diff mbox series

[net-next,v3] xsk: support use vaddr as ring

Message ID 20230214015112.12094-1-xuanzhuo@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v3] xsk: support use vaddr as ring | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2 this patch: 2
netdev/cc_maintainers success CCed 14 of 14 maintainers
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2 this patch: 2
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 65 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Xuan Zhuo Feb. 14, 2023, 1:51 a.m. UTC
When we try to start AF_XDP on some machines with long running time, due
to the machine's memory fragmentation problem, there is no sufficient
contiguous physical memory that will cause the start failure.

If the size of the queue is 8 * 1024, then the size of the desc[] is
8 * 1024 * 8 = 16 * PAGE, but we also add struct xdp_ring size, so it is
16page+. This is necessary to apply for a 4-order memory. If there are a
lot of queues, it is difficult to these machine with long running time.

Here, that we actually waste 15 pages. 4-Order memory is 32 pages, but
we only use 17 pages.

This patch replaces __get_free_pages() by vmalloc() to allocate memory
to solve these problems.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 net/xdp/xsk.c       |  9 ++-------
 net/xdp/xsk_queue.c | 10 ++++------
 net/xdp/xsk_queue.h |  1 +
 3 files changed, 7 insertions(+), 13 deletions(-)

Comments

Alexander Lobakin Feb. 14, 2023, 2:45 p.m. UTC | #1
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Tue, 14 Feb 2023 09:51:12 +0800

> When we try to start AF_XDP on some machines with long running time, due
> to the machine's memory fragmentation problem, there is no sufficient
> contiguous physical memory that will cause the start failure.

[...]

> @@ -1319,13 +1317,10 @@ static int xsk_mmap(struct file *file, struct socket *sock,
>  
>  	/* Matches the smp_wmb() in xsk_init_queue */
>  	smp_rmb();
> -	qpg = virt_to_head_page(q->ring);
> -	if (size > page_size(qpg))
> +	if (size > PAGE_ALIGN(q->ring_size))

You can set q->ring_size as PAGE_ALIGN(size) already at the allocation
to simplify this. I don't see any other places where you use it.

>  		return -EINVAL;
>  
> -	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
> -	return remap_pfn_range(vma, vma->vm_start, pfn,
> -			       size, vma->vm_page_prot);
> +	return remap_vmalloc_range(vma, q->ring, 0);
>  }
>  
>  static int xsk_notifier(struct notifier_block *this,
> diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
> index 6cf9586e5027..247316bdfcbe 100644
> --- a/net/xdp/xsk_queue.c
> +++ b/net/xdp/xsk_queue.c
> @@ -7,6 +7,7 @@
>  #include <linux/slab.h>
>  #include <linux/overflow.h>
>  #include <net/xdp_sock_drv.h>
> +#include <linux/vmalloc.h>

Alphabetic order maybe?

>  
>  #include "xsk_queue.h"
>  
> @@ -23,7 +24,6 @@ static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
>  struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
>  {
>  	struct xsk_queue *q;
> -	gfp_t gfp_flags;
>  	size_t size;
>  
>  	q = kzalloc(sizeof(*q), GFP_KERNEL);
> @@ -33,12 +33,10 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
>  	q->nentries = nentries;
>  	q->ring_mask = nentries - 1;
>  
> -	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
> -		    __GFP_COMP  | __GFP_NORETRY;
>  	size = xskq_get_ring_size(q, umem_queue);
>  
> -	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
> -						      get_order(size));
> +	q->ring_size = size;

Maybe assign size only after successful allocation?

> +	q->ring = (struct xdp_ring *)vmalloc_user(size);

The cast from `void *` is redundant. It was needed for
__get_free_pages() since it returns pointer as long.

>  	if (!q->ring) {
>  		kfree(q);
>  		return NULL;
> @@ -52,6 +50,6 @@ void xskq_destroy(struct xsk_queue *q)
>  	if (!q)
>  		return;
>  
> -	page_frag_free(q->ring);
> +	vfree(q->ring);
>  	kfree(q);
>  }
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> index c6fb6b763658..35922b8b92a8 100644
> --- a/net/xdp/xsk_queue.h
> +++ b/net/xdp/xsk_queue.h
> @@ -45,6 +45,7 @@ struct xsk_queue {
>  	struct xdp_ring *ring;
>  	u64 invalid_descs;
>  	u64 queue_empty_descs;
> +	size_t ring_size;
>  };
>  
>  /* The structure of the shared state of the rings are a simple
Thanks,
Olek
kernel test robot Feb. 14, 2023, 5:05 p.m. UTC | #2
Hi Xuan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Xuan-Zhuo/xsk-support-use-vaddr-as-ring/20230214-095210
patch link:    https://lore.kernel.org/r/20230214015112.12094-1-xuanzhuo%40linux.alibaba.com
patch subject: [PATCH net-next v3] xsk: support use vaddr as ring
config: sh-allmodconfig (https://download.01.org/0day-ci/archive/20230215/202302150059.M0lYLPTa-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/a2f7f17c84b0f4af1d0a8903b2b5e8e558f8359a
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Xuan-Zhuo/xsk-support-use-vaddr-as-ring/20230214-095210
        git checkout a2f7f17c84b0f4af1d0a8903b2b5e8e558f8359a
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=sh olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=sh SHELL=/bin/bash net/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302150059.M0lYLPTa-lkp@intel.com/

All errors (new ones prefixed by >>):

   net/xdp/xsk.c: In function 'xsk_mmap':
>> net/xdp/xsk.c:1323:16: error: implicit declaration of function 'remap_vmalloc_range'; did you mean 'ida_alloc_range'? [-Werror=implicit-function-declaration]
    1323 |         return remap_vmalloc_range(vma, q->ring, 0);
         |                ^~~~~~~~~~~~~~~~~~~
         |                ida_alloc_range
   cc1: some warnings being treated as errors


vim +1323 net/xdp/xsk.c

  1290	
  1291	static int xsk_mmap(struct file *file, struct socket *sock,
  1292			    struct vm_area_struct *vma)
  1293	{
  1294		loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
  1295		unsigned long size = vma->vm_end - vma->vm_start;
  1296		struct xdp_sock *xs = xdp_sk(sock->sk);
  1297		struct xsk_queue *q = NULL;
  1298	
  1299		if (READ_ONCE(xs->state) != XSK_READY)
  1300			return -EBUSY;
  1301	
  1302		if (offset == XDP_PGOFF_RX_RING) {
  1303			q = READ_ONCE(xs->rx);
  1304		} else if (offset == XDP_PGOFF_TX_RING) {
  1305			q = READ_ONCE(xs->tx);
  1306		} else {
  1307			/* Matches the smp_wmb() in XDP_UMEM_REG */
  1308			smp_rmb();
  1309			if (offset == XDP_UMEM_PGOFF_FILL_RING)
  1310				q = READ_ONCE(xs->fq_tmp);
  1311			else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
  1312				q = READ_ONCE(xs->cq_tmp);
  1313		}
  1314	
  1315		if (!q)
  1316			return -EINVAL;
  1317	
  1318		/* Matches the smp_wmb() in xsk_init_queue */
  1319		smp_rmb();
  1320		if (size > PAGE_ALIGN(q->ring_size))
  1321			return -EINVAL;
  1322	
> 1323		return remap_vmalloc_range(vma, q->ring, 0);
  1324	}
  1325
Xuan Zhuo Feb. 15, 2023, 1:48 a.m. UTC | #3
On Tue, 14 Feb 2023 15:45:12 +0100, Alexander Lobakin <alexandr.lobakin@intel.com> wrote:
> From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Date: Tue, 14 Feb 2023 09:51:12 +0800
>
> > When we try to start AF_XDP on some machines with long running time, due
> > to the machine's memory fragmentation problem, there is no sufficient
> > contiguous physical memory that will cause the start failure.
>
> [...]
>
> > @@ -1319,13 +1317,10 @@ static int xsk_mmap(struct file *file, struct socket *sock,
> >
> >  	/* Matches the smp_wmb() in xsk_init_queue */
> >  	smp_rmb();
> > -	qpg = virt_to_head_page(q->ring);
> > -	if (size > page_size(qpg))
> > +	if (size > PAGE_ALIGN(q->ring_size))
>
> You can set q->ring_size as PAGE_ALIGN(size) already at the allocation
> to simplify this. I don't see any other places where you use it.

That's it, but I think it is not particularly appropriate to change the
the semantics of ring_size just for simplify this code. This may make
people feel strange.

I agree with you other opinions.

Thanks.


>
> >  		return -EINVAL;
> >
> > -	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
> > -	return remap_pfn_range(vma, vma->vm_start, pfn,
> > -			       size, vma->vm_page_prot);
> > +	return remap_vmalloc_range(vma, q->ring, 0);
> >  }
> >
> >  static int xsk_notifier(struct notifier_block *this,
> > diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
> > index 6cf9586e5027..247316bdfcbe 100644
> > --- a/net/xdp/xsk_queue.c
> > +++ b/net/xdp/xsk_queue.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/slab.h>
> >  #include <linux/overflow.h>
> >  #include <net/xdp_sock_drv.h>
> > +#include <linux/vmalloc.h>
>
> Alphabetic order maybe?
>
> >
> >  #include "xsk_queue.h"
> >
> > @@ -23,7 +24,6 @@ static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
> >  struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
> >  {
> >  	struct xsk_queue *q;
> > -	gfp_t gfp_flags;
> >  	size_t size;
> >
> >  	q = kzalloc(sizeof(*q), GFP_KERNEL);
> > @@ -33,12 +33,10 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
> >  	q->nentries = nentries;
> >  	q->ring_mask = nentries - 1;
> >
> > -	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
> > -		    __GFP_COMP  | __GFP_NORETRY;
> >  	size = xskq_get_ring_size(q, umem_queue);
> >
> > -	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
> > -						      get_order(size));
> > +	q->ring_size = size;
>
> Maybe assign size only after successful allocation?
>
> > +	q->ring = (struct xdp_ring *)vmalloc_user(size);
>
> The cast from `void *` is redundant. It was needed for
> __get_free_pages() since it returns pointer as long.
>
> >  	if (!q->ring) {
> >  		kfree(q);
> >  		return NULL;
> > @@ -52,6 +50,6 @@ void xskq_destroy(struct xsk_queue *q)
> >  	if (!q)
> >  		return;
> >
> > -	page_frag_free(q->ring);
> > +	vfree(q->ring);
> >  	kfree(q);
> >  }
> > diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> > index c6fb6b763658..35922b8b92a8 100644
> > --- a/net/xdp/xsk_queue.h
> > +++ b/net/xdp/xsk_queue.h
> > @@ -45,6 +45,7 @@ struct xsk_queue {
> >  	struct xdp_ring *ring;
> >  	u64 invalid_descs;
> >  	u64 queue_empty_descs;
> > +	size_t ring_size;
> >  };
> >
> >  /* The structure of the shared state of the rings are a simple
> Thanks,
> Olek
Alexander Lobakin Feb. 15, 2023, 4:50 p.m. UTC | #4
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Wed, 15 Feb 2023 09:48:21 +0800

> On Tue, 14 Feb 2023 15:45:12 +0100, Alexander Lobakin <alexandr.lobakin@intel.com> wrote:
>> From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> Date: Tue, 14 Feb 2023 09:51:12 +0800
>>
>>> When we try to start AF_XDP on some machines with long running time, due
>>> to the machine's memory fragmentation problem, there is no sufficient
>>> contiguous physical memory that will cause the start failure.
>>
>> [...]
>>
>>> @@ -1319,13 +1317,10 @@ static int xsk_mmap(struct file *file, struct socket *sock,
>>>
>>>  	/* Matches the smp_wmb() in xsk_init_queue */
>>>  	smp_rmb();
>>> -	qpg = virt_to_head_page(q->ring);
>>> -	if (size > page_size(qpg))
>>> +	if (size > PAGE_ALIGN(q->ring_size))
>>
>> You can set q->ring_size as PAGE_ALIGN(size) already at the allocation
>> to simplify this. I don't see any other places where you use it.
> 
> That's it, but I think it is not particularly appropriate to change the
> the semantics of ring_size just for simplify this code. This may make
> people feel strange.

You can name it 'vmalloc_size' then. By "ring_size" I first of all
assume the number of elements, not the allocation size.

Also, wait, shouldn't you do this PAGE_ALIGN() *before* you actually
vmalloc() it? Can't here be out-of-bounds with the current approach?

> 
> I agree with you other opinions.
> 
> Thanks.

Thanks,
Olek
Xuan Zhuo Feb. 16, 2023, 8:22 a.m. UTC | #5
On Wed, 15 Feb 2023 17:50:54 +0100, Alexander Lobakin <aleksander.lobakin@intel.com> wrote:
> From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Date: Wed, 15 Feb 2023 09:48:21 +0800
>
> > On Tue, 14 Feb 2023 15:45:12 +0100, Alexander Lobakin <alexandr.lobakin@intel.com> wrote:
> >> From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >> Date: Tue, 14 Feb 2023 09:51:12 +0800
> >>
> >>> When we try to start AF_XDP on some machines with long running time, due
> >>> to the machine's memory fragmentation problem, there is no sufficient
> >>> contiguous physical memory that will cause the start failure.
> >>
> >> [...]
> >>
> >>> @@ -1319,13 +1317,10 @@ static int xsk_mmap(struct file *file, struct socket *sock,
> >>>
> >>>  	/* Matches the smp_wmb() in xsk_init_queue */
> >>>  	smp_rmb();
> >>> -	qpg = virt_to_head_page(q->ring);
> >>> -	if (size > page_size(qpg))
> >>> +	if (size > PAGE_ALIGN(q->ring_size))
> >>
> >> You can set q->ring_size as PAGE_ALIGN(size) already at the allocation
> >> to simplify this. I don't see any other places where you use it.
> >
> > That's it, but I think it is not particularly appropriate to change the
> > the semantics of ring_size just for simplify this code. This may make
> > people feel strange.
>
> You can name it 'vmalloc_size' then. By "ring_size" I first of all
> assume the number of elements, not the allocation size.


Maybe "ring_vmalloc_size"

>
> Also, wait, shouldn't you do this PAGE_ALIGN() *before* you actually
> vmalloc() it? Can't here be out-of-bounds with the current approach?


vmalloc_user() will do PAGE_ALIGN().

Thanks.


>
> >
> > I agree with you other opinions.
> >
> > Thanks.
>
> Thanks,
> Olek
diff mbox series

Patch

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9f0561b67c12..6a588b99b670 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1295,8 +1295,6 @@  static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long size = vma->vm_end - vma->vm_start;
 	struct xdp_sock *xs = xdp_sk(sock->sk);
 	struct xsk_queue *q = NULL;
-	unsigned long pfn;
-	struct page *qpg;
 
 	if (READ_ONCE(xs->state) != XSK_READY)
 		return -EBUSY;
@@ -1319,13 +1317,10 @@  static int xsk_mmap(struct file *file, struct socket *sock,
 
 	/* Matches the smp_wmb() in xsk_init_queue */
 	smp_rmb();
-	qpg = virt_to_head_page(q->ring);
-	if (size > page_size(qpg))
+	if (size > PAGE_ALIGN(q->ring_size))
 		return -EINVAL;
 
-	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
-	return remap_pfn_range(vma, vma->vm_start, pfn,
-			       size, vma->vm_page_prot);
+	return remap_vmalloc_range(vma, q->ring, 0);
 }
 
 static int xsk_notifier(struct notifier_block *this,
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 6cf9586e5027..247316bdfcbe 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -7,6 +7,7 @@ 
 #include <linux/slab.h>
 #include <linux/overflow.h>
 #include <net/xdp_sock_drv.h>
+#include <linux/vmalloc.h>
 
 #include "xsk_queue.h"
 
@@ -23,7 +24,6 @@  static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 {
 	struct xsk_queue *q;
-	gfp_t gfp_flags;
 	size_t size;
 
 	q = kzalloc(sizeof(*q), GFP_KERNEL);
@@ -33,12 +33,10 @@  struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 	q->nentries = nentries;
 	q->ring_mask = nentries - 1;
 
-	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
-		    __GFP_COMP  | __GFP_NORETRY;
 	size = xskq_get_ring_size(q, umem_queue);
 
-	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
-						      get_order(size));
+	q->ring_size = size;
+	q->ring = (struct xdp_ring *)vmalloc_user(size);
 	if (!q->ring) {
 		kfree(q);
 		return NULL;
@@ -52,6 +50,6 @@  void xskq_destroy(struct xsk_queue *q)
 	if (!q)
 		return;
 
-	page_frag_free(q->ring);
+	vfree(q->ring);
 	kfree(q);
 }
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index c6fb6b763658..35922b8b92a8 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -45,6 +45,7 @@  struct xsk_queue {
 	struct xdp_ring *ring;
 	u64 invalid_descs;
 	u64 queue_empty_descs;
+	size_t ring_size;
 };
 
 /* The structure of the shared state of the rings are a simple