diff mbox

fs/select: add vmalloc fallback for select(2)

Message ID 20160922152831.24165-1-vbabka@suse.cz (mailing list archive)
State New, archived
Headers show

Commit Message

Vlastimil Babka Sept. 22, 2016, 3:28 p.m. UTC
The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
with the number of fds passed. We had a customer report page allocation
failures of order-4 for this allocation. This is a costly order, so it might
easily fail, as the VM expects such allocation to have a lower-order fallback.

Such trivial fallback is vmalloc(), as the memory doesn't have to be
physically contiguous. Also the allocation is temporary for the duration of the
syscall, so it's unlikely to stress vmalloc too much.

Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
it doesn't need this kind of fallback.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 fs/select.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

Comments

Eric Dumazet Sept. 22, 2016, 4:24 p.m. UTC | #1
On Thu, 2016-09-22 at 17:28 +0200, Vlastimil Babka wrote:
> The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
> with the number of fds passed. We had a customer report page allocation
> failures of order-4 for this allocation. This is a costly order, so it might
> easily fail, as the VM expects such allocation to have a lower-order fallback.
> 
> Such trivial fallback is vmalloc(), as the memory doesn't have to be
> physically contiguous. Also the allocation is temporary for the duration of the
> syscall, so it's unlikely to stress vmalloc too much.
> 
> Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
> it doesn't need this kind of fallback.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  fs/select.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/select.c b/fs/select.c
> index 8ed9da50896a..8fe5bddbe99b 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched/rt.h>
>  #include <linux/freezer.h>
>  #include <net/busy_poll.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -558,6 +559,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	struct fdtable *fdt;
>  	/* Allocate small arguments on the stack to save memory and be faster */
>  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
> +	unsigned long alloc_size;
>  
>  	ret = -EINVAL;
>  	if (n < 0)
> @@ -580,10 +582,15 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	bits = stack_fds;
>  	if (size > sizeof(stack_fds) / 6) {
>  		/* Not enough space in on-stack array; must use kmalloc */
> +		alloc_size = 6 * size;
>  		ret = -ENOMEM;
> -		bits = kmalloc(6 * size, GFP_KERNEL);
> -		if (!bits)
> -			goto out_nofds;
> +		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
> +		if (!bits && alloc_size > PAGE_SIZE) {
> +			bits = vmalloc(alloc_size);
> +
> +			if (!bits)
> +				goto out_nofds;

Test should happen if alloc_size <= PAGE_SIZE

> +		}

if (!bits && alloc_size > PAGE_SIZE)
    bits = vmalloc(alloc_size);

if (!bits)
      goto out_nofds;



>  	}
>  	fds.in      = bits;
>  	fds.out     = bits +   size;
> @@ -618,7 +625,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  
>  out:
>  	if (bits != stack_fds)
> -		kfree(bits);
> +		kvfree(bits);
>  out_nofds:
>  	return ret;
>  }


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vlastimil Babka Sept. 22, 2016, 4:40 p.m. UTC | #2
On 09/22/2016 06:24 PM, Eric Dumazet wrote:

>> +		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
>> +		if (!bits && alloc_size > PAGE_SIZE) {
>> +			bits = vmalloc(alloc_size);
>> +
>> +			if (!bits)
>> +				goto out_nofds;
>
> Test should happen if alloc_size <= PAGE_SIZE
>
>> +		}
>
> if (!bits && alloc_size > PAGE_SIZE)
>     bits = vmalloc(alloc_size);
>
> if (!bits)
>       goto out_nofds;
>

Thanks... stupid last-minute changes.


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hillf Danton Sept. 23, 2016, 6:42 a.m. UTC | #3
> 
> The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
> with the number of fds passed. We had a customer report page allocation
> failures of order-4 for this allocation. This is a costly order, so it might
> easily fail, as the VM expects such allocation to have a lower-order fallback.
> 
> Such trivial fallback is vmalloc(), as the memory doesn't have to be
> physically contiguous. Also the allocation is temporary for the duration of the
> syscall, so it's unlikely to stress vmalloc too much.
> 
> Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
> it doesn't need this kind of fallback.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  fs/select.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/select.c b/fs/select.c
> index 8ed9da50896a..8fe5bddbe99b 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched/rt.h>
>  #include <linux/freezer.h>
>  #include <net/busy_poll.h>
> +#include <linux/vmalloc.h>
> 
>  #include <asm/uaccess.h>
> 
> @@ -558,6 +559,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	struct fdtable *fdt;
>  	/* Allocate small arguments on the stack to save memory and be faster */
>  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
> +	unsigned long alloc_size;
> 
>  	ret = -EINVAL;
>  	if (n < 0)
> @@ -580,10 +582,15 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	bits = stack_fds;
>  	if (size > sizeof(stack_fds) / 6) {
>  		/* Not enough space in on-stack array; must use kmalloc */
> +		alloc_size = 6 * size;
>  		ret = -ENOMEM;
> -		bits = kmalloc(6 * size, GFP_KERNEL);
> -		if (!bits)
> -			goto out_nofds;
> +		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
> +		if (!bits && alloc_size > PAGE_SIZE) {
> +			bits = vmalloc(alloc_size);
> +
> +			if (!bits)
> +				goto out_nofds;
> +		}

Looks like we also have to bail out if kmalloc fails with 
alloc_size less than PAGE_SIZE.

thanks
Hillf
>  	}
>  	fds.in      = bits;
>  	fds.out     = bits +   size;
> @@ -618,7 +625,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
> 
>  out:
>  	if (bits != stack_fds)
> -		kfree(bits);
> +		kvfree(bits);
>  out_nofds:
>  	return ret;
>  }
> --
> 2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/select.c b/fs/select.c
index 8ed9da50896a..8fe5bddbe99b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,6 +29,7 @@ 
 #include <linux/sched/rt.h>
 #include <linux/freezer.h>
 #include <net/busy_poll.h>
+#include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
 
@@ -558,6 +559,7 @@  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
 	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+	unsigned long alloc_size;
 
 	ret = -EINVAL;
 	if (n < 0)
@@ -580,10 +582,15 @@  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	bits = stack_fds;
 	if (size > sizeof(stack_fds) / 6) {
 		/* Not enough space in on-stack array; must use kmalloc */
+		alloc_size = 6 * size;
 		ret = -ENOMEM;
-		bits = kmalloc(6 * size, GFP_KERNEL);
-		if (!bits)
-			goto out_nofds;
+		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
+		if (!bits && alloc_size > PAGE_SIZE) {
+			bits = vmalloc(alloc_size);
+
+			if (!bits)
+				goto out_nofds;
+		}
 	}
 	fds.in      = bits;
 	fds.out     = bits +   size;
@@ -618,7 +625,7 @@  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 
 out:
 	if (bits != stack_fds)
-		kfree(bits);
+		kvfree(bits);
 out_nofds:
 	return ret;
 }