diff mbox

[RESEND] fs: aio: fix the increment of aio-nr and counting against aio-max-nr

Message ID 1499262796-4022-1-git-send-email-mauricfo@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mauricio Faria de Oliveira July 5, 2017, 1:53 p.m. UTC
Currently, aio-nr is incremented in steps of 'num_possible_cpus() * 8'
for io_setup(nr_events, ..) with 'nr_events < num_possible_cpus() * 4':

    ioctx_alloc()
    ...
        nr_events = max(nr_events, num_possible_cpus() * 4);
        nr_events *= 2;
    ...
        ctx->max_reqs = nr_events;
    ...
        aio_nr += ctx->max_reqs;
    ....

This limits the number of aio contexts actually available to much less
than aio-max-nr, and is increasingly worse with greater number of CPUs.

For example, with 64 CPUs, only 256 aio contexts are actually available
(with aio-max-nr = 65536) because the increment is 512 in that scenario.

Note: 65536 [max aio contexts] / (64*4*2) [increment per aio context]
is 128, but make it 256 (double) as counting against 'aio-max-nr * 2':

    ioctx_alloc()
    ...
        if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
        ...
            goto err_ctx;
    ...

This patch uses the original value of nr_events (from userspace) to
increment aio-nr and count against aio-max-nr, which resolves those.

Signed-off-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Reported-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
Tested-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
Tested-by: Paul Nguyen <nguyenp@us.ibm.com>
---
 fs/aio.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

Comments

Mauricio Faria de Oliveira July 5, 2017, 1:59 p.m. UTC | #1
On 07/05/2017 10:53 AM, Mauricio Faria de Oliveira wrote:
> This patch uses the original value of nr_events (from userspace) to
> increment aio-nr and count against aio-max-nr, which resolves those.

This has been tested with v4.12+ (commit 650fc870a2ef on Linus tree).

The test-case and test-suite validation steps are included later in
this message.


Example on a system with 64 CPUs:

# cat /sys/devices/system/cpu/possible
0-63

# grep . /proc/sys/fs/aio-*
/proc/sys/fs/aio-max-nr:65536
/proc/sys/fs/aio-nr:0


test 1)  number of aio contexts available with nr_events == 1
-------------------------------------------------------------

     This test calls io_setup(1, ..) up to 65536 times, exiting on error.


     - original kernel:

     Only 256 aio contexts could be created successfully,
     quickly falling into the aio-max-nr exceeded error path (-EAGAIN).

     # ./io_setup 1 65536 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1, ): 256 calls with rc 0, last call with 
rc -11.
     /proc/sys/fs/aio-nr:131072

     One might notice the aio-nr value is twice the aio-max-nr limit,
     an effect of how the current code handles that 'nr_events *= 2'.


     - patched kernel:

     Almost all of the limit of aio contexts could be allocated,
     eventually falling into the insufficient resources error path 
(-ENOMEM):

     # ./io_setup 1 65536 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1, ): 65516 calls with rc 0, last call 
with rc -12.
     /proc/sys/fs/aio-nr:65516

     Notice the aio-nr value is now _under_ the aio-max-nr limit.


test 2)  increment value for nr_events == 1
-------------------------------------------

     This test calls io_setup(1, ..) only 1 time, to show the increment:


     - original kernel:

     # ./io_setup 1 1 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1, ) : 1 calls with rc 0, last call with 
rc 0.
     /proc/sys/fs/aio-nr:512

     Notice the increment is 'num_online_cpus() * 8'.


     - patched kernel:

     # ./io_setup 1 1 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1, ): 1 calls with rc 0, last call with rc 0.
     /proc/sys/fs/aio-nr:1

     Notice the increment is exactly 1 (matches nr_events from userspace).



test 3)  more aio contexts available with great-enough nr_events
----------------------------------------------------------------

     The full aio-max-nr limit (65536) is available for greater nr_events.
     This test calls io_setup(1024, ) exactly 64 times, without error.


     - original kernel:

     # ./io_setup 1024 64 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1024, ): 64 calls with rc 0, last call 
with rc 0.
     /proc/sys/fs/aio-nr:131072

     Notice the aio-nr value is twice the aio-max-nr limit.


     - patched kernel:

     # ./io_setup 1024 64 | grep -m1 . - /proc/sys/fs/aio-nr
     (standard input):io_setup(1024, ): 64 calls with rc 0, last call 
with rc 0.
     /proc/sys/fs/aio-nr:65536

     Notice the aio-nr value is now _exactly_ the aio-max-nr limit.


Test-case: io_setup.c # gcc -o io_setup io_setup.c -laio
---------

"""
#include <libaio.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, char *argv[]) {

     int nr_events, nr_calls, rc, i;
     io_context_t *ioctx;

     /* usage: io_setup <nr_events for io_setup()> <max calls to 
io_setup()> */
     if (argc != 3)
	    return -1;

     nr_events = atoi(argv[1]);
     nr_calls = atoi(argv[2]);

     ioctx = calloc(nr_calls, sizeof(*ioctx));
     if (!ioctx)
	    return -2;

     for (i = 0; i < nr_calls; i++)
	    if (rc = io_setup(nr_events, &ioctx[i]))
		    break;

     printf("io_setup(%d, ): %d calls with rc 0, last call with rc 
%d.\n",  nr_events, i, rc);
     fflush(stdout);

     sleep(1);
     return 0;
}
"""


Test-suite: libaio
----------

     # curl 
https://kojipkgs.fedoraproject.org/packages/libaio/0.3.110/7.fc26/src/libaio-0.3.110-7.fc26.src.rpm 
| rpm2cpio | cpio -mid

     # tar xf libaio-0.3.110.tar.gz
     # cd libaio-0.3.110

     # make
     # make check 2>&1 | grep '^test cases'
     test cases/2.t completed PASSED.
     test cases/3.t completed PASSED.
     test cases/4.t completed PASSED.
     test cases/5.t completed PASSED.
     test cases/6.t completed PASSED.
     test cases/7.t completed PASSED.
     test cases/11.t completed PASSED.
     test cases/12.t completed PASSED.
     test cases/13.t completed PASSED.
     test cases/14.t completed PASSED.
     test cases/15.t completed PASSED.
     test cases/16.t completed PASSED.
     test cases/10.t completed PASSED.
     test cases/8.t completed PASSED.
Jeff Moyer July 5, 2017, 7:28 p.m. UTC | #2
Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com> writes:

> Currently, aio-nr is incremented in steps of 'num_possible_cpus() * 8'
> for io_setup(nr_events, ..) with 'nr_events < num_possible_cpus() * 4':
>
>     ioctx_alloc()
>     ...
>         nr_events = max(nr_events, num_possible_cpus() * 4);
>         nr_events *= 2;
>     ...
>         ctx->max_reqs = nr_events;
>     ...
>         aio_nr += ctx->max_reqs;
>     ....
>
> This limits the number of aio contexts actually available to much less
> than aio-max-nr, and is increasingly worse with greater number of CPUs.
>
> For example, with 64 CPUs, only 256 aio contexts are actually available
> (with aio-max-nr = 65536) because the increment is 512 in that scenario.
>
> Note: 65536 [max aio contexts] / (64*4*2) [increment per aio context]
> is 128, but make it 256 (double) as counting against 'aio-max-nr * 2':
>
>     ioctx_alloc()
>     ...
>         if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
>         ...
>             goto err_ctx;
>     ...
>
> This patch uses the original value of nr_events (from userspace) to
> increment aio-nr and count against aio-max-nr, which resolves those.
>
> Signed-off-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
> Reported-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
> Tested-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
> Tested-by: Paul Nguyen <nguyenp@us.ibm.com>

Thanks for your persistence in re-posting this.  The fix looks good to
me.  Ben, can you queue this up?

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

> ---
>  fs/aio.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index f52d925ee259..3908480d7ccd 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -441,10 +441,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
>  #endif
>  };
>  
> -static int aio_setup_ring(struct kioctx *ctx)
> +static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
>  {
>  	struct aio_ring *ring;
> -	unsigned nr_events = ctx->max_reqs;
>  	struct mm_struct *mm = current->mm;
>  	unsigned long size, unused;
>  	int nr_pages;
> @@ -707,6 +706,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
>  	int err = -ENOMEM;
>  
>  	/*
> +	 * Store the original nr_events -- what userspace passed to io_setup(),
> +	 * for counting against the global limit -- before it changes.
> +	 */
> +	unsigned int max_reqs = nr_events;
> +
> +	/*
>  	 * We keep track of the number of available ringbuffer slots, to prevent
>  	 * overflow (reqs_available), and we also use percpu counters for this.
>  	 *
> @@ -724,14 +729,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
>  		return ERR_PTR(-EINVAL);
>  	}
>  
> -	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
> +	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
>  		return ERR_PTR(-EAGAIN);
>  
>  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
>  	if (!ctx)
>  		return ERR_PTR(-ENOMEM);
>  
> -	ctx->max_reqs = nr_events;
> +	ctx->max_reqs = max_reqs;
>  
>  	spin_lock_init(&ctx->ctx_lock);
>  	spin_lock_init(&ctx->completion_lock);
> @@ -753,7 +758,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
>  	if (!ctx->cpu)
>  		goto err;
>  
> -	err = aio_setup_ring(ctx);
> +	err = aio_setup_ring(ctx, nr_events);
>  	if (err < 0)
>  		goto err;
>  
> @@ -764,8 +769,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
>  
>  	/* limit the number of system wide aios */
>  	spin_lock(&aio_nr_lock);
> -	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
> -	    aio_nr + nr_events < aio_nr) {
> +	if (aio_nr + ctx->max_reqs > aio_max_nr ||
> +	    aio_nr + ctx->max_reqs < aio_nr) {
>  		spin_unlock(&aio_nr_lock);
>  		err = -EAGAIN;
>  		goto err_ctx;
Benjamin LaHaise July 6, 2017, 9:07 p.m. UTC | #3
On Wed, Jul 05, 2017 at 03:28:14PM -0400, Jeff Moyer wrote:
> Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com> writes:
> 
> > Currently, aio-nr is incremented in steps of 'num_possible_cpus() * 8'
> > for io_setup(nr_events, ..) with 'nr_events < num_possible_cpus() * 4':
> >
> >     ioctx_alloc()
> >     ...
> >         nr_events = max(nr_events, num_possible_cpus() * 4);
> >         nr_events *= 2;
> >     ...
> >         ctx->max_reqs = nr_events;
> >     ...
> >         aio_nr += ctx->max_reqs;
> >     ....
> >
> > This limits the number of aio contexts actually available to much less
> > than aio-max-nr, and is increasingly worse with greater number of CPUs.
> >
> > For example, with 64 CPUs, only 256 aio contexts are actually available
> > (with aio-max-nr = 65536) because the increment is 512 in that scenario.
> >
> > Note: 65536 [max aio contexts] / (64*4*2) [increment per aio context]
> > is 128, but make it 256 (double) as counting against 'aio-max-nr * 2':
> >
> >     ioctx_alloc()
> >     ...
> >         if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
> >         ...
> >             goto err_ctx;
> >     ...
> >
> > This patch uses the original value of nr_events (from userspace) to
> > increment aio-nr and count against aio-max-nr, which resolves those.
> >
> > Signed-off-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
> > Reported-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
> > Tested-by: Lekshmi C. Pillai <lekshmi.cpillai@in.ibm.com>
> > Tested-by: Paul Nguyen <nguyenp@us.ibm.com>
> 
> Thanks for your persistence in re-posting this.  The fix looks good to
> me.  Ben, can you queue this up?

I'm queuing this up in my aio-next and will push upstream after a few days
of soaking in linux-next.

		-ben

> Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
> 
> > ---
> >  fs/aio.c | 19 ++++++++++++-------
> >  1 file changed, 12 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/aio.c b/fs/aio.c
> > index f52d925ee259..3908480d7ccd 100644
> > --- a/fs/aio.c
> > +++ b/fs/aio.c
> > @@ -441,10 +441,9 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
> >  #endif
> >  };
> >  
> > -static int aio_setup_ring(struct kioctx *ctx)
> > +static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
> >  {
> >  	struct aio_ring *ring;
> > -	unsigned nr_events = ctx->max_reqs;
> >  	struct mm_struct *mm = current->mm;
> >  	unsigned long size, unused;
> >  	int nr_pages;
> > @@ -707,6 +706,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> >  	int err = -ENOMEM;
> >  
> >  	/*
> > +	 * Store the original nr_events -- what userspace passed to io_setup(),
> > +	 * for counting against the global limit -- before it changes.
> > +	 */
> > +	unsigned int max_reqs = nr_events;
> > +
> > +	/*
> >  	 * We keep track of the number of available ringbuffer slots, to prevent
> >  	 * overflow (reqs_available), and we also use percpu counters for this.
> >  	 *
> > @@ -724,14 +729,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> >  		return ERR_PTR(-EINVAL);
> >  	}
> >  
> > -	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
> > +	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
> >  		return ERR_PTR(-EAGAIN);
> >  
> >  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
> >  	if (!ctx)
> >  		return ERR_PTR(-ENOMEM);
> >  
> > -	ctx->max_reqs = nr_events;
> > +	ctx->max_reqs = max_reqs;
> >  
> >  	spin_lock_init(&ctx->ctx_lock);
> >  	spin_lock_init(&ctx->completion_lock);
> > @@ -753,7 +758,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> >  	if (!ctx->cpu)
> >  		goto err;
> >  
> > -	err = aio_setup_ring(ctx);
> > +	err = aio_setup_ring(ctx, nr_events);
> >  	if (err < 0)
> >  		goto err;
> >  
> > @@ -764,8 +769,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> >  
> >  	/* limit the number of system wide aios */
> >  	spin_lock(&aio_nr_lock);
> > -	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
> > -	    aio_nr + nr_events < aio_nr) {
> > +	if (aio_nr + ctx->max_reqs > aio_max_nr ||
> > +	    aio_nr + ctx->max_reqs < aio_nr) {
> >  		spin_unlock(&aio_nr_lock);
> >  		err = -EAGAIN;
> >  		goto err_ctx;
>
Jeff Moyer July 6, 2017, 10:25 p.m. UTC | #4
Benjamin LaHaise <bcrl@kvack.org> writes:

> I'm queuing this up in my aio-next and will push upstream after a few days
> of soaking in linux-next.

Thanks!

-Jeff
Mauricio Faria de Oliveira July 7, 2017, 12:44 p.m. UTC | #5
On 07/06/2017 06:07 PM, Benjamin LaHaise wrote:
> I'm queuing this up in my aio-next and will push upstream after a few days
> of soaking in linux-next.

Thanks, Ben.
Mauricio Faria de Oliveira July 14, 2017, 11:58 p.m. UTC | #6
Hi Ben,

On 07/06/2017 06:07 PM, Benjamin LaHaise wrote:
> I'm queuing this up in my aio-next and will push upstream after a few days
> of soaking in linux-next.

Apparently this patch could only make linux-next in today's tree (0714)
as seen in [1], and the merge window should close in 2 days (Jul, 16th)
according to LWN [2].

Wondering if you'd consider submitting it to the few initial v4.13-rcs ?
(after the testing period you mentioned) so that it can make v4.13, as
it can be considered a fix too, not just an improvement, IMHO :- )

Thank you.

[1] https://lkml.org/lkml/2017/7/13/888
[2] https://lwn.net/Articles/727385/
Mauricio Faria de Oliveira Sept. 7, 2017, 3:04 a.m. UTC | #7
Hi Benjamin,

On 07/06/2017 02:07 PM, Benjamin LaHaise wrote:

> On Wed, Jul 05, 2017 at 03:28:14PM -0400, Jeff Moyer wrote:
>> Thanks for your persistence in re-posting this.  The fix looks good to
>> me.  Ben, can you queue this up?
> I'm queuing this up in my aio-next and will push upstream after a few days
> of soaking in linux-next.

Do you plan to push this patch upstream in the current merge window?

cheers,
diff mbox

Patch

diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..3908480d7ccd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -441,10 +441,9 @@  static int aio_migratepage(struct address_space *mapping, struct page *new,
 #endif
 };
 
-static int aio_setup_ring(struct kioctx *ctx)
+static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 {
 	struct aio_ring *ring;
-	unsigned nr_events = ctx->max_reqs;
 	struct mm_struct *mm = current->mm;
 	unsigned long size, unused;
 	int nr_pages;
@@ -707,6 +706,12 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	int err = -ENOMEM;
 
 	/*
+	 * Store the original nr_events -- what userspace passed to io_setup(),
+	 * for counting against the global limit -- before it changes.
+	 */
+	unsigned int max_reqs = nr_events;
+
+	/*
 	 * We keep track of the number of available ringbuffer slots, to prevent
 	 * overflow (reqs_available), and we also use percpu counters for this.
 	 *
@@ -724,14 +729,14 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	ctx->max_reqs = nr_events;
+	ctx->max_reqs = max_reqs;
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -753,7 +758,7 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (!ctx->cpu)
 		goto err;
 
-	err = aio_setup_ring(ctx);
+	err = aio_setup_ring(ctx, nr_events);
 	if (err < 0)
 		goto err;
 
@@ -764,8 +769,8 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	/* limit the number of system wide aios */
 	spin_lock(&aio_nr_lock);
-	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
-	    aio_nr + nr_events < aio_nr) {
+	if (aio_nr + ctx->max_reqs > aio_max_nr ||
+	    aio_nr + ctx->max_reqs < aio_nr) {
 		spin_unlock(&aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;