diff mbox series

aio: Convert ioctx_table to XArray

Message ID 20181128183531.5139-1-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series aio: Convert ioctx_table to XArray | expand

Commit Message

Matthew Wilcox Nov. 28, 2018, 6:35 p.m. UTC
This custom resizing array was vulnerable to a Spectre attack (speculating
off the end of an array to a user-controlled offset).  The XArray is
not vulnerable to Spectre as it always masks its lookups to be within
the bounds of the array.

Signed-off-by: Matthew Wilcox <willy@infradead.org>
---
 fs/aio.c                 | 182 ++++++++++++++-------------------------
 include/linux/mm_types.h |   5 +-
 kernel/fork.c            |   3 +-
 mm/debug.c               |   6 --
 4 files changed, 67 insertions(+), 129 deletions(-)

Comments

Matthew Wilcox Dec. 6, 2018, 9:08 p.m. UTC | #1
ping

On Wed, Nov 28, 2018 at 10:35:31AM -0800, Matthew Wilcox wrote:
> This custom resizing array was vulnerable to a Spectre attack (speculating
> off the end of an array to a user-controlled offset).  The XArray is
> not vulnerable to Spectre as it always masks its lookups to be within
> the bounds of the array.
> 
> Signed-off-by: Matthew Wilcox <willy@infradead.org>
> ---
>  fs/aio.c                 | 182 ++++++++++++++-------------------------
>  include/linux/mm_types.h |   5 +-
>  kernel/fork.c            |   3 +-
>  mm/debug.c               |   6 --
>  4 files changed, 67 insertions(+), 129 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index 301e6314183b..51ba7446a24f 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -71,12 +71,6 @@ struct aio_ring {
>  
>  #define AIO_RING_PAGES	8
>  
> -struct kioctx_table {
> -	struct rcu_head		rcu;
> -	unsigned		nr;
> -	struct kioctx __rcu	*table[];
> -};
> -
>  struct kioctx_cpu {
>  	unsigned		reqs_available;
>  };
> @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
>  {
>  	struct file *file = vma->vm_file;
>  	struct mm_struct *mm = vma->vm_mm;
> -	struct kioctx_table *table;
> -	int i, res = -EINVAL;
> +	struct kioctx *ctx;
> +	unsigned long index;
> +	int res = -EINVAL;
>  
> -	spin_lock(&mm->ioctx_lock);
> -	rcu_read_lock();
> -	table = rcu_dereference(mm->ioctx_table);
> -	for (i = 0; i < table->nr; i++) {
> -		struct kioctx *ctx;
> -
> -		ctx = rcu_dereference(table->table[i]);
> -		if (ctx && ctx->aio_ring_file == file) {
> -			if (!atomic_read(&ctx->dead)) {
> -				ctx->user_id = ctx->mmap_base = vma->vm_start;
> -				res = 0;
> -			}
> -			break;
> +	xa_lock(&mm->ioctx);
> +	xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
> +		if (ctx->aio_ring_file != file)
> +			continue;
> +		if (!atomic_read(&ctx->dead)) {
> +			ctx->user_id = ctx->mmap_base = vma->vm_start;
> +			res = 0;
>  		}
> +		break;
>  	}
> +	xa_unlock(&mm->ioctx);
>  
> -	rcu_read_unlock();
> -	spin_unlock(&mm->ioctx_lock);
>  	return res;
>  }
>  
> @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref)
>  
>  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
>  {
> -	unsigned i, new_nr;
> -	struct kioctx_table *table, *old;
>  	struct aio_ring *ring;
> +	int err;
>  
> -	spin_lock(&mm->ioctx_lock);
> -	table = rcu_dereference_raw(mm->ioctx_table);
> -
> -	while (1) {
> -		if (table)
> -			for (i = 0; i < table->nr; i++)
> -				if (!rcu_access_pointer(table->table[i])) {
> -					ctx->id = i;
> -					rcu_assign_pointer(table->table[i], ctx);
> -					spin_unlock(&mm->ioctx_lock);
> -
> -					/* While kioctx setup is in progress,
> -					 * we are protected from page migration
> -					 * changes ring_pages by ->ring_lock.
> -					 */
> -					ring = kmap_atomic(ctx->ring_pages[0]);
> -					ring->id = ctx->id;
> -					kunmap_atomic(ring);
> -					return 0;
> -				}
> -
> -		new_nr = (table ? table->nr : 1) * 4;
> -		spin_unlock(&mm->ioctx_lock);
> -
> -		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
> -				new_nr, GFP_KERNEL);
> -		if (!table)
> -			return -ENOMEM;
> -
> -		table->nr = new_nr;
> -
> -		spin_lock(&mm->ioctx_lock);
> -		old = rcu_dereference_raw(mm->ioctx_table);
> -
> -		if (!old) {
> -			rcu_assign_pointer(mm->ioctx_table, table);
> -		} else if (table->nr > old->nr) {
> -			memcpy(table->table, old->table,
> -			       old->nr * sizeof(struct kioctx *));
> +	err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL);
> +	if (err)
> +		return err;
>  
> -			rcu_assign_pointer(mm->ioctx_table, table);
> -			kfree_rcu(old, rcu);
> -		} else {
> -			kfree(table);
> -			table = old;
> -		}
> -	}
> +	/*
> +	 * While kioctx setup is in progress, we are protected from
> +	 * page migration changing ring_pages by ->ring_lock.
> +	 */
> +	ring = kmap_atomic(ctx->ring_pages[0]);
> +	ring->id = ctx->id;
> +	kunmap_atomic(ring);
> +	return 0;
>  }
>  
>  static void aio_nr_sub(unsigned nr)
> @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
>  	return ERR_PTR(err);
>  }
>  
> -/* kill_ioctx
> - *	Cancels all outstanding aio requests on an aio context.  Used
> - *	when the processes owning a context have all exited to encourage
> - *	the rapid destruction of the kioctx.
> - */
> -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
> -		      struct ctx_rq_wait *wait)
> +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait)
>  {
> -	struct kioctx_table *table;
> -
> -	spin_lock(&mm->ioctx_lock);
> -	if (atomic_xchg(&ctx->dead, 1)) {
> -		spin_unlock(&mm->ioctx_lock);
> -		return -EINVAL;
> -	}
> -
> -	table = rcu_dereference_raw(mm->ioctx_table);
> -	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
> -	RCU_INIT_POINTER(table->table[ctx->id], NULL);
> -	spin_unlock(&mm->ioctx_lock);
> -
>  	/* free_ioctx_reqs() will do the necessary RCU synchronization */
>  	wake_up_all(&ctx->wait);
>  
> @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
>  
>  	ctx->rq_wait = wait;
>  	percpu_ref_kill(&ctx->users);
> +}
> +
> +/* kill_ioctx
> + *	Cancels all outstanding aio requests on an aio context.  Used
> + *	when the processes owning a context have all exited to encourage
> + *	the rapid destruction of the kioctx.
> + */
> +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
> +		      struct ctx_rq_wait *wait)
> +{
> +	struct kioctx *old;
> +
> +	/* I don't understand what this lock is protecting against */
> +	xa_lock(&mm->ioctx);
> +	if (atomic_xchg(&ctx->dead, 1)) {
> +		xa_unlock(&mm->ioctx);
> +		return -EINVAL;
> +	}
> +
> +	old = __xa_erase(&mm->ioctx, ctx->id);
> +	WARN_ON(old != ctx);
> +	xa_unlock(&mm->ioctx);
> +
> +	__kill_ioctx(ctx, wait);
>  	return 0;
>  }
>  
> @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
>   */
>  void exit_aio(struct mm_struct *mm)
>  {
> -	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
> +	struct kioctx *ctx;
>  	struct ctx_rq_wait wait;
> -	int i, skipped;
> +	unsigned long index;
>  
> -	if (!table)
> +	if (xa_empty(&mm->ioctx))
>  		return;
>  
> -	atomic_set(&wait.count, table->nr);
> +	/*
> +	 * Prevent count from getting to 0 until we're ready for it
> +	 */
> +	atomic_set(&wait.count, 1);
>  	init_completion(&wait.comp);
>  
> -	skipped = 0;
> -	for (i = 0; i < table->nr; ++i) {
> -		struct kioctx *ctx =
> -			rcu_dereference_protected(table->table[i], true);
> -
> -		if (!ctx) {
> -			skipped++;
> -			continue;
> -		}
> -
> +	xa_lock(&mm->ioctx);
> +	xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
>  		/*
>  		 * We don't need to bother with munmap() here - exit_mmap(mm)
>  		 * is coming and it'll unmap everything. And we simply can't,
> @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm)
>  		 * that it needs to unmap the area, just set it to 0.
>  		 */
>  		ctx->mmap_size = 0;
> -		kill_ioctx(mm, ctx, &wait);
> +		atomic_inc(&wait.count);
> +		__xa_erase(&mm->ioctx, ctx->id);
> +		__kill_ioctx(ctx, &wait);
>  	}
> +	xa_unlock(&mm->ioctx);
>  
> -	if (!atomic_sub_and_test(skipped, &wait.count)) {
> +	if (!atomic_sub_and_test(1, &wait.count)) {
>  		/* Wait until all IO for the context are done. */
>  		wait_for_completion(&wait.comp);
>  	}
> -
> -	RCU_INIT_POINTER(mm->ioctx_table, NULL);
> -	kfree(table);
>  }
>  
>  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
> @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
>  	struct aio_ring __user *ring  = (void __user *)ctx_id;
>  	struct mm_struct *mm = current->mm;
>  	struct kioctx *ctx, *ret = NULL;
> -	struct kioctx_table *table;
>  	unsigned id;
>  
>  	if (get_user(id, &ring->id))
>  		return NULL;
>  
>  	rcu_read_lock();
> -	table = rcu_dereference(mm->ioctx_table);
> -
> -	if (!table || id >= table->nr)
> -		goto out;
> -
> -	ctx = rcu_dereference(table->table[id]);
> +	ctx = xa_load(&mm->ioctx, id);
>  	if (ctx && ctx->user_id == ctx_id) {
>  		if (percpu_ref_tryget_live(&ctx->users))
>  			ret = ctx;
>  	}
> -out:
>  	rcu_read_unlock();
>  	return ret;
>  }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 5ed8f6292a53..1a95bb27f5a7 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -14,6 +14,7 @@
>  #include <linux/uprobes.h>
>  #include <linux/page-flags-layout.h>
>  #include <linux/workqueue.h>
> +#include <linux/xarray.h>
>  
>  #include <asm/mmu.h>
>  
> @@ -336,7 +337,6 @@ struct core_state {
>  	struct completion startup;
>  };
>  
> -struct kioctx_table;
>  struct mm_struct {
>  	struct {
>  		struct vm_area_struct *mmap;		/* list of VMAs */
> @@ -431,8 +431,7 @@ struct mm_struct {
>  		atomic_t membarrier_state;
>  #endif
>  #ifdef CONFIG_AIO
> -		spinlock_t			ioctx_lock;
> -		struct kioctx_table __rcu	*ioctx_table;
> +		struct xarray ioctx;
>  #endif
>  #ifdef CONFIG_MEMCG
>  		/*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff89c7b..acb775f9592d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup);
>  static void mm_init_aio(struct mm_struct *mm)
>  {
>  #ifdef CONFIG_AIO
> -	spin_lock_init(&mm->ioctx_lock);
> -	mm->ioctx_table = NULL;
> +	xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC);
>  #endif
>  }
>  
> diff --git a/mm/debug.c b/mm/debug.c
> index cdacba12e09a..d45ec63aed8c 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm)
>  		"start_brk %lx brk %lx start_stack %lx\n"
>  		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
>  		"binfmt %px flags %lx core_state %px\n"
> -#ifdef CONFIG_AIO
> -		"ioctx_table %px\n"
> -#endif
>  #ifdef CONFIG_MEMCG
>  		"owner %px "
>  #endif
> @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm)
>  		mm->start_brk, mm->brk, mm->start_stack,
>  		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
>  		mm->binfmt, mm->flags, mm->core_state,
> -#ifdef CONFIG_AIO
> -		mm->ioctx_table,
> -#endif
>  #ifdef CONFIG_MEMCG
>  		mm->owner,
>  #endif
> -- 
> 2.19.1
>
Jeff Moyer Dec. 6, 2018, 10:21 p.m. UTC | #2
Matthew Wilcox <willy@infradead.org> writes:

> This custom resizing array was vulnerable to a Spectre attack (speculating
> off the end of an array to a user-controlled offset).  The XArray is
> not vulnerable to Spectre as it always masks its lookups to be within
> the bounds of the array.

I'm not a big fan of completely re-writing the code to fix this.  Isn't
the below patch sufficient?

-Jeff

diff --git a/fs/aio.c b/fs/aio.c
index 97f983592925..9402ae0b63d5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1038,6 +1038,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	if (!table || id >= table->nr)
 		goto out;
 
+	id = array_index_nospec(index, table->nr);
 	ctx = rcu_dereference(table->table[id]);
 	if (ctx && ctx->user_id == ctx_id) {
 		if (percpu_ref_tryget_live(&ctx->users))
Jeff Moyer Dec. 6, 2018, 10:26 p.m. UTC | #3
Jeff Moyer <jmoyer@redhat.com> writes:

> Matthew Wilcox <willy@infradead.org> writes:
>
>> This custom resizing array was vulnerable to a Spectre attack (speculating
>> off the end of an array to a user-controlled offset).  The XArray is
>> not vulnerable to Spectre as it always masks its lookups to be within
>> the bounds of the array.
>
> I'm not a big fan of completely re-writing the code to fix this.  Isn't
> the below patch sufficient?

Too quick on the draw.  Here's a patch that compiles.  ;-)

Cheers,
Jeff

diff --git a/fs/aio.c b/fs/aio.c
index 97f983592925..aac9659381d2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -45,6 +45,7 @@
 
 #include <asm/kmap_types.h>
 #include <linux/uaccess.h>
+#include <linux/nospec.h>
 
 #include "internal.h"
 
@@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	if (!table || id >= table->nr)
 		goto out;
 
+	id = array_index_nospec(id, table->nr);
 	ctx = rcu_dereference(table->table[id]);
 	if (ctx && ctx->user_id == ctx_id) {
 		if (percpu_ref_tryget_live(&ctx->users))
Jeff Moyer Dec. 11, 2018, 5:21 p.m. UTC | #4
Jeff Moyer <jmoyer@redhat.com> writes:

> Jeff Moyer <jmoyer@redhat.com> writes:
>
>> Matthew Wilcox <willy@infradead.org> writes:
>>
>>> This custom resizing array was vulnerable to a Spectre attack (speculating
>>> off the end of an array to a user-controlled offset).  The XArray is
>>> not vulnerable to Spectre as it always masks its lookups to be within
>>> the bounds of the array.
>>
>> I'm not a big fan of completely re-writing the code to fix this.  Isn't
>> the below patch sufficient?
>
> Too quick on the draw.  Here's a patch that compiles.  ;-)

Hi, Matthew,

I'm going to submit this version formally.  If you're interested in
converting the ioctx_table to xarray, you can do that separately from a
security fix.  I would include a performance analysis with that patch,
though.  The idea of using a radix tree for the ioctx table was
discarded due to performance reasons--see commit db446a08c23d5 ("aio:
convert the ioctx list to table lookup v3").  I suspect using the xarray
will perform similarly.

Cheers,
Jeff

> diff --git a/fs/aio.c b/fs/aio.c
> index 97f983592925..aac9659381d2 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -45,6 +45,7 @@
>  
>  #include <asm/kmap_types.h>
>  #include <linux/uaccess.h>
> +#include <linux/nospec.h>
>  
>  #include "internal.h"
>  
> @@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
>  	if (!table || id >= table->nr)
>  		goto out;
>  
> +	id = array_index_nospec(id, table->nr);
>  	ctx = rcu_dereference(table->table[id]);
>  	if (ctx && ctx->user_id == ctx_id) {
>  		if (percpu_ref_tryget_live(&ctx->users))
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/
> Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
Matthew Wilcox Dec. 11, 2018, 5:51 p.m. UTC | #5
On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
> I'm going to submit this version formally.  If you're interested in
> converting the ioctx_table to xarray, you can do that separately from a
> security fix.  I would include a performance analysis with that patch,
> though.  The idea of using a radix tree for the ioctx table was
> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
> convert the ioctx list to table lookup v3").  I suspect using the xarray
> will perform similarly.

There's a big difference between Octavian's patch and mine.  That patch
indexed into the radix tree by 'ctx_id' directly, which was pretty
much guaranteed to exhibit some close-to-worst-case behaviour from the
radix tree due to IDs being sparsely assigned.  My patch uses the ring
ID which _we_ assigned, and so is nicely behaved, being usually a very
small integer.

What performance analysis would you find compelling?  Octavian's original
fio script:

> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
> blocksize=1024; numjobs=512; thread; loops=100
> 
> on an EXT2 filesystem mounted on top of a ramdisk

or something else?
Jeff Moyer Dec. 11, 2018, 6:02 p.m. UTC | #6
Matthew Wilcox <willy@infradead.org> writes:

> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>> I'm going to submit this version formally.  If you're interested in
>> converting the ioctx_table to xarray, you can do that separately from a
>> security fix.  I would include a performance analysis with that patch,
>> though.  The idea of using a radix tree for the ioctx table was
>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>> will perform similarly.
>
> There's a big difference between Octavian's patch and mine.  That patch
> indexed into the radix tree by 'ctx_id' directly, which was pretty
> much guaranteed to exhibit some close-to-worst-case behaviour from the
> radix tree due to IDs being sparsely assigned.  My patch uses the ring
> ID which _we_ assigned, and so is nicely behaved, being usually a very
> small integer.

OK, good to know.  I obviously didn't look too closely at the two.

> What performance analysis would you find compelling?  Octavian's original
> fio script:
>
>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>> blocksize=1024; numjobs=512; thread; loops=100
>> 
>> on an EXT2 filesystem mounted on top of a ramdisk
>
> or something else?

I think the most common use case is a small number of ioctx-s, so I'd
like to see that use case not regress (that should be easy, right?).
Kent, what were the tests you were using when doing this work?  Jens,
since you're doing performance work in this area now, are there any
particular test cases you care about?

Cheers,
Jeff
Jens Axboe Dec. 11, 2018, 6:05 p.m. UTC | #7
On 12/11/18 11:02 AM, Jeff Moyer wrote:
> Matthew Wilcox <willy@infradead.org> writes:
> 
>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>>> I'm going to submit this version formally.  If you're interested in
>>> converting the ioctx_table to xarray, you can do that separately from a
>>> security fix.  I would include a performance analysis with that patch,
>>> though.  The idea of using a radix tree for the ioctx table was
>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>>> will perform similarly.
>>
>> There's a big difference between Octavian's patch and mine.  That patch
>> indexed into the radix tree by 'ctx_id' directly, which was pretty
>> much guaranteed to exhibit some close-to-worst-case behaviour from the
>> radix tree due to IDs being sparsely assigned.  My patch uses the ring
>> ID which _we_ assigned, and so is nicely behaved, being usually a very
>> small integer.
> 
> OK, good to know.  I obviously didn't look too closely at the two.
> 
>> What performance analysis would you find compelling?  Octavian's original
>> fio script:
>>
>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>>> blocksize=1024; numjobs=512; thread; loops=100
>>>
>>> on an EXT2 filesystem mounted on top of a ramdisk
>>
>> or something else?
> 
> I think the most common use case is a small number of ioctx-s, so I'd
> like to see that use case not regress (that should be easy, right?).
> Kent, what were the tests you were using when doing this work?  Jens,
> since you're doing performance work in this area now, are there any
> particular test cases you care about?

I can give it a spin, ioctx lookup is in the fast path, and for "classic"
aio we do it twice for each IO...
Jeff Moyer Dec. 11, 2018, 6:09 p.m. UTC | #8
Jens Axboe <axboe@kernel.dk> writes:

> On 12/11/18 11:02 AM, Jeff Moyer wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>> 
>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>>>> I'm going to submit this version formally.  If you're interested in
>>>> converting the ioctx_table to xarray, you can do that separately from a
>>>> security fix.  I would include a performance analysis with that patch,
>>>> though.  The idea of using a radix tree for the ioctx table was
>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>>>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>>>> will perform similarly.
>>>
>>> There's a big difference between Octavian's patch and mine.  That patch
>>> indexed into the radix tree by 'ctx_id' directly, which was pretty
>>> much guaranteed to exhibit some close-to-worst-case behaviour from the
>>> radix tree due to IDs being sparsely assigned.  My patch uses the ring
>>> ID which _we_ assigned, and so is nicely behaved, being usually a very
>>> small integer.
>> 
>> OK, good to know.  I obviously didn't look too closely at the two.
>> 
>>> What performance analysis would you find compelling?  Octavian's original
>>> fio script:
>>>
>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>>>> blocksize=1024; numjobs=512; thread; loops=100
>>>>
>>>> on an EXT2 filesystem mounted on top of a ramdisk
>>>
>>> or something else?
>> 
>> I think the most common use case is a small number of ioctx-s, so I'd
>> like to see that use case not regress (that should be easy, right?).

Bah, I meant a small number of threads doing submit/getevents.

>> Kent, what were the tests you were using when doing this work?  Jens,
>> since you're doing performance work in this area now, are there any
>> particular test cases you care about?
>
> I can give it a spin, ioctx lookup is in the fast path, and for "classic"
> aio we do it twice for each IO...

Thanks!

-Jeff
Jens Axboe Dec. 11, 2018, 6:32 p.m. UTC | #9
On 12/11/18 11:05 AM, Jens Axboe wrote:
> On 12/11/18 11:02 AM, Jeff Moyer wrote:
>> Matthew Wilcox <willy@infradead.org> writes:
>>
>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>>>> I'm going to submit this version formally.  If you're interested in
>>>> converting the ioctx_table to xarray, you can do that separately from a
>>>> security fix.  I would include a performance analysis with that patch,
>>>> though.  The idea of using a radix tree for the ioctx table was
>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>>>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>>>> will perform similarly.
>>>
>>> There's a big difference between Octavian's patch and mine.  That patch
>>> indexed into the radix tree by 'ctx_id' directly, which was pretty
>>> much guaranteed to exhibit some close-to-worst-case behaviour from the
>>> radix tree due to IDs being sparsely assigned.  My patch uses the ring
>>> ID which _we_ assigned, and so is nicely behaved, being usually a very
>>> small integer.
>>
>> OK, good to know.  I obviously didn't look too closely at the two.
>>
>>> What performance analysis would you find compelling?  Octavian's original
>>> fio script:
>>>
>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>>>> blocksize=1024; numjobs=512; thread; loops=100
>>>>
>>>> on an EXT2 filesystem mounted on top of a ramdisk
>>>
>>> or something else?
>>
>> I think the most common use case is a small number of ioctx-s, so I'd
>> like to see that use case not regress (that should be easy, right?).
>> Kent, what were the tests you were using when doing this work?  Jens,
>> since you're doing performance work in this area now, are there any
>> particular test cases you care about?
> 
> I can give it a spin, ioctx lookup is in the fast path, and for "classic"
> aio we do it twice for each IO...

Don't see any regressions. But if we're fiddling with it anyway, can't
we do something smarter? Make the fast path just index a table, and put
all the big hammers in setup/destroy. We're spending a non-substantial
amount of time doing lookups, that's really no different before and
after the patch.
Jens Axboe Dec. 11, 2018, 6:36 p.m. UTC | #10
On 12/11/18 11:32 AM, Jens Axboe wrote:
> On 12/11/18 11:05 AM, Jens Axboe wrote:
>> On 12/11/18 11:02 AM, Jeff Moyer wrote:
>>> Matthew Wilcox <willy@infradead.org> writes:
>>>
>>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>>>>> I'm going to submit this version formally.  If you're interested in
>>>>> converting the ioctx_table to xarray, you can do that separately from a
>>>>> security fix.  I would include a performance analysis with that patch,
>>>>> though.  The idea of using a radix tree for the ioctx table was
>>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>>>>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>>>>> will perform similarly.
>>>>
>>>> There's a big difference between Octavian's patch and mine.  That patch
>>>> indexed into the radix tree by 'ctx_id' directly, which was pretty
>>>> much guaranteed to exhibit some close-to-worst-case behaviour from the
>>>> radix tree due to IDs being sparsely assigned.  My patch uses the ring
>>>> ID which _we_ assigned, and so is nicely behaved, being usually a very
>>>> small integer.
>>>
>>> OK, good to know.  I obviously didn't look too closely at the two.
>>>
>>>> What performance analysis would you find compelling?  Octavian's original
>>>> fio script:
>>>>
>>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>>>>> blocksize=1024; numjobs=512; thread; loops=100
>>>>>
>>>>> on an EXT2 filesystem mounted on top of a ramdisk
>>>>
>>>> or something else?
>>>
>>> I think the most common use case is a small number of ioctx-s, so I'd
>>> like to see that use case not regress (that should be easy, right?).
>>> Kent, what were the tests you were using when doing this work?  Jens,
>>> since you're doing performance work in this area now, are there any
>>> particular test cases you care about?
>>
>> I can give it a spin, ioctx lookup is in the fast path, and for "classic"
>> aio we do it twice for each IO...
> 
> Don't see any regressions. But if we're fiddling with it anyway, can't
> we do something smarter? Make the fast path just index a table, and put
> all the big hammers in setup/destroy. We're spending a non-substantial
> amount of time doing lookups, that's really no different before and
> after the patch.

Looks like it's the percpu ref get, in terms of "lookup" we already
look pretty good.
Jens Axboe Dec. 11, 2018, 6:37 p.m. UTC | #11
On 12/11/18 11:09 AM, Jeff Moyer wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> On 12/11/18 11:02 AM, Jeff Moyer wrote:
>>> Matthew Wilcox <willy@infradead.org> writes:
>>>
>>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote:
>>>>> I'm going to submit this version formally.  If you're interested in
>>>>> converting the ioctx_table to xarray, you can do that separately from a
>>>>> security fix.  I would include a performance analysis with that patch,
>>>>> though.  The idea of using a radix tree for the ioctx table was
>>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio:
>>>>> convert the ioctx list to table lookup v3").  I suspect using the xarray
>>>>> will perform similarly.
>>>>
>>>> There's a big difference between Octavian's patch and mine.  That patch
>>>> indexed into the radix tree by 'ctx_id' directly, which was pretty
>>>> much guaranteed to exhibit some close-to-worst-case behaviour from the
>>>> radix tree due to IDs being sparsely assigned.  My patch uses the ring
>>>> ID which _we_ assigned, and so is nicely behaved, being usually a very
>>>> small integer.
>>>
>>> OK, good to know.  I obviously didn't look too closely at the two.
>>>
>>>> What performance analysis would you find compelling?  Octavian's original
>>>> fio script:
>>>>
>>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1
>>>>> blocksize=1024; numjobs=512; thread; loops=100
>>>>>
>>>>> on an EXT2 filesystem mounted on top of a ramdisk
>>>>
>>>> or something else?
>>>
>>> I think the most common use case is a small number of ioctx-s, so I'd
>>> like to see that use case not regress (that should be easy, right?).
> 
> Bah, I meant a small number of threads doing submit/getevents.
> 
>>> Kent, what were the tests you were using when doing this work?  Jens,
>>> since you're doing performance work in this area now, are there any
>>> particular test cases you care about?
>>
>> I can give it a spin, ioctx lookup is in the fast path, and for "classic"
>> aio we do it twice for each IO...
> 
> Thanks!

You can add my reviewed-by/tested-by. Do you want me to carry this one?
I can rebase on top of the aio.c nospec lookup patch, we should do
those separately.
Jens Axboe Dec. 11, 2018, 6:41 p.m. UTC | #12
On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote:
> @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
>         struct aio_ring __user *ring  = (void __user *)ctx_id;
>         struct mm_struct *mm = current->mm;
>         struct kioctx *ctx, *ret = NULL;
> -       struct kioctx_table *table;
>         unsigned id;
>
>         if (get_user(id, &ring->id))
>                 return NULL;
>
>         rcu_read_lock();
> -       table = rcu_dereference(mm->ioctx_table);
> -
> -       if (!table || id >= table->nr)
> -               goto out;
> -
> -       ctx = rcu_dereference(table->table[id]);
> +       ctx = xa_load(&mm->ioctx, id);
>         if (ctx && ctx->user_id == ctx_id) {
>                 if (percpu_ref_tryget_live(&ctx->users))
>                         ret = ctx;
>         }

Question on this part - do we need that RCU read lock around this now? I
don't think we do.
Matthew Wilcox Dec. 11, 2018, 6:45 p.m. UTC | #13
On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote:
> On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> >         rcu_read_lock();
> > -       table = rcu_dereference(mm->ioctx_table);
> > -
> > -       if (!table || id >= table->nr)
> > -               goto out;
> > -
> > -       ctx = rcu_dereference(table->table[id]);
> > +       ctx = xa_load(&mm->ioctx, id);
> >         if (ctx && ctx->user_id == ctx_id) {
> >                 if (percpu_ref_tryget_live(&ctx->users))
> >                         ret = ctx;
> >         }
> 
> Question on this part - do we need that RCU read lock around this now? I
> don't think we do.

I think we need the rcu read lock here to prevent ctx from being freed
under us by free_ioctx().
Jens Axboe Dec. 11, 2018, 6:46 p.m. UTC | #14
On 12/11/18 11:45 AM, Matthew Wilcox wrote:
> On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote:
>> On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote:
>>>
>>>         rcu_read_lock();
>>> -       table = rcu_dereference(mm->ioctx_table);
>>> -
>>> -       if (!table || id >= table->nr)
>>> -               goto out;
>>> -
>>> -       ctx = rcu_dereference(table->table[id]);
>>> +       ctx = xa_load(&mm->ioctx, id);
>>>         if (ctx && ctx->user_id == ctx_id) {
>>>                 if (percpu_ref_tryget_live(&ctx->users))
>>>                         ret = ctx;
>>>         }
>>
>> Question on this part - do we need that RCU read lock around this now? I
>> don't think we do.
> 
> I think we need the rcu read lock here to prevent ctx from being freed
> under us by free_ioctx().

Then that begs the question, how about __xa_load() that is already called
under RCU read lock?
Matthew Wilcox Dec. 11, 2018, 6:51 p.m. UTC | #15
On Tue, Dec 11, 2018 at 11:32:54AM -0700, Jens Axboe wrote:
> Don't see any regressions. But if we're fiddling with it anyway, can't
> we do something smarter? Make the fast path just index a table, and put
> all the big hammers in setup/destroy. We're spending a non-substantial
> amount of time doing lookups, that's really no different before and
> after the patch.

Thanks for checking it out.

I think the fast path does just index a table.  Until you have more than
64 pointers in the XArray, it's just xa->head->slots[i].  And then up
to 4096 pointers, it's xa->head->slots[i >> 6]->slots[i].  It has the
advantage that if you only have one kioctx (which is surely many programs
using AIO), it's just xa->head, so even better than a table lookup.

It'll start to deteriorate after 4096 kioctxs, with one extra indirection
for every 6 bits, but by that point, we'd've been straining the memory
allocator to allocate a large table anyway.
Jens Axboe Dec. 11, 2018, 6:52 p.m. UTC | #16
On 12/11/18 11:51 AM, Matthew Wilcox wrote:
> On Tue, Dec 11, 2018 at 11:32:54AM -0700, Jens Axboe wrote:
>> Don't see any regressions. But if we're fiddling with it anyway, can't
>> we do something smarter? Make the fast path just index a table, and put
>> all the big hammers in setup/destroy. We're spending a non-substantial
>> amount of time doing lookups, that's really no different before and
>> after the patch.
> 
> Thanks for checking it out.
> 
> I think the fast path does just index a table.  Until you have more than
> 64 pointers in the XArray, it's just xa->head->slots[i].  And then up
> to 4096 pointers, it's xa->head->slots[i >> 6]->slots[i].  It has the
> advantage that if you only have one kioctx (which is surely many programs
> using AIO), it's just xa->head, so even better than a table lookup.
> 
> It'll start to deteriorate after 4096 kioctxs, with one extra indirection
> for every 6 bits, but by that point, we'd've been straining the memory
> allocator to allocate a large table anyway.

I agree, and nobody cares about 4k kioctxs, you're way into the weeds
at that point anyway.

So as the followup said, I think we're fine as-is for this particular
case.
Matthew Wilcox Dec. 11, 2018, 6:53 p.m. UTC | #17
On Tue, Dec 11, 2018 at 11:46:53AM -0700, Jens Axboe wrote:
> On 12/11/18 11:45 AM, Matthew Wilcox wrote:
> > I think we need the rcu read lock here to prevent ctx from being freed
> > under us by free_ioctx().
> 
> Then that begs the question, how about __xa_load() that is already called
> under RCU read lock?

I've been considering adding it to the API, yes.  I was under the
impression that nested rcu_read_lock() calls were not expensive, even
with CONFIG_PREEMPT.
Jens Axboe Dec. 11, 2018, 6:53 p.m. UTC | #18
On 12/11/18 11:46 AM, Jens Axboe wrote:
> On 12/11/18 11:45 AM, Matthew Wilcox wrote:
>> On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote:
>>> On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote:
>>>>
>>>>         rcu_read_lock();
>>>> -       table = rcu_dereference(mm->ioctx_table);
>>>> -
>>>> -       if (!table || id >= table->nr)
>>>> -               goto out;
>>>> -
>>>> -       ctx = rcu_dereference(table->table[id]);
>>>> +       ctx = xa_load(&mm->ioctx, id);
>>>>         if (ctx && ctx->user_id == ctx_id) {
>>>>                 if (percpu_ref_tryget_live(&ctx->users))
>>>>                         ret = ctx;
>>>>         }
>>>
>>> Question on this part - do we need that RCU read lock around this now? I
>>> don't think we do.
>>
>> I think we need the rcu read lock here to prevent ctx from being freed
>> under us by free_ioctx().
> 
> Then that begs the question, how about __xa_load() that is already called
> under RCU read lock?

Something like this, mem remap has an existing user that can use this
too already.
Jens Axboe Dec. 11, 2018, 6:54 p.m. UTC | #19
On 12/11/18 11:53 AM, Matthew Wilcox wrote:
> On Tue, Dec 11, 2018 at 11:46:53AM -0700, Jens Axboe wrote:
>> On 12/11/18 11:45 AM, Matthew Wilcox wrote:
>>> I think we need the rcu read lock here to prevent ctx from being freed
>>> under us by free_ioctx().
>>
>> Then that begs the question, how about __xa_load() that is already called
>> under RCU read lock?
> 
> I've been considering adding it to the API, yes.  I was under the
> impression that nested rcu_read_lock() calls were not expensive, even
> with CONFIG_PREEMPT.

They are not expensive, but they are not free either. And if we know we
are already under a rcu read lock, it seems pretty pointless. For the
two cases (memremap and aio), the rcu read lock is right there, before
the call. Easy to verify that it's safe.
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index 301e6314183b..51ba7446a24f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -71,12 +71,6 @@  struct aio_ring {
 
 #define AIO_RING_PAGES	8
 
-struct kioctx_table {
-	struct rcu_head		rcu;
-	unsigned		nr;
-	struct kioctx __rcu	*table[];
-};
-
 struct kioctx_cpu {
 	unsigned		reqs_available;
 };
@@ -313,27 +307,22 @@  static int aio_ring_mremap(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
-	struct kioctx_table *table;
-	int i, res = -EINVAL;
+	struct kioctx *ctx;
+	unsigned long index;
+	int res = -EINVAL;
 
-	spin_lock(&mm->ioctx_lock);
-	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
-	for (i = 0; i < table->nr; i++) {
-		struct kioctx *ctx;
-
-		ctx = rcu_dereference(table->table[i]);
-		if (ctx && ctx->aio_ring_file == file) {
-			if (!atomic_read(&ctx->dead)) {
-				ctx->user_id = ctx->mmap_base = vma->vm_start;
-				res = 0;
-			}
-			break;
+	xa_lock(&mm->ioctx);
+	xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
+		if (ctx->aio_ring_file != file)
+			continue;
+		if (!atomic_read(&ctx->dead)) {
+			ctx->user_id = ctx->mmap_base = vma->vm_start;
+			res = 0;
 		}
+		break;
 	}
+	xa_unlock(&mm->ioctx);
 
-	rcu_read_unlock();
-	spin_unlock(&mm->ioctx_lock);
 	return res;
 }
 
@@ -617,57 +606,21 @@  static void free_ioctx_users(struct percpu_ref *ref)
 
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
-	unsigned i, new_nr;
-	struct kioctx_table *table, *old;
 	struct aio_ring *ring;
+	int err;
 
-	spin_lock(&mm->ioctx_lock);
-	table = rcu_dereference_raw(mm->ioctx_table);
-
-	while (1) {
-		if (table)
-			for (i = 0; i < table->nr; i++)
-				if (!rcu_access_pointer(table->table[i])) {
-					ctx->id = i;
-					rcu_assign_pointer(table->table[i], ctx);
-					spin_unlock(&mm->ioctx_lock);
-
-					/* While kioctx setup is in progress,
-					 * we are protected from page migration
-					 * changes ring_pages by ->ring_lock.
-					 */
-					ring = kmap_atomic(ctx->ring_pages[0]);
-					ring->id = ctx->id;
-					kunmap_atomic(ring);
-					return 0;
-				}
-
-		new_nr = (table ? table->nr : 1) * 4;
-		spin_unlock(&mm->ioctx_lock);
-
-		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
-				new_nr, GFP_KERNEL);
-		if (!table)
-			return -ENOMEM;
-
-		table->nr = new_nr;
-
-		spin_lock(&mm->ioctx_lock);
-		old = rcu_dereference_raw(mm->ioctx_table);
-
-		if (!old) {
-			rcu_assign_pointer(mm->ioctx_table, table);
-		} else if (table->nr > old->nr) {
-			memcpy(table->table, old->table,
-			       old->nr * sizeof(struct kioctx *));
+	err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL);
+	if (err)
+		return err;
 
-			rcu_assign_pointer(mm->ioctx_table, table);
-			kfree_rcu(old, rcu);
-		} else {
-			kfree(table);
-			table = old;
-		}
-	}
+	/*
+	 * While kioctx setup is in progress, we are protected from
+	 * page migration changing ring_pages by ->ring_lock.
+	 */
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->id = ctx->id;
+	kunmap_atomic(ring);
+	return 0;
 }
 
 static void aio_nr_sub(unsigned nr)
@@ -793,27 +746,8 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	return ERR_PTR(err);
 }
 
-/* kill_ioctx
- *	Cancels all outstanding aio requests on an aio context.  Used
- *	when the processes owning a context have all exited to encourage
- *	the rapid destruction of the kioctx.
- */
-static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
-		      struct ctx_rq_wait *wait)
+static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait)
 {
-	struct kioctx_table *table;
-
-	spin_lock(&mm->ioctx_lock);
-	if (atomic_xchg(&ctx->dead, 1)) {
-		spin_unlock(&mm->ioctx_lock);
-		return -EINVAL;
-	}
-
-	table = rcu_dereference_raw(mm->ioctx_table);
-	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
-	RCU_INIT_POINTER(table->table[ctx->id], NULL);
-	spin_unlock(&mm->ioctx_lock);
-
 	/* free_ioctx_reqs() will do the necessary RCU synchronization */
 	wake_up_all(&ctx->wait);
 
@@ -831,6 +765,30 @@  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 
 	ctx->rq_wait = wait;
 	percpu_ref_kill(&ctx->users);
+}
+
+/* kill_ioctx
+ *	Cancels all outstanding aio requests on an aio context.  Used
+ *	when the processes owning a context have all exited to encourage
+ *	the rapid destruction of the kioctx.
+ */
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+		      struct ctx_rq_wait *wait)
+{
+	struct kioctx *old;
+
+	/* I don't understand what this lock is protecting against */
+	xa_lock(&mm->ioctx);
+	if (atomic_xchg(&ctx->dead, 1)) {
+		xa_unlock(&mm->ioctx);
+		return -EINVAL;
+	}
+
+	old = __xa_erase(&mm->ioctx, ctx->id);
+	WARN_ON(old != ctx);
+	xa_unlock(&mm->ioctx);
+
+	__kill_ioctx(ctx, wait);
 	return 0;
 }
 
@@ -844,26 +802,21 @@  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+	struct kioctx *ctx;
 	struct ctx_rq_wait wait;
-	int i, skipped;
+	unsigned long index;
 
-	if (!table)
+	if (xa_empty(&mm->ioctx))
 		return;
 
-	atomic_set(&wait.count, table->nr);
+	/*
+	 * Prevent count from getting to 0 until we're ready for it
+	 */
+	atomic_set(&wait.count, 1);
 	init_completion(&wait.comp);
 
-	skipped = 0;
-	for (i = 0; i < table->nr; ++i) {
-		struct kioctx *ctx =
-			rcu_dereference_protected(table->table[i], true);
-
-		if (!ctx) {
-			skipped++;
-			continue;
-		}
-
+	xa_lock(&mm->ioctx);
+	xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) {
 		/*
 		 * We don't need to bother with munmap() here - exit_mmap(mm)
 		 * is coming and it'll unmap everything. And we simply can't,
@@ -872,16 +825,16 @@  void exit_aio(struct mm_struct *mm)
 		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-		kill_ioctx(mm, ctx, &wait);
+		atomic_inc(&wait.count);
+		__xa_erase(&mm->ioctx, ctx->id);
+		__kill_ioctx(ctx, &wait);
 	}
+	xa_unlock(&mm->ioctx);
 
-	if (!atomic_sub_and_test(skipped, &wait.count)) {
+	if (!atomic_sub_and_test(1, &wait.count)) {
 		/* Wait until all IO for the context are done. */
 		wait_for_completion(&wait.comp);
 	}
-
-	RCU_INIT_POINTER(mm->ioctx_table, NULL);
-	kfree(table);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -1026,24 +979,17 @@  static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
-	struct kioctx_table *table;
 	unsigned id;
 
 	if (get_user(id, &ring->id))
 		return NULL;
 
 	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
-
-	if (!table || id >= table->nr)
-		goto out;
-
-	ctx = rcu_dereference(table->table[id]);
+	ctx = xa_load(&mm->ioctx, id);
 	if (ctx && ctx->user_id == ctx_id) {
 		if (percpu_ref_tryget_live(&ctx->users))
 			ret = ctx;
 	}
-out:
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5ed8f6292a53..1a95bb27f5a7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -14,6 +14,7 @@ 
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
+#include <linux/xarray.h>
 
 #include <asm/mmu.h>
 
@@ -336,7 +337,6 @@  struct core_state {
 	struct completion startup;
 };
 
-struct kioctx_table;
 struct mm_struct {
 	struct {
 		struct vm_area_struct *mmap;		/* list of VMAs */
@@ -431,8 +431,7 @@  struct mm_struct {
 		atomic_t membarrier_state;
 #endif
 #ifdef CONFIG_AIO
-		spinlock_t			ioctx_lock;
-		struct kioctx_table __rcu	*ioctx_table;
+		struct xarray ioctx;
 #endif
 #ifdef CONFIG_MEMCG
 		/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cddff89c7b..acb775f9592d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,8 +946,7 @@  __setup("coredump_filter=", coredump_filter_setup);
 static void mm_init_aio(struct mm_struct *mm)
 {
 #ifdef CONFIG_AIO
-	spin_lock_init(&mm->ioctx_lock);
-	mm->ioctx_table = NULL;
+	xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC);
 #endif
 }
 
diff --git a/mm/debug.c b/mm/debug.c
index cdacba12e09a..d45ec63aed8c 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -127,9 +127,6 @@  void dump_mm(const struct mm_struct *mm)
 		"start_brk %lx brk %lx start_stack %lx\n"
 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
 		"binfmt %px flags %lx core_state %px\n"
-#ifdef CONFIG_AIO
-		"ioctx_table %px\n"
-#endif
 #ifdef CONFIG_MEMCG
 		"owner %px "
 #endif
@@ -158,9 +155,6 @@  void dump_mm(const struct mm_struct *mm)
 		mm->start_brk, mm->brk, mm->start_stack,
 		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
 		mm->binfmt, mm->flags, mm->core_state,
-#ifdef CONFIG_AIO
-		mm->ioctx_table,
-#endif
 #ifdef CONFIG_MEMCG
 		mm->owner,
 #endif