Message ID | 20181128183531.5139-1-willy@infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | aio: Convert ioctx_table to XArray | expand |
ping On Wed, Nov 28, 2018 at 10:35:31AM -0800, Matthew Wilcox wrote: > This custom resizing array was vulnerable to a Spectre attack (speculating > off the end of an array to a user-controlled offset). The XArray is > not vulnerable to Spectre as it always masks its lookups to be within > the bounds of the array. > > Signed-off-by: Matthew Wilcox <willy@infradead.org> > --- > fs/aio.c | 182 ++++++++++++++------------------------- > include/linux/mm_types.h | 5 +- > kernel/fork.c | 3 +- > mm/debug.c | 6 -- > 4 files changed, 67 insertions(+), 129 deletions(-) > > diff --git a/fs/aio.c b/fs/aio.c > index 301e6314183b..51ba7446a24f 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -71,12 +71,6 @@ struct aio_ring { > > #define AIO_RING_PAGES 8 > > -struct kioctx_table { > - struct rcu_head rcu; > - unsigned nr; > - struct kioctx __rcu *table[]; > -}; > - > struct kioctx_cpu { > unsigned reqs_available; > }; > @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma) > { > struct file *file = vma->vm_file; > struct mm_struct *mm = vma->vm_mm; > - struct kioctx_table *table; > - int i, res = -EINVAL; > + struct kioctx *ctx; > + unsigned long index; > + int res = -EINVAL; > > - spin_lock(&mm->ioctx_lock); > - rcu_read_lock(); > - table = rcu_dereference(mm->ioctx_table); > - for (i = 0; i < table->nr; i++) { > - struct kioctx *ctx; > - > - ctx = rcu_dereference(table->table[i]); > - if (ctx && ctx->aio_ring_file == file) { > - if (!atomic_read(&ctx->dead)) { > - ctx->user_id = ctx->mmap_base = vma->vm_start; > - res = 0; > - } > - break; > + xa_lock(&mm->ioctx); > + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { > + if (ctx->aio_ring_file != file) > + continue; > + if (!atomic_read(&ctx->dead)) { > + ctx->user_id = ctx->mmap_base = vma->vm_start; > + res = 0; > } > + break; > } > + xa_unlock(&mm->ioctx); > > - rcu_read_unlock(); > - spin_unlock(&mm->ioctx_lock); > return res; > } > > @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref) > > static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) > { > - unsigned i, new_nr; > - struct kioctx_table *table, *old; > struct aio_ring *ring; > + int err; > > - spin_lock(&mm->ioctx_lock); > - table = rcu_dereference_raw(mm->ioctx_table); > - > - while (1) { > - if (table) > - for (i = 0; i < table->nr; i++) > - if (!rcu_access_pointer(table->table[i])) { > - ctx->id = i; > - rcu_assign_pointer(table->table[i], ctx); > - spin_unlock(&mm->ioctx_lock); > - > - /* While kioctx setup is in progress, > - * we are protected from page migration > - * changes ring_pages by ->ring_lock. > - */ > - ring = kmap_atomic(ctx->ring_pages[0]); > - ring->id = ctx->id; > - kunmap_atomic(ring); > - return 0; > - } > - > - new_nr = (table ? table->nr : 1) * 4; > - spin_unlock(&mm->ioctx_lock); > - > - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * > - new_nr, GFP_KERNEL); > - if (!table) > - return -ENOMEM; > - > - table->nr = new_nr; > - > - spin_lock(&mm->ioctx_lock); > - old = rcu_dereference_raw(mm->ioctx_table); > - > - if (!old) { > - rcu_assign_pointer(mm->ioctx_table, table); > - } else if (table->nr > old->nr) { > - memcpy(table->table, old->table, > - old->nr * sizeof(struct kioctx *)); > + err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL); > + if (err) > + return err; > > - rcu_assign_pointer(mm->ioctx_table, table); > - kfree_rcu(old, rcu); > - } else { > - kfree(table); > - table = old; > - } > - } > + /* > + * While kioctx setup is in progress, we are protected from > + * page migration changing ring_pages by ->ring_lock. > + */ > + ring = kmap_atomic(ctx->ring_pages[0]); > + ring->id = ctx->id; > + kunmap_atomic(ring); > + return 0; > } > > static void aio_nr_sub(unsigned nr) > @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) > return ERR_PTR(err); > } > > -/* kill_ioctx > - * Cancels all outstanding aio requests on an aio context. Used > - * when the processes owning a context have all exited to encourage > - * the rapid destruction of the kioctx. > - */ > -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > - struct ctx_rq_wait *wait) > +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait) > { > - struct kioctx_table *table; > - > - spin_lock(&mm->ioctx_lock); > - if (atomic_xchg(&ctx->dead, 1)) { > - spin_unlock(&mm->ioctx_lock); > - return -EINVAL; > - } > - > - table = rcu_dereference_raw(mm->ioctx_table); > - WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); > - RCU_INIT_POINTER(table->table[ctx->id], NULL); > - spin_unlock(&mm->ioctx_lock); > - > /* free_ioctx_reqs() will do the necessary RCU synchronization */ > wake_up_all(&ctx->wait); > > @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > > ctx->rq_wait = wait; > percpu_ref_kill(&ctx->users); > +} > + > +/* kill_ioctx > + * Cancels all outstanding aio requests on an aio context. Used > + * when the processes owning a context have all exited to encourage > + * the rapid destruction of the kioctx. > + */ > +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > + struct ctx_rq_wait *wait) > +{ > + struct kioctx *old; > + > + /* I don't understand what this lock is protecting against */ > + xa_lock(&mm->ioctx); > + if (atomic_xchg(&ctx->dead, 1)) { > + xa_unlock(&mm->ioctx); > + return -EINVAL; > + } > + > + old = __xa_erase(&mm->ioctx, ctx->id); > + WARN_ON(old != ctx); > + xa_unlock(&mm->ioctx); > + > + __kill_ioctx(ctx, wait); > return 0; > } > > @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, > */ > void exit_aio(struct mm_struct *mm) > { > - struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); > + struct kioctx *ctx; > struct ctx_rq_wait wait; > - int i, skipped; > + unsigned long index; > > - if (!table) > + if (xa_empty(&mm->ioctx)) > return; > > - atomic_set(&wait.count, table->nr); > + /* > + * Prevent count from getting to 0 until we're ready for it > + */ > + atomic_set(&wait.count, 1); > init_completion(&wait.comp); > > - skipped = 0; > - for (i = 0; i < table->nr; ++i) { > - struct kioctx *ctx = > - rcu_dereference_protected(table->table[i], true); > - > - if (!ctx) { > - skipped++; > - continue; > - } > - > + xa_lock(&mm->ioctx); > + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { > /* > * We don't need to bother with munmap() here - exit_mmap(mm) > * is coming and it'll unmap everything. And we simply can't, > @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm) > * that it needs to unmap the area, just set it to 0. > */ > ctx->mmap_size = 0; > - kill_ioctx(mm, ctx, &wait); > + atomic_inc(&wait.count); > + __xa_erase(&mm->ioctx, ctx->id); > + __kill_ioctx(ctx, &wait); > } > + xa_unlock(&mm->ioctx); > > - if (!atomic_sub_and_test(skipped, &wait.count)) { > + if (!atomic_sub_and_test(1, &wait.count)) { > /* Wait until all IO for the context are done. */ > wait_for_completion(&wait.comp); > } > - > - RCU_INIT_POINTER(mm->ioctx_table, NULL); > - kfree(table); > } > > static void put_reqs_available(struct kioctx *ctx, unsigned nr) > @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) > struct aio_ring __user *ring = (void __user *)ctx_id; > struct mm_struct *mm = current->mm; > struct kioctx *ctx, *ret = NULL; > - struct kioctx_table *table; > unsigned id; > > if (get_user(id, &ring->id)) > return NULL; > > rcu_read_lock(); > - table = rcu_dereference(mm->ioctx_table); > - > - if (!table || id >= table->nr) > - goto out; > - > - ctx = rcu_dereference(table->table[id]); > + ctx = xa_load(&mm->ioctx, id); > if (ctx && ctx->user_id == ctx_id) { > if (percpu_ref_tryget_live(&ctx->users)) > ret = ctx; > } > -out: > rcu_read_unlock(); > return ret; > } > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 5ed8f6292a53..1a95bb27f5a7 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -14,6 +14,7 @@ > #include <linux/uprobes.h> > #include <linux/page-flags-layout.h> > #include <linux/workqueue.h> > +#include <linux/xarray.h> > > #include <asm/mmu.h> > > @@ -336,7 +337,6 @@ struct core_state { > struct completion startup; > }; > > -struct kioctx_table; > struct mm_struct { > struct { > struct vm_area_struct *mmap; /* list of VMAs */ > @@ -431,8 +431,7 @@ struct mm_struct { > atomic_t membarrier_state; > #endif > #ifdef CONFIG_AIO > - spinlock_t ioctx_lock; > - struct kioctx_table __rcu *ioctx_table; > + struct xarray ioctx; > #endif > #ifdef CONFIG_MEMCG > /* > diff --git a/kernel/fork.c b/kernel/fork.c > index 07cddff89c7b..acb775f9592d 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup); > static void mm_init_aio(struct mm_struct *mm) > { > #ifdef CONFIG_AIO > - spin_lock_init(&mm->ioctx_lock); > - mm->ioctx_table = NULL; > + xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC); > #endif > } > > diff --git a/mm/debug.c b/mm/debug.c > index cdacba12e09a..d45ec63aed8c 100644 > --- a/mm/debug.c > +++ b/mm/debug.c > @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm) > "start_brk %lx brk %lx start_stack %lx\n" > "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" > "binfmt %px flags %lx core_state %px\n" > -#ifdef CONFIG_AIO > - "ioctx_table %px\n" > -#endif > #ifdef CONFIG_MEMCG > "owner %px " > #endif > @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm) > mm->start_brk, mm->brk, mm->start_stack, > mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, > mm->binfmt, mm->flags, mm->core_state, > -#ifdef CONFIG_AIO > - mm->ioctx_table, > -#endif > #ifdef CONFIG_MEMCG > mm->owner, > #endif > -- > 2.19.1 >
Matthew Wilcox <willy@infradead.org> writes: > This custom resizing array was vulnerable to a Spectre attack (speculating > off the end of an array to a user-controlled offset). The XArray is > not vulnerable to Spectre as it always masks its lookups to be within > the bounds of the array. I'm not a big fan of completely re-writing the code to fix this. Isn't the below patch sufficient? -Jeff diff --git a/fs/aio.c b/fs/aio.c index 97f983592925..9402ae0b63d5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1038,6 +1038,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(index, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users))
Jeff Moyer <jmoyer@redhat.com> writes: > Matthew Wilcox <willy@infradead.org> writes: > >> This custom resizing array was vulnerable to a Spectre attack (speculating >> off the end of an array to a user-controlled offset). The XArray is >> not vulnerable to Spectre as it always masks its lookups to be within >> the bounds of the array. > > I'm not a big fan of completely re-writing the code to fix this. Isn't > the below patch sufficient? Too quick on the draw. Here's a patch that compiles. ;-) Cheers, Jeff diff --git a/fs/aio.c b/fs/aio.c index 97f983592925..aac9659381d2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -45,6 +45,7 @@ #include <asm/kmap_types.h> #include <linux/uaccess.h> +#include <linux/nospec.h> #include "internal.h" @@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users))
Jeff Moyer <jmoyer@redhat.com> writes: > Jeff Moyer <jmoyer@redhat.com> writes: > >> Matthew Wilcox <willy@infradead.org> writes: >> >>> This custom resizing array was vulnerable to a Spectre attack (speculating >>> off the end of an array to a user-controlled offset). The XArray is >>> not vulnerable to Spectre as it always masks its lookups to be within >>> the bounds of the array. >> >> I'm not a big fan of completely re-writing the code to fix this. Isn't >> the below patch sufficient? > > Too quick on the draw. Here's a patch that compiles. ;-) Hi, Matthew, I'm going to submit this version formally. If you're interested in converting the ioctx_table to xarray, you can do that separately from a security fix. I would include a performance analysis with that patch, though. The idea of using a radix tree for the ioctx table was discarded due to performance reasons--see commit db446a08c23d5 ("aio: convert the ioctx list to table lookup v3"). I suspect using the xarray will perform similarly. Cheers, Jeff > diff --git a/fs/aio.c b/fs/aio.c > index 97f983592925..aac9659381d2 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -45,6 +45,7 @@ > > #include <asm/kmap_types.h> > #include <linux/uaccess.h> > +#include <linux/nospec.h> > > #include "internal.h" > > @@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) > if (!table || id >= table->nr) > goto out; > > + id = array_index_nospec(id, table->nr); > ctx = rcu_dereference(table->table[id]); > if (ctx && ctx->user_id == ctx_id) { > if (percpu_ref_tryget_live(&ctx->users)) > > -- > To unsubscribe, send a message with 'unsubscribe linux-aio' in > the body to majordomo@kvack.org. For more info on Linux AIO, > see: http://www.kvack.org/aio/ > Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>
On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: > I'm going to submit this version formally. If you're interested in > converting the ioctx_table to xarray, you can do that separately from a > security fix. I would include a performance analysis with that patch, > though. The idea of using a radix tree for the ioctx table was > discarded due to performance reasons--see commit db446a08c23d5 ("aio: > convert the ioctx list to table lookup v3"). I suspect using the xarray > will perform similarly. There's a big difference between Octavian's patch and mine. That patch indexed into the radix tree by 'ctx_id' directly, which was pretty much guaranteed to exhibit some close-to-worst-case behaviour from the radix tree due to IDs being sparsely assigned. My patch uses the ring ID which _we_ assigned, and so is nicely behaved, being usually a very small integer. What performance analysis would you find compelling? Octavian's original fio script: > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > blocksize=1024; numjobs=512; thread; loops=100 > > on an EXT2 filesystem mounted on top of a ramdisk or something else?
Matthew Wilcox <willy@infradead.org> writes: > On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >> I'm going to submit this version formally. If you're interested in >> converting the ioctx_table to xarray, you can do that separately from a >> security fix. I would include a performance analysis with that patch, >> though. The idea of using a radix tree for the ioctx table was >> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >> convert the ioctx list to table lookup v3"). I suspect using the xarray >> will perform similarly. > > There's a big difference between Octavian's patch and mine. That patch > indexed into the radix tree by 'ctx_id' directly, which was pretty > much guaranteed to exhibit some close-to-worst-case behaviour from the > radix tree due to IDs being sparsely assigned. My patch uses the ring > ID which _we_ assigned, and so is nicely behaved, being usually a very > small integer. OK, good to know. I obviously didn't look too closely at the two. > What performance analysis would you find compelling? Octavian's original > fio script: > >> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >> blocksize=1024; numjobs=512; thread; loops=100 >> >> on an EXT2 filesystem mounted on top of a ramdisk > > or something else? I think the most common use case is a small number of ioctx-s, so I'd like to see that use case not regress (that should be easy, right?). Kent, what were the tests you were using when doing this work? Jens, since you're doing performance work in this area now, are there any particular test cases you care about? Cheers, Jeff
On 12/11/18 11:02 AM, Jeff Moyer wrote: > Matthew Wilcox <willy@infradead.org> writes: > >> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >>> I'm going to submit this version formally. If you're interested in >>> converting the ioctx_table to xarray, you can do that separately from a >>> security fix. I would include a performance analysis with that patch, >>> though. The idea of using a radix tree for the ioctx table was >>> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >>> convert the ioctx list to table lookup v3"). I suspect using the xarray >>> will perform similarly. >> >> There's a big difference between Octavian's patch and mine. That patch >> indexed into the radix tree by 'ctx_id' directly, which was pretty >> much guaranteed to exhibit some close-to-worst-case behaviour from the >> radix tree due to IDs being sparsely assigned. My patch uses the ring >> ID which _we_ assigned, and so is nicely behaved, being usually a very >> small integer. > > OK, good to know. I obviously didn't look too closely at the two. > >> What performance analysis would you find compelling? Octavian's original >> fio script: >> >>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >>> blocksize=1024; numjobs=512; thread; loops=100 >>> >>> on an EXT2 filesystem mounted on top of a ramdisk >> >> or something else? > > I think the most common use case is a small number of ioctx-s, so I'd > like to see that use case not regress (that should be easy, right?). > Kent, what were the tests you were using when doing this work? Jens, > since you're doing performance work in this area now, are there any > particular test cases you care about? I can give it a spin, ioctx lookup is in the fast path, and for "classic" aio we do it twice for each IO...
Jens Axboe <axboe@kernel.dk> writes: > On 12/11/18 11:02 AM, Jeff Moyer wrote: >> Matthew Wilcox <willy@infradead.org> writes: >> >>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >>>> I'm going to submit this version formally. If you're interested in >>>> converting the ioctx_table to xarray, you can do that separately from a >>>> security fix. I would include a performance analysis with that patch, >>>> though. The idea of using a radix tree for the ioctx table was >>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >>>> convert the ioctx list to table lookup v3"). I suspect using the xarray >>>> will perform similarly. >>> >>> There's a big difference between Octavian's patch and mine. That patch >>> indexed into the radix tree by 'ctx_id' directly, which was pretty >>> much guaranteed to exhibit some close-to-worst-case behaviour from the >>> radix tree due to IDs being sparsely assigned. My patch uses the ring >>> ID which _we_ assigned, and so is nicely behaved, being usually a very >>> small integer. >> >> OK, good to know. I obviously didn't look too closely at the two. >> >>> What performance analysis would you find compelling? Octavian's original >>> fio script: >>> >>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >>>> blocksize=1024; numjobs=512; thread; loops=100 >>>> >>>> on an EXT2 filesystem mounted on top of a ramdisk >>> >>> or something else? >> >> I think the most common use case is a small number of ioctx-s, so I'd >> like to see that use case not regress (that should be easy, right?). Bah, I meant a small number of threads doing submit/getevents. >> Kent, what were the tests you were using when doing this work? Jens, >> since you're doing performance work in this area now, are there any >> particular test cases you care about? > > I can give it a spin, ioctx lookup is in the fast path, and for "classic" > aio we do it twice for each IO... Thanks! -Jeff
On 12/11/18 11:05 AM, Jens Axboe wrote: > On 12/11/18 11:02 AM, Jeff Moyer wrote: >> Matthew Wilcox <willy@infradead.org> writes: >> >>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >>>> I'm going to submit this version formally. If you're interested in >>>> converting the ioctx_table to xarray, you can do that separately from a >>>> security fix. I would include a performance analysis with that patch, >>>> though. The idea of using a radix tree for the ioctx table was >>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >>>> convert the ioctx list to table lookup v3"). I suspect using the xarray >>>> will perform similarly. >>> >>> There's a big difference between Octavian's patch and mine. That patch >>> indexed into the radix tree by 'ctx_id' directly, which was pretty >>> much guaranteed to exhibit some close-to-worst-case behaviour from the >>> radix tree due to IDs being sparsely assigned. My patch uses the ring >>> ID which _we_ assigned, and so is nicely behaved, being usually a very >>> small integer. >> >> OK, good to know. I obviously didn't look too closely at the two. >> >>> What performance analysis would you find compelling? Octavian's original >>> fio script: >>> >>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >>>> blocksize=1024; numjobs=512; thread; loops=100 >>>> >>>> on an EXT2 filesystem mounted on top of a ramdisk >>> >>> or something else? >> >> I think the most common use case is a small number of ioctx-s, so I'd >> like to see that use case not regress (that should be easy, right?). >> Kent, what were the tests you were using when doing this work? Jens, >> since you're doing performance work in this area now, are there any >> particular test cases you care about? > > I can give it a spin, ioctx lookup is in the fast path, and for "classic" > aio we do it twice for each IO... Don't see any regressions. But if we're fiddling with it anyway, can't we do something smarter? Make the fast path just index a table, and put all the big hammers in setup/destroy. We're spending a non-substantial amount of time doing lookups, that's really no different before and after the patch.
On 12/11/18 11:32 AM, Jens Axboe wrote: > On 12/11/18 11:05 AM, Jens Axboe wrote: >> On 12/11/18 11:02 AM, Jeff Moyer wrote: >>> Matthew Wilcox <willy@infradead.org> writes: >>> >>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >>>>> I'm going to submit this version formally. If you're interested in >>>>> converting the ioctx_table to xarray, you can do that separately from a >>>>> security fix. I would include a performance analysis with that patch, >>>>> though. The idea of using a radix tree for the ioctx table was >>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >>>>> convert the ioctx list to table lookup v3"). I suspect using the xarray >>>>> will perform similarly. >>>> >>>> There's a big difference between Octavian's patch and mine. That patch >>>> indexed into the radix tree by 'ctx_id' directly, which was pretty >>>> much guaranteed to exhibit some close-to-worst-case behaviour from the >>>> radix tree due to IDs being sparsely assigned. My patch uses the ring >>>> ID which _we_ assigned, and so is nicely behaved, being usually a very >>>> small integer. >>> >>> OK, good to know. I obviously didn't look too closely at the two. >>> >>>> What performance analysis would you find compelling? Octavian's original >>>> fio script: >>>> >>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >>>>> blocksize=1024; numjobs=512; thread; loops=100 >>>>> >>>>> on an EXT2 filesystem mounted on top of a ramdisk >>>> >>>> or something else? >>> >>> I think the most common use case is a small number of ioctx-s, so I'd >>> like to see that use case not regress (that should be easy, right?). >>> Kent, what were the tests you were using when doing this work? Jens, >>> since you're doing performance work in this area now, are there any >>> particular test cases you care about? >> >> I can give it a spin, ioctx lookup is in the fast path, and for "classic" >> aio we do it twice for each IO... > > Don't see any regressions. But if we're fiddling with it anyway, can't > we do something smarter? Make the fast path just index a table, and put > all the big hammers in setup/destroy. We're spending a non-substantial > amount of time doing lookups, that's really no different before and > after the patch. Looks like it's the percpu ref get, in terms of "lookup" we already look pretty good.
On 12/11/18 11:09 AM, Jeff Moyer wrote: > Jens Axboe <axboe@kernel.dk> writes: > >> On 12/11/18 11:02 AM, Jeff Moyer wrote: >>> Matthew Wilcox <willy@infradead.org> writes: >>> >>>> On Tue, Dec 11, 2018 at 12:21:52PM -0500, Jeff Moyer wrote: >>>>> I'm going to submit this version formally. If you're interested in >>>>> converting the ioctx_table to xarray, you can do that separately from a >>>>> security fix. I would include a performance analysis with that patch, >>>>> though. The idea of using a radix tree for the ioctx table was >>>>> discarded due to performance reasons--see commit db446a08c23d5 ("aio: >>>>> convert the ioctx list to table lookup v3"). I suspect using the xarray >>>>> will perform similarly. >>>> >>>> There's a big difference between Octavian's patch and mine. That patch >>>> indexed into the radix tree by 'ctx_id' directly, which was pretty >>>> much guaranteed to exhibit some close-to-worst-case behaviour from the >>>> radix tree due to IDs being sparsely assigned. My patch uses the ring >>>> ID which _we_ assigned, and so is nicely behaved, being usually a very >>>> small integer. >>> >>> OK, good to know. I obviously didn't look too closely at the two. >>> >>>> What performance analysis would you find compelling? Octavian's original >>>> fio script: >>>> >>>>> rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 >>>>> blocksize=1024; numjobs=512; thread; loops=100 >>>>> >>>>> on an EXT2 filesystem mounted on top of a ramdisk >>>> >>>> or something else? >>> >>> I think the most common use case is a small number of ioctx-s, so I'd >>> like to see that use case not regress (that should be easy, right?). > > Bah, I meant a small number of threads doing submit/getevents. > >>> Kent, what were the tests you were using when doing this work? Jens, >>> since you're doing performance work in this area now, are there any >>> particular test cases you care about? >> >> I can give it a spin, ioctx lookup is in the fast path, and for "classic" >> aio we do it twice for each IO... > > Thanks! You can add my reviewed-by/tested-by. Do you want me to carry this one? I can rebase on top of the aio.c nospec lookup patch, we should do those separately.
On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote: > @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) > struct aio_ring __user *ring = (void __user *)ctx_id; > struct mm_struct *mm = current->mm; > struct kioctx *ctx, *ret = NULL; > - struct kioctx_table *table; > unsigned id; > > if (get_user(id, &ring->id)) > return NULL; > > rcu_read_lock(); > - table = rcu_dereference(mm->ioctx_table); > - > - if (!table || id >= table->nr) > - goto out; > - > - ctx = rcu_dereference(table->table[id]); > + ctx = xa_load(&mm->ioctx, id); > if (ctx && ctx->user_id == ctx_id) { > if (percpu_ref_tryget_live(&ctx->users)) > ret = ctx; > } Question on this part - do we need that RCU read lock around this now? I don't think we do.
On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote: > On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote: > > > > rcu_read_lock(); > > - table = rcu_dereference(mm->ioctx_table); > > - > > - if (!table || id >= table->nr) > > - goto out; > > - > > - ctx = rcu_dereference(table->table[id]); > > + ctx = xa_load(&mm->ioctx, id); > > if (ctx && ctx->user_id == ctx_id) { > > if (percpu_ref_tryget_live(&ctx->users)) > > ret = ctx; > > } > > Question on this part - do we need that RCU read lock around this now? I > don't think we do. I think we need the rcu read lock here to prevent ctx from being freed under us by free_ioctx().
On 12/11/18 11:45 AM, Matthew Wilcox wrote: > On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote: >> On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote: >>> >>> rcu_read_lock(); >>> - table = rcu_dereference(mm->ioctx_table); >>> - >>> - if (!table || id >= table->nr) >>> - goto out; >>> - >>> - ctx = rcu_dereference(table->table[id]); >>> + ctx = xa_load(&mm->ioctx, id); >>> if (ctx && ctx->user_id == ctx_id) { >>> if (percpu_ref_tryget_live(&ctx->users)) >>> ret = ctx; >>> } >> >> Question on this part - do we need that RCU read lock around this now? I >> don't think we do. > > I think we need the rcu read lock here to prevent ctx from being freed > under us by free_ioctx(). Then that begs the question, how about __xa_load() that is already called under RCU read lock?
On Tue, Dec 11, 2018 at 11:32:54AM -0700, Jens Axboe wrote: > Don't see any regressions. But if we're fiddling with it anyway, can't > we do something smarter? Make the fast path just index a table, and put > all the big hammers in setup/destroy. We're spending a non-substantial > amount of time doing lookups, that's really no different before and > after the patch. Thanks for checking it out. I think the fast path does just index a table. Until you have more than 64 pointers in the XArray, it's just xa->head->slots[i]. And then up to 4096 pointers, it's xa->head->slots[i >> 6]->slots[i]. It has the advantage that if you only have one kioctx (which is surely many programs using AIO), it's just xa->head, so even better than a table lookup. It'll start to deteriorate after 4096 kioctxs, with one extra indirection for every 6 bits, but by that point, we'd've been straining the memory allocator to allocate a large table anyway.
On 12/11/18 11:51 AM, Matthew Wilcox wrote: > On Tue, Dec 11, 2018 at 11:32:54AM -0700, Jens Axboe wrote: >> Don't see any regressions. But if we're fiddling with it anyway, can't >> we do something smarter? Make the fast path just index a table, and put >> all the big hammers in setup/destroy. We're spending a non-substantial >> amount of time doing lookups, that's really no different before and >> after the patch. > > Thanks for checking it out. > > I think the fast path does just index a table. Until you have more than > 64 pointers in the XArray, it's just xa->head->slots[i]. And then up > to 4096 pointers, it's xa->head->slots[i >> 6]->slots[i]. It has the > advantage that if you only have one kioctx (which is surely many programs > using AIO), it's just xa->head, so even better than a table lookup. > > It'll start to deteriorate after 4096 kioctxs, with one extra indirection > for every 6 bits, but by that point, we'd've been straining the memory > allocator to allocate a large table anyway. I agree, and nobody cares about 4k kioctxs, you're way into the weeds at that point anyway. So as the followup said, I think we're fine as-is for this particular case.
On Tue, Dec 11, 2018 at 11:46:53AM -0700, Jens Axboe wrote: > On 12/11/18 11:45 AM, Matthew Wilcox wrote: > > I think we need the rcu read lock here to prevent ctx from being freed > > under us by free_ioctx(). > > Then that begs the question, how about __xa_load() that is already called > under RCU read lock? I've been considering adding it to the API, yes. I was under the impression that nested rcu_read_lock() calls were not expensive, even with CONFIG_PREEMPT.
On 12/11/18 11:46 AM, Jens Axboe wrote: > On 12/11/18 11:45 AM, Matthew Wilcox wrote: >> On Tue, Dec 11, 2018 at 11:41:55AM -0700, Jens Axboe wrote: >>> On Wed, Nov 28, 2018 at 11:35 AM Matthew Wilcox <willy@infradead.org> wrote: >>>> >>>> rcu_read_lock(); >>>> - table = rcu_dereference(mm->ioctx_table); >>>> - >>>> - if (!table || id >= table->nr) >>>> - goto out; >>>> - >>>> - ctx = rcu_dereference(table->table[id]); >>>> + ctx = xa_load(&mm->ioctx, id); >>>> if (ctx && ctx->user_id == ctx_id) { >>>> if (percpu_ref_tryget_live(&ctx->users)) >>>> ret = ctx; >>>> } >>> >>> Question on this part - do we need that RCU read lock around this now? I >>> don't think we do. >> >> I think we need the rcu read lock here to prevent ctx from being freed >> under us by free_ioctx(). > > Then that begs the question, how about __xa_load() that is already called > under RCU read lock? Something like this, mem remap has an existing user that can use this too already.
On 12/11/18 11:53 AM, Matthew Wilcox wrote: > On Tue, Dec 11, 2018 at 11:46:53AM -0700, Jens Axboe wrote: >> On 12/11/18 11:45 AM, Matthew Wilcox wrote: >>> I think we need the rcu read lock here to prevent ctx from being freed >>> under us by free_ioctx(). >> >> Then that begs the question, how about __xa_load() that is already called >> under RCU read lock? > > I've been considering adding it to the API, yes. I was under the > impression that nested rcu_read_lock() calls were not expensive, even > with CONFIG_PREEMPT. They are not expensive, but they are not free either. And if we know we are already under a rcu read lock, it seems pretty pointless. For the two cases (memremap and aio), the rcu read lock is right there, before the call. Easy to verify that it's safe.
diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..51ba7446a24f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -71,12 +71,6 @@ struct aio_ring { #define AIO_RING_PAGES 8 -struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx __rcu *table[]; -}; - struct kioctx_cpu { unsigned reqs_available; }; @@ -313,27 +307,22 @@ static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct kioctx_table *table; - int i, res = -EINVAL; + struct kioctx *ctx; + unsigned long index; + int res = -EINVAL; - spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - for (i = 0; i < table->nr; i++) { - struct kioctx *ctx; - - ctx = rcu_dereference(table->table[i]); - if (ctx && ctx->aio_ring_file == file) { - if (!atomic_read(&ctx->dead)) { - ctx->user_id = ctx->mmap_base = vma->vm_start; - res = 0; - } - break; + xa_lock(&mm->ioctx); + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { + if (ctx->aio_ring_file != file) + continue; + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; } + break; } + xa_unlock(&mm->ioctx); - rcu_read_unlock(); - spin_unlock(&mm->ioctx_lock); return res; } @@ -617,57 +606,21 @@ static void free_ioctx_users(struct percpu_ref *ref) static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) { - unsigned i, new_nr; - struct kioctx_table *table, *old; struct aio_ring *ring; + int err; - spin_lock(&mm->ioctx_lock); - table = rcu_dereference_raw(mm->ioctx_table); - - while (1) { - if (table) - for (i = 0; i < table->nr; i++) - if (!rcu_access_pointer(table->table[i])) { - ctx->id = i; - rcu_assign_pointer(table->table[i], ctx); - spin_unlock(&mm->ioctx_lock); - - /* While kioctx setup is in progress, - * we are protected from page migration - * changes ring_pages by ->ring_lock. - */ - ring = kmap_atomic(ctx->ring_pages[0]); - ring->id = ctx->id; - kunmap_atomic(ring); - return 0; - } - - new_nr = (table ? table->nr : 1) * 4; - spin_unlock(&mm->ioctx_lock); - - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * - new_nr, GFP_KERNEL); - if (!table) - return -ENOMEM; - - table->nr = new_nr; - - spin_lock(&mm->ioctx_lock); - old = rcu_dereference_raw(mm->ioctx_table); - - if (!old) { - rcu_assign_pointer(mm->ioctx_table, table); - } else if (table->nr > old->nr) { - memcpy(table->table, old->table, - old->nr * sizeof(struct kioctx *)); + err = xa_alloc(&mm->ioctx, &ctx->id, UINT_MAX, ctx, GFP_KERNEL); + if (err) + return err; - rcu_assign_pointer(mm->ioctx_table, table); - kfree_rcu(old, rcu); - } else { - kfree(table); - table = old; - } - } + /* + * While kioctx setup is in progress, we are protected from + * page migration changing ring_pages by ->ring_lock. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; } static void aio_nr_sub(unsigned nr) @@ -793,27 +746,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(err); } -/* kill_ioctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, - struct ctx_rq_wait *wait) +static void __kill_ioctx(struct kioctx *ctx, struct ctx_rq_wait *wait) { - struct kioctx_table *table; - - spin_lock(&mm->ioctx_lock); - if (atomic_xchg(&ctx->dead, 1)) { - spin_unlock(&mm->ioctx_lock); - return -EINVAL; - } - - table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); - RCU_INIT_POINTER(table->table[ctx->id], NULL); - spin_unlock(&mm->ioctx_lock); - /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); @@ -831,6 +765,30 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); +} + +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct ctx_rq_wait *wait) +{ + struct kioctx *old; + + /* I don't understand what this lock is protecting against */ + xa_lock(&mm->ioctx); + if (atomic_xchg(&ctx->dead, 1)) { + xa_unlock(&mm->ioctx); + return -EINVAL; + } + + old = __xa_erase(&mm->ioctx, ctx->id); + WARN_ON(old != ctx); + xa_unlock(&mm->ioctx); + + __kill_ioctx(ctx, wait); return 0; } @@ -844,26 +802,21 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + struct kioctx *ctx; struct ctx_rq_wait wait; - int i, skipped; + unsigned long index; - if (!table) + if (xa_empty(&mm->ioctx)) return; - atomic_set(&wait.count, table->nr); + /* + * Prevent count from getting to 0 until we're ready for it + */ + atomic_set(&wait.count, 1); init_completion(&wait.comp); - skipped = 0; - for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = - rcu_dereference_protected(table->table[i], true); - - if (!ctx) { - skipped++; - continue; - } - + xa_lock(&mm->ioctx); + xa_for_each(&mm->ioctx, ctx, index, ULONG_MAX, XA_PRESENT) { /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, @@ -872,16 +825,16 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, &wait); + atomic_inc(&wait.count); + __xa_erase(&mm->ioctx, ctx->id); + __kill_ioctx(ctx, &wait); } + xa_unlock(&mm->ioctx); - if (!atomic_sub_and_test(skipped, &wait.count)) { + if (!atomic_sub_and_test(1, &wait.count)) { /* Wait until all IO for the context are done. */ wait_for_completion(&wait.comp); } - - RCU_INIT_POINTER(mm->ioctx_table, NULL); - kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) @@ -1026,24 +979,17 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; - struct kioctx_table *table; unsigned id; if (get_user(id, &ring->id)) return NULL; rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - if (!table || id >= table->nr) - goto out; - - ctx = rcu_dereference(table->table[id]); + ctx = xa_load(&mm->ioctx, id); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) ret = ctx; } -out: rcu_read_unlock(); return ret; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ed8f6292a53..1a95bb27f5a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include <linux/uprobes.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> +#include <linux/xarray.h> #include <asm/mmu.h> @@ -336,7 +337,6 @@ struct core_state { struct completion startup; }; -struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ @@ -431,8 +431,7 @@ struct mm_struct { atomic_t membarrier_state; #endif #ifdef CONFIG_AIO - spinlock_t ioctx_lock; - struct kioctx_table __rcu *ioctx_table; + struct xarray ioctx; #endif #ifdef CONFIG_MEMCG /* diff --git a/kernel/fork.c b/kernel/fork.c index 07cddff89c7b..acb775f9592d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,8 +946,7 @@ __setup("coredump_filter=", coredump_filter_setup); static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO - spin_lock_init(&mm->ioctx_lock); - mm->ioctx_table = NULL; + xa_init_flags(&mm->ioctx, XA_FLAGS_ALLOC); #endif } diff --git a/mm/debug.c b/mm/debug.c index cdacba12e09a..d45ec63aed8c 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -127,9 +127,6 @@ void dump_mm(const struct mm_struct *mm) "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" "binfmt %px flags %lx core_state %px\n" -#ifdef CONFIG_AIO - "ioctx_table %px\n" -#endif #ifdef CONFIG_MEMCG "owner %px " #endif @@ -158,9 +155,6 @@ void dump_mm(const struct mm_struct *mm) mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, mm->binfmt, mm->flags, mm->core_state, -#ifdef CONFIG_AIO - mm->ioctx_table, -#endif #ifdef CONFIG_MEMCG mm->owner, #endif
This custom resizing array was vulnerable to a Spectre attack (speculating off the end of an array to a user-controlled offset). The XArray is not vulnerable to Spectre as it always masks its lookups to be within the bounds of the array. Signed-off-by: Matthew Wilcox <willy@infradead.org> --- fs/aio.c | 182 ++++++++++++++------------------------- include/linux/mm_types.h | 5 +- kernel/fork.c | 3 +- mm/debug.c | 6 -- 4 files changed, 67 insertions(+), 129 deletions(-)