Message ID | 7f5eb1b89e8dcf93739607c79bbf7aec1784cbbe.1680187408.git.asml.silence@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | optimise registered buffer/file updates | expand |
Pavel Begunkov <asml.silence@gmail.com> writes: > Add allocation cache for struct io_rsrc_node, it's always allocated and > put under ->uring_lock, so it doesn't need any extra synchronisation > around caches. Hi Pavel, I'm curious if you considered using kmem_cache instead of the custom cache for this case? I'm wondering if this provokes visible difference in performance in your benchmark. > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > --- > include/linux/io_uring_types.h | 1 + > io_uring/io_uring.c | 11 +++++++++-- > io_uring/rsrc.c | 23 +++++++++++++++-------- > io_uring/rsrc.h | 5 ++++- > 4 files changed, 29 insertions(+), 11 deletions(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 47496059e13a..5d772e36e7fc 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -332,6 +332,7 @@ struct io_ring_ctx { > > /* protected by ->uring_lock */ > struct list_head rsrc_ref_list; > + struct io_alloc_cache rsrc_node_cache; > > struct list_head io_buffers_pages; > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 8c3886a4ca96..beedaf403284 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) > INIT_LIST_HEAD(&ctx->sqd_list); > INIT_LIST_HEAD(&ctx->cq_overflow_list); > INIT_LIST_HEAD(&ctx->io_buffers_cache); > + io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); > io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); > io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); > init_completion(&ctx->ref_comp); > @@ -2791,6 +2792,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) > mutex_unlock(&ctx->uring_lock); > } > > +void io_rsrc_node_cache_free(struct io_cache_entry *entry) > +{ > + kfree(container_of(entry, struct io_rsrc_node, cache)); > +} > + > static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) > { > io_sq_thread_finish(ctx); > @@ -2816,9 +2822,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) > > /* there are no registered resources left, nobody uses it */ > if (ctx->rsrc_node) > - io_rsrc_node_destroy(ctx->rsrc_node); > + io_rsrc_node_destroy(ctx, ctx->rsrc_node); > if (ctx->rsrc_backup_node) > - io_rsrc_node_destroy(ctx->rsrc_backup_node); > + io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); > > WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); > > @@ -2830,6 +2836,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) > #endif > WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); > > + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); > if (ctx->mm_account) { > mmdrop(ctx->mm_account); > ctx->mm_account = NULL; > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 0f4e245dee1b..345631091d80 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) > kfree(prsrc); > } > > - io_rsrc_node_destroy(ref_node); > + io_rsrc_node_destroy(rsrc_data->ctx, ref_node); > if (atomic_dec_and_test(&rsrc_data->refs)) > complete(&rsrc_data->done); > } > @@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data) > wait_for_completion(&data->done); > } > > -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) > +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > { > - kfree(ref_node); > + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) > + kfree(node); > } > > void io_rsrc_node_ref_zero(struct io_rsrc_node *node) > @@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) > } > } > > -static struct io_rsrc_node *io_rsrc_node_alloc(void) > +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) > { > struct io_rsrc_node *ref_node; > + struct io_cache_entry *entry; > > - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); > - if (!ref_node) > - return NULL; > + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); > + if (entry) { > + ref_node = container_of(entry, struct io_rsrc_node, cache); > + } else { > + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); > + if (!ref_node) > + return NULL; > + } > > ref_node->refs = 1; > INIT_LIST_HEAD(&ref_node->node); > @@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) > { > if (ctx->rsrc_backup_node) > return 0; > - ctx->rsrc_backup_node = io_rsrc_node_alloc(); > + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); > return ctx->rsrc_backup_node ? 0 : -ENOMEM; > } > > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index 17293ab90f64..d1555eaae81a 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -4,6 +4,8 @@ > > #include <net/af_unix.h> > > +#include "alloc_cache.h" > + > #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) > #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) > #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) > @@ -37,6 +39,7 @@ struct io_rsrc_data { > }; > > struct io_rsrc_node { > + struct io_cache_entry cache; > int refs; > struct list_head node; > struct io_rsrc_data *rsrc_data; > @@ -65,7 +68,7 @@ void io_rsrc_put_tw(struct callback_head *cb); > void io_rsrc_node_ref_zero(struct io_rsrc_node *node); > void io_rsrc_put_work(struct work_struct *work); > void io_wait_rsrc_data(struct io_rsrc_data *data); > -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); > +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); > int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); > int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, > struct io_rsrc_node *node, void *rsrc);
On 3/31/23 15:09, Gabriel Krisman Bertazi wrote: > Pavel Begunkov <asml.silence@gmail.com> writes: > >> Add allocation cache for struct io_rsrc_node, it's always allocated and >> put under ->uring_lock, so it doesn't need any extra synchronisation >> around caches. > > Hi Pavel, > > I'm curious if you considered using kmem_cache instead of the custom > cache for this case? I'm wondering if this provokes visible difference in > performance in your benchmark. I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us much, definitely doesn't spare from locking, and the overhead definitely wasn't satisfactory for requests before. A quick note that I want to further limit the cache size, IO_ALLOC_CACHE_MAX=512 for nodes doesn't sound great.
Pavel Begunkov <asml.silence@gmail.com> writes: > On 3/31/23 15:09, Gabriel Krisman Bertazi wrote: >> Pavel Begunkov <asml.silence@gmail.com> writes: >> >>> Add allocation cache for struct io_rsrc_node, it's always allocated and >>> put under ->uring_lock, so it doesn't need any extra synchronisation >>> around caches. >> Hi Pavel, >> I'm curious if you considered using kmem_cache instead of the custom >> cache for this case? I'm wondering if this provokes visible difference in >> performance in your benchmark. > > I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us > much, definitely doesn't spare from locking, and the overhead > definitely wasn't satisfactory for requests before. There is no locks in the fast path of slub, as far as I know. it has a per-cpu cache that is refilled once empty, quite similar to the fastpath of this cache. I imagine the performance hit in slub comes from the barrier and atomic operations? kmem_cache works fine for most hot paths of the kernel. I think this custom cache makes sense for the request cache, where objects are allocated at an incredibly high rate. but is this level of update frequency a valid use case here? If it is indeed a significant performance improvement, I guess it is fine to have another user of the cache. But I'd be curious to know how much of the performance improvement you mentioned in the cover letter is due to this patch!
On 4/1/23 01:04, Gabriel Krisman Bertazi wrote: > Pavel Begunkov <asml.silence@gmail.com> writes: > >> On 3/31/23 15:09, Gabriel Krisman Bertazi wrote: >>> Pavel Begunkov <asml.silence@gmail.com> writes: >>> >>>> Add allocation cache for struct io_rsrc_node, it's always allocated and >>>> put under ->uring_lock, so it doesn't need any extra synchronisation >>>> around caches. >>> Hi Pavel, >>> I'm curious if you considered using kmem_cache instead of the custom >>> cache for this case? I'm wondering if this provokes visible difference in >>> performance in your benchmark. >> >> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us >> much, definitely doesn't spare from locking, and the overhead >> definitely wasn't satisfactory for requests before. > > There is no locks in the fast path of slub, as far as I know. it has a > per-cpu cache that is refilled once empty, quite similar to the fastpath > of this cache. I imagine the performance hit in slub comes from the > barrier and atomic operations? Yeah, I mean all kinds of synchronisation. And I don't think that's the main offender here, the test is single threaded without contention and the system was mostly idle. > kmem_cache works fine for most hot paths of the kernel. I think this It doesn't for io_uring. There are caches for the net side and now in the block layer as well. I wouldn't say it necessarily halves performance but definitely takes a share of CPU. > custom cache makes sense for the request cache, where objects are > allocated at an incredibly high rate. but is this level of update > frequency a valid use case here? I can think of some. For example it was of interest before to install a file for just 2-3 IO operations and also fully bypassing the normal file table. I rather don't see why we wouldn't use it. > If it is indeed a significant performance improvement, I guess it is > fine to have another user of the cache. But I'd be curious to know how > much of the performance improvement you mentioned in the cover letter is > due to this patch! It was definitely sticking out in profiles, 5-10% of cycles, maybe more
Pavel Begunkov <asml.silence@gmail.com> writes: > On 4/1/23 01:04, Gabriel Krisman Bertazi wrote: >> Pavel Begunkov <asml.silence@gmail.com> writes: >>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us >>> much, definitely doesn't spare from locking, and the overhead >>> definitely wasn't satisfactory for requests before. >> There is no locks in the fast path of slub, as far as I know. it has >> a >> per-cpu cache that is refilled once empty, quite similar to the fastpath >> of this cache. I imagine the performance hit in slub comes from the >> barrier and atomic operations? > > Yeah, I mean all kinds of synchronisation. And I don't think > that's the main offender here, the test is single threaded without > contention and the system was mostly idle. > >> kmem_cache works fine for most hot paths of the kernel. I think this > > It doesn't for io_uring. There are caches for the net side and now > in the block layer as well. I wouldn't say it necessarily halves > performance but definitely takes a share of CPU. Right. My point is that all these caches (block, io_uring) duplicate what the slab cache is meant to do. Since slab became a bottleneck, I'm looking at how to improve the situation on their side, to see if we can drop the caching here and in block/. >> If it is indeed a significant performance improvement, I guess it is >> fine to have another user of the cache. But I'd be curious to know how >> much of the performance improvement you mentioned in the cover letter is >> due to this patch! > > It was definitely sticking out in profiles, 5-10% of cycles, maybe > more That's surprisingly high. Hopefully we will can avoid this caching soon. For now, feel free to add to this patch: Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote: > Pavel Begunkov <asml.silence@gmail.com> writes: > >> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote: >>> Pavel Begunkov <asml.silence@gmail.com> writes: > >>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us >>>> much, definitely doesn't spare from locking, and the overhead >>>> definitely wasn't satisfactory for requests before. >>> There is no locks in the fast path of slub, as far as I know. it has >>> a >>> per-cpu cache that is refilled once empty, quite similar to the fastpath >>> of this cache. I imagine the performance hit in slub comes from the >>> barrier and atomic operations? >> >> Yeah, I mean all kinds of synchronisation. And I don't think >> that's the main offender here, the test is single threaded without >> contention and the system was mostly idle. >> >>> kmem_cache works fine for most hot paths of the kernel. I think this >> >> It doesn't for io_uring. There are caches for the net side and now >> in the block layer as well. I wouldn't say it necessarily halves >> performance but definitely takes a share of CPU. > > Right. My point is that all these caches (block, io_uring) duplicate > what the slab cache is meant to do. Since slab became a bottleneck, I'm > looking at how to improve the situation on their side, to see if we can > drop the caching here and in block/. That would certainly be a worthy goal, and I do agree that these caches are (largely) working around deficiencies. One important point that you may miss is that most of this caching gets its performance from both avoiding atomics in slub, but also because we can guarantee that both alloc and free happen from process context. The block IRQ bits are a bit different, but apart from that, it's true elsewhere. Caching that needs to even disable IRQs locally generally doesn't beat out slub by much, the big wins are the cases where we know free+alloc is done in process context.
Jens Axboe <axboe@kernel.dk> writes: > On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote: >> Pavel Begunkov <asml.silence@gmail.com> writes: >> >>> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote: >>>> Pavel Begunkov <asml.silence@gmail.com> writes: >> >>>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us >>>>> much, definitely doesn't spare from locking, and the overhead >>>>> definitely wasn't satisfactory for requests before. >>>> There is no locks in the fast path of slub, as far as I know. it has >>>> a >>>> per-cpu cache that is refilled once empty, quite similar to the fastpath >>>> of this cache. I imagine the performance hit in slub comes from the >>>> barrier and atomic operations? >>> >>> Yeah, I mean all kinds of synchronisation. And I don't think >>> that's the main offender here, the test is single threaded without >>> contention and the system was mostly idle. >>> >>>> kmem_cache works fine for most hot paths of the kernel. I think this >>> >>> It doesn't for io_uring. There are caches for the net side and now >>> in the block layer as well. I wouldn't say it necessarily halves >>> performance but definitely takes a share of CPU. >> >> Right. My point is that all these caches (block, io_uring) duplicate >> what the slab cache is meant to do. Since slab became a bottleneck, I'm >> looking at how to improve the situation on their side, to see if we can >> drop the caching here and in block/. > > That would certainly be a worthy goal, and I do agree that these caches > are (largely) working around deficiencies. One important point that you > may miss is that most of this caching gets its performance from both > avoiding atomics in slub, but also because we can guarantee that both > alloc and free happen from process context. The block IRQ bits are a bit > different, but apart from that, it's true elsewhere. Caching that needs > to even disable IRQs locally generally doesn't beat out slub by much, > the big wins are the cases where we know free+alloc is done in process > context. Yes, I noticed that. I was thinking of exposing a flag at kmem_cache creation-time to tell slab the user promises not to use it in IRQ context, so it doesn't need to worry about nested invocation in the allocation/free path. Then, for those caches, have a kmem_cache_alloc_locked variant, where the synchronization is maintained by the caller (i.e. by ->uring_lock here), so it can manipulate the cache without atomics. I was looking at your implementation of the block cache for inspiration and saw how you kept a second list for IRQ. I'm thinking how to fit a similar change inside slub. But for now, I want to get the simpler case, which is all io_uring need. I'll try to get a prototype before lsfmm and see if I get the MM folks input there.
On 4/4/23 17:53, Gabriel Krisman Bertazi wrote: > Jens Axboe <axboe@kernel.dk> writes: > >> On 4/4/23 9:48?AM, Gabriel Krisman Bertazi wrote: >>> Pavel Begunkov <asml.silence@gmail.com> writes: >>> >>>> On 4/1/23 01:04, Gabriel Krisman Bertazi wrote: >>>>> Pavel Begunkov <asml.silence@gmail.com> writes: >>> >>>>>> I didn't try it, but kmem_cache vs kmalloc, IIRC, doesn't bring us >>>>>> much, definitely doesn't spare from locking, and the overhead >>>>>> definitely wasn't satisfactory for requests before. >>>>> There is no locks in the fast path of slub, as far as I know. it has >>>>> a >>>>> per-cpu cache that is refilled once empty, quite similar to the fastpath >>>>> of this cache. I imagine the performance hit in slub comes from the >>>>> barrier and atomic operations? >>>> >>>> Yeah, I mean all kinds of synchronisation. And I don't think >>>> that's the main offender here, the test is single threaded without >>>> contention and the system was mostly idle. >>>> >>>>> kmem_cache works fine for most hot paths of the kernel. I think this >>>> >>>> It doesn't for io_uring. There are caches for the net side and now >>>> in the block layer as well. I wouldn't say it necessarily halves >>>> performance but definitely takes a share of CPU. >>> >>> Right. My point is that all these caches (block, io_uring) duplicate >>> what the slab cache is meant to do. Since slab became a bottleneck, I'm >>> looking at how to improve the situation on their side, to see if we can >>> drop the caching here and in block/. >> >> That would certainly be a worthy goal, and I do agree that these caches >> are (largely) working around deficiencies. One important point that you >> may miss is that most of this caching gets its performance from both >> avoiding atomics in slub, but also because we can guarantee that both >> alloc and free happen from process context. The block IRQ bits are a bit >> different, but apart from that, it's true elsewhere. Caching that needs >> to even disable IRQs locally generally doesn't beat out slub by much, >> the big wins are the cases where we know free+alloc is done in process >> context. > > Yes, I noticed that. I was thinking of exposing a flag at kmem_cache > creation-time to tell slab the user promises not to use it in IRQ > context, so it doesn't need to worry about nested invocation in the > allocation/free path. Then, for those caches, have a > kmem_cache_alloc_locked variant, where the synchronization is maintained > by the caller (i.e. by ->uring_lock here), so it can manipulate the > cache without atomics. > > I was looking at your implementation of the block cache for inspiration > and saw how you kept a second list for IRQ. I'm thinking how to fit a It works fine, but one problem I had while implementing it is local_irq_save() in bio_put_percpu_cache() not being so cheap even when interrupts are already disabled. Apparently, raw_local_save_flags() is not as free as I wished it to be, and we can't easily pass the current context. Would be nice to do something about it at some point. > similar change inside slub. But for now, I want to get the simpler > case, which is all io_uring need. > > I'll try to get a prototype before lsfmm and see if I get the MM folks > input there. >
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 47496059e13a..5d772e36e7fc 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -332,6 +332,7 @@ struct io_ring_ctx { /* protected by ->uring_lock */ struct list_head rsrc_ref_list; + struct io_alloc_cache rsrc_node_cache; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8c3886a4ca96..beedaf403284 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); @@ -2791,6 +2792,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_rsrc_node_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_rsrc_node, cache)); +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2816,9 +2822,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_node); if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); @@ -2830,6 +2836,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0f4e245dee1b..345631091d80 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) kfree(prsrc); } - io_rsrc_node_destroy(ref_node); + io_rsrc_node_destroy(rsrc_data->ctx, ref_node); if (atomic_dec_and_test(&rsrc_data->refs)) complete(&rsrc_data->done); } @@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - kfree(ref_node); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) @@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) } } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + } ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); @@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 17293ab90f64..d1555eaae81a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,6 +4,8 @@ #include <net/af_unix.h> +#include "alloc_cache.h" + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -37,6 +39,7 @@ struct io_rsrc_data { }; struct io_rsrc_node { + struct io_cache_entry cache; int refs; struct list_head node; struct io_rsrc_data *rsrc_data; @@ -65,7 +68,7 @@ void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_wait_rsrc_data(struct io_rsrc_data *data); -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc);
Add allocation cache for struct io_rsrc_node, it's always allocated and put under ->uring_lock, so it doesn't need any extra synchronisation around caches. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 11 +++++++++-- io_uring/rsrc.c | 23 +++++++++++++++-------- io_uring/rsrc.h | 5 ++++- 4 files changed, 29 insertions(+), 11 deletions(-)