Message ID | 20250203154517.937623-7-kbusch@meta.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [1/6] block: const blk_rq_nr_phys_segments request | expand |
On 2/3/25 15:45, Keith Busch wrote: > From: Keith Busch <kbusch@kernel.org> > > Frequent alloc/free cycles on these is pretty costly. Use an io cache to > more efficiently reuse these buffers. > > Signed-off-by: Keith Busch <kbusch@kernel.org> > --- > include/linux/io_uring_types.h | 16 ++--- > io_uring/filetable.c | 2 +- > io_uring/rsrc.c | 108 ++++++++++++++++++++++++--------- > io_uring/rsrc.h | 2 +- > 4 files changed, 92 insertions(+), 36 deletions(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index aa661ebfd6568..c0e0c1f92e5b1 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -67,8 +67,17 @@ struct io_file_table { > unsigned int alloc_hint; > }; > > +struct io_alloc_cache { > + void **entries; > + unsigned int nr_cached; > + unsigned int max_cached; > + size_t elem_size; > +}; > + > struct io_buf_table { > struct io_rsrc_data data; > + struct io_alloc_cache node_cache; > + struct io_alloc_cache imu_cache; We can avoid all churn if you kill patch 5/6 and place put the caches directly into struct io_ring_ctx. It's a bit better for future cache improvements and we can even reuse the node cache for files. ... > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 864c2eabf8efd..5434b0d992d62 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -117,23 +117,39 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > unpin_user_page(imu->bvec[i].bv_page); > if (imu->acct_pages) > io_unaccount_mem(ctx, imu->acct_pages); > - kvfree(imu); > + if (struct_size(imu, bvec, imu->nr_bvecs) > > + ctx->buf_table.imu_cache.elem_size || It could be quite a large allocation, let's not cache it if it hasn't came from the cache for now. We can always improve on top. And can we invert how it's calculated? See below. You'll have fewer calculations in the fast path, and I don't really like users looking at ->elem_size when it's not necessary. #define IO_CACHED_BVEC_SEGS N io_alloc_cache_init(&table->imu_cache, ..., /* elem_size */ struct_size(imu, ..., IO_CACHED_BVEC_SEGS)); alloc(bvec_segs) { if (bvec_segs <= IO_CACHED_BVEC_SEGS) /* use the cache */ ... } free() { if (imu->nr_segs == IO_CACHED_BVEC_SEGS) /* return to cache */ else { WARN_ON_ONCE(imu->nr_segs < IO_CACHED_BVEC_SEGS); ... } } > + !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu)) > + kvfree(imu); > } > } >
On Fri, Feb 07, 2025 at 12:41:17PM +0000, Pavel Begunkov wrote: > On 2/3/25 15:45, Keith Busch wrote: > > From: Keith Busch <kbusch@kernel.org> > > > > Frequent alloc/free cycles on these is pretty costly. Use an io cache to > > more efficiently reuse these buffers. > > > > Signed-off-by: Keith Busch <kbusch@kernel.org> > > --- > > include/linux/io_uring_types.h | 16 ++--- > > io_uring/filetable.c | 2 +- > > io_uring/rsrc.c | 108 ++++++++++++++++++++++++--------- > > io_uring/rsrc.h | 2 +- > > 4 files changed, 92 insertions(+), 36 deletions(-) > > > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > > index aa661ebfd6568..c0e0c1f92e5b1 100644 > > --- a/include/linux/io_uring_types.h > > +++ b/include/linux/io_uring_types.h > > @@ -67,8 +67,17 @@ struct io_file_table { > > unsigned int alloc_hint; > > }; > > +struct io_alloc_cache { > > + void **entries; > > + unsigned int nr_cached; > > + unsigned int max_cached; > > + size_t elem_size; > > +}; > > + > > struct io_buf_table { > > struct io_rsrc_data data; > > + struct io_alloc_cache node_cache; > > + struct io_alloc_cache imu_cache; > > We can avoid all churn if you kill patch 5/6 and place put the > caches directly into struct io_ring_ctx. It's a bit better for > future cache improvements and we can even reuse the node cache > for files. > > ... > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > > index 864c2eabf8efd..5434b0d992d62 100644 > > --- a/io_uring/rsrc.c > > +++ b/io_uring/rsrc.c > > @@ -117,23 +117,39 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) > > unpin_user_page(imu->bvec[i].bv_page); > > if (imu->acct_pages) > > io_unaccount_mem(ctx, imu->acct_pages); > > - kvfree(imu); > > + if (struct_size(imu, bvec, imu->nr_bvecs) > > > + ctx->buf_table.imu_cache.elem_size || > > It could be quite a large allocation, let's not cache it if > it hasn't came from the cache for now. We can always improve > on top. Eh? This already skips inserting into the cache if it wasn't allocated out of the cache. I picked an arbitrary size, 512b, as the threshold for caching. If you need more bvecs than fit in that, it falls back to a kvmalloc/kvfree. The allocation overhead is pretty insignificant when you're transferring large payloads like that, and 14 vectors was chosen as the tipping point because it fits in a nice round number. > And can we invert how it's calculated? See below. You'll have > fewer calculations in the fast path, and I don't really like > users looking at ->elem_size when it's not necessary. > > > #define IO_CACHED_BVEC_SEGS N Yah, that's fine.
On Fri, Feb 07, 2025 at 12:41:17PM +0000, Pavel Begunkov wrote: > On 2/3/25 15:45, Keith Busch wrote: > > +struct io_alloc_cache { > > + void **entries; > > + unsigned int nr_cached; > > + unsigned int max_cached; > > + size_t elem_size; > > +}; > > + > > struct io_buf_table { > > struct io_rsrc_data data; > > + struct io_alloc_cache node_cache; > > + struct io_alloc_cache imu_cache; > > We can avoid all churn if you kill patch 5/6 and place put the > caches directly into struct io_ring_ctx. It's a bit better for > future cache improvements and we can even reuse the node cache > for files. I had this that way in an earlier version. The cache is tightly connected to the buf table, though, so splitting them up makes for some awkward cleanup. Grouping them together makes it clear their lifetimes are as a single unit. The filetable could have moved its bitmap into io_ring_ctx too, but it's in its own structure like this, and it conceptually makes sense. This is following in that same pattern.
On 2/7/25 15:33, Keith Busch wrote: ... >>> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c >>> index 864c2eabf8efd..5434b0d992d62 100644 >>> --- a/io_uring/rsrc.c >>> +++ b/io_uring/rsrc.c >>> @@ -117,23 +117,39 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) >>> unpin_user_page(imu->bvec[i].bv_page); >>> if (imu->acct_pages) >>> io_unaccount_mem(ctx, imu->acct_pages); >>> - kvfree(imu); >>> + if (struct_size(imu, bvec, imu->nr_bvecs) > >>> + ctx->buf_table.imu_cache.elem_size || >> >> It could be quite a large allocation, let's not cache it if >> it hasn't came from the cache for now. We can always improve >> on top. > > Eh? This already skips inserting into the cache if it wasn't allocated > out of the cache. Ah yes, you're right, I overlooked it. > > I picked an arbitrary size, 512b, as the threshold for caching. If you > need more bvecs than fit in that, it falls back to a kvmalloc/kvfree. > The allocation overhead is pretty insignificant when you're transferring > large payloads like that, and 14 vectors was chosen as the tipping point > because it fits in a nice round number. > >> And can we invert how it's calculated? See below. You'll have >> fewer calculations in the fast path, and I don't really like >> users looking at ->elem_size when it's not necessary. >> >> >> #define IO_CACHED_BVEC_SEGS N > > Yah, that's fine.
On 2/7/25 15:59, Keith Busch wrote: > On Fri, Feb 07, 2025 at 12:41:17PM +0000, Pavel Begunkov wrote: >> On 2/3/25 15:45, Keith Busch wrote: >>> +struct io_alloc_cache { >>> + void **entries; >>> + unsigned int nr_cached; >>> + unsigned int max_cached; >>> + size_t elem_size; >>> +}; >>> + >>> struct io_buf_table { >>> struct io_rsrc_data data; >>> + struct io_alloc_cache node_cache; >>> + struct io_alloc_cache imu_cache; >> >> We can avoid all churn if you kill patch 5/6 and place put the >> caches directly into struct io_ring_ctx. It's a bit better for >> future cache improvements and we can even reuse the node cache >> for files. > > I had this that way in an earlier version. The cache is tightly > connected to the buf table, though, so splitting them up makes for some > awkward cleanup. Grouping them together makes it clear their lifetimes > are as a single unit. I'd say it's tightly couple with the context as depends on the ctx sync, and the table doesn't really care from where memory comes. Anyway, there are basically two reasons. If we're adding node cache, we need to reuse it for files as well. And I'll likely need to pull it all back to ctx for cleaning up caches in general, and I'd rather avoid such back and forths as it's not so great for backports and things like that. Is there a good reason why it needs to be in there? > The filetable could have moved its bitmap into io_ring_ctx too, but it's > in its own structure like this, and it conceptually makes sense. This is > following in that same pattern.
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa661ebfd6568..c0e0c1f92e5b1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -67,8 +67,17 @@ struct io_file_table { unsigned int alloc_hint; }; +struct io_alloc_cache { + void **entries; + unsigned int nr_cached; + unsigned int max_cached; + size_t elem_size; +}; + struct io_buf_table { struct io_rsrc_data data; + struct io_alloc_cache node_cache; + struct io_alloc_cache imu_cache; }; struct io_hash_bucket { @@ -222,13 +231,6 @@ struct io_submit_state { struct blk_plug plug; }; -struct io_alloc_cache { - void **entries; - unsigned int nr_cached; - unsigned int max_cached; - size_t elem_size; -}; - struct io_ring_ctx { /* const or read-mostly hot data */ struct { diff --git a/io_uring/filetable.c b/io_uring/filetable.c index dd8eeec97acf6..a21660e3145ab 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 864c2eabf8efd..5434b0d992d62 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -117,23 +117,39 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); + if (struct_size(imu, bvec, imu->nr_bvecs) > + ctx->buf_table.imu_cache.elem_size || + !io_alloc_cache_put(&ctx->buf_table.imu_cache, imu)) + kvfree(imu); } } -struct io_rsrc_node *io_rsrc_node_alloc(int type) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + if (type == IORING_RSRC_FILE) + node = kmalloc(sizeof(*node), GFP_KERNEL); + else + node = io_cache_alloc(&ctx->buf_table.node_cache, GFP_KERNEL, NULL); if (node) { node->type = type; node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } return node; } -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) +static __cold void __io_rsrc_data_free(struct io_rsrc_data *data) +{ + kvfree(data->nodes); + data->nodes = NULL; + data->nr = 0; +} + +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { if (!data->nr) return; @@ -141,9 +157,7 @@ __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data if (data->nodes[data->nr]) io_put_rsrc_node(ctx, data->nodes[data->nr]); } - kvfree(data->nodes); - data->nodes = NULL; - data->nr = 0; + __io_rsrc_data_free(data); } __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) @@ -157,6 +171,31 @@ __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) return -ENOMEM; } +static __cold int io_rsrc_buffer_alloc(struct io_buf_table *table, unsigned nr) +{ + int ret; + + ret = io_rsrc_data_alloc(&table->data, nr); + if (ret) + return ret; + + ret = io_alloc_cache_init(&table->node_cache, nr, + sizeof(struct io_rsrc_node)); + if (ret) + goto out_1; + + ret = io_alloc_cache_init(&table->imu_cache, nr, 512); + if (ret) + goto out_2; + + return 0; +out_2: + io_alloc_cache_free(&table->node_cache, kfree); +out_1: + __io_rsrc_data_free(&table->data); + return ret; +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) @@ -206,7 +245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -466,6 +505,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) case IORING_RSRC_KBUF: if (node->buf) io_buffer_unmap(ctx, node); + if (io_alloc_cache_put(&ctx->buf_table.node_cache, node)) + return; break; default: WARN_ON_ONCE(1); @@ -534,7 +575,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -554,11 +595,19 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +static void io_rsrc_buffer_free(struct io_ring_ctx *ctx, + struct io_buf_table *table) +{ + io_rsrc_data_free(ctx, &table->data); + io_alloc_cache_free(&table->node_cache, kfree); + io_alloc_cache_free(&table->imu_cache, kfree); +} + int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { if (!ctx->buf_table.data.nr) return -ENXIO; - io_rsrc_data_free(ctx, &ctx->buf_table.data); + io_rsrc_buffer_free(ctx, &ctx->buf_table); return 0; } @@ -739,7 +788,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!iov->iov_base) return NULL; - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); node->buf = NULL; @@ -759,7 +808,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (struct_size(imu, bvec, nr_pages) > ctx->buf_table.imu_cache.elem_size) + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + else + imu = io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL, NULL); if (!imu) goto done; @@ -805,9 +857,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; - struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; + struct io_buf_table table; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -816,13 +868,14 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(&data, nr_args); + ret = io_rsrc_buffer_alloc(&table, nr_args); if (ret) return ret; if (!arg) memset(iov, 0, sizeof(*iov)); + ctx->buf_table = table; for (i = 0; i < nr_args; i++) { struct io_rsrc_node *node; u64 tag = 0; @@ -862,10 +915,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } node->tag = tag; } - data.nodes[i] = node; + table.data.nodes[i] = node; } - - ctx->buf_table.data = data; if (ret) io_sqe_buffers_unregister(ctx); return ret; @@ -878,11 +929,14 @@ static struct io_rsrc_node *io_buffer_alloc_node(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; struct io_rsrc_node *node; - node = io_rsrc_node_alloc(IORING_RSRC_KBUF); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_KBUF); if (!node) return NULL; - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (struct_size(imu, bvec, nr_bvecs) > ctx->buf_table.imu_cache.elem_size) + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + else + imu = io_cache_alloc(&ctx->buf_table.imu_cache, GFP_KERNEL, NULL); if (!imu) { io_put_rsrc_node(ctx, node); return NULL; @@ -1036,7 +1090,7 @@ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, struct io_uring_clone_buffers *arg) { - struct io_rsrc_data data; + struct io_buf_table table; int i, ret, off, nr; unsigned int nbufs; @@ -1067,7 +1121,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; - ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.data.nr)); + ret = io_rsrc_buffer_alloc(&table, max(nbufs, ctx->buf_table.data.nr)); if (ret) return ret; @@ -1076,7 +1130,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx struct io_rsrc_node *src_node = ctx->buf_table.data.nodes[i]; if (src_node) { - data.nodes[i] = src_node; + table.data.nodes[i] = src_node; src_node->refs++; } } @@ -1106,7 +1160,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; @@ -1115,12 +1169,12 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx refcount_inc(&src_node->buf->refs); dst_node->buf = src_node->buf; } - data.nodes[off++] = dst_node; + table.data.nodes[off++] = dst_node; i++; } /* - * If asked for replace, put the old table. data->nodes[] holds both + * If asked for replace, put the old table. table.data->nodes[] holds both * old and new nodes at this point. */ if (arg->flags & IORING_REGISTER_DST_REPLACE) @@ -1133,10 +1187,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx * entry). */ WARN_ON_ONCE(ctx->buf_table.data.nr); - ctx->buf_table.data = data; + ctx->buf_table = table; return 0; out_free: - io_rsrc_data_free(ctx, &data); + io_rsrc_buffer_free(ctx, &table); return ret; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index d1d90d9cd2b43..759ac373b0dc6 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -46,7 +46,7 @@ struct io_imu_folio_data { unsigned int nr_folios; }; -struct io_rsrc_node *io_rsrc_node_alloc(int type); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);