Message ID | 841b4d5b039b9db84aa1e1415a6d249ea57646f6.1741014186.git.asml.silence@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for vectored registered buffers | expand |
On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <asml.silence@gmail.com> wrote: > > Add io_import_reg_vec(), which will be responsible for importing > vectored registered buffers. iovecs are overlapped with the resulting > bvec in memory, which is why the iovec is expected to be padded in > iou_vec. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > --- > include/linux/io_uring_types.h | 5 +- > io_uring/rsrc.c | 124 +++++++++++++++++++++++++++++++++ > io_uring/rsrc.h | 5 ++ > 3 files changed, 133 insertions(+), 1 deletion(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 9101f12d21ef..b770a2b12da6 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -111,7 +111,10 @@ struct io_uring_task { > }; > > struct iou_vec { > - struct iovec *iovec; > + union { > + struct iovec *iovec; > + struct bio_vec *bvec; > + }; If I understand correctly, io_import_reg_vec() converts the iovecs to bio_vecs in place. If an iovec expands to more than one bio_vec (i.e. crosses a folio boundary), wouldn't the bio_vecs overwrite iovecs that hadn't been processed yet? > unsigned nr; > }; > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 9b05e614819e..1ec1f5b3e385 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -1267,9 +1267,133 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) > > void io_vec_free(struct iou_vec *iv) > { > + BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec)); > + > if (!iv->iovec) > return; > kfree(iv->iovec); > iv->iovec = NULL; > iv->nr = 0; > } > + > +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) > +{ > + struct iovec *iov; > + > + WARN_ON_ONCE(nr_entries <= 0); > + > + iov = kmalloc_array(nr_entries, sizeof(iov[0]), GFP_KERNEL); > + if (!iov) > + return -ENOMEM; > + > + io_vec_free(iv); > + iv->iovec = iov; > + iv->nr = nr_entries; > + return 0; > +} > + > +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, > + struct io_mapped_ubuf *imu, > + struct iovec *iovec, int nr_iovs, > + struct iou_vec *vec) > +{ > + unsigned long folio_size = (1 << imu->folio_shift); > + unsigned long folio_mask = folio_size - 1; > + struct bio_vec *res_bvec = vec->bvec; > + size_t total_len = 0; > + int bvec_idx = 0; > + int iov_idx; > + > + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { > + size_t iov_len = iovec[iov_idx].iov_len; > + u64 buf_addr = (u64)iovec[iov_idx].iov_base; > + u64 folio_addr = imu->ubuf & ~folio_mask; The computation of folio_addr could be moved out of the loop. > + struct bio_vec *src_bvec; > + size_t offset; > + u64 buf_end; > + > + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) > + return -EFAULT; > + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) > + return -EFAULT; > + > + total_len += iov_len; > + /* by using folio address it also accounts for bvec offset */ > + offset = buf_addr - folio_addr; > + src_bvec = imu->bvec + (offset >> imu->folio_shift); > + offset &= folio_mask; > + > + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { > + size_t seg_size = min_t(size_t, iov_len, > + folio_size - offset); > + > + res_bvec[bvec_idx].bv_page = src_bvec->bv_page; > + res_bvec[bvec_idx].bv_offset = offset; > + res_bvec[bvec_idx].bv_len = seg_size; Could just increment res_bvec to avoid the variable bvec_idx? > + iov_len -= seg_size; > + } > + } > + if (total_len > MAX_RW_COUNT) > + return -EINVAL; > + > + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); > + return 0; > +} > + > +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, > + struct io_mapped_ubuf *imu) > +{ > + unsigned shift = imu->folio_shift; > + size_t max_segs = 0; > + unsigned i; > + > + for (i = 0; i < nr_iovs; i++) > + max_segs += (iov[i].iov_len >> shift) + 2; Sees like this may overestimate a bit. I think something like this would give the exact number of segments for each iovec? (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift > + return max_segs; > +} > + > +int io_import_reg_vec(int ddir, struct iov_iter *iter, > + struct io_kiocb *req, struct iou_vec *vec, > + int nr_iovs, unsigned iovec_off, > + unsigned issue_flags) > +{ > + struct io_rsrc_node *node; > + struct io_mapped_ubuf *imu; > + unsigned cache_nr; > + struct iovec *iov; > + unsigned nr_segs; > + int ret; > + > + node = io_find_buf_node(req, issue_flags); > + if (!node) > + return -EFAULT; > + imu = node->buf; > + if (imu->is_kbuf) > + return -EOPNOTSUPP; > + > + iov = vec->iovec + iovec_off; > + ret = io_estimate_bvec_size(iov, nr_iovs, imu); > + if (ret < 0) > + return ret; io_estimate_bvec_size() doesn't (intentionally) return an error code, just an unsigned value cast to an int. Best, Caleb > + nr_segs = ret; > + cache_nr = vec->nr; > + > + if (WARN_ON_ONCE(iovec_off + nr_iovs != cache_nr) || > + nr_segs > cache_nr) { > + struct iou_vec tmp_vec = {}; > + > + ret = io_vec_realloc(&tmp_vec, nr_segs); > + if (ret) > + return ret; > + > + iovec_off = tmp_vec.nr - nr_iovs; > + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); > + io_vec_free(vec); > + > + *vec = tmp_vec; > + iov = vec->iovec + iovec_off; > + req->flags |= REQ_F_NEED_CLEANUP; > + } > + > + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); > +} > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index e3f1cfb2ff7b..769ef5d76a4b 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, > int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, > u64 buf_addr, size_t len, int ddir, > unsigned issue_flags); > +int io_import_reg_vec(int ddir, struct iov_iter *iter, > + struct io_kiocb *req, struct iou_vec *vec, > + int nr_iovs, unsigned iovec_off, > + unsigned issue_flags); > > int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); > @@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user, > } > > void io_vec_free(struct iou_vec *iv); > +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); > > static inline void io_vec_reset_iovec(struct iou_vec *iv, > struct iovec *iovec, unsigned nr) > -- > 2.48.1 > >
On Mon, Mar 03, 2025 at 12:49:38PM -0800, Caleb Sander Mateos wrote: > > + u64 buf_end; > > + > > + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) > > + return -EFAULT; > > + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) > > + return -EFAULT; > > + > > + total_len += iov_len; > > + /* by using folio address it also accounts for bvec offset */ > > + offset = buf_addr - folio_addr; > > + src_bvec = imu->bvec + (offset >> imu->folio_shift); > > + offset &= folio_mask; > > + > > + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { > > + size_t seg_size = min_t(size_t, iov_len, > > + folio_size - offset); > > + > > + res_bvec[bvec_idx].bv_page = src_bvec->bv_page; > > + res_bvec[bvec_idx].bv_offset = offset; > > + res_bvec[bvec_idx].bv_len = seg_size; > > Could just increment res_bvec to avoid the variable bvec_idx? And utilizing bvec_set_page() to initialize looks a bit cleaner too.
On 3/3/25 20:49, Caleb Sander Mateos wrote: > On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <asml.silence@gmail.com> wrote: ... > > If I understand correctly, io_import_reg_vec() converts the iovecs to > bio_vecs in place. If an iovec expands to more than one bio_vec (i.e. > crosses a folio boundary), wouldn't the bio_vecs overwrite iovecs that > hadn't been processed yet? It's handled, obviously, you missed that the vectors are offset'ed from each other. > >> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c >> index 9b05e614819e..1ec1f5b3e385 100644 >> --- a/io_uring/rsrc.c >> +++ b/io_uring/rsrc.c ... >> + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { >> + size_t seg_size = min_t(size_t, iov_len, >> + folio_size - offset); >> + >> + res_bvec[bvec_idx].bv_page = src_bvec->bv_page; >> + res_bvec[bvec_idx].bv_offset = offset; >> + res_bvec[bvec_idx].bv_len = seg_size; > > Could just increment res_bvec to avoid the variable bvec_idx? I don't see the benefit. >> + for (i = 0; i < nr_iovs; i++) >> + max_segs += (iov[i].iov_len >> shift) + 2; > > Sees like this may overestimate a bit. I think something like this > would give the exact number of segments for each iovec? > (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift It's overestimated exactly to avoid a beast like this.
On 3/4/25 10:05, Pavel Begunkov wrote: > On 3/3/25 20:49, Caleb Sander Mateos wrote: >> On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <asml.silence@gmail.com> wrote: > ... >>> + for (i = 0; i < nr_iovs; i++) >>> + max_segs += (iov[i].iov_len >> shift) + 2; >> >> Sees like this may overestimate a bit. I think something like this >> would give the exact number of segments for each iovec? >> (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift > > It's overestimated exactly to avoid a beast like this. And it'd be broken as well for 0 len.
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9101f12d21ef..b770a2b12da6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -111,7 +111,10 @@ struct io_uring_task { }; struct iou_vec { - struct iovec *iovec; + union { + struct iovec *iovec; + struct bio_vec *bvec; + }; unsigned nr; }; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 9b05e614819e..1ec1f5b3e385 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1267,9 +1267,133 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) void io_vec_free(struct iou_vec *iv) { + BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec)); + if (!iv->iovec) return; kfree(iv->iovec); iv->iovec = NULL; iv->nr = 0; } + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + struct iovec *iov; + + WARN_ON_ONCE(nr_entries <= 0); + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), GFP_KERNEL); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, int nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = (1 << imu->folio_shift); + unsigned long folio_mask = folio_size - 1; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + int bvec_idx = 0; + int iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)iovec[iov_idx].iov_base; + u64 folio_addr = imu->ubuf & ~folio_mask; + struct bio_vec *src_bvec; + size_t offset; + u64 buf_end; + + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) + return -EFAULT; + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) + return -EFAULT; + + total_len += iov_len; + /* by using folio address it also accounts for bvec offset */ + offset = buf_addr - folio_addr; + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + res_bvec[bvec_idx].bv_page = src_bvec->bv_page; + res_bvec[bvec_idx].bv_offset = offset; + res_bvec[bvec_idx].bv_len = seg_size; + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + int nr_iovs, unsigned iovec_off, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + unsigned cache_nr; + struct iovec *iov; + unsigned nr_segs; + int ret; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (imu->is_kbuf) + return -EOPNOTSUPP; + + iov = vec->iovec + iovec_off; + ret = io_estimate_bvec_size(iov, nr_iovs, imu); + if (ret < 0) + return ret; + nr_segs = ret; + cache_nr = vec->nr; + + if (WARN_ON_ONCE(iovec_off + nr_iovs != cache_nr) || + nr_segs > cache_nr) { + struct iou_vec tmp_vec = {}; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index e3f1cfb2ff7b..769ef5d76a4b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + int nr_iovs, unsigned iovec_off, + unsigned issue_flags); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user, } void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); static inline void io_vec_reset_iovec(struct iou_vec *iv, struct iovec *iovec, unsigned nr)
Add io_import_reg_vec(), which will be responsible for importing vectored registered buffers. iovecs are overlapped with the resulting bvec in memory, which is why the iovec is expected to be padded in iou_vec. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/linux/io_uring_types.h | 5 +- io_uring/rsrc.c | 124 +++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 5 ++ 3 files changed, 133 insertions(+), 1 deletion(-)