Message ID | b054a88092767f7767f8447e7a5bdab15fcc0759.1741102644.git.asml.silence@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for vectored registered buffers | expand |
On 3/4/25 15:40, Pavel Begunkov wrote: > Add io_import_reg_vec(), which will be responsible for importing > vectored registered buffers. iovecs are overlapped with the resulting > bvec in memory, which is why the iovec is expected to be padded in > iou_vec. > > Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> > --- ... > +int io_import_reg_vec(int ddir, struct iov_iter *iter, > + struct io_kiocb *req, struct iou_vec *vec, > + unsigned nr_iovs, unsigned iovec_off, > + unsigned issue_flags) > +{ > + struct io_rsrc_node *node; > + struct io_mapped_ubuf *imu; > + struct iovec *iov; > + unsigned nr_segs; > + > + node = io_find_buf_node(req, issue_flags); > + if (!node) > + return -EFAULT; > + imu = node->buf; > + if (imu->is_kbuf) > + return -EOPNOTSUPP; > + if (!(imu->dir & (1 << ddir))) > + return -EFAULT; > + > + iov = vec->iovec + iovec_off; > + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); if (sizeof(struct bio_vec) > sizeof(struct iovec)) { size_t entry_sz = sizeof(struct iovec); size_t bvec_bytes = nr_segs * sizeof(struct bio_vec); size_t iovec_off = (bvec_bytes + entry_sz - 1) / entry_sz; nr_segs += iovec_off; } How about fixing it up like this for now? Instead of overlapping bvec with iovec, it'd put them back to back and waste some memory on 32bit. I can try to make it a bit tighter, remove the if and let the compiler to optimise it into no-op for x64, or allocate max(bvec, iovec) * nr and see where it leads. But in either way IMHO it's better to be left until I get more time. > + > + if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) || > + nr_segs > vec->nr) { > + struct iou_vec tmp_vec = {}; > + int ret; > + > + ret = io_vec_realloc(&tmp_vec, nr_segs); > + if (ret) > + return ret; > + > + iovec_off = tmp_vec.nr - nr_iovs; > + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); > + io_vec_free(vec); > + > + *vec = tmp_vec; > + iov = vec->iovec + iovec_off; > + req->flags |= REQ_F_NEED_CLEANUP; > + } > + > + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); > +}
On 3/7/25 7:07 AM, Pavel Begunkov wrote: > On 3/4/25 15:40, Pavel Begunkov wrote: >> Add io_import_reg_vec(), which will be responsible for importing >> vectored registered buffers. iovecs are overlapped with the resulting >> bvec in memory, which is why the iovec is expected to be padded in >> iou_vec. >> >> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> >> --- > ... >> +int io_import_reg_vec(int ddir, struct iov_iter *iter, >> + struct io_kiocb *req, struct iou_vec *vec, >> + unsigned nr_iovs, unsigned iovec_off, >> + unsigned issue_flags) >> +{ >> + struct io_rsrc_node *node; >> + struct io_mapped_ubuf *imu; >> + struct iovec *iov; >> + unsigned nr_segs; >> + >> + node = io_find_buf_node(req, issue_flags); >> + if (!node) >> + return -EFAULT; >> + imu = node->buf; >> + if (imu->is_kbuf) >> + return -EOPNOTSUPP; >> + if (!(imu->dir & (1 << ddir))) >> + return -EFAULT; >> + >> + iov = vec->iovec + iovec_off; >> + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); > > if (sizeof(struct bio_vec) > sizeof(struct iovec)) { > size_t entry_sz = sizeof(struct iovec); > size_t bvec_bytes = nr_segs * sizeof(struct bio_vec); > size_t iovec_off = (bvec_bytes + entry_sz - 1) / entry_sz; > > nr_segs += iovec_off; > } > > How about fixing it up like this for now? Instead of overlapping > bvec with iovec, it'd put them back to back and waste some memory > on 32bit. > > I can try to make it a bit tighter, remove the if and let > the compiler to optimise it into no-op for x64, or allocate > max(bvec, iovec) * nr and see where it leads. But in either > way IMHO it's better to be left until I get more time. I think that looks reasonable. Nobody cares about this for 32-bit, outside of needing to work obviously.
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9101f12d21ef..b770a2b12da6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -111,7 +111,10 @@ struct io_uring_task { }; struct iou_vec { - struct iovec *iovec; + union { + struct iovec *iovec; + struct bio_vec *bvec; + }; unsigned nr; }; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 9b05e614819e..38743886bbf4 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1267,9 +1267,131 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) void io_vec_free(struct iou_vec *iv) { + BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec)); + if (!iv->iovec) return; kfree(iv->iovec); iv->iovec = NULL; iv->nr = 0; } + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + u64 folio_addr = imu->ubuf & ~folio_mask; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + u64 buf_end; + + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) + return -EFAULT; + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) + return -EFAULT; + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + /* by using folio address it also accounts for bvec offset */ + offset = buf_addr - folio_addr; + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned iovec_off, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (imu->is_kbuf) + return -EOPNOTSUPP; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iov = vec->iovec + iovec_off; + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) || + nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index ff78ead6bc75..f1496f7d844f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -62,6 +62,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned iovec_off, + unsigned issue_flags); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -147,6 +151,7 @@ static inline void __io_unaccount_mem(struct user_struct *user, } void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); static inline void io_vec_reset_iovec(struct iou_vec *iv, struct iovec *iovec, unsigned nr)
Add io_import_reg_vec(), which will be responsible for importing vectored registered buffers. iovecs are overlapped with the resulting bvec in memory, which is why the iovec is expected to be padded in iou_vec. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- include/linux/io_uring_types.h | 5 +- io_uring/rsrc.c | 122 +++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 5 ++ 3 files changed, 131 insertions(+), 1 deletion(-)