diff mbox series

[V6,6/8] io_uring: support providing sqe group buffer

Message ID 20240912104933.1875409-7-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series io_uring: support sqe group and provide group kbuf | expand

Commit Message

Ming Lei Sept. 12, 2024, 10:49 a.m. UTC
SQE group with REQ_F_SQE_GROUP_DEP introduces one new mechanism to share
resource among one group of requests, and all member requests can consume
the resource provided by group leader efficiently in parallel.

This patch uses the added sqe group feature REQ_F_SQE_GROUP_DEP to share
kernel buffer in sqe group:

- the group leader provides kernel buffer to member requests

- member requests use the provided buffer to do FS or network IO, or more
operations in future

- this kernel buffer is returned back after member requests consume it

This way looks a bit similar with kernel's pipe/splice, but there are some
important differences:

- splice is for transferring data between two FDs via pipe, and fd_out can
only read data from pipe, but data can't be written to; this feature can
borrow buffer from group leader to members, so member request can write data
to this buffer if the provided buffer is allowed to write to.

- splice implements data transfer by moving pages between subsystem and
pipe, that means page ownership is transferred, and this way is one of the
most complicated thing of splice; this patch supports scenarios in which
the buffer can't be transferred, and buffer is only borrowed to member
requests, and is returned back after member requests consume the provided
buffer, so buffer lifetime is aligned with group leader lifetime, and
buffer lifetime is simplified a lot. Especially the buffer is guaranteed
to be returned back.

- splice can't run in async way basically

It can help to implement generic zero copy between device and related
operations, such as ublk, fuse, vdpa, even network receive or whatever.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/linux/io_uring_types.h | 33 +++++++++++++++++++
 io_uring/io_uring.c            | 10 +++++-
 io_uring/io_uring.h            | 10 ++++++
 io_uring/kbuf.c                | 60 ++++++++++++++++++++++++++++++++++
 io_uring/kbuf.h                | 13 ++++++++
 io_uring/net.c                 | 23 ++++++++++++-
 io_uring/opdef.c               |  4 +++
 io_uring/opdef.h               |  2 ++
 io_uring/rw.c                  | 20 +++++++++++-
 9 files changed, 172 insertions(+), 3 deletions(-)

Comments

Pavel Begunkov Oct. 4, 2024, 3:32 p.m. UTC | #1
On 9/12/24 11:49, Ming Lei wrote:
...
> It can help to implement generic zero copy between device and related
> operations, such as ublk, fuse, vdpa,
> even network receive or whatever.

That's exactly the thing it can't sanely work with because
of this design.

> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   include/linux/io_uring_types.h | 33 +++++++++++++++++++
>   io_uring/io_uring.c            | 10 +++++-
>   io_uring/io_uring.h            | 10 ++++++
>   io_uring/kbuf.c                | 60 ++++++++++++++++++++++++++++++++++
>   io_uring/kbuf.h                | 13 ++++++++
>   io_uring/net.c                 | 23 ++++++++++++-
>   io_uring/opdef.c               |  4 +++
>   io_uring/opdef.h               |  2 ++
>   io_uring/rw.c                  | 20 +++++++++++-
>   9 files changed, 172 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> index 793d5a26d9b8..445e5507565a 100644
> --- a/include/linux/io_uring_types.h
> +++ b/include/linux/io_uring_types.h
> @@ -6,6 +6,7 @@
>   #include <linux/task_work.h>
>   #include <linux/bitmap.h>
>   #include <linux/llist.h>
> +#include <linux/bvec.h>
>   #include <uapi/linux/io_uring.h>
>   
>   enum {
> @@ -39,6 +40,26 @@ enum io_uring_cmd_flags {
>   	IO_URING_F_COMPAT		= (1 << 12),
>   };
>   
> +struct io_uring_kernel_buf;
> +typedef void (io_uring_buf_giveback_t) (const struct io_uring_kernel_buf *);
> +
> +/* buffer provided from kernel */
> +struct io_uring_kernel_buf {
> +	unsigned long		len;
> +	unsigned short		nr_bvecs;
> +	unsigned char		dir;	/* ITER_SOURCE or ITER_DEST */
> +
> +	/* offset in the 1st bvec */
> +	unsigned int		offset;
> +	const struct bio_vec	*bvec;
> +
> +	/* called when we are done with this buffer */
> +	io_uring_buf_giveback_t	*grp_kbuf_ack;
> +
> +	/* private field, user don't touch it */
> +	struct bio_vec		__bvec[];
> +};
> +
>   struct io_wq_work_node {
>   	struct io_wq_work_node *next;
>   };
> @@ -473,6 +494,7 @@ enum {
>   	REQ_F_BUFFERS_COMMIT_BIT,
>   	REQ_F_SQE_GROUP_LEADER_BIT,
>   	REQ_F_SQE_GROUP_DEP_BIT,
> +	REQ_F_GROUP_KBUF_BIT,
>   
>   	/* not a real bit, just to check we're not overflowing the space */
>   	__REQ_F_LAST_BIT,
> @@ -557,6 +579,8 @@ enum {
>   	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
>   	/* sqe group with members depending on leader */
>   	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
> +	/* group lead provides kbuf for members, set for both lead and member */
> +	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),

We have a huge flag problem here. It's a 4th group flag, that gives
me an idea that it's overabused. We're adding state machines based on
them "set group, clear group, but if last set it again. And clear
group lead if refs are of particular value". And it's not really
clear what these two flags are here for or what they do.

 From what I see you need here just one flag to mark requests
that provide a buffer, ala REQ_F_PROVIDING_KBUF. On the import
side:

if ((req->flags & GROUP) && (req->lead->flags & REQ_F_PROVIDING_KBUF))
	...

And when you kill the request:

if (req->flags & REQ_F_PROVIDING_KBUF)
	io_group_kbuf_drop();

And I don't think you need opdef::accept_group_kbuf since the
request handler should already know that and, importantly, you don't
imbue any semantics based on it.

FWIW, would be nice if during init figure we can verify that the leader
provides a buffer IFF there is someone consuming it, but I don't think
the semantics is flexible enough to do it sanely. i.e. there are many
members in a group, some might want to use the buffer and some might not.


>   typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
> @@ -640,6 +664,15 @@ struct io_kiocb {
>   		 * REQ_F_BUFFER_RING is set.
>   		 */
>   		struct io_buffer_list	*buf_list;
> +
> +		/*
> +		 * store kernel buffer provided by sqe group lead, valid
> +		 * IFF REQ_F_GROUP_KBUF
> +		 *
> +		 * The buffer meta is immutable since it is shared by
> +		 * all member requests
> +		 */
> +		const struct io_uring_kernel_buf *grp_kbuf;
>   	};
>   
>   	union {
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 99b44b6babd6..80c4d9192657 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -116,7 +116,7 @@
>   
>   #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
>   				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
> -				REQ_F_ASYNC_DATA)
> +				REQ_F_ASYNC_DATA | REQ_F_GROUP_KBUF)
>   
>   #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
>   				 REQ_F_SQE_GROUP | REQ_F_SQE_GROUP_LEADER | \
> @@ -387,6 +387,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
>   
>   static void io_clean_op(struct io_kiocb *req)
>   {
> +	/* GROUP_KBUF is only available for REQ_F_SQE_GROUP_DEP */
> +	if ((req->flags & (REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP)) ==
> +			(REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP))
> +		io_group_kbuf_drop(req);
> +
>   	if (req->flags & REQ_F_BUFFER_SELECTED) {
>   		spin_lock(&req->ctx->completion_lock);
>   		io_kbuf_drop(req);
> @@ -914,9 +919,12 @@ static void io_queue_group_members(struct io_kiocb *req)
>   
>   	req->grp_link = NULL;
>   	while (member) {
> +		const struct io_issue_def *def = &io_issue_defs[member->opcode];
>   		struct io_kiocb *next = member->grp_link;
>   
>   		member->grp_leader = req;
> +		if ((req->flags & REQ_F_GROUP_KBUF) && def->accept_group_kbuf)
> +			member->flags |= REQ_F_GROUP_KBUF;

As per above I suspect that is not needed.

>   		if (unlikely(member->flags & REQ_F_FAIL)) {
>   			io_req_task_queue_fail(member, member->cqe.res);
>   		} else if (unlikely(req->flags & REQ_F_FAIL)) {
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index df2be7353414..8e111d24c02d 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -349,6 +349,16 @@ static inline bool req_is_group_leader(struct io_kiocb *req)
>   	return req->flags & REQ_F_SQE_GROUP_LEADER;
>   }
>   
...
> +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
> +		unsigned int len, int dir, struct iov_iter *iter)
> +{
> +	struct io_kiocb *lead = req->grp_link;
> +	const struct io_uring_kernel_buf *kbuf;
> +	unsigned long offset;
> +
> +	WARN_ON_ONCE(!(req->flags & REQ_F_GROUP_KBUF));
> +
> +	if (!req_is_group_member(req))
> +		return -EINVAL;
> +
> +	if (!lead || !req_support_group_dep(lead) || !lead->grp_kbuf)
> +		return -EINVAL;
> +
> +	/* req->fused_cmd_kbuf is immutable */
> +	kbuf = lead->grp_kbuf;
> +	offset = kbuf->offset;
> +
> +	if (!kbuf->bvec)
> +		return -EINVAL;

How can this happen?

> +	if (dir != kbuf->dir)
> +		return -EINVAL;
> +
> +	if (unlikely(buf_off > kbuf->len))
> +		return -EFAULT;
> +
> +	if (unlikely(len > kbuf->len - buf_off))
> +		return -EFAULT;

check_add_overflow() would be more readable

> +
> +	/* don't use io_import_fixed which doesn't support multipage bvec */
> +	offset += buf_off;
> +	iov_iter_bvec(iter, dir, kbuf->bvec, kbuf->nr_bvecs, offset + len);
> +
> +	if (offset)
> +		iov_iter_advance(iter, offset);
> +
> +	return 0;
> +}
> diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
> index 36aadfe5ac00..37d18324e840 100644
> --- a/io_uring/kbuf.h
> +++ b/io_uring/kbuf.h
> @@ -89,6 +89,11 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
>   				      unsigned long bgid);
>   int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
>   
> +int io_provide_group_kbuf(struct io_kiocb *req,
> +		const struct io_uring_kernel_buf *grp_kbuf);
> +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
> +		unsigned int len, int dir, struct iov_iter *iter);
> +
>   static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
>   {
>   	/*
> @@ -220,4 +225,12 @@ static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
>   {
>   	return __io_put_kbufs(req, len, nbufs, issue_flags);
>   }
> +
> +static inline void io_group_kbuf_drop(struct io_kiocb *req)
> +{
> +	const struct io_uring_kernel_buf *gbuf = req->grp_kbuf;
> +
> +	if (gbuf && gbuf->grp_kbuf_ack)

How can ->grp_kbuf_ack be missing?

> +		gbuf->grp_kbuf_ack(gbuf);
> +}
>   #endif
> diff --git a/io_uring/net.c b/io_uring/net.c
> index f10f5a22d66a..ad24dd5924d2 100644
> --- a/io_uring/net.c
> +++ b/io_uring/net.c
> @@ -89,6 +89,13 @@ struct io_sr_msg {
>    */
>   #define MULTISHOT_MAX_RETRY	32
>   
> +#define user_ptr_to_u64(x) (		\
> +{					\
> +	typecheck(void __user *, (x));	\
> +	(u64)(unsigned long)(x);	\
> +}					\
> +)
> +
>   int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>   {
>   	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
> @@ -375,7 +382,7 @@ static int io_send_setup(struct io_kiocb *req)
>   		kmsg->msg.msg_name = &kmsg->addr;
>   		kmsg->msg.msg_namelen = sr->addr_len;
>   	}
> -	if (!io_do_buffer_select(req)) {
> +	if (!io_do_buffer_select(req) && !(req->flags & REQ_F_GROUP_KBUF)) {
>   		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
>   				  &kmsg->msg.msg_iter);
>   		if (unlikely(ret < 0))
> @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
>   	if (issue_flags & IO_URING_F_NONBLOCK)
>   		flags |= MSG_DONTWAIT;
>   
> +	if (req->flags & REQ_F_GROUP_KBUF) {

Does anything prevent the request to be marked by both
GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
a group kbuf and then go to the io_do_buffer_select()
overriding all of that

> +		ret = io_import_group_kbuf(req,
> +					user_ptr_to_u64(sr->buf),
> +					sr->len, ITER_SOURCE,
> +					&kmsg->msg.msg_iter);
> +		if (unlikely(ret))
> +			return ret;
> +	}
> +
>   retry_bundle:
>   	if (io_do_buffer_select(req)) {
>   		struct buf_sel_arg arg = {
> @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
>   			goto out_free;
>   		}
>   		sr->buf = NULL;
> +	} else if (req->flags & REQ_F_GROUP_KBUF) {

What happens if we get a short read/recv?

> +		ret = io_import_group_kbuf(req, user_ptr_to_u64(sr->buf),
> +				sr->len, ITER_DEST, &kmsg->msg.msg_iter);
> +		if (unlikely(ret))
> +			goto out_free;
>   	}
>   
>   	kmsg->msg.msg_flags = 0;
...
Ming Lei Oct. 6, 2024, 8:20 a.m. UTC | #2
On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> On 9/12/24 11:49, Ming Lei wrote:
> ...
> > It can help to implement generic zero copy between device and related
> > operations, such as ublk, fuse, vdpa,
> > even network receive or whatever.
> 
> That's exactly the thing it can't sanely work with because
> of this design.

The provide buffer design is absolutely generic, and basically

- group leader provides buffer for member OPs, and member OPs can borrow
the buffer if leader allows by calling io_provide_group_kbuf()

- after member OPs consumes the buffer, the buffer is returned back by
the callback implemented in group leader subsystem, so group leader can
release related sources;

- and it is guaranteed that the buffer can be released always

The ublk implementation is pretty simple, it can be reused in device driver
to share buffer with other kernel subsystems.

I don't see anything insane with the design.

> 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >   include/linux/io_uring_types.h | 33 +++++++++++++++++++
> >   io_uring/io_uring.c            | 10 +++++-
> >   io_uring/io_uring.h            | 10 ++++++
> >   io_uring/kbuf.c                | 60 ++++++++++++++++++++++++++++++++++
> >   io_uring/kbuf.h                | 13 ++++++++
> >   io_uring/net.c                 | 23 ++++++++++++-
> >   io_uring/opdef.c               |  4 +++
> >   io_uring/opdef.h               |  2 ++
> >   io_uring/rw.c                  | 20 +++++++++++-
> >   9 files changed, 172 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > index 793d5a26d9b8..445e5507565a 100644
> > --- a/include/linux/io_uring_types.h
> > +++ b/include/linux/io_uring_types.h
> > @@ -6,6 +6,7 @@
> >   #include <linux/task_work.h>
> >   #include <linux/bitmap.h>
> >   #include <linux/llist.h>
> > +#include <linux/bvec.h>
> >   #include <uapi/linux/io_uring.h>
> >   enum {
> > @@ -39,6 +40,26 @@ enum io_uring_cmd_flags {
> >   	IO_URING_F_COMPAT		= (1 << 12),
> >   };
> > +struct io_uring_kernel_buf;
> > +typedef void (io_uring_buf_giveback_t) (const struct io_uring_kernel_buf *);
> > +
> > +/* buffer provided from kernel */
> > +struct io_uring_kernel_buf {
> > +	unsigned long		len;
> > +	unsigned short		nr_bvecs;
> > +	unsigned char		dir;	/* ITER_SOURCE or ITER_DEST */
> > +
> > +	/* offset in the 1st bvec */
> > +	unsigned int		offset;
> > +	const struct bio_vec	*bvec;
> > +
> > +	/* called when we are done with this buffer */
> > +	io_uring_buf_giveback_t	*grp_kbuf_ack;
> > +
> > +	/* private field, user don't touch it */
> > +	struct bio_vec		__bvec[];
> > +};
> > +
> >   struct io_wq_work_node {
> >   	struct io_wq_work_node *next;
> >   };
> > @@ -473,6 +494,7 @@ enum {
> >   	REQ_F_BUFFERS_COMMIT_BIT,
> >   	REQ_F_SQE_GROUP_LEADER_BIT,
> >   	REQ_F_SQE_GROUP_DEP_BIT,
> > +	REQ_F_GROUP_KBUF_BIT,
> >   	/* not a real bit, just to check we're not overflowing the space */
> >   	__REQ_F_LAST_BIT,
> > @@ -557,6 +579,8 @@ enum {
> >   	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
> >   	/* sqe group with members depending on leader */
> >   	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
> > +	/* group lead provides kbuf for members, set for both lead and member */
> > +	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),
> 
> We have a huge flag problem here. It's a 4th group flag, that gives
> me an idea that it's overabused. We're adding state machines based on
> them "set group, clear group, but if last set it again.

I have explained it is just for reusing SQE_GROUP flag to mark the last
member.

And now REQ_F_SQE_GROUP_DEP_BIT can be killed.

> And clear
> group lead if refs are of particular value".

It is actually one dead code for supporting concurrent leader & members,
so it will be removed too.

> And it's not really
> clear what these two flags are here for or what they do.
> 
> From what I see you need here just one flag to mark requests
> that provide a buffer, ala REQ_F_PROVIDING_KBUF. On the import
> side:
> 
> if ((req->flags & GROUP) && (req->lead->flags & REQ_F_PROVIDING_KBUF))
> 	...
> 
> And when you kill the request:
> 
> if (req->flags & REQ_F_PROVIDING_KBUF)
> 	io_group_kbuf_drop();

Yeah, this way works, here using REQ_F_GROUP_KBUF can remove the extra
indirect ->lead->flags check. I am fine to switch to this way by adding
one helper of io_use_group_provided_buf() to cover the check.

> 
> And I don't think you need opdef::accept_group_kbuf since the
> request handler should already know that and, importantly, you don't
> imbue any semantics based on it.

Yeah, and it just follows logic of buffer_select. I guess def->buffer_select
may be removed too?

> 
> FWIW, would be nice if during init figure we can verify that the leader
> provides a buffer IFF there is someone consuming it, but I don't think

It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
only be provided in ->issue().

> the semantics is flexible enough to do it sanely. i.e. there are many
> members in a group, some might want to use the buffer and some might not.
> 
> 
> >   typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
> > @@ -640,6 +664,15 @@ struct io_kiocb {
> >   		 * REQ_F_BUFFER_RING is set.
> >   		 */
> >   		struct io_buffer_list	*buf_list;
> > +
> > +		/*
> > +		 * store kernel buffer provided by sqe group lead, valid
> > +		 * IFF REQ_F_GROUP_KBUF
> > +		 *
> > +		 * The buffer meta is immutable since it is shared by
> > +		 * all member requests
> > +		 */
> > +		const struct io_uring_kernel_buf *grp_kbuf;
> >   	};
> >   	union {
> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > index 99b44b6babd6..80c4d9192657 100644
> > --- a/io_uring/io_uring.c
> > +++ b/io_uring/io_uring.c
> > @@ -116,7 +116,7 @@
> >   #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
> >   				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
> > -				REQ_F_ASYNC_DATA)
> > +				REQ_F_ASYNC_DATA | REQ_F_GROUP_KBUF)
> >   #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
> >   				 REQ_F_SQE_GROUP | REQ_F_SQE_GROUP_LEADER | \
> > @@ -387,6 +387,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
> >   static void io_clean_op(struct io_kiocb *req)
> >   {
> > +	/* GROUP_KBUF is only available for REQ_F_SQE_GROUP_DEP */
> > +	if ((req->flags & (REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP)) ==
> > +			(REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP))
> > +		io_group_kbuf_drop(req);
> > +
> >   	if (req->flags & REQ_F_BUFFER_SELECTED) {
> >   		spin_lock(&req->ctx->completion_lock);
> >   		io_kbuf_drop(req);
> > @@ -914,9 +919,12 @@ static void io_queue_group_members(struct io_kiocb *req)
> >   	req->grp_link = NULL;
> >   	while (member) {
> > +		const struct io_issue_def *def = &io_issue_defs[member->opcode];
> >   		struct io_kiocb *next = member->grp_link;
> >   		member->grp_leader = req;
> > +		if ((req->flags & REQ_F_GROUP_KBUF) && def->accept_group_kbuf)
> > +			member->flags |= REQ_F_GROUP_KBUF;
> 
> As per above I suspect that is not needed.
> 
> >   		if (unlikely(member->flags & REQ_F_FAIL)) {
> >   			io_req_task_queue_fail(member, member->cqe.res);
> >   		} else if (unlikely(req->flags & REQ_F_FAIL)) {
> > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> > index df2be7353414..8e111d24c02d 100644
> > --- a/io_uring/io_uring.h
> > +++ b/io_uring/io_uring.h
> > @@ -349,6 +349,16 @@ static inline bool req_is_group_leader(struct io_kiocb *req)
> >   	return req->flags & REQ_F_SQE_GROUP_LEADER;
> >   }
> ...
> > +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
> > +		unsigned int len, int dir, struct iov_iter *iter)
> > +{
> > +	struct io_kiocb *lead = req->grp_link;
> > +	const struct io_uring_kernel_buf *kbuf;
> > +	unsigned long offset;
> > +
> > +	WARN_ON_ONCE(!(req->flags & REQ_F_GROUP_KBUF));
> > +
> > +	if (!req_is_group_member(req))
> > +		return -EINVAL;
> > +
> > +	if (!lead || !req_support_group_dep(lead) || !lead->grp_kbuf)
> > +		return -EINVAL;
> > +
> > +	/* req->fused_cmd_kbuf is immutable */
> > +	kbuf = lead->grp_kbuf;
> > +	offset = kbuf->offset;
> > +
> > +	if (!kbuf->bvec)
> > +		return -EINVAL;
> 
> How can this happen?

OK, we can run the check in uring_cmd API.

> 
> > +	if (dir != kbuf->dir)
> > +		return -EINVAL;
> > +
> > +	if (unlikely(buf_off > kbuf->len))
> > +		return -EFAULT;
> > +
> > +	if (unlikely(len > kbuf->len - buf_off))
> > +		return -EFAULT;
> 
> check_add_overflow() would be more readable

OK.

> 
> > +
> > +	/* don't use io_import_fixed which doesn't support multipage bvec */
> > +	offset += buf_off;
> > +	iov_iter_bvec(iter, dir, kbuf->bvec, kbuf->nr_bvecs, offset + len);
> > +
> > +	if (offset)
> > +		iov_iter_advance(iter, offset);
> > +
> > +	return 0;
> > +}
> > diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
> > index 36aadfe5ac00..37d18324e840 100644
> > --- a/io_uring/kbuf.h
> > +++ b/io_uring/kbuf.h
> > @@ -89,6 +89,11 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
> >   				      unsigned long bgid);
> >   int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
> > +int io_provide_group_kbuf(struct io_kiocb *req,
> > +		const struct io_uring_kernel_buf *grp_kbuf);
> > +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
> > +		unsigned int len, int dir, struct iov_iter *iter);
> > +
> >   static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
> >   {
> >   	/*
> > @@ -220,4 +225,12 @@ static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
> >   {
> >   	return __io_put_kbufs(req, len, nbufs, issue_flags);
> >   }
> > +
> > +static inline void io_group_kbuf_drop(struct io_kiocb *req)
> > +{
> > +	const struct io_uring_kernel_buf *gbuf = req->grp_kbuf;
> > +
> > +	if (gbuf && gbuf->grp_kbuf_ack)
> 
> How can ->grp_kbuf_ack be missing?

OK, the check can be removed here.

> 
> > +		gbuf->grp_kbuf_ack(gbuf);
> > +}
> >   #endif
> > diff --git a/io_uring/net.c b/io_uring/net.c
> > index f10f5a22d66a..ad24dd5924d2 100644
> > --- a/io_uring/net.c
> > +++ b/io_uring/net.c
> > @@ -89,6 +89,13 @@ struct io_sr_msg {
> >    */
> >   #define MULTISHOT_MAX_RETRY	32
> > +#define user_ptr_to_u64(x) (		\
> > +{					\
> > +	typecheck(void __user *, (x));	\
> > +	(u64)(unsigned long)(x);	\
> > +}					\
> > +)
> > +
> >   int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> >   {
> >   	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
> > @@ -375,7 +382,7 @@ static int io_send_setup(struct io_kiocb *req)
> >   		kmsg->msg.msg_name = &kmsg->addr;
> >   		kmsg->msg.msg_namelen = sr->addr_len;
> >   	}
> > -	if (!io_do_buffer_select(req)) {
> > +	if (!io_do_buffer_select(req) && !(req->flags & REQ_F_GROUP_KBUF)) {
> >   		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
> >   				  &kmsg->msg.msg_iter);
> >   		if (unlikely(ret < 0))
> > @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
> >   	if (issue_flags & IO_URING_F_NONBLOCK)
> >   		flags |= MSG_DONTWAIT;
> > +	if (req->flags & REQ_F_GROUP_KBUF) {
> 
> Does anything prevent the request to be marked by both
> GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
> a group kbuf and then go to the io_do_buffer_select()
> overriding all of that

It could be used in this way, and we can fail the member in
io_queue_group_members().

> 
> > +		ret = io_import_group_kbuf(req,
> > +					user_ptr_to_u64(sr->buf),
> > +					sr->len, ITER_SOURCE,
> > +					&kmsg->msg.msg_iter);
> > +		if (unlikely(ret))
> > +			return ret;
> > +	}
> > +
> >   retry_bundle:
> >   	if (io_do_buffer_select(req)) {
> >   		struct buf_sel_arg arg = {
> > @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
> >   			goto out_free;
> >   		}
> >   		sr->buf = NULL;
> > +	} else if (req->flags & REQ_F_GROUP_KBUF) {
> 
> What happens if we get a short read/recv?

For short read/recv, any progress is stored in iterator, nothing to do
with the provide buffer, which is immutable.

One problem for read is reissue, but it can be handled by saving iter
state after the group buffer is imported, I will fix it in next version.
For net recv, offset/len of buffer is updated in case of short recv, so
it works as expected.

Or any other issue with short read/recv? Can you explain in detail?

Thanks,
Ming
Ming Lei Oct. 6, 2024, 9:47 a.m. UTC | #3
On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> On 9/12/24 11:49, Ming Lei wrote:
> ...
...
> > @@ -473,6 +494,7 @@ enum {
> >   	REQ_F_BUFFERS_COMMIT_BIT,
> >   	REQ_F_SQE_GROUP_LEADER_BIT,
> >   	REQ_F_SQE_GROUP_DEP_BIT,
> > +	REQ_F_GROUP_KBUF_BIT,
> >   	/* not a real bit, just to check we're not overflowing the space */
> >   	__REQ_F_LAST_BIT,
> > @@ -557,6 +579,8 @@ enum {
> >   	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
> >   	/* sqe group with members depending on leader */
> >   	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
> > +	/* group lead provides kbuf for members, set for both lead and member */
> > +	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),
> 
> We have a huge flag problem here. It's a 4th group flag, that gives
> me an idea that it's overabused. We're adding state machines based on
> them "set group, clear group, but if last set it again. And clear
> group lead if refs are of particular value". And it's not really
> clear what these two flags are here for or what they do.
> 
> From what I see you need here just one flag to mark requests
> that provide a buffer, ala REQ_F_PROVIDING_KBUF. On the import
> side:
> 
> if ((req->flags & GROUP) && (req->lead->flags & REQ_F_PROVIDING_KBUF))
> 	...
> 
> And when you kill the request:
> 
> if (req->flags & REQ_F_PROVIDING_KBUF)
> 	io_group_kbuf_drop();

REQ_F_PROVIDING_KBUF may be killed too, and the check helper can become:

bool io_use_group_provided_buf(req)
{
	return (req->flags & GROUP) && req->lead->grp_buf;
}


Thanks,
Ming
Pavel Begunkov Oct. 9, 2024, 11:57 a.m. UTC | #4
On 10/6/24 10:47, Ming Lei wrote:
> On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
>> On 9/12/24 11:49, Ming Lei wrote:
>> ...
> ...
>>> @@ -473,6 +494,7 @@ enum {
>>>    	REQ_F_BUFFERS_COMMIT_BIT,
>>>    	REQ_F_SQE_GROUP_LEADER_BIT,
>>>    	REQ_F_SQE_GROUP_DEP_BIT,
>>> +	REQ_F_GROUP_KBUF_BIT,
>>>    	/* not a real bit, just to check we're not overflowing the space */
>>>    	__REQ_F_LAST_BIT,
>>> @@ -557,6 +579,8 @@ enum {
>>>    	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
>>>    	/* sqe group with members depending on leader */
>>>    	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
>>> +	/* group lead provides kbuf for members, set for both lead and member */
>>> +	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),
>>
>> We have a huge flag problem here. It's a 4th group flag, that gives
>> me an idea that it's overabused. We're adding state machines based on
>> them "set group, clear group, but if last set it again. And clear
>> group lead if refs are of particular value". And it's not really
>> clear what these two flags are here for or what they do.
>>
>>  From what I see you need here just one flag to mark requests
>> that provide a buffer, ala REQ_F_PROVIDING_KBUF. On the import
>> side:
>>
>> if ((req->flags & GROUP) && (req->lead->flags & REQ_F_PROVIDING_KBUF))
>> 	...
>>
>> And when you kill the request:
>>
>> if (req->flags & REQ_F_PROVIDING_KBUF)
>> 	io_group_kbuf_drop();
> 
> REQ_F_PROVIDING_KBUF may be killed too, and the check helper can become:
> 
> bool io_use_group_provided_buf(req)
> {
> 	return (req->flags & GROUP) && req->lead->grp_buf;
> }

->grp_kbuf is unionised, so for that to work you need to ensure that
only a buffer providing cmd / request could be a leader of a group,
which doesn't sound right.
Ming Lei Oct. 9, 2024, 12:21 p.m. UTC | #5
On Wed, Oct 09, 2024 at 12:57:48PM +0100, Pavel Begunkov wrote:
> On 10/6/24 10:47, Ming Lei wrote:
> > On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> > > On 9/12/24 11:49, Ming Lei wrote:
> > > ...
> > ...
> > > > @@ -473,6 +494,7 @@ enum {
> > > >    	REQ_F_BUFFERS_COMMIT_BIT,
> > > >    	REQ_F_SQE_GROUP_LEADER_BIT,
> > > >    	REQ_F_SQE_GROUP_DEP_BIT,
> > > > +	REQ_F_GROUP_KBUF_BIT,
> > > >    	/* not a real bit, just to check we're not overflowing the space */
> > > >    	__REQ_F_LAST_BIT,
> > > > @@ -557,6 +579,8 @@ enum {
> > > >    	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
> > > >    	/* sqe group with members depending on leader */
> > > >    	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
> > > > +	/* group lead provides kbuf for members, set for both lead and member */
> > > > +	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),
> > > 
> > > We have a huge flag problem here. It's a 4th group flag, that gives
> > > me an idea that it's overabused. We're adding state machines based on
> > > them "set group, clear group, but if last set it again. And clear
> > > group lead if refs are of particular value". And it's not really
> > > clear what these two flags are here for or what they do.
> > > 
> > >  From what I see you need here just one flag to mark requests
> > > that provide a buffer, ala REQ_F_PROVIDING_KBUF. On the import
> > > side:
> > > 
> > > if ((req->flags & GROUP) && (req->lead->flags & REQ_F_PROVIDING_KBUF))
> > > 	...
> > > 
> > > And when you kill the request:
> > > 
> > > if (req->flags & REQ_F_PROVIDING_KBUF)
> > > 	io_group_kbuf_drop();
> > 
> > REQ_F_PROVIDING_KBUF may be killed too, and the check helper can become:
> > 
> > bool io_use_group_provided_buf(req)
> > {
> > 	return (req->flags & GROUP) && req->lead->grp_buf;
> > }
> 
> ->grp_kbuf is unionised, so for that to work you need to ensure that
> only a buffer providing cmd / request could be a leader of a group,
> which doesn't sound right.

Yes, both 'req->lead->flags & REQ_F_PROVIDING_KBUF' and 'req->lead->grp_buf'
may not work because the helper may be called in ->prep(), when req->lead
isn't setup yet.

Another idea is to reuse one of the three unused flags(LINK, HARDLINK and DRAIN) 
of members for marking GROUP_KBUF, then it is aligned with BUFFER_SELECT and
implementation can be cleaner, what do you think of this approach?

Thanks,
Ming
Pavel Begunkov Oct. 9, 2024, 2:25 p.m. UTC | #6
On 10/6/24 09:20, Ming Lei wrote:
> On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
>> On 9/12/24 11:49, Ming Lei wrote:
>> ...
>>> It can help to implement generic zero copy between device and related
>>> operations, such as ublk, fuse, vdpa,
>>> even network receive or whatever.
>>
>> That's exactly the thing it can't sanely work with because
>> of this design.
> 
> The provide buffer design is absolutely generic, and basically
> 
> - group leader provides buffer for member OPs, and member OPs can borrow
> the buffer if leader allows by calling io_provide_group_kbuf()
> 
> - after member OPs consumes the buffer, the buffer is returned back by
> the callback implemented in group leader subsystem, so group leader can
> release related sources;
> 
> - and it is guaranteed that the buffer can be released always
> 
> The ublk implementation is pretty simple, it can be reused in device driver
> to share buffer with other kernel subsystems.
> 
> I don't see anything insane with the design.

There is nothing insane with the design, but the problem is cross
request error handling, same thing that makes links a pain to use.
It's good that with storage reads are reasonably idempotent and you
can be retried if needed. With sockets and streams, however, you
can't sanely borrow a buffer without consuming it, so if a member
request processing the buffer fails for any reason, the user data
will be dropped on the floor. I mentioned quite a while before,
if for example you stash the buffer somewhere you can access
across syscalls like the io_uring's registered buffer table, the
user at least would be able to find an error and then memcpy the
unprocessed data as a fallback.

>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>>    include/linux/io_uring_types.h | 33 +++++++++++++++++++
>>>    io_uring/io_uring.c            | 10 +++++-
>>>    io_uring/io_uring.h            | 10 ++++++
>>>    io_uring/kbuf.c                | 60 ++++++++++++++++++++++++++++++++++
>>>    io_uring/kbuf.h                | 13 ++++++++
>>>    io_uring/net.c                 | 23 ++++++++++++-
>>>    io_uring/opdef.c               |  4 +++
>>>    io_uring/opdef.h               |  2 ++
>>>    io_uring/rw.c                  | 20 +++++++++++-
>>>    9 files changed, 172 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>> index 793d5a26d9b8..445e5507565a 100644
>>> --- a/include/linux/io_uring_types.h
>>> +++ b/include/linux/io_uring_types.h
...
>>
>> And I don't think you need opdef::accept_group_kbuf since the
>> request handler should already know that and, importantly, you don't
>> imbue any semantics based on it.
> 
> Yeah, and it just follows logic of buffer_select. I guess def->buffer_select
> may be removed too?

->buffer_select helps to fail IOSQE_BUFFER_SELECT for requests
that don't support it, so we don't need to add the check every
time we add a new request opcode.

In your case requests just ignore ->accept_group_kbuf /
REQ_F_GROUP_KBUF if they don't expect to use the buffer, so
it's different in several aspects.

fwiw, I don't mind ->accept_group_kbuf, I just don't see
what purpose it serves. Would be nice to have a sturdier uAPI,
where the user sets a flag to each member that want to use
these provided buffers and then the kernel checks leader vs
that flag and fails misconfigurations, but likely we don't
have flags / sqe space for it.


>> FWIW, would be nice if during init figure we can verify that the leader
>> provides a buffer IFF there is someone consuming it, but I don't think
> 
> It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
> only be provided in ->issue().

In theory we could, in practise it'd be too much of a pain, I agree.

IORING_OP_PROVIDE_BUFFERS is different as you just stash the buffer
in the io_uring instance, and it's used at an unspecified time later
by some request. In this sense the API is explicit, requests that don't
support it but marked with IOSQE_BUFFER_SELECT will be failed by the
kernel.

>> the semantics is flexible enough to do it sanely. i.e. there are many
>> members in a group, some might want to use the buffer and some might not.
>>
...
>>> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
>>> index df2be7353414..8e111d24c02d 100644
>>> --- a/io_uring/io_uring.h
>>> +++ b/io_uring/io_uring.h
>>> @@ -349,6 +349,16 @@ static inline bool req_is_group_leader(struct io_kiocb *req)
>>>    	return req->flags & REQ_F_SQE_GROUP_LEADER;
>>>    }
>> ...
>>> +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
>>> +		unsigned int len, int dir, struct iov_iter *iter)
>>> +{
>>> +	struct io_kiocb *lead = req->grp_link;
>>> +	const struct io_uring_kernel_buf *kbuf;
>>> +	unsigned long offset;
>>> +
>>> +	WARN_ON_ONCE(!(req->flags & REQ_F_GROUP_KBUF));
>>> +
>>> +	if (!req_is_group_member(req))
>>> +		return -EINVAL;
>>> +
>>> +	if (!lead || !req_support_group_dep(lead) || !lead->grp_kbuf)
>>> +		return -EINVAL;
>>> +
>>> +	/* req->fused_cmd_kbuf is immutable */
>>> +	kbuf = lead->grp_kbuf;
>>> +	offset = kbuf->offset;
>>> +
>>> +	if (!kbuf->bvec)
>>> +		return -EINVAL;
>>
>> How can this happen?
> 
> OK, we can run the check in uring_cmd API.

Not sure I follow, if a request providing a buffer can't set
a bvec it should just fail, without exposing half made
io_uring_kernel_buf to other requests.

Is it rather a WARN_ON_ONCE check?


>>> diff --git a/io_uring/net.c b/io_uring/net.c
>>> index f10f5a22d66a..ad24dd5924d2 100644
>>> --- a/io_uring/net.c
>>> +++ b/io_uring/net.c
>>> @@ -89,6 +89,13 @@ struct io_sr_msg {
>>>     */
>>>    #define MULTISHOT_MAX_RETRY	32
>>> +#define user_ptr_to_u64(x) (		\
>>> +{					\
>>> +	typecheck(void __user *, (x));	\
>>> +	(u64)(unsigned long)(x);	\
>>> +}					\
>>> +)
>>> +
>>>    int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>    {
>>>    	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
>>> @@ -375,7 +382,7 @@ static int io_send_setup(struct io_kiocb *req)
>>>    		kmsg->msg.msg_name = &kmsg->addr;
>>>    		kmsg->msg.msg_namelen = sr->addr_len;
>>>    	}
>>> -	if (!io_do_buffer_select(req)) {
>>> +	if (!io_do_buffer_select(req) && !(req->flags & REQ_F_GROUP_KBUF)) {
>>>    		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
>>>    				  &kmsg->msg.msg_iter);
>>>    		if (unlikely(ret < 0))
>>> @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
>>>    	if (issue_flags & IO_URING_F_NONBLOCK)
>>>    		flags |= MSG_DONTWAIT;
>>> +	if (req->flags & REQ_F_GROUP_KBUF) {
>>
>> Does anything prevent the request to be marked by both
>> GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
>> a group kbuf and then go to the io_do_buffer_select()
>> overriding all of that
> 
> It could be used in this way, and we can fail the member in
> io_queue_group_members().

That's where the opdef flag could actually be useful,

if (opdef[member]->accept_group_kbuf &&
     member->flags & SELECT_BUF)
	fail;


>>> +		ret = io_import_group_kbuf(req,
>>> +					user_ptr_to_u64(sr->buf),
>>> +					sr->len, ITER_SOURCE,
>>> +					&kmsg->msg.msg_iter);
>>> +		if (unlikely(ret))
>>> +			return ret;
>>> +	}
>>> +
>>>    retry_bundle:
>>>    	if (io_do_buffer_select(req)) {
>>>    		struct buf_sel_arg arg = {
>>> @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
>>>    			goto out_free;
>>>    		}
>>>    		sr->buf = NULL;
>>> +	} else if (req->flags & REQ_F_GROUP_KBUF) {
>>
>> What happens if we get a short read/recv?
> 
> For short read/recv, any progress is stored in iterator, nothing to do
> with the provide buffer, which is immutable.
> 
> One problem for read is reissue, but it can be handled by saving iter
> state after the group buffer is imported, I will fix it in next version.
> For net recv, offset/len of buffer is updated in case of short recv, so
> it works as expected.

That was one of my worries.

> Or any other issue with short read/recv? Can you explain in detail?

To sum up design wise, when members that are using the buffer as a
source, e.g. write/send, fail, the user is expected to usually reissue
both the write and the ublk cmd.

Let's say you have a ublk leader command providing a 4K buffer, and
you group it with a 4K send using the buffer. Let's assume the send
is short and does't only 2K of data. Then the user would normally
reissue:

ublk(4K, GROUP), send(off=2K)

That's fine assuming short IO is rare.

I worry more about the backward flow, ublk provides an "empty" buffer
to receive/read into. ublk wants to do something with the buffer in
the callback. What happens when read/recv is short (and cannot be
retried by io_uring)?

1. ublk(provide empty 4K buffer)
2. recv, ret=2K
3. ->grp_kbuf_ack: ublk should commit back only 2K
    of data and not assume it's 4K

Another option is to fail ->grp_kbuf_ack if any member fails, but
the API might be a bit too awkward and inconvenient .
Ming Lei Oct. 10, 2024, 3 a.m. UTC | #7
On Wed, Oct 09, 2024 at 03:25:25PM +0100, Pavel Begunkov wrote:
> On 10/6/24 09:20, Ming Lei wrote:
> > On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> > > On 9/12/24 11:49, Ming Lei wrote:
> > > ...
> > > > It can help to implement generic zero copy between device and related
> > > > operations, such as ublk, fuse, vdpa,
> > > > even network receive or whatever.
> > > 
> > > That's exactly the thing it can't sanely work with because
> > > of this design.
> > 
> > The provide buffer design is absolutely generic, and basically
> > 
> > - group leader provides buffer for member OPs, and member OPs can borrow
> > the buffer if leader allows by calling io_provide_group_kbuf()
> > 
> > - after member OPs consumes the buffer, the buffer is returned back by
> > the callback implemented in group leader subsystem, so group leader can
> > release related sources;
> > 
> > - and it is guaranteed that the buffer can be released always
> > 
> > The ublk implementation is pretty simple, it can be reused in device driver
> > to share buffer with other kernel subsystems.
> > 
> > I don't see anything insane with the design.
> 
> There is nothing insane with the design, but the problem is cross
> request error handling, same thing that makes links a pain to use.

Wrt. link, the whole group is linked in the chain, and it respects
all existed link rule, care to share the trouble in link use case?

The only thing I thought of is that group internal link isn't supported
yet, but it may be added in future if use case is coming.

> It's good that with storage reads are reasonably idempotent and you
> can be retried if needed. With sockets and streams, however, you
> can't sanely borrow a buffer without consuming it, so if a member
> request processing the buffer fails for any reason, the user data
> will be dropped on the floor. I mentioned quite a while before,
> if for example you stash the buffer somewhere you can access
> across syscalls like the io_uring's registered buffer table, the
> user at least would be able to find an error and then memcpy the
> unprocessed data as a fallback.

I guess it is net rx case, which requires buffer to cross syscalls,
then the buffer has to be owned by userspace, otherwise the buffer
can be leaked easily.

That may not match with sqe group which is supposed to borrow kernel
buffer consumed by users.

> 
> > > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > > ---
> > > >    include/linux/io_uring_types.h | 33 +++++++++++++++++++
> > > >    io_uring/io_uring.c            | 10 +++++-
> > > >    io_uring/io_uring.h            | 10 ++++++
> > > >    io_uring/kbuf.c                | 60 ++++++++++++++++++++++++++++++++++
> > > >    io_uring/kbuf.h                | 13 ++++++++
> > > >    io_uring/net.c                 | 23 ++++++++++++-
> > > >    io_uring/opdef.c               |  4 +++
> > > >    io_uring/opdef.h               |  2 ++
> > > >    io_uring/rw.c                  | 20 +++++++++++-
> > > >    9 files changed, 172 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > > > index 793d5a26d9b8..445e5507565a 100644
> > > > --- a/include/linux/io_uring_types.h
> > > > +++ b/include/linux/io_uring_types.h
> ...
> > > 
> > > And I don't think you need opdef::accept_group_kbuf since the
> > > request handler should already know that and, importantly, you don't
> > > imbue any semantics based on it.
> > 
> > Yeah, and it just follows logic of buffer_select. I guess def->buffer_select
> > may be removed too?
> 
> ->buffer_select helps to fail IOSQE_BUFFER_SELECT for requests
> that don't support it, so we don't need to add the check every
> time we add a new request opcode.
> 
> In your case requests just ignore ->accept_group_kbuf /
> REQ_F_GROUP_KBUF if they don't expect to use the buffer, so
> it's different in several aspects.
> 
> fwiw, I don't mind ->accept_group_kbuf, I just don't see
> what purpose it serves. Would be nice to have a sturdier uAPI,
> where the user sets a flag to each member that want to use
> these provided buffers and then the kernel checks leader vs
> that flag and fails misconfigurations, but likely we don't
> have flags / sqe space for it.

As I replied in previous email, members have three flags available,
we can map one of them to REQ_F_GROUP_KBUF.

> 
> 
> > > FWIW, would be nice if during init figure we can verify that the leader
> > > provides a buffer IFF there is someone consuming it, but I don't think
> > 
> > It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
> > only be provided in ->issue().
> 
> In theory we could, in practise it'd be too much of a pain, I agree.
> 
> IORING_OP_PROVIDE_BUFFERS is different as you just stash the buffer
> in the io_uring instance, and it's used at an unspecified time later
> by some request. In this sense the API is explicit, requests that don't
> support it but marked with IOSQE_BUFFER_SELECT will be failed by the
> kernel.

That is also one reason why I add ->accept_group_kbuf.

> 
> > > the semantics is flexible enough to do it sanely. i.e. there are many
> > > members in a group, some might want to use the buffer and some might not.
> > > 
> ...
> > > > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> > > > index df2be7353414..8e111d24c02d 100644
> > > > --- a/io_uring/io_uring.h
> > > > +++ b/io_uring/io_uring.h
> > > > @@ -349,6 +349,16 @@ static inline bool req_is_group_leader(struct io_kiocb *req)
> > > >    	return req->flags & REQ_F_SQE_GROUP_LEADER;
> > > >    }
> > > ...
> > > > +int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
> > > > +		unsigned int len, int dir, struct iov_iter *iter)
> > > > +{
> > > > +	struct io_kiocb *lead = req->grp_link;
> > > > +	const struct io_uring_kernel_buf *kbuf;
> > > > +	unsigned long offset;
> > > > +
> > > > +	WARN_ON_ONCE(!(req->flags & REQ_F_GROUP_KBUF));
> > > > +
> > > > +	if (!req_is_group_member(req))
> > > > +		return -EINVAL;
> > > > +
> > > > +	if (!lead || !req_support_group_dep(lead) || !lead->grp_kbuf)
> > > > +		return -EINVAL;
> > > > +
> > > > +	/* req->fused_cmd_kbuf is immutable */
> > > > +	kbuf = lead->grp_kbuf;
> > > > +	offset = kbuf->offset;
> > > > +
> > > > +	if (!kbuf->bvec)
> > > > +		return -EINVAL;
> > > 
> > > How can this happen?
> > 
> > OK, we can run the check in uring_cmd API.
> 
> Not sure I follow, if a request providing a buffer can't set
> a bvec it should just fail, without exposing half made
> io_uring_kernel_buf to other requests.
> 
> Is it rather a WARN_ON_ONCE check?

I meant we can check it in API of io_provide_group_kbuf() since the group
buffer is filled by driver, since then the buffer is immutable, and we
needn't any other check.

> 
> 
> > > > diff --git a/io_uring/net.c b/io_uring/net.c
> > > > index f10f5a22d66a..ad24dd5924d2 100644
> > > > --- a/io_uring/net.c
> > > > +++ b/io_uring/net.c
> > > > @@ -89,6 +89,13 @@ struct io_sr_msg {
> > > >     */
> > > >    #define MULTISHOT_MAX_RETRY	32
> > > > +#define user_ptr_to_u64(x) (		\
> > > > +{					\
> > > > +	typecheck(void __user *, (x));	\
> > > > +	(u64)(unsigned long)(x);	\
> > > > +}					\
> > > > +)
> > > > +
> > > >    int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > >    {
> > > >    	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
> > > > @@ -375,7 +382,7 @@ static int io_send_setup(struct io_kiocb *req)
> > > >    		kmsg->msg.msg_name = &kmsg->addr;
> > > >    		kmsg->msg.msg_namelen = sr->addr_len;
> > > >    	}
> > > > -	if (!io_do_buffer_select(req)) {
> > > > +	if (!io_do_buffer_select(req) && !(req->flags & REQ_F_GROUP_KBUF)) {
> > > >    		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
> > > >    				  &kmsg->msg.msg_iter);
> > > >    		if (unlikely(ret < 0))
> > > > @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
> > > >    	if (issue_flags & IO_URING_F_NONBLOCK)
> > > >    		flags |= MSG_DONTWAIT;
> > > > +	if (req->flags & REQ_F_GROUP_KBUF) {
> > > 
> > > Does anything prevent the request to be marked by both
> > > GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
> > > a group kbuf and then go to the io_do_buffer_select()
> > > overriding all of that
> > 
> > It could be used in this way, and we can fail the member in
> > io_queue_group_members().
> 
> That's where the opdef flag could actually be useful,
> 
> if (opdef[member]->accept_group_kbuf &&
>     member->flags & SELECT_BUF)
> 	fail;
> 
> 
> > > > +		ret = io_import_group_kbuf(req,
> > > > +					user_ptr_to_u64(sr->buf),
> > > > +					sr->len, ITER_SOURCE,
> > > > +					&kmsg->msg.msg_iter);
> > > > +		if (unlikely(ret))
> > > > +			return ret;
> > > > +	}
> > > > +
> > > >    retry_bundle:
> > > >    	if (io_do_buffer_select(req)) {
> > > >    		struct buf_sel_arg arg = {
> > > > @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
> > > >    			goto out_free;
> > > >    		}
> > > >    		sr->buf = NULL;
> > > > +	} else if (req->flags & REQ_F_GROUP_KBUF) {
> > > 
> > > What happens if we get a short read/recv?
> > 
> > For short read/recv, any progress is stored in iterator, nothing to do
> > with the provide buffer, which is immutable.
> > 
> > One problem for read is reissue, but it can be handled by saving iter
> > state after the group buffer is imported, I will fix it in next version.
> > For net recv, offset/len of buffer is updated in case of short recv, so
> > it works as expected.
> 
> That was one of my worries.
> 
> > Or any other issue with short read/recv? Can you explain in detail?
> 
> To sum up design wise, when members that are using the buffer as a
> source, e.g. write/send, fail, the user is expected to usually reissue
> both the write and the ublk cmd.
> 
> Let's say you have a ublk leader command providing a 4K buffer, and
> you group it with a 4K send using the buffer. Let's assume the send
> is short and does't only 2K of data. Then the user would normally
> reissue:
> 
> ublk(4K, GROUP), send(off=2K)
> 
> That's fine assuming short IO is rare.
> 
> I worry more about the backward flow, ublk provides an "empty" buffer
> to receive/read into. ublk wants to do something with the buffer in
> the callback. What happens when read/recv is short (and cannot be
> retried by io_uring)?
> 
> 1. ublk(provide empty 4K buffer)
> 2. recv, ret=2K
> 3. ->grp_kbuf_ack: ublk should commit back only 2K
>    of data and not assume it's 4K

->grp_kbuf_ack is supposed to only return back the buffer to the
owner, and it doesn't care result of buffer consumption.

When ->grp_kbuf_ack() is done, it means this time buffer borrow is
over.

When userspace figures out it is one short read, it will send one
ublk uring_cmd to notify that this io command is completed with
result(2k). ublk driver may decide to requeue this io command for
retrying the remained bytes, when only remained part of the buffer
is allowed to borrow in following provide uring command originated
from userspace.

For ublk use case, so far so good.

> 
> Another option is to fail ->grp_kbuf_ack if any member fails, but
> the API might be a bit too awkward and inconvenient .

We needn't ->grp_kbuf_ack() to cover buffer consumption.


Thanks,
Ming
Pavel Begunkov Oct. 10, 2024, 6:51 p.m. UTC | #8
On 10/10/24 04:00, Ming Lei wrote:
> On Wed, Oct 09, 2024 at 03:25:25PM +0100, Pavel Begunkov wrote:
>> On 10/6/24 09:20, Ming Lei wrote:
>>> On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
>>>> On 9/12/24 11:49, Ming Lei wrote:
>>>> ...
>>>>> It can help to implement generic zero copy between device and related
>>>>> operations, such as ublk, fuse, vdpa,
>>>>> even network receive or whatever.
>>>>
>>>> That's exactly the thing it can't sanely work with because
>>>> of this design.
>>>
>>> The provide buffer design is absolutely generic, and basically
>>>
>>> - group leader provides buffer for member OPs, and member OPs can borrow
>>> the buffer if leader allows by calling io_provide_group_kbuf()
>>>
>>> - after member OPs consumes the buffer, the buffer is returned back by
>>> the callback implemented in group leader subsystem, so group leader can
>>> release related sources;
>>>
>>> - and it is guaranteed that the buffer can be released always
>>>
>>> The ublk implementation is pretty simple, it can be reused in device driver
>>> to share buffer with other kernel subsystems.
>>>
>>> I don't see anything insane with the design.
>>
>> There is nothing insane with the design, but the problem is cross
>> request error handling, same thing that makes links a pain to use.
> 
> Wrt. link, the whole group is linked in the chain, and it respects
> all existed link rule, care to share the trouble in link use case?

Error handling is a pain, it has been, even for pure link without
any groups. Even with a simple req1 -> req2, you need to track if
the first request fails you need to expect another failed CQE for
the second request, probably refcount (let's say non-atomically)
some structure and clean it up when you get both CQEs. It's not
prettier when the 2nd fails, especially if you consider short IO
and that you can't fully retry that partial IO, e.g. you consumed
data from the socket. And so on.

> The only thing I thought of is that group internal link isn't supported
> yet, but it may be added in future if use case is coming.
> 
>> It's good that with storage reads are reasonably idempotent and you
>> can be retried if needed. With sockets and streams, however, you
>> can't sanely borrow a buffer without consuming it, so if a member
>> request processing the buffer fails for any reason, the user data
>> will be dropped on the floor. I mentioned quite a while before,
>> if for example you stash the buffer somewhere you can access
>> across syscalls like the io_uring's registered buffer table, the
>> user at least would be able to find an error and then memcpy the
>> unprocessed data as a fallback.
> 
> I guess it is net rx case, which requires buffer to cross syscalls,
> then the buffer has to be owned by userspace, otherwise the buffer
> can be leaked easily.
> 
> That may not match with sqe group which is supposed to borrow kernel
> buffer consumed by users.

It doesn't necessarily require to keep buffers across syscalls
per se, it just can't drop the data it got on the floor. It's
just storage can read data again.

...
>>>>> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
>>>>> index 793d5a26d9b8..445e5507565a 100644
>>>>> --- a/include/linux/io_uring_types.h
>>>>> +++ b/include/linux/io_uring_types.h
...
>>>> FWIW, would be nice if during init figure we can verify that the leader
>>>> provides a buffer IFF there is someone consuming it, but I don't think
>>>
>>> It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
>>> only be provided in ->issue().
>>
>> In theory we could, in practise it'd be too much of a pain, I agree.
>>
>> IORING_OP_PROVIDE_BUFFERS is different as you just stash the buffer
>> in the io_uring instance, and it's used at an unspecified time later
>> by some request. In this sense the API is explicit, requests that don't
>> support it but marked with IOSQE_BUFFER_SELECT will be failed by the
>> kernel.
> 
> That is also one reason why I add ->accept_group_kbuf.

I probably missed that, but I haven't seen that

>>>> the semantics is flexible enough to do it sanely. i.e. there are many
>>>> members in a group, some might want to use the buffer and some might not.
>>>>
...
>>>>> +	if (!kbuf->bvec)
>>>>> +		return -EINVAL;
>>>>
>>>> How can this happen?
>>>
>>> OK, we can run the check in uring_cmd API.
>>
>> Not sure I follow, if a request providing a buffer can't set
>> a bvec it should just fail, without exposing half made
>> io_uring_kernel_buf to other requests.
>>
>> Is it rather a WARN_ON_ONCE check?
> 
> I meant we can check it in API of io_provide_group_kbuf() since the group
> buffer is filled by driver, since then the buffer is immutable, and we
> needn't any other check.

That's be a buggy provider, so sounds like WARN_ON_ONCE

...
>>>>>     		if (unlikely(ret < 0))
>>>>> @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
>>>>>     	if (issue_flags & IO_URING_F_NONBLOCK)
>>>>>     		flags |= MSG_DONTWAIT;
>>>>> +	if (req->flags & REQ_F_GROUP_KBUF) {
>>>>
>>>> Does anything prevent the request to be marked by both
>>>> GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
>>>> a group kbuf and then go to the io_do_buffer_select()
>>>> overriding all of that
>>>
>>> It could be used in this way, and we can fail the member in
>>> io_queue_group_members().
>>
>> That's where the opdef flag could actually be useful,
>>
>> if (opdef[member]->accept_group_kbuf &&
>>      member->flags & SELECT_BUF)
>> 	fail;
>>
>>
>>>>> +		ret = io_import_group_kbuf(req,
>>>>> +					user_ptr_to_u64(sr->buf),
>>>>> +					sr->len, ITER_SOURCE,
>>>>> +					&kmsg->msg.msg_iter);
>>>>> +		if (unlikely(ret))
>>>>> +			return ret;
>>>>> +	}
>>>>> +
>>>>>     retry_bundle:
>>>>>     	if (io_do_buffer_select(req)) {
>>>>>     		struct buf_sel_arg arg = {
>>>>> @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
>>>>>     			goto out_free;
>>>>>     		}
>>>>>     		sr->buf = NULL;
>>>>> +	} else if (req->flags & REQ_F_GROUP_KBUF) {
>>>>
>>>> What happens if we get a short read/recv?
>>>
>>> For short read/recv, any progress is stored in iterator, nothing to do
>>> with the provide buffer, which is immutable.
>>>
>>> One problem for read is reissue, but it can be handled by saving iter
>>> state after the group buffer is imported, I will fix it in next version.
>>> For net recv, offset/len of buffer is updated in case of short recv, so
>>> it works as expected.
>>
>> That was one of my worries.
>>
>>> Or any other issue with short read/recv? Can you explain in detail?
>>
>> To sum up design wise, when members that are using the buffer as a
>> source, e.g. write/send, fail, the user is expected to usually reissue
>> both the write and the ublk cmd.
>>
>> Let's say you have a ublk leader command providing a 4K buffer, and
>> you group it with a 4K send using the buffer. Let's assume the send
>> is short and does't only 2K of data. Then the user would normally
>> reissue:
>>
>> ublk(4K, GROUP), send(off=2K)
>>
>> That's fine assuming short IO is rare.
>>
>> I worry more about the backward flow, ublk provides an "empty" buffer
>> to receive/read into. ublk wants to do something with the buffer in
>> the callback. What happens when read/recv is short (and cannot be
>> retried by io_uring)?
>>
>> 1. ublk(provide empty 4K buffer)
>> 2. recv, ret=2K
>> 3. ->grp_kbuf_ack: ublk should commit back only 2K
>>     of data and not assume it's 4K
> 
> ->grp_kbuf_ack is supposed to only return back the buffer to the
> owner, and it doesn't care result of buffer consumption.
> 
> When ->grp_kbuf_ack() is done, it means this time buffer borrow is
> over.
> 
> When userspace figures out it is one short read, it will send one
> ublk uring_cmd to notify that this io command is completed with
> result(2k). ublk driver may decide to requeue this io command for
> retrying the remained bytes, when only remained part of the buffer
> is allowed to borrow in following provide uring command originated
> from userspace.

My apologies, I failed to notice that moment, even though should've
given it some thinking at the very beginning. I think that part would
be a terrible interface. Might be good enough for ublk, but we can't
be creating a ublk specific features that change the entire io_uring.
Without knowing how much data it actually got, in generic case you
1) need to require the buffer to be fully initialised / zeroed
before handing it. 2) Can't ever commit the data from the callback,
but it would need to wait until the userspace reacts. Yes, it
works in the specific context of ublk, but I don't think it works
as a generic interface.

We need to fall back again and think if we can reuse the registered
buffer table or something else, and make it much cleaner and more
accommodating to other users. Jens, can you give a quick thought
about the API? You've done a lot of interfaces before and hopefully
have some ideas here.


> For ublk use case, so far so good.
> 
>>
>> Another option is to fail ->grp_kbuf_ack if any member fails, but
>> the API might be a bit too awkward and inconvenient .
> 
> We needn't ->grp_kbuf_ack() to cover buffer consumption.
Ming Lei Oct. 11, 2024, 2 a.m. UTC | #9
On Thu, Oct 10, 2024 at 07:51:19PM +0100, Pavel Begunkov wrote:
> On 10/10/24 04:00, Ming Lei wrote:
> > On Wed, Oct 09, 2024 at 03:25:25PM +0100, Pavel Begunkov wrote:
> > > On 10/6/24 09:20, Ming Lei wrote:
> > > > On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> > > > > On 9/12/24 11:49, Ming Lei wrote:
> > > > > ...
> > > > > > It can help to implement generic zero copy between device and related
> > > > > > operations, such as ublk, fuse, vdpa,
> > > > > > even network receive or whatever.
> > > > > 
> > > > > That's exactly the thing it can't sanely work with because
> > > > > of this design.
> > > > 
> > > > The provide buffer design is absolutely generic, and basically
> > > > 
> > > > - group leader provides buffer for member OPs, and member OPs can borrow
> > > > the buffer if leader allows by calling io_provide_group_kbuf()
> > > > 
> > > > - after member OPs consumes the buffer, the buffer is returned back by
> > > > the callback implemented in group leader subsystem, so group leader can
> > > > release related sources;
> > > > 
> > > > - and it is guaranteed that the buffer can be released always
> > > > 
> > > > The ublk implementation is pretty simple, it can be reused in device driver
> > > > to share buffer with other kernel subsystems.
> > > > 
> > > > I don't see anything insane with the design.
> > > 
> > > There is nothing insane with the design, but the problem is cross
> > > request error handling, same thing that makes links a pain to use.
> > 
> > Wrt. link, the whole group is linked in the chain, and it respects
> > all existed link rule, care to share the trouble in link use case?
> 
> Error handling is a pain, it has been, even for pure link without
> any groups. Even with a simple req1 -> req2, you need to track if
> the first request fails you need to expect another failed CQE for
> the second request, probably refcount (let's say non-atomically)
> some structure and clean it up when you get both CQEs. It's not
> prettier when the 2nd fails, especially if you consider short IO
> and that you can't fully retry that partial IO, e.g. you consumed
> data from the socket. And so on.
> 
> > The only thing I thought of is that group internal link isn't supported
> > yet, but it may be added in future if use case is coming.
> > 
> > > It's good that with storage reads are reasonably idempotent and you
> > > can be retried if needed. With sockets and streams, however, you
> > > can't sanely borrow a buffer without consuming it, so if a member
> > > request processing the buffer fails for any reason, the user data
> > > will be dropped on the floor. I mentioned quite a while before,
> > > if for example you stash the buffer somewhere you can access
> > > across syscalls like the io_uring's registered buffer table, the
> > > user at least would be able to find an error and then memcpy the
> > > unprocessed data as a fallback.
> > 
> > I guess it is net rx case, which requires buffer to cross syscalls,
> > then the buffer has to be owned by userspace, otherwise the buffer
> > can be leaked easily.
> > 
> > That may not match with sqe group which is supposed to borrow kernel
> > buffer consumed by users.
> 
> It doesn't necessarily require to keep buffers across syscalls
> per se, it just can't drop the data it got on the floor. It's
> just storage can read data again.

In case of short read, data is really stored(not dropped) in the provided
buffer, and you can consume the short read data, or continue to read more to
the same buffer.

What is the your real issue here?

> 
> ...
> > > > > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > > > > > index 793d5a26d9b8..445e5507565a 100644
> > > > > > --- a/include/linux/io_uring_types.h
> > > > > > +++ b/include/linux/io_uring_types.h
> ...
> > > > > FWIW, would be nice if during init figure we can verify that the leader
> > > > > provides a buffer IFF there is someone consuming it, but I don't think
> > > > 
> > > > It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
> > > > only be provided in ->issue().
> > > 
> > > In theory we could, in practise it'd be too much of a pain, I agree.
> > > 
> > > IORING_OP_PROVIDE_BUFFERS is different as you just stash the buffer
> > > in the io_uring instance, and it's used at an unspecified time later
> > > by some request. In this sense the API is explicit, requests that don't
> > > support it but marked with IOSQE_BUFFER_SELECT will be failed by the
> > > kernel.
> > 
> > That is also one reason why I add ->accept_group_kbuf.
> 
> I probably missed that, but I haven't seen that

Such as, any OPs with fixed buffer can't set ->accept_group_kbuf.

> 
> > > > > the semantics is flexible enough to do it sanely. i.e. there are many
> > > > > members in a group, some might want to use the buffer and some might not.
> > > > > 
> ...
> > > > > > +	if (!kbuf->bvec)
> > > > > > +		return -EINVAL;
> > > > > 
> > > > > How can this happen?
> > > > 
> > > > OK, we can run the check in uring_cmd API.
> > > 
> > > Not sure I follow, if a request providing a buffer can't set
> > > a bvec it should just fail, without exposing half made
> > > io_uring_kernel_buf to other requests.
> > > 
> > > Is it rather a WARN_ON_ONCE check?
> > 
> > I meant we can check it in API of io_provide_group_kbuf() since the group
> > buffer is filled by driver, since then the buffer is immutable, and we
> > needn't any other check.
> 
> That's be a buggy provider, so sounds like WARN_ON_ONCE

Not at all.

If the driver provides bad buffer, all group leader and members OP will be
failed, and userspace can get notified.

> 
> ...
> > > > > >     		if (unlikely(ret < 0))
> > > > > > @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
> > > > > >     	if (issue_flags & IO_URING_F_NONBLOCK)
> > > > > >     		flags |= MSG_DONTWAIT;
> > > > > > +	if (req->flags & REQ_F_GROUP_KBUF) {
> > > > > 
> > > > > Does anything prevent the request to be marked by both
> > > > > GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
> > > > > a group kbuf and then go to the io_do_buffer_select()
> > > > > overriding all of that
> > > > 
> > > > It could be used in this way, and we can fail the member in
> > > > io_queue_group_members().
> > > 
> > > That's where the opdef flag could actually be useful,
> > > 
> > > if (opdef[member]->accept_group_kbuf &&
> > >      member->flags & SELECT_BUF)
> > > 	fail;
> > > 
> > > 
> > > > > > +		ret = io_import_group_kbuf(req,
> > > > > > +					user_ptr_to_u64(sr->buf),
> > > > > > +					sr->len, ITER_SOURCE,
> > > > > > +					&kmsg->msg.msg_iter);
> > > > > > +		if (unlikely(ret))
> > > > > > +			return ret;
> > > > > > +	}
> > > > > > +
> > > > > >     retry_bundle:
> > > > > >     	if (io_do_buffer_select(req)) {
> > > > > >     		struct buf_sel_arg arg = {
> > > > > > @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
> > > > > >     			goto out_free;
> > > > > >     		}
> > > > > >     		sr->buf = NULL;
> > > > > > +	} else if (req->flags & REQ_F_GROUP_KBUF) {
> > > > > 
> > > > > What happens if we get a short read/recv?
> > > > 
> > > > For short read/recv, any progress is stored in iterator, nothing to do
> > > > with the provide buffer, which is immutable.
> > > > 
> > > > One problem for read is reissue, but it can be handled by saving iter
> > > > state after the group buffer is imported, I will fix it in next version.
> > > > For net recv, offset/len of buffer is updated in case of short recv, so
> > > > it works as expected.
> > > 
> > > That was one of my worries.
> > > 
> > > > Or any other issue with short read/recv? Can you explain in detail?
> > > 
> > > To sum up design wise, when members that are using the buffer as a
> > > source, e.g. write/send, fail, the user is expected to usually reissue
> > > both the write and the ublk cmd.
> > > 
> > > Let's say you have a ublk leader command providing a 4K buffer, and
> > > you group it with a 4K send using the buffer. Let's assume the send
> > > is short and does't only 2K of data. Then the user would normally
> > > reissue:
> > > 
> > > ublk(4K, GROUP), send(off=2K)
> > > 
> > > That's fine assuming short IO is rare.
> > > 
> > > I worry more about the backward flow, ublk provides an "empty" buffer
> > > to receive/read into. ublk wants to do something with the buffer in
> > > the callback. What happens when read/recv is short (and cannot be
> > > retried by io_uring)?
> > > 
> > > 1. ublk(provide empty 4K buffer)
> > > 2. recv, ret=2K
> > > 3. ->grp_kbuf_ack: ublk should commit back only 2K
> > >     of data and not assume it's 4K
> > 
> > ->grp_kbuf_ack is supposed to only return back the buffer to the
> > owner, and it doesn't care result of buffer consumption.
> > 
> > When ->grp_kbuf_ack() is done, it means this time buffer borrow is
> > over.
> > 
> > When userspace figures out it is one short read, it will send one
> > ublk uring_cmd to notify that this io command is completed with
> > result(2k). ublk driver may decide to requeue this io command for
> > retrying the remained bytes, when only remained part of the buffer
> > is allowed to borrow in following provide uring command originated
> > from userspace.
> 
> My apologies, I failed to notice that moment, even though should've
> given it some thinking at the very beginning. I think that part would
> be a terrible interface. Might be good enough for ublk, but we can't
> be creating a ublk specific features that change the entire io_uring.
> Without knowing how much data it actually got, in generic case you

You do know how much data actually got in the member OP, don't you?

> 1) need to require the buffer to be fully initialised / zeroed
> before handing it.

The buffer is really initialized before being provided via
io_provide_group_kbuf(). And it is one bvec buffer, anytime the part
is consumed, the iterator is advanced, so always initialized buffer
is provided to consumer OP.

> 2) Can't ever commit the data from the callback,

What do you mean `commit`?

The callback is documented clearly from beginning that it is for
returning back the buffer to the owner.

Only member OPs consume buffer, and group leader provides valid buffer
for member OP, and the buffer lifetime is aligned with group leader
request.

> but it would need to wait until the userspace reacts. Yes, it
> works in the specific context of ublk, but I don't think it works
> as a generic interface.

It is just how ublk uses group buffer, but not necessary to be exactly
this way.

Anytime the buffer is provided via io_provide_group_kbuf() successfully,
the member OPs can consume it safely, and finally the buffer is returned
back if all member OPs are completed. That is all.

Please explain why it isn't generic interface.


Thanks, 
Ming
Ming Lei Oct. 11, 2024, 4:06 a.m. UTC | #10
On Fri, Oct 11, 2024 at 10:00:06AM +0800, Ming Lei wrote:
> On Thu, Oct 10, 2024 at 07:51:19PM +0100, Pavel Begunkov wrote:
> > On 10/10/24 04:00, Ming Lei wrote:
> > > On Wed, Oct 09, 2024 at 03:25:25PM +0100, Pavel Begunkov wrote:
> > > > On 10/6/24 09:20, Ming Lei wrote:
> > > > > On Fri, Oct 04, 2024 at 04:32:04PM +0100, Pavel Begunkov wrote:
> > > > > > On 9/12/24 11:49, Ming Lei wrote:
> > > > > > ...
> > > > > > > It can help to implement generic zero copy between device and related
> > > > > > > operations, such as ublk, fuse, vdpa,
> > > > > > > even network receive or whatever.
> > > > > > 
> > > > > > That's exactly the thing it can't sanely work with because
> > > > > > of this design.
> > > > > 
> > > > > The provide buffer design is absolutely generic, and basically
> > > > > 
> > > > > - group leader provides buffer for member OPs, and member OPs can borrow
> > > > > the buffer if leader allows by calling io_provide_group_kbuf()
> > > > > 
> > > > > - after member OPs consumes the buffer, the buffer is returned back by
> > > > > the callback implemented in group leader subsystem, so group leader can
> > > > > release related sources;
> > > > > 
> > > > > - and it is guaranteed that the buffer can be released always
> > > > > 
> > > > > The ublk implementation is pretty simple, it can be reused in device driver
> > > > > to share buffer with other kernel subsystems.
> > > > > 
> > > > > I don't see anything insane with the design.
> > > > 
> > > > There is nothing insane with the design, but the problem is cross
> > > > request error handling, same thing that makes links a pain to use.
> > > 
> > > Wrt. link, the whole group is linked in the chain, and it respects
> > > all existed link rule, care to share the trouble in link use case?
> > 
> > Error handling is a pain, it has been, even for pure link without
> > any groups. Even with a simple req1 -> req2, you need to track if
> > the first request fails you need to expect another failed CQE for
> > the second request, probably refcount (let's say non-atomically)
> > some structure and clean it up when you get both CQEs. It's not
> > prettier when the 2nd fails, especially if you consider short IO
> > and that you can't fully retry that partial IO, e.g. you consumed
> > data from the socket. And so on.
> > 
> > > The only thing I thought of is that group internal link isn't supported
> > > yet, but it may be added in future if use case is coming.
> > > 
> > > > It's good that with storage reads are reasonably idempotent and you
> > > > can be retried if needed. With sockets and streams, however, you
> > > > can't sanely borrow a buffer without consuming it, so if a member
> > > > request processing the buffer fails for any reason, the user data
> > > > will be dropped on the floor. I mentioned quite a while before,
> > > > if for example you stash the buffer somewhere you can access
> > > > across syscalls like the io_uring's registered buffer table, the
> > > > user at least would be able to find an error and then memcpy the
> > > > unprocessed data as a fallback.
> > > 
> > > I guess it is net rx case, which requires buffer to cross syscalls,
> > > then the buffer has to be owned by userspace, otherwise the buffer
> > > can be leaked easily.
> > > 
> > > That may not match with sqe group which is supposed to borrow kernel
> > > buffer consumed by users.
> > 
> > It doesn't necessarily require to keep buffers across syscalls
> > per se, it just can't drop the data it got on the floor. It's
> > just storage can read data again.
> 
> In case of short read, data is really stored(not dropped) in the provided
> buffer, and you can consume the short read data, or continue to read more to
> the same buffer.
> 
> What is the your real issue here?
> 
> > 
> > ...
> > > > > > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
> > > > > > > index 793d5a26d9b8..445e5507565a 100644
> > > > > > > --- a/include/linux/io_uring_types.h
> > > > > > > +++ b/include/linux/io_uring_types.h
> > ...
> > > > > > FWIW, would be nice if during init figure we can verify that the leader
> > > > > > provides a buffer IFF there is someone consuming it, but I don't think
> > > > > 
> > > > > It isn't doable, same reason with IORING_OP_PROVIDE_BUFFERS, since buffer can
> > > > > only be provided in ->issue().
> > > > 
> > > > In theory we could, in practise it'd be too much of a pain, I agree.
> > > > 
> > > > IORING_OP_PROVIDE_BUFFERS is different as you just stash the buffer
> > > > in the io_uring instance, and it's used at an unspecified time later
> > > > by some request. In this sense the API is explicit, requests that don't
> > > > support it but marked with IOSQE_BUFFER_SELECT will be failed by the
> > > > kernel.
> > > 
> > > That is also one reason why I add ->accept_group_kbuf.
> > 
> > I probably missed that, but I haven't seen that
> 
> Such as, any OPs with fixed buffer can't set ->accept_group_kbuf.
> 
> > 
> > > > > > the semantics is flexible enough to do it sanely. i.e. there are many
> > > > > > members in a group, some might want to use the buffer and some might not.
> > > > > > 
> > ...
> > > > > > > +	if (!kbuf->bvec)
> > > > > > > +		return -EINVAL;
> > > > > > 
> > > > > > How can this happen?
> > > > > 
> > > > > OK, we can run the check in uring_cmd API.
> > > > 
> > > > Not sure I follow, if a request providing a buffer can't set
> > > > a bvec it should just fail, without exposing half made
> > > > io_uring_kernel_buf to other requests.
> > > > 
> > > > Is it rather a WARN_ON_ONCE check?
> > > 
> > > I meant we can check it in API of io_provide_group_kbuf() since the group
> > > buffer is filled by driver, since then the buffer is immutable, and we
> > > needn't any other check.
> > 
> > That's be a buggy provider, so sounds like WARN_ON_ONCE
> 
> Not at all.
> 
> If the driver provides bad buffer, all group leader and members OP will be
> failed, and userspace can get notified.
> 
> > 
> > ...
> > > > > > >     		if (unlikely(ret < 0))
> > > > > > > @@ -593,6 +600,15 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
> > > > > > >     	if (issue_flags & IO_URING_F_NONBLOCK)
> > > > > > >     		flags |= MSG_DONTWAIT;
> > > > > > > +	if (req->flags & REQ_F_GROUP_KBUF) {
> > > > > > 
> > > > > > Does anything prevent the request to be marked by both
> > > > > > GROUP_KBUF and BUFFER_SELECT? In which case we'd set up
> > > > > > a group kbuf and then go to the io_do_buffer_select()
> > > > > > overriding all of that
> > > > > 
> > > > > It could be used in this way, and we can fail the member in
> > > > > io_queue_group_members().
> > > > 
> > > > That's where the opdef flag could actually be useful,
> > > > 
> > > > if (opdef[member]->accept_group_kbuf &&
> > > >      member->flags & SELECT_BUF)
> > > > 	fail;
> > > > 
> > > > 
> > > > > > > +		ret = io_import_group_kbuf(req,
> > > > > > > +					user_ptr_to_u64(sr->buf),
> > > > > > > +					sr->len, ITER_SOURCE,
> > > > > > > +					&kmsg->msg.msg_iter);
> > > > > > > +		if (unlikely(ret))
> > > > > > > +			return ret;
> > > > > > > +	}
> > > > > > > +
> > > > > > >     retry_bundle:
> > > > > > >     	if (io_do_buffer_select(req)) {
> > > > > > >     		struct buf_sel_arg arg = {
> > > > > > > @@ -1154,6 +1170,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
> > > > > > >     			goto out_free;
> > > > > > >     		}
> > > > > > >     		sr->buf = NULL;
> > > > > > > +	} else if (req->flags & REQ_F_GROUP_KBUF) {
> > > > > > 
> > > > > > What happens if we get a short read/recv?
> > > > > 
> > > > > For short read/recv, any progress is stored in iterator, nothing to do
> > > > > with the provide buffer, which is immutable.
> > > > > 
> > > > > One problem for read is reissue, but it can be handled by saving iter
> > > > > state after the group buffer is imported, I will fix it in next version.
> > > > > For net recv, offset/len of buffer is updated in case of short recv, so
> > > > > it works as expected.
> > > > 
> > > > That was one of my worries.
> > > > 
> > > > > Or any other issue with short read/recv? Can you explain in detail?
> > > > 
> > > > To sum up design wise, when members that are using the buffer as a
> > > > source, e.g. write/send, fail, the user is expected to usually reissue
> > > > both the write and the ublk cmd.
> > > > 
> > > > Let's say you have a ublk leader command providing a 4K buffer, and
> > > > you group it with a 4K send using the buffer. Let's assume the send
> > > > is short and does't only 2K of data. Then the user would normally
> > > > reissue:
> > > > 
> > > > ublk(4K, GROUP), send(off=2K)
> > > > 
> > > > That's fine assuming short IO is rare.
> > > > 
> > > > I worry more about the backward flow, ublk provides an "empty" buffer
> > > > to receive/read into. ublk wants to do something with the buffer in
> > > > the callback. What happens when read/recv is short (and cannot be
> > > > retried by io_uring)?
> > > > 
> > > > 1. ublk(provide empty 4K buffer)
> > > > 2. recv, ret=2K
> > > > 3. ->grp_kbuf_ack: ublk should commit back only 2K
> > > >     of data and not assume it's 4K
> > > 
> > > ->grp_kbuf_ack is supposed to only return back the buffer to the
> > > owner, and it doesn't care result of buffer consumption.
> > > 
> > > When ->grp_kbuf_ack() is done, it means this time buffer borrow is
> > > over.
> > > 
> > > When userspace figures out it is one short read, it will send one
> > > ublk uring_cmd to notify that this io command is completed with
> > > result(2k). ublk driver may decide to requeue this io command for
> > > retrying the remained bytes, when only remained part of the buffer
> > > is allowed to borrow in following provide uring command originated
> > > from userspace.
> > 
> > My apologies, I failed to notice that moment, even though should've
> > given it some thinking at the very beginning. I think that part would
> > be a terrible interface. Might be good enough for ublk, but we can't
> > be creating a ublk specific features that change the entire io_uring.
> > Without knowing how much data it actually got, in generic case you
> 
> You do know how much data actually got in the member OP, don't you?
> 
> > 1) need to require the buffer to be fully initialised / zeroed
> > before handing it.
> 
> The buffer is really initialized before being provided via
> io_provide_group_kbuf(). And it is one bvec buffer, anytime the part
> is consumed, the iterator is advanced, so always initialized buffer
> is provided to consumer OP.
> 
> > 2) Can't ever commit the data from the callback,
> 
> What do you mean `commit`?
> 
> The callback is documented clearly from beginning that it is for
> returning back the buffer to the owner.
> 
> Only member OPs consume buffer, and group leader provides valid buffer
> for member OP, and the buffer lifetime is aligned with group leader
> request.
> 
> > but it would need to wait until the userspace reacts. Yes, it
> > works in the specific context of ublk, but I don't think it works
> > as a generic interface.
> 
> It is just how ublk uses group buffer, but not necessary to be exactly
> this way.
> 
> Anytime the buffer is provided via io_provide_group_kbuf() successfully,
> the member OPs can consume it safely, and finally the buffer is returned
> back if all member OPs are completed. That is all.

Forget to mention:

The same buffer can be provided multiple times if it is valid, and one
offset can be added(not done yet in this patchset) easily on the provide
buffer uring_command, so the buffer can be advanced in case of short recv
in the provider side.

> Please explain why it isn't generic interface.


Thanks,
Ming
diff mbox series

Patch

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 793d5a26d9b8..445e5507565a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -6,6 +6,7 @@ 
 #include <linux/task_work.h>
 #include <linux/bitmap.h>
 #include <linux/llist.h>
+#include <linux/bvec.h>
 #include <uapi/linux/io_uring.h>
 
 enum {
@@ -39,6 +40,26 @@  enum io_uring_cmd_flags {
 	IO_URING_F_COMPAT		= (1 << 12),
 };
 
+struct io_uring_kernel_buf;
+typedef void (io_uring_buf_giveback_t) (const struct io_uring_kernel_buf *);
+
+/* buffer provided from kernel */
+struct io_uring_kernel_buf {
+	unsigned long		len;
+	unsigned short		nr_bvecs;
+	unsigned char		dir;	/* ITER_SOURCE or ITER_DEST */
+
+	/* offset in the 1st bvec */
+	unsigned int		offset;
+	const struct bio_vec	*bvec;
+
+	/* called when we are done with this buffer */
+	io_uring_buf_giveback_t	*grp_kbuf_ack;
+
+	/* private field, user don't touch it */
+	struct bio_vec		__bvec[];
+};
+
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -473,6 +494,7 @@  enum {
 	REQ_F_BUFFERS_COMMIT_BIT,
 	REQ_F_SQE_GROUP_LEADER_BIT,
 	REQ_F_SQE_GROUP_DEP_BIT,
+	REQ_F_GROUP_KBUF_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -557,6 +579,8 @@  enum {
 	REQ_F_SQE_GROUP_LEADER	= IO_REQ_FLAG(REQ_F_SQE_GROUP_LEADER_BIT),
 	/* sqe group with members depending on leader */
 	REQ_F_SQE_GROUP_DEP	= IO_REQ_FLAG(REQ_F_SQE_GROUP_DEP_BIT),
+	/* group lead provides kbuf for members, set for both lead and member */
+	REQ_F_GROUP_KBUF	= IO_REQ_FLAG(REQ_F_GROUP_KBUF_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -640,6 +664,15 @@  struct io_kiocb {
 		 * REQ_F_BUFFER_RING is set.
 		 */
 		struct io_buffer_list	*buf_list;
+
+		/*
+		 * store kernel buffer provided by sqe group lead, valid
+		 * IFF REQ_F_GROUP_KBUF
+		 *
+		 * The buffer meta is immutable since it is shared by
+		 * all member requests
+		 */
+		const struct io_uring_kernel_buf *grp_kbuf;
 	};
 
 	union {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 99b44b6babd6..80c4d9192657 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -116,7 +116,7 @@ 
 
 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
-				REQ_F_ASYNC_DATA)
+				REQ_F_ASYNC_DATA | REQ_F_GROUP_KBUF)
 
 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
 				 REQ_F_SQE_GROUP | REQ_F_SQE_GROUP_LEADER | \
@@ -387,6 +387,11 @@  static bool req_need_defer(struct io_kiocb *req, u32 seq)
 
 static void io_clean_op(struct io_kiocb *req)
 {
+	/* GROUP_KBUF is only available for REQ_F_SQE_GROUP_DEP */
+	if ((req->flags & (REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP)) ==
+			(REQ_F_GROUP_KBUF | REQ_F_SQE_GROUP_DEP))
+		io_group_kbuf_drop(req);
+
 	if (req->flags & REQ_F_BUFFER_SELECTED) {
 		spin_lock(&req->ctx->completion_lock);
 		io_kbuf_drop(req);
@@ -914,9 +919,12 @@  static void io_queue_group_members(struct io_kiocb *req)
 
 	req->grp_link = NULL;
 	while (member) {
+		const struct io_issue_def *def = &io_issue_defs[member->opcode];
 		struct io_kiocb *next = member->grp_link;
 
 		member->grp_leader = req;
+		if ((req->flags & REQ_F_GROUP_KBUF) && def->accept_group_kbuf)
+			member->flags |= REQ_F_GROUP_KBUF;
 		if (unlikely(member->flags & REQ_F_FAIL)) {
 			io_req_task_queue_fail(member, member->cqe.res);
 		} else if (unlikely(req->flags & REQ_F_FAIL)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index df2be7353414..8e111d24c02d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -349,6 +349,16 @@  static inline bool req_is_group_leader(struct io_kiocb *req)
 	return req->flags & REQ_F_SQE_GROUP_LEADER;
 }
 
+static inline bool req_is_group_member(struct io_kiocb *req)
+{
+	return !req_is_group_leader(req) && (req->flags & REQ_F_SQE_GROUP);
+}
+
+static inline bool req_support_group_dep(struct io_kiocb *req)
+{
+	return req_is_group_leader(req) && (req->flags & REQ_F_SQE_GROUP_DEP);
+}
+
 /*
  * Don't complete immediately but use deferred completion infrastructure.
  * Protected by ->uring_lock and can only be used either with
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 1f503bcc9c9f..ead0f85c05ac 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -838,3 +838,63 @@  int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
 	io_put_bl(ctx, bl);
 	return ret;
 }
+
+int io_provide_group_kbuf(struct io_kiocb *req,
+		const struct io_uring_kernel_buf *grp_kbuf)
+{
+	if (unlikely(!req_support_group_dep(req)))
+		return -EINVAL;
+
+	/*
+	 * Borrow this buffer from one kernel subsystem, and return them
+	 * by calling `grp_kbuf_ack` when the group lead is freed.
+	 *
+	 * Not like pipe/splice, this kernel buffer is always owned by the
+	 * provider, and has to be returned back.
+	 */
+	req->grp_kbuf = grp_kbuf;
+	req->flags |= REQ_F_GROUP_KBUF;
+
+	return 0;
+}
+
+int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
+		unsigned int len, int dir, struct iov_iter *iter)
+{
+	struct io_kiocb *lead = req->grp_link;
+	const struct io_uring_kernel_buf *kbuf;
+	unsigned long offset;
+
+	WARN_ON_ONCE(!(req->flags & REQ_F_GROUP_KBUF));
+
+	if (!req_is_group_member(req))
+		return -EINVAL;
+
+	if (!lead || !req_support_group_dep(lead) || !lead->grp_kbuf)
+		return -EINVAL;
+
+	/* req->fused_cmd_kbuf is immutable */
+	kbuf = lead->grp_kbuf;
+	offset = kbuf->offset;
+
+	if (!kbuf->bvec)
+		return -EINVAL;
+
+	if (dir != kbuf->dir)
+		return -EINVAL;
+
+	if (unlikely(buf_off > kbuf->len))
+		return -EFAULT;
+
+	if (unlikely(len > kbuf->len - buf_off))
+		return -EFAULT;
+
+	/* don't use io_import_fixed which doesn't support multipage bvec */
+	offset += buf_off;
+	iov_iter_bvec(iter, dir, kbuf->bvec, kbuf->nr_bvecs, offset + len);
+
+	if (offset)
+		iov_iter_advance(iter, offset);
+
+	return 0;
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 36aadfe5ac00..37d18324e840 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -89,6 +89,11 @@  struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
 				      unsigned long bgid);
 int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
 
+int io_provide_group_kbuf(struct io_kiocb *req,
+		const struct io_uring_kernel_buf *grp_kbuf);
+int io_import_group_kbuf(struct io_kiocb *req, unsigned long buf_off,
+		unsigned int len, int dir, struct iov_iter *iter);
+
 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 {
 	/*
@@ -220,4 +225,12 @@  static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
 {
 	return __io_put_kbufs(req, len, nbufs, issue_flags);
 }
+
+static inline void io_group_kbuf_drop(struct io_kiocb *req)
+{
+	const struct io_uring_kernel_buf *gbuf = req->grp_kbuf;
+
+	if (gbuf && gbuf->grp_kbuf_ack)
+		gbuf->grp_kbuf_ack(gbuf);
+}
 #endif
diff --git a/io_uring/net.c b/io_uring/net.c
index f10f5a22d66a..ad24dd5924d2 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -89,6 +89,13 @@  struct io_sr_msg {
  */
 #define MULTISHOT_MAX_RETRY	32
 
+#define user_ptr_to_u64(x) (		\
+{					\
+	typecheck(void __user *, (x));	\
+	(u64)(unsigned long)(x);	\
+}					\
+)
+
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
@@ -375,7 +382,7 @@  static int io_send_setup(struct io_kiocb *req)
 		kmsg->msg.msg_name = &kmsg->addr;
 		kmsg->msg.msg_namelen = sr->addr_len;
 	}
-	if (!io_do_buffer_select(req)) {
+	if (!io_do_buffer_select(req) && !(req->flags & REQ_F_GROUP_KBUF)) {
 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
 				  &kmsg->msg.msg_iter);
 		if (unlikely(ret < 0))
@@ -593,6 +600,15 @@  int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 
+	if (req->flags & REQ_F_GROUP_KBUF) {
+		ret = io_import_group_kbuf(req,
+					user_ptr_to_u64(sr->buf),
+					sr->len, ITER_SOURCE,
+					&kmsg->msg.msg_iter);
+		if (unlikely(ret))
+			return ret;
+	}
+
 retry_bundle:
 	if (io_do_buffer_select(req)) {
 		struct buf_sel_arg arg = {
@@ -1154,6 +1170,11 @@  int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 			goto out_free;
 		}
 		sr->buf = NULL;
+	} else if (req->flags & REQ_F_GROUP_KBUF) {
+		ret = io_import_group_kbuf(req, user_ptr_to_u64(sr->buf),
+				sr->len, ITER_DEST, &kmsg->msg.msg_iter);
+		if (unlikely(ret))
+			goto out_free;
 	}
 
 	kmsg->msg.msg_flags = 0;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a2be3bbca5ff..c12f57619a33 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -246,6 +246,7 @@  const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.accept_group_kbuf	= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_read,
 		.issue			= io_read,
@@ -260,6 +261,7 @@  const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
+		.accept_group_kbuf	= 1,
 		.async_size		= sizeof(struct io_async_rw),
 		.prep			= io_prep_write,
 		.issue			= io_write,
@@ -282,6 +284,7 @@  const struct io_issue_def io_issue_defs[] = {
 		.audit_skip		= 1,
 		.ioprio			= 1,
 		.buffer_select		= 1,
+		.accept_group_kbuf	= 1,
 #if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
@@ -297,6 +300,7 @@  const struct io_issue_def io_issue_defs[] = {
 		.buffer_select		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
+		.accept_group_kbuf	= 1,
 #if defined(CONFIG_NET)
 		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_recvmsg_prep,
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 14456436ff74..328c8a3c4fa7 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -27,6 +27,8 @@  struct io_issue_def {
 	unsigned		iopoll_queue : 1;
 	/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
 	unsigned		vectored : 1;
+	/* opcodes which accept provided group kbuf */
+	unsigned		accept_group_kbuf : 1;
 
 	/* size of async data needed, if any */
 	unsigned short		async_size;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index f023ff49c688..402d80436ffd 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -235,7 +235,8 @@  static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
 	if (io_rw_alloc_async(req))
 		return -ENOMEM;
 
-	if (!do_import || io_do_buffer_select(req))
+	if (!do_import || io_do_buffer_select(req) ||
+	    (req->flags & REQ_F_GROUP_KBUF))
 		return 0;
 
 	rw = req->async_data;
@@ -619,11 +620,16 @@  static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
  */
 static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 {
+	struct io_kiocb *req = cmd_to_io_kiocb(rw);
 	struct kiocb *kiocb = &rw->kiocb;
 	struct file *file = kiocb->ki_filp;
 	ssize_t ret = 0;
 	loff_t *ppos;
 
+	/* group buffer is kernel buffer and doesn't have userspace addr */
+	if (req->flags & REQ_F_GROUP_KBUF)
+		return -EOPNOTSUPP;
+
 	/*
 	 * Don't support polled IO through this interface, and we can't
 	 * support non-blocking either. For the latter, this just causes
@@ -830,6 +836,11 @@  static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_import_iovec(ITER_DEST, req, io, issue_flags);
 		if (unlikely(ret < 0))
 			return ret;
+	} else if (req->flags & REQ_F_GROUP_KBUF) {
+		ret = io_import_group_kbuf(req, rw->addr, rw->len, ITER_DEST,
+				&io->iter);
+		if (unlikely(ret))
+			return ret;
 	}
 	ret = io_rw_init_file(req, FMODE_READ, READ);
 	if (unlikely(ret))
@@ -1019,6 +1030,13 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ssize_t ret, ret2;
 	loff_t *ppos;
 
+	if (req->flags & REQ_F_GROUP_KBUF) {
+		ret = io_import_group_kbuf(req, rw->addr, rw->len, ITER_SOURCE,
+				&io->iter);
+		if (unlikely(ret))
+			return ret;
+	}
+
 	ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
 	if (unlikely(ret))
 		return ret;