diff mbox series

[3/4] io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF

Message ID 20230210153212.733006-4-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF | expand

Commit Message

Ming Lei Feb. 10, 2023, 3:32 p.m. UTC
IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
for building buffer.

IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
for building buffer.

The typical use case is for supporting ublk/fuse io_uring zero copy,
and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
from device->read_splice(), then READ/WRITE can be done to/from this
buffer directly.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/uapi/linux/io_uring.h |   2 +
 io_uring/opdef.c              |  37 ++++++++
 io_uring/rw.c                 | 174 +++++++++++++++++++++++++++++++++-
 io_uring/rw.h                 |   1 +
 4 files changed, 213 insertions(+), 1 deletion(-)

Comments

Jens Axboe Feb. 11, 2023, 3:45 p.m. UTC | #1
On 2/10/23 8:32?AM, Ming Lei wrote:
> IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> for building buffer.
> 
> IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> for building buffer.
> 
> The typical use case is for supporting ublk/fuse io_uring zero copy,
> and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
> from device->read_splice(), then READ/WRITE can be done to/from this
> buffer directly.

Main question here - would this be better not plumbed up through the rw
path? Might be cleaner, even if it either requires a bit of helper
refactoring or accepting a bit of duplication. But would still be better
than polluting the rw fast path imho.

Also seems like this should be separately testable. We can't add new
opcodes that don't have a feature test at least, and should also have
various corner case tests. A bit of commenting outside of this below.

> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 5238ecd7af6a..91e8d8f96134 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -427,6 +427,31 @@ const struct io_issue_def io_issue_defs[] = {
>  		.prep			= io_eopnotsupp_prep,
>  #endif
>  	},
> +	[IORING_OP_READ_SPLICE_BUF] = {
> +		.needs_file		= 1,
> +		.unbound_nonreg_file	= 1,
> +		.pollin			= 1,
> +		.plug			= 1,
> +		.audit_skip		= 1,
> +		.ioprio			= 1,
> +		.iopoll			= 1,
> +		.iopoll_queue		= 1,
> +		.prep			= io_prep_rw,
> +		.issue			= io_read,
> +	},
> +	[IORING_OP_WRITE_SPLICE_BUF] = {
> +		.needs_file		= 1,
> +		.hash_reg_file		= 1,
> +		.unbound_nonreg_file	= 1,
> +		.pollout		= 1,
> +		.plug			= 1,
> +		.audit_skip		= 1,
> +		.ioprio			= 1,
> +		.iopoll			= 1,
> +		.iopoll_queue		= 1,
> +		.prep			= io_prep_rw,
> +		.issue			= io_write,
> +	},

Are these really safe with iopoll?

> +static int io_prep_rw_splice_buf(struct io_kiocb *req,
> +				 const struct io_uring_sqe *sqe)
> +{
> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> +	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
> +	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
> +	struct io_rw_splice_buf_data data;
> +	struct io_mapped_ubuf *imu;
> +	struct fd splice_fd;
> +	int ret;
> +
> +	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
> +	if (!splice_fd.file)
> +		return -EBADF;

Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
io_file_get_normal() for the non-fixed case in case someone passed in an
io_uring fd.

> +	data.imu = &imu;
> +
> +	rw->addr = 0;
> +	req->flags |= REQ_F_NEED_CLEANUP;
> +
> +	ret = __io_prep_rw_splice_buf(req, &data, splice_fd.file, rw->len,
> +			splice_off);
> +	imu = *data.imu;
> +	imu->acct_pages = 0;
> +	imu->ubuf = 0;
> +	imu->ubuf_end = data.total;
> +	rw->len = data.total;
> +	req->imu = imu;
> +	if (!data.total) {
> +		io_rw_cleanup_splice_buf(req);
> +	} else  {
> +		ret = 0;
> +	}
> +out_put_fd:
> +	if (splice_fd.file)
> +		fdput(splice_fd);
> +
> +	return ret;
> +}

If the operation is done, clear NEED_CLEANUP and do the cleanup here?
That'll be faster.
Ming Lei Feb. 11, 2023, 4:12 p.m. UTC | #2
On Sat, Feb 11, 2023 at 08:45:18AM -0700, Jens Axboe wrote:
> On 2/10/23 8:32?AM, Ming Lei wrote:
> > IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
> > ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> > for building buffer.
> > 
> > IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
> > ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> > for building buffer.
> > 
> > The typical use case is for supporting ublk/fuse io_uring zero copy,
> > and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
> > from device->read_splice(), then READ/WRITE can be done to/from this
> > buffer directly.
> 
> Main question here - would this be better not plumbed up through the rw
> path? Might be cleaner, even if it either requires a bit of helper
> refactoring or accepting a bit of duplication. But would still be better
> than polluting the rw fast path imho.

The buffer is actually IO buffer, which has to be plumbed up in IO path,
and it can't be done like the registered buffer.

The only affect on fast path is :

		if (io_rw_splice_buf(req))	//which just check opcode
              return io_prep_rw_splice_buf(req, sqe);

and the cleanup code which is only done for the two new OPs.

Or maybe I misunderstand your point? Or any detailed suggestion?

Actually the code should be factored into generic helper, since net.c
need to use them too. Probably it needs to move to rsrc.c?

> 
> Also seems like this should be separately testable. We can't add new
> opcodes that don't have a feature test at least, and should also have
> various corner case tests. A bit of commenting outside of this below.

OK, I will write/add one very simple ublk userspace to liburing for
test purpose.

> 
> > diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> > index 5238ecd7af6a..91e8d8f96134 100644
> > --- a/io_uring/opdef.c
> > +++ b/io_uring/opdef.c
> > @@ -427,6 +427,31 @@ const struct io_issue_def io_issue_defs[] = {
> >  		.prep			= io_eopnotsupp_prep,
> >  #endif
> >  	},
> > +	[IORING_OP_READ_SPLICE_BUF] = {
> > +		.needs_file		= 1,
> > +		.unbound_nonreg_file	= 1,
> > +		.pollin			= 1,
> > +		.plug			= 1,
> > +		.audit_skip		= 1,
> > +		.ioprio			= 1,
> > +		.iopoll			= 1,
> > +		.iopoll_queue		= 1,
> > +		.prep			= io_prep_rw,
> > +		.issue			= io_read,
> > +	},
> > +	[IORING_OP_WRITE_SPLICE_BUF] = {
> > +		.needs_file		= 1,
> > +		.hash_reg_file		= 1,
> > +		.unbound_nonreg_file	= 1,
> > +		.pollout		= 1,
> > +		.plug			= 1,
> > +		.audit_skip		= 1,
> > +		.ioprio			= 1,
> > +		.iopoll			= 1,
> > +		.iopoll_queue		= 1,
> > +		.prep			= io_prep_rw,
> > +		.issue			= io_write,
> > +	},
> 
> Are these really safe with iopoll?

Yeah, after the buffer is built, the handling is basically
same with IORING_OP_WRITE_FIXED, so I think it is safe.

> 
> > +static int io_prep_rw_splice_buf(struct io_kiocb *req,
> > +				 const struct io_uring_sqe *sqe)
> > +{
> > +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> > +	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
> > +	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
> > +	struct io_rw_splice_buf_data data;
> > +	struct io_mapped_ubuf *imu;
> > +	struct fd splice_fd;
> > +	int ret;
> > +
> > +	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
> > +	if (!splice_fd.file)
> > +		return -EBADF;
> 
> Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
> io_file_get_normal() for the non-fixed case in case someone passed in an
> io_uring fd.

SPLICE_F_FD_IN_FIXED needs one extra word for holding splice flags, if
we can use sqe->addr3, I think it is doable.

> 
> > +	data.imu = &imu;
> > +
> > +	rw->addr = 0;
> > +	req->flags |= REQ_F_NEED_CLEANUP;
> > +
> > +	ret = __io_prep_rw_splice_buf(req, &data, splice_fd.file, rw->len,
> > +			splice_off);
> > +	imu = *data.imu;
> > +	imu->acct_pages = 0;
> > +	imu->ubuf = 0;
> > +	imu->ubuf_end = data.total;
> > +	rw->len = data.total;
> > +	req->imu = imu;
> > +	if (!data.total) {
> > +		io_rw_cleanup_splice_buf(req);
> > +	} else  {
> > +		ret = 0;
> > +	}
> > +out_put_fd:
> > +	if (splice_fd.file)
> > +		fdput(splice_fd);
> > +
> > +	return ret;
> > +}
> 
> If the operation is done, clear NEED_CLEANUP and do the cleanup here?
> That'll be faster.

The buffer has to be cleaned up after req is completed, since bvec
table is needed for bio, and page reference need to be dropped after
IO is done too.


thanks,
Ming
Jens Axboe Feb. 11, 2023, 4:52 p.m. UTC | #3
On 2/11/23 9:12?AM, Ming Lei wrote:
> On Sat, Feb 11, 2023 at 08:45:18AM -0700, Jens Axboe wrote:
>> On 2/10/23 8:32?AM, Ming Lei wrote:
>>> IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
>>> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
>>> for building buffer.
>>>
>>> IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
>>> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
>>> for building buffer.
>>>
>>> The typical use case is for supporting ublk/fuse io_uring zero copy,
>>> and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
>>> from device->read_splice(), then READ/WRITE can be done to/from this
>>> buffer directly.
>>
>> Main question here - would this be better not plumbed up through the rw
>> path? Might be cleaner, even if it either requires a bit of helper
>> refactoring or accepting a bit of duplication. But would still be better
>> than polluting the rw fast path imho.
> 
> The buffer is actually IO buffer, which has to be plumbed up in IO path,
> and it can't be done like the registered buffer.
> 
> The only affect on fast path is :
> 
> 		if (io_rw_splice_buf(req))	//which just check opcode
>               return io_prep_rw_splice_buf(req, sqe);
> 
> and the cleanup code which is only done for the two new OPs.
> 
> Or maybe I misunderstand your point? Or any detailed suggestion?
> 
> Actually the code should be factored into generic helper, since net.c
> need to use them too. Probably it needs to move to rsrc.c?

Yep, just refactoring out those bits as a prep thing. rsrc could work,
or perhaps a new file for that.

>> Also seems like this should be separately testable. We can't add new
>> opcodes that don't have a feature test at least, and should also have
>> various corner case tests. A bit of commenting outside of this below.
> 
> OK, I will write/add one very simple ublk userspace to liburing for
> test purpose.

Thanks!

>>> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
>>> index 5238ecd7af6a..91e8d8f96134 100644
>>> --- a/io_uring/opdef.c
>>> +++ b/io_uring/opdef.c
>>> @@ -427,6 +427,31 @@ const struct io_issue_def io_issue_defs[] = {
>>>  		.prep			= io_eopnotsupp_prep,
>>>  #endif
>>>  	},
>>> +	[IORING_OP_READ_SPLICE_BUF] = {
>>> +		.needs_file		= 1,
>>> +		.unbound_nonreg_file	= 1,
>>> +		.pollin			= 1,
>>> +		.plug			= 1,
>>> +		.audit_skip		= 1,
>>> +		.ioprio			= 1,
>>> +		.iopoll			= 1,
>>> +		.iopoll_queue		= 1,
>>> +		.prep			= io_prep_rw,
>>> +		.issue			= io_read,
>>> +	},
>>> +	[IORING_OP_WRITE_SPLICE_BUF] = {
>>> +		.needs_file		= 1,
>>> +		.hash_reg_file		= 1,
>>> +		.unbound_nonreg_file	= 1,
>>> +		.pollout		= 1,
>>> +		.plug			= 1,
>>> +		.audit_skip		= 1,
>>> +		.ioprio			= 1,
>>> +		.iopoll			= 1,
>>> +		.iopoll_queue		= 1,
>>> +		.prep			= io_prep_rw,
>>> +		.issue			= io_write,
>>> +	},
>>
>> Are these really safe with iopoll?
> 
> Yeah, after the buffer is built, the handling is basically
> same with IORING_OP_WRITE_FIXED, so I think it is safe.

Yeah, on a second look, as these are just using the normal read/write
path after that should be fine indeed.

>>
>>> +static int io_prep_rw_splice_buf(struct io_kiocb *req,
>>> +				 const struct io_uring_sqe *sqe)
>>> +{
>>> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
>>> +	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
>>> +	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
>>> +	struct io_rw_splice_buf_data data;
>>> +	struct io_mapped_ubuf *imu;
>>> +	struct fd splice_fd;
>>> +	int ret;
>>> +
>>> +	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
>>> +	if (!splice_fd.file)
>>> +		return -EBADF;
>>
>> Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
>> io_file_get_normal() for the non-fixed case in case someone passed in an
>> io_uring fd.
> 
> SPLICE_F_FD_IN_FIXED needs one extra word for holding splice flags, if
> we can use sqe->addr3, I think it is doable.

I haven't checked the rest, but you can't just use ->splice_flags for
this?

In any case, the get path needs to look like io_tee() here, and:

>>> +out_put_fd:
>>> +	if (splice_fd.file)
>>> +		fdput(splice_fd);

this put needs to be gated on whether it's a fixed file or not.

>> If the operation is done, clear NEED_CLEANUP and do the cleanup here?
>> That'll be faster.
> 
> The buffer has to be cleaned up after req is completed, since bvec
> table is needed for bio, and page reference need to be dropped after
> IO is done too.

I mean when you clear that flag, call the cleanup bits you otherwise
would've called on later cleanup.
Jens Axboe Feb. 11, 2023, 5:13 p.m. UTC | #4
On 2/10/23 8:32?AM, Ming Lei wrote:

One more comment on this.

> +static int __io_prep_rw_splice_buf(struct io_kiocb *req,
> +				   struct io_rw_splice_buf_data *data,
> +				   struct file *splice_f,
> +				   size_t len,
> +				   loff_t splice_off)
> +{
> +	unsigned flags = req->opcode == IORING_OP_READ_SPLICE_BUF ?
> +			SPLICE_F_KERN_FOR_READ : SPLICE_F_KERN_FOR_WRITE;
> +	struct splice_desc sd = {
> +		.total_len = len,
> +		.flags = flags | SPLICE_F_NONBLOCK | SPLICE_F_KERN_NEED_CONFIRM,
> +		.pos = splice_off,
> +		.u.data = data,
> +		.ignore_sig = true,
> +	};
> +
> +	return splice_direct_to_actor(splice_f, &sd,
> +			io_splice_buf_direct_actor);

Is this safe? We end up using current->splice_pipe here, which should be
fine as long as things are left in a sane state after every operation.
Which they should be, just like a syscall would. Just wanted to make
sure you've considered that part.
Ming Lei Feb. 12, 2023, 1:48 a.m. UTC | #5
On Sat, Feb 11, 2023 at 10:13:37AM -0700, Jens Axboe wrote:
> On 2/10/23 8:32?AM, Ming Lei wrote:
> 
> One more comment on this.
> 
> > +static int __io_prep_rw_splice_buf(struct io_kiocb *req,
> > +				   struct io_rw_splice_buf_data *data,
> > +				   struct file *splice_f,
> > +				   size_t len,
> > +				   loff_t splice_off)
> > +{
> > +	unsigned flags = req->opcode == IORING_OP_READ_SPLICE_BUF ?
> > +			SPLICE_F_KERN_FOR_READ : SPLICE_F_KERN_FOR_WRITE;
> > +	struct splice_desc sd = {
> > +		.total_len = len,
> > +		.flags = flags | SPLICE_F_NONBLOCK | SPLICE_F_KERN_NEED_CONFIRM,
> > +		.pos = splice_off,
> > +		.u.data = data,
> > +		.ignore_sig = true,
> > +	};
> > +
> > +	return splice_direct_to_actor(splice_f, &sd,
> > +			io_splice_buf_direct_actor);
> 
> Is this safe? We end up using current->splice_pipe here, which should be
> fine as long as things are left in a sane state after every operation.
> Which they should be, just like a syscall would. Just wanted to make
> sure you've considered that part.

Yeah.

Direct pipe is always left as empty when splice_direct_to_actor()
returns. Pipe buffers(pages) are produced from ->splice_read()
called from splice_direct_to_actor(), and consumed by
io_splice_buf_direct_actor().

If any error is returned, direct pipe is empty too, and we just
need to drop reference of sliced pages by io_rw_cleanup_splice_buf().


Thanks,
Ming
Jens Axboe Feb. 12, 2023, 2:42 a.m. UTC | #6
On 2/11/23 6:48 PM, Ming Lei wrote:
> On Sat, Feb 11, 2023 at 10:13:37AM -0700, Jens Axboe wrote:
>> On 2/10/23 8:32?AM, Ming Lei wrote:
>>
>> One more comment on this.
>>
>>> +static int __io_prep_rw_splice_buf(struct io_kiocb *req,
>>> +				   struct io_rw_splice_buf_data *data,
>>> +				   struct file *splice_f,
>>> +				   size_t len,
>>> +				   loff_t splice_off)
>>> +{
>>> +	unsigned flags = req->opcode == IORING_OP_READ_SPLICE_BUF ?
>>> +			SPLICE_F_KERN_FOR_READ : SPLICE_F_KERN_FOR_WRITE;
>>> +	struct splice_desc sd = {
>>> +		.total_len = len,
>>> +		.flags = flags | SPLICE_F_NONBLOCK | SPLICE_F_KERN_NEED_CONFIRM,
>>> +		.pos = splice_off,
>>> +		.u.data = data,
>>> +		.ignore_sig = true,
>>> +	};
>>> +
>>> +	return splice_direct_to_actor(splice_f, &sd,
>>> +			io_splice_buf_direct_actor);
>>
>> Is this safe? We end up using current->splice_pipe here, which should be
>> fine as long as things are left in a sane state after every operation.
>> Which they should be, just like a syscall would. Just wanted to make
>> sure you've considered that part.
> 
> Yeah.
> 
> Direct pipe is always left as empty when splice_direct_to_actor()
> returns. Pipe buffers(pages) are produced from ->splice_read()
> called from splice_direct_to_actor(), and consumed by
> io_splice_buf_direct_actor().
> 
> If any error is returned, direct pipe is empty too, and we just
> need to drop reference of sliced pages by io_rw_cleanup_splice_buf().

OK thanks for confirming, then that should be fine as we can
obviously not have two syscalls (or sendfile(2) and task_work from
io_uring) running at the same time.
Ming Lei Feb. 12, 2023, 3:22 a.m. UTC | #7
On Sat, Feb 11, 2023 at 09:52:58AM -0700, Jens Axboe wrote:
> On 2/11/23 9:12?AM, Ming Lei wrote:
> > On Sat, Feb 11, 2023 at 08:45:18AM -0700, Jens Axboe wrote:
> >> On 2/10/23 8:32?AM, Ming Lei wrote:
> >>> IORING_OP_READ_SPLICE_BUF: read to buffer which is built from
> >>> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> >>> for building buffer.
> >>>
> >>> IORING_OP_WRITE_SPLICE_BUF: write from buffer which is built from
> >>> ->read_splice() of specified fd, so user needs to provide (splice_fd, offset, len)
> >>> for building buffer.
> >>>
> >>> The typical use case is for supporting ublk/fuse io_uring zero copy,
> >>> and READ/WRITE OP retrieves ublk/fuse request buffer via direct pipe
> >>> from device->read_splice(), then READ/WRITE can be done to/from this
> >>> buffer directly.
> >>
> >> Main question here - would this be better not plumbed up through the rw
> >> path? Might be cleaner, even if it either requires a bit of helper
> >> refactoring or accepting a bit of duplication. But would still be better
> >> than polluting the rw fast path imho.
> > 
> > The buffer is actually IO buffer, which has to be plumbed up in IO path,
> > and it can't be done like the registered buffer.
> > 
> > The only affect on fast path is :
> > 
> > 		if (io_rw_splice_buf(req))	//which just check opcode
> >               return io_prep_rw_splice_buf(req, sqe);
> > 
> > and the cleanup code which is only done for the two new OPs.
> > 
> > Or maybe I misunderstand your point? Or any detailed suggestion?
> > 
> > Actually the code should be factored into generic helper, since net.c
> > need to use them too. Probably it needs to move to rsrc.c?
> 
> Yep, just refactoring out those bits as a prep thing. rsrc could work,
> or perhaps a new file for that.

OK.

> 
> >> Also seems like this should be separately testable. We can't add new
> >> opcodes that don't have a feature test at least, and should also have
> >> various corner case tests. A bit of commenting outside of this below.
> > 
> > OK, I will write/add one very simple ublk userspace to liburing for
> > test purpose.
> 
> Thanks!

Thinking of further, if we use ublk for liburing test purpose, root is
often needed, even though we support un-privileged mode, which needs
administrator to grant access, so is it still good to do so?

It could be easier to add ->splice_read() on /dev/zero for test
purpose, just allocate zeroed pages in ->splice_read(), and add
them to pipe like ublk->splice_read(), and sink side can read
from or write to these pages, but zero's read_iter_zero() won't
be affected. And normal splice/tee won't connect to zero too
because we only allow it from kernel use.

> 
> >>> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> >>> index 5238ecd7af6a..91e8d8f96134 100644
> >>> --- a/io_uring/opdef.c
> >>> +++ b/io_uring/opdef.c
> >>> @@ -427,6 +427,31 @@ const struct io_issue_def io_issue_defs[] = {
> >>>  		.prep			= io_eopnotsupp_prep,
> >>>  #endif
> >>>  	},
> >>> +	[IORING_OP_READ_SPLICE_BUF] = {
> >>> +		.needs_file		= 1,
> >>> +		.unbound_nonreg_file	= 1,
> >>> +		.pollin			= 1,
> >>> +		.plug			= 1,
> >>> +		.audit_skip		= 1,
> >>> +		.ioprio			= 1,
> >>> +		.iopoll			= 1,
> >>> +		.iopoll_queue		= 1,
> >>> +		.prep			= io_prep_rw,
> >>> +		.issue			= io_read,
> >>> +	},
> >>> +	[IORING_OP_WRITE_SPLICE_BUF] = {
> >>> +		.needs_file		= 1,
> >>> +		.hash_reg_file		= 1,
> >>> +		.unbound_nonreg_file	= 1,
> >>> +		.pollout		= 1,
> >>> +		.plug			= 1,
> >>> +		.audit_skip		= 1,
> >>> +		.ioprio			= 1,
> >>> +		.iopoll			= 1,
> >>> +		.iopoll_queue		= 1,
> >>> +		.prep			= io_prep_rw,
> >>> +		.issue			= io_write,
> >>> +	},
> >>
> >> Are these really safe with iopoll?
> > 
> > Yeah, after the buffer is built, the handling is basically
> > same with IORING_OP_WRITE_FIXED, so I think it is safe.
> 
> Yeah, on a second look, as these are just using the normal read/write
> path after that should be fine indeed.
> 
> >>
> >>> +static int io_prep_rw_splice_buf(struct io_kiocb *req,
> >>> +				 const struct io_uring_sqe *sqe)
> >>> +{
> >>> +	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
> >>> +	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
> >>> +	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
> >>> +	struct io_rw_splice_buf_data data;
> >>> +	struct io_mapped_ubuf *imu;
> >>> +	struct fd splice_fd;
> >>> +	int ret;
> >>> +
> >>> +	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
> >>> +	if (!splice_fd.file)
> >>> +		return -EBADF;
> >>
> >> Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
> >> io_file_get_normal() for the non-fixed case in case someone passed in an
> >> io_uring fd.
> > 
> > SPLICE_F_FD_IN_FIXED needs one extra word for holding splice flags, if
> > we can use sqe->addr3, I think it is doable.
> 
> I haven't checked the rest, but you can't just use ->splice_flags for
> this?

->splice_flags shares memory with rwflags, so can't be used.

I think it is fine to use ->addr3, given io_getxattr()/io_setxattr()/
io_msg_ring() has used that.

> 
> In any case, the get path needs to look like io_tee() here, and:
> 
> >>> +out_put_fd:
> >>> +	if (splice_fd.file)
> >>> +		fdput(splice_fd);
> 
> this put needs to be gated on whether it's a fixed file or not.

Yeah.

> 
> >> If the operation is done, clear NEED_CLEANUP and do the cleanup here?
> >> That'll be faster.
> > 
> > The buffer has to be cleaned up after req is completed, since bvec
> > table is needed for bio, and page reference need to be dropped after
> > IO is done too.
> 
> I mean when you clear that flag, call the cleanup bits you otherwise
> would've called on later cleanup.

Got it.

Thanks,
Ming
Jens Axboe Feb. 12, 2023, 3:55 a.m. UTC | #8
On 2/11/23 8:22?PM, Ming Lei wrote:
>>>> Also seems like this should be separately testable. We can't add new
>>>> opcodes that don't have a feature test at least, and should also have
>>>> various corner case tests. A bit of commenting outside of this below.
>>>
>>> OK, I will write/add one very simple ublk userspace to liburing for
>>> test purpose.
>>
>> Thanks!
> 
> Thinking of further, if we use ublk for liburing test purpose, root is
> often needed, even though we support un-privileged mode, which needs
> administrator to grant access, so is it still good to do so?

That's fine, some tests already depend on root for certain things, like
passthrough. When I run the tests, I do a pass as both a regular user
and as root. The important bit is just that the tests skip when they are
not root rather than fail.

> It could be easier to add ->splice_read() on /dev/zero for test
> purpose, just allocate zeroed pages in ->splice_read(), and add
> them to pipe like ublk->splice_read(), and sink side can read
> from or write to these pages, but zero's read_iter_zero() won't
> be affected. And normal splice/tee won't connect to zero too
> because we only allow it from kernel use.

Arguably /dev/zero should still support splice_read() as a regression
fix as I argued to Linus, so I'd just add that as a prep patch.

>>>> Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
>>>> io_file_get_normal() for the non-fixed case in case someone passed in an
>>>> io_uring fd.
>>>
>>> SPLICE_F_FD_IN_FIXED needs one extra word for holding splice flags, if
>>> we can use sqe->addr3, I think it is doable.
>>
>> I haven't checked the rest, but you can't just use ->splice_flags for
>> this?
> 
> ->splice_flags shares memory with rwflags, so can't be used.
> 
> I think it is fine to use ->addr3, given io_getxattr()/io_setxattr()/
> io_msg_ring() has used that.

This is part of the confusion, as you treat it basically like a
read/write internally, but the opcode names indicate differently. Why
not just have a separate prep helper for these and then use a layout
that makes more sense, surely rwflags aren't applicable for these
anyway? I think that'd make it a lot cleaner.

Yeah, addr3 could easily be used, but it's makes for a really confusing
command structure when the command is kinda-read but also kinda-splice.
And it arguable makes more sense to treat it as the latter, as it takes
the two fds like splice.
Ming Lei Feb. 13, 2023, 1:06 a.m. UTC | #9
On Sat, Feb 11, 2023 at 08:55:23PM -0700, Jens Axboe wrote:
> On 2/11/23 8:22?PM, Ming Lei wrote:
> >>>> Also seems like this should be separately testable. We can't add new
> >>>> opcodes that don't have a feature test at least, and should also have
> >>>> various corner case tests. A bit of commenting outside of this below.
> >>>
> >>> OK, I will write/add one very simple ublk userspace to liburing for
> >>> test purpose.
> >>
> >> Thanks!
> > 
> > Thinking of further, if we use ublk for liburing test purpose, root is
> > often needed, even though we support un-privileged mode, which needs
> > administrator to grant access, so is it still good to do so?
> 
> That's fine, some tests already depend on root for certain things, like
> passthrough. When I run the tests, I do a pass as both a regular user
> and as root. The important bit is just that the tests skip when they are
> not root rather than fail.

OK, that is nice! I am going to write one minimized ublk userspace,
which can be used in both liburing & blktests for test purpose.

> 
> > It could be easier to add ->splice_read() on /dev/zero for test
> > purpose, just allocate zeroed pages in ->splice_read(), and add
> > them to pipe like ublk->splice_read(), and sink side can read
> > from or write to these pages, but zero's read_iter_zero() won't
> > be affected. And normal splice/tee won't connect to zero too
> > because we only allow it from kernel use.
> 
> Arguably /dev/zero should still support splice_read() as a regression
> fix as I argued to Linus, so I'd just add that as a prep patch.

Understood.

> 
> >>>> Seems like this should check for SPLICE_F_FD_IN_FIXED, and also use
> >>>> io_file_get_normal() for the non-fixed case in case someone passed in an
> >>>> io_uring fd.
> >>>
> >>> SPLICE_F_FD_IN_FIXED needs one extra word for holding splice flags, if
> >>> we can use sqe->addr3, I think it is doable.
> >>
> >> I haven't checked the rest, but you can't just use ->splice_flags for
> >> this?
> > 
> > ->splice_flags shares memory with rwflags, so can't be used.
> > 
> > I think it is fine to use ->addr3, given io_getxattr()/io_setxattr()/
> > io_msg_ring() has used that.
> 
> This is part of the confusion, as you treat it basically like a
> read/write internally, but the opcode names indicate differently. Why
> not just have a separate prep helper for these and then use a layout

Looks separate prep is cleaner.

> that makes more sense,surely rwflags aren't applicable for these
> anyway? I think that'd make it a lot cleaner.
> 
> Yeah, addr3 could easily be used, but it's makes for a really confusing
> command structure when the command is kinda-read but also kinda-splice.
> And it arguable makes more sense to treat it as the latter, as it takes
> the two fds like splice.

Yeah, it can be thought as one direct splice between device and generic
file, and I'd take more words to share the whole story here.

1) traditional splice is: 

file A(read file A to pipe buffer) -> pipe -> file B(write pipe buffer to file B)

which implements zero copy for 'cp file_A file_B'.

2) device splice (device -> pipe -> file)

If only for doing 'cp device file', the current splice works just fine, however
we do not have syscall for handling the following more generic scenario:

	dev(produce buffer to pipe) -> pipe -> file(consume buffer from pipe)

splice is supposed for transferring pages over pipe, so the above model
is reasonable. And we do have kernel APIs for doing the above by direct
pipe: splice_direct_to_actor & __splice_from_pipe.

That is basically what this patch does, and consumer could be READ
or WRITE. The current splice syscall only supports WRITE consumer(
write pipe buffer to file) in pipe's read end.

It can be used for fuse to support READ zero copy(fuse write zero copy
was supported in 2010, but never support read zero copy because of missing
such syscall), and for supporting ublk zero copy.

Actually it can help to implement any "virt" drivers, most of which just
talks with file or socket, or anything which can be handled in userspace
efficiently.

Follows the usage, suppose the syscall is named as dev_splice()

	1) "poll" device A for incoming request
	- "poll' just represents one kernel/user communication, once it
	returns, there is request peeked

    - device is one virt device and exposed to userspace and for
	receiving request from userspace

	2) handling driver specific logic
	   - it could be request mapping, such as logical volume manager,
		 the logic can be quite complicated to require Btree to map
		 request from A to underlying files, such as dm-thin or bcache

	   - it could be network based device, nbd, ceph RBD, drbd, ..., usb
	   over IP, .., there could be meta lookup over socket, send
	   command/recv reply, crypto enc/dec, ...

	3) dev_splice(A, A_offset, len, B, B_offset, OP)
	- A is the device
	- B is one generic file(regular file, block device, socket, ...)
	- OP is the consumer operation(could just be read/write or net
	  recv/send)
	- A(produce buffer) -> direct pipe -> B(consume the buffer from
	pipe by OP)

	4) after the above device_splice() is completed, request is
	completed, and send notification to userspace

Now we have io_uring command for handling 1) & 4) efficiently, the
test over ublk has shown this point. If 3) can be supported, the
complicated driver/device logic in 2) can be moved to userspace.
Then the whole driver can be implemented in userspace efficiently,
performance could even be better than kernel driver[1][2].

The trouble is that when dev_splice() becomes much more generic, it is
harder to define it as syscall. It could be easier with io_uring
compared with splice() syscall, but still not easy enough:

- if the sqe interface(for providing parameters) can be stable from
  beginning, or can be extended in future easily

- REQ_F_* has been used up

- may cause some trouble inside io_uring implementation, such as, how
to convert to other io_uring OP handling with the provide consumer op code.

That is why I treat it as io_uring read/write from the beginning, the other
two could be just treated as net recv/send, and only difference is that
buffer is retrieved from direct pipe via splice(device, offset, len).

So which type is better?  Treating it as dev_spice() or specific consumer
OP? If we treat it as splice, is it better to define it as one single
generic OP, or muliple OPs and each one is mapped to single consumer OP?

I am open about the interface definition, any comment/suggestion is
highly welcome!

[1] https://lore.kernel.org/linux-block/Yza1u1KfKa7ycQm0@T590/
[2] https://lore.kernel.org/lkml/20230126040822.GA2858@1wt.eu/T/

Thanks,
Ming
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 636a4c2c1294..bada0c91a350 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -223,6 +223,8 @@  enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_READ_SPLICE_BUF,
+	IORING_OP_WRITE_SPLICE_BUF,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 5238ecd7af6a..91e8d8f96134 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -427,6 +427,31 @@  const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_READ_SPLICE_BUF] = {
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollin			= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw,
+		.issue			= io_read,
+	},
+	[IORING_OP_WRITE_SPLICE_BUF] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+		.pollout		= 1,
+		.plug			= 1,
+		.audit_skip		= 1,
+		.ioprio			= 1,
+		.iopoll			= 1,
+		.iopoll_queue		= 1,
+		.prep			= io_prep_rw,
+		.issue			= io_write,
+	},
 };
 
 
@@ -647,6 +672,18 @@  const struct io_cold_def io_cold_defs[] = {
 		.fail			= io_sendrecv_fail,
 #endif
 	},
+	[IORING_OP_READ_SPLICE_BUF] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_TO_SPLICE_BUF",
+		.cleanup		= io_read_write_cleanup,
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITE_SPLICE_BUF] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_FROM_SPICE_BUF",
+		.cleanup		= io_read_write_cleanup,
+		.fail			= io_rw_fail,
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index efe6bfda9ca9..381514fd1bc5 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -73,6 +73,175 @@  static int io_iov_buffer_select_prep(struct io_kiocb *req)
 	return 0;
 }
 
+struct io_rw_splice_buf_data {
+	unsigned long total;
+	unsigned int  max_bvecs;
+	struct io_mapped_ubuf **imu;
+};
+
+/* the max size of whole 'io_mapped_ubuf' allocation is one page */
+static inline unsigned int io_rw_max_splice_buf_bvecs(void)
+{
+	return (PAGE_SIZE - sizeof(struct io_mapped_ubuf)) /
+			sizeof(struct bio_vec);
+}
+
+static inline unsigned int io_rw_splice_buf_nr_bvecs(unsigned long len)
+{
+	return min_t(unsigned int, (len + PAGE_SIZE - 1) >> PAGE_SHIFT,
+			io_rw_max_splice_buf_bvecs());
+}
+
+static inline bool io_rw_splice_buf(struct io_kiocb *req)
+{
+	return req->opcode == IORING_OP_READ_SPLICE_BUF ||
+		req->opcode == IORING_OP_WRITE_SPLICE_BUF;
+}
+
+static void io_rw_cleanup_splice_buf(struct io_kiocb *req)
+{
+	struct io_mapped_ubuf *imu = req->imu;
+	int i;
+
+	if (!imu)
+		return;
+
+	for (i = 0; i < imu->nr_bvecs; i++)
+		put_page(imu->bvec[i].bv_page);
+
+	req->imu = NULL;
+	kfree(imu);
+}
+
+static int io_splice_buf_actor(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf,
+			       struct splice_desc *sd)
+{
+	struct io_rw_splice_buf_data *data = sd->u.data;
+	struct io_mapped_ubuf *imu = *data->imu;
+	struct bio_vec *bvec;
+
+	if (imu->nr_bvecs >= data->max_bvecs) {
+		/*
+		 * Double bvec allocation given we don't know
+		 * how many remains
+		 */
+		unsigned nr_bvecs = min(data->max_bvecs * 2,
+				io_rw_max_splice_buf_bvecs());
+		struct io_mapped_ubuf *new_imu;
+
+		/* can't grow, given up */
+		if (nr_bvecs <= data->max_bvecs)
+			return 0;
+
+		new_imu = krealloc(imu, struct_size(imu, bvec, nr_bvecs),
+				GFP_KERNEL);
+		if (!new_imu)
+			return -ENOMEM;
+		imu = new_imu;
+		data->max_bvecs = nr_bvecs;
+		*data->imu = imu;
+	}
+
+	if (!try_get_page(buf->page))
+		return -EINVAL;
+
+	bvec = &imu->bvec[imu->nr_bvecs];
+	bvec->bv_page = buf->page;
+	bvec->bv_offset = buf->offset;
+	bvec->bv_len = buf->len;
+	imu->nr_bvecs++;
+	data->total += buf->len;
+
+	return buf->len;
+}
+
+static int io_splice_buf_direct_actor(struct pipe_inode_info *pipe,
+			       struct splice_desc *sd)
+{
+	return __splice_from_pipe(pipe, sd, io_splice_buf_actor);
+}
+
+static int __io_prep_rw_splice_buf(struct io_kiocb *req,
+				   struct io_rw_splice_buf_data *data,
+				   struct file *splice_f,
+				   size_t len,
+				   loff_t splice_off)
+{
+	unsigned flags = req->opcode == IORING_OP_READ_SPLICE_BUF ?
+			SPLICE_F_KERN_FOR_READ : SPLICE_F_KERN_FOR_WRITE;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags | SPLICE_F_NONBLOCK | SPLICE_F_KERN_NEED_CONFIRM,
+		.pos = splice_off,
+		.u.data = data,
+		.ignore_sig = true,
+	};
+
+	return splice_direct_to_actor(splice_f, &sd,
+			io_splice_buf_direct_actor);
+}
+
+static int io_prep_rw_splice_buf(struct io_kiocb *req,
+				 const struct io_uring_sqe *sqe)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	unsigned nr_pages = io_rw_splice_buf_nr_bvecs(rw->len);
+	loff_t splice_off = READ_ONCE(sqe->splice_off_in);
+	struct io_rw_splice_buf_data data;
+	struct io_mapped_ubuf *imu;
+	struct fd splice_fd;
+	int ret;
+
+	splice_fd = fdget(READ_ONCE(sqe->splice_fd_in));
+	if (!splice_fd.file)
+		return -EBADF;
+
+	ret = -EBADF;
+	if (!(splice_fd.file->f_mode & FMODE_READ))
+		goto out_put_fd;
+
+	ret = -ENOMEM;
+	imu = kmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+	if (!imu)
+		goto out_put_fd;
+
+	/* splice buffer actually hasn't virtual address */
+	imu->nr_bvecs = 0;
+
+	data.max_bvecs = nr_pages;
+	data.total = 0;
+	data.imu = &imu;
+
+	rw->addr = 0;
+	req->flags |= REQ_F_NEED_CLEANUP;
+
+	ret = __io_prep_rw_splice_buf(req, &data, splice_fd.file, rw->len,
+			splice_off);
+	imu = *data.imu;
+	imu->acct_pages = 0;
+	imu->ubuf = 0;
+	imu->ubuf_end = data.total;
+	rw->len = data.total;
+	req->imu = imu;
+	if (!data.total) {
+		io_rw_cleanup_splice_buf(req);
+	} else  {
+		ret = 0;
+	}
+out_put_fd:
+	if (splice_fd.file)
+		fdput(splice_fd);
+
+	return ret;
+}
+
+void io_read_write_cleanup(struct io_kiocb *req)
+{
+	if (io_rw_splice_buf(req))
+		io_rw_cleanup_splice_buf(req);
+}
+
 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -117,6 +286,8 @@  int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		ret = io_iov_buffer_select_prep(req);
 		if (ret)
 			return ret;
+	} else if (io_rw_splice_buf(req)) {
+		return io_prep_rw_splice_buf(req, sqe);
 	}
 
 	return 0;
@@ -371,7 +542,8 @@  static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	size_t sqe_len;
 	ssize_t ret;
 
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
+	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED ||
+			io_rw_splice_buf(req)) {
 		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
 		if (ret)
 			return ERR_PTR(ret);
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3b733f4b610a..b37d6f6ecb6a 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -21,4 +21,5 @@  int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
+void io_read_write_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);