diff mbox series

[1/4] fs/splice: enhance direct pipe & splice for moving pages in kernel

Message ID 20230210153212.733006-2-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series io_uring: add IORING_OP_READ[WRITE]_SPLICE_BUF | expand

Commit Message

Ming Lei Feb. 10, 2023, 3:32 p.m. UTC
Per-task direct pipe can transfer page between two files or one
file and other kernel components, especially by splice_direct_to_actor
and __splice_from_pipe().

This way is helpful for fuse/ublk to implement zero copy by transferring
pages from device to file or socket. However, when device's ->splice_read()
produces pages, the kernel consumer may read from or write to these pages,
and from device viewpoint, there could be unexpected read or write on
pages.

Enhance the limit by the following approach:

1) add kernel splice flags of SPLICE_F_KERN_FOR_[READ|WRITE] which is
   passed to device's ->read_splice(), then device can check if this
   READ or WRITE is expected on pages filled to pipe together with
   information from ppos & len

2) add kernel splice flag of SPLICE_F_KERN_NEED_CONFIRM which is passed
   to device's ->read_splice() for asking device to confirm if it
   really supports this kind of usage of feeding pages by ->read_splice().
   If device does support, pipe->ack_page_consuming is set. This way
   can avoid misuse.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 fs/splice.c               | 15 +++++++++++++++
 include/linux/pipe_fs_i.h | 10 ++++++++++
 include/linux/splice.h    | 22 ++++++++++++++++++++++
 3 files changed, 47 insertions(+)

Comments

Ming Lei Feb. 11, 2023, 3:42 p.m. UTC | #1
On Fri, Feb 10, 2023 at 11:32:09PM +0800, Ming Lei wrote:
> Per-task direct pipe can transfer page between two files or one
> file and other kernel components, especially by splice_direct_to_actor
> and __splice_from_pipe().
> 
> This way is helpful for fuse/ublk to implement zero copy by transferring
> pages from device to file or socket. However, when device's ->splice_read()
> produces pages, the kernel consumer may read from or write to these pages,
> and from device viewpoint, there could be unexpected read or write on
> pages.
> 
> Enhance the limit by the following approach:
> 
> 1) add kernel splice flags of SPLICE_F_KERN_FOR_[READ|WRITE] which is
>    passed to device's ->read_splice(), then device can check if this
>    READ or WRITE is expected on pages filled to pipe together with
>    information from ppos & len
> 
> 2) add kernel splice flag of SPLICE_F_KERN_NEED_CONFIRM which is passed
>    to device's ->read_splice() for asking device to confirm if it
>    really supports this kind of usage of feeding pages by ->read_splice().
>    If device does support, pipe->ack_page_consuming is set. This way
>    can avoid misuse.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  fs/splice.c               | 15 +++++++++++++++
>  include/linux/pipe_fs_i.h | 10 ++++++++++
>  include/linux/splice.h    | 22 ++++++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/fs/splice.c b/fs/splice.c
> index 87d9b19349de..c4770e1644cc 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -792,6 +792,14 @@ static long do_splice_to(struct file *in, loff_t *ppos,
>  	return in->f_op->splice_read(in, ppos, pipe, len, flags);
>  }
>  
> +static inline bool slice_read_acked(const struct pipe_inode_info *pipe,
> +			       int flags)
> +{
> +	if (flags & SPLICE_F_KERN_NEED_CONFIRM)
> +		return pipe->ack_page_consuming;
> +	return true;
> +}
> +
>  /**
>   * splice_direct_to_actor - splices data directly between two non-pipes
>   * @in:		file to splice from
> @@ -861,10 +869,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
>  		size_t read_len;
>  		loff_t pos = sd->pos, prev_pos = pos;
>  
> +		pipe->ack_page_consuming = false;
>  		ret = do_splice_to(in, &pos, pipe, len, flags);
>  		if (unlikely(ret <= 0))
>  			goto out_release;
>  
> +		if (!slice_read_acked(pipe, flags)) {
> +			bytes = 0;
> +			ret = -EACCES;
> +			goto out_release;
> +		}
> +
>  		read_len = ret;
>  		sd->total_len = read_len;
>  
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index 6cb65df3e3ba..09ee1a9380ec 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -72,6 +72,7 @@ struct pipe_inode_info {
>  	unsigned int r_counter;
>  	unsigned int w_counter;
>  	bool poll_usage;
> +	bool ack_page_consuming;	/* only for direct pipe */
>  	struct page *tmp_page;
>  	struct fasync_struct *fasync_readers;
>  	struct fasync_struct *fasync_writers;
> @@ -218,6 +219,15 @@ static inline void pipe_discard_from(struct pipe_inode_info *pipe,
>  		pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]);
>  }
>  
> +/*
> + * Called in ->splice_read() for confirming the READ/WRITE page is allowed
> + */
> +static inline void pipe_ack_page_consume(struct pipe_inode_info *pipe)
> +{
> +	if (!WARN_ON_ONCE(current->splice_pipe != pipe))
> +		pipe->ack_page_consuming = true;
> +}
> +
>  /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
>     memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
>  #define PIPE_SIZE		PAGE_SIZE
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index a55179fd60fc..98c471fd918d 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -23,6 +23,28 @@
>  
>  #define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
>  
> +/*
> + * Flags used for kernel internal page move from ->splice_read()
> + * by internal direct pipe, and user pipe can't touch these
> + * flags.
> + *
> + * Pages filled from ->splice_read() are usually moved/copied to
> + * ->splice_write(). Here address fuse/ublk zero copy by transferring
> + * page from device to file/socket for either READ or WRITE. So we
> + * need ->splice_read() to confirm if this READ/WRITE is allowed on
> + * pages filled in ->splice_read().
> + */
> +/* The page consumer is for READ from pages moved from direct pipe */
> +#define SPLICE_F_KERN_FOR_READ	(0x100)
> +/* The page consumer is for WRITE to pages moved from direct pipe */
> +#define SPLICE_F_KERN_FOR_WRITE	(0x200)
> +/*
> + * ->splice_read() has to confirm if consumer's READ/WRITE on pages
> + * is allow. If yes, ->splice_read() has to set pipe->ack_page_consuming,
> + * otherwise pipe->ack_page_consuming has to be cleared.
> + */
> +#define SPLICE_F_KERN_NEED_CONFIRM	(0x400)
> +

As Linus commented in another thread, this patch is really ugly.

I'd suggest to change to the following one, any comment is welcome!


From fb9340ce72a1c58c9428d2af7cb00b55fa4ba799 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 10 Feb 2023 09:16:46 +0000
Subject: [PATCH 2/4] fs/splice: add private flags for cross-check in two ends

Pages are transferred via pipe/splice between different subsystems.

The source subsystem may know if spliced pages are readable or
writable. The sink subsystem may know if spliced pages need to
write to or read from.

Add two pair of private flags, so that the source subsystem and
sink subsystem can run cross-check about page use correctness,
and they are supposed to be used in case of communicating over
direct pipe only. Generic splice and pipe code is supposed to
not touch the two pair of private flags.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/linux/pipe_fs_i.h | 8 ++++++++
 include/linux/splice.h    | 9 +++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6cb65df3e3ba..959c8b9287f4 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -14,6 +14,14 @@
 #define PIPE_BUF_FLAG_LOSS	0x40	/* Message loss happened after this buffer */
 #endif
 
+/*
+ * Used by source/sink end only, don't touch them in generic
+ * splice/pipe code. Set in source side, and check in sink
+ * side
+ */
+#define PIPE_BUF_PRIV_FLAG_MAY_READ	0x1000 /* sink can read from page */
+#define PIPE_BUF_PRIV_FLAG_MAY_WRITE	0x2000 /* sink can write to page */
+
 /**
  *	struct pipe_buffer - a linux kernel pipe buffer
  *	@page: the page containing the data for the pipe buffer
diff --git a/include/linux/splice.h b/include/linux/splice.h
index a55179fd60fc..90d1d2b5327d 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -23,6 +23,15 @@
 
 #define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
 
+/*
+ * Use for source/sink side only, don't touch them in generic splice/pipe
+ * code. Pass from sink side, and check in source side.
+ *
+ * So far, it is only for communicating over direct pipe.
+ */
+#define SPLICE_F_PRIV_FOR_READ	(0x100)	/* sink side will read from page */
+#define SPLICE_F_PRIV_FOR_WRITE	(0x200) /* sink side will write page */
+
 /*
  * Passed to the actors
  */
Linus Torvalds Feb. 11, 2023, 6:57 p.m. UTC | #2
On Sat, Feb 11, 2023 at 7:42 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> +/*
> + * Used by source/sink end only, don't touch them in generic
> + * splice/pipe code. Set in source side, and check in sink
> + * side
> + */
> +#define PIPE_BUF_PRIV_FLAG_MAY_READ    0x1000 /* sink can read from page */
> +#define PIPE_BUF_PRIV_FLAG_MAY_WRITE   0x2000 /* sink can write to page */
> +

So this sounds much more sane and understandable, but I have two worries:

 (a) what's the point of MAY_READ? A non-readable page sounds insane
and wrong. All sinks expect to be able to read.

 (b) what about 'tee()' which duplicates a pipe buffer that has
MAY_WRITE? Particularly one that may already have been *partially*
given to something that thinks it can write to it?

So at a minimum I think the tee code then needs to clear that flag.
And I think MAY_READ is nonsense.

          Linus
Ming Lei Feb. 12, 2023, 1:39 a.m. UTC | #3
On Sat, Feb 11, 2023 at 10:57:08AM -0800, Linus Torvalds wrote:
> On Sat, Feb 11, 2023 at 7:42 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > +/*
> > + * Used by source/sink end only, don't touch them in generic
> > + * splice/pipe code. Set in source side, and check in sink
> > + * side
> > + */
> > +#define PIPE_BUF_PRIV_FLAG_MAY_READ    0x1000 /* sink can read from page */
> > +#define PIPE_BUF_PRIV_FLAG_MAY_WRITE   0x2000 /* sink can write to page */
> > +
> 
> So this sounds much more sane and understandable, but I have two worries:
> 
>  (a) what's the point of MAY_READ? A non-readable page sounds insane
> and wrong. All sinks expect to be able to read.

For example, it is one page which needs sink end to fill data, so
we needn't to zero it in source end every time, just for avoiding
leak kernel data if (unexpected)sink end simply tried to read from
the spliced page instead of writing data to page.

> 
>  (b) what about 'tee()' which duplicates a pipe buffer that has
> MAY_WRITE? Particularly one that may already have been *partially*
> given to something that thinks it can write to it?

The flags added is for kernel code(io_uring) over direct pipe,
and the two pipe buf flags won't be copied cross tee, cause the kernel
code just use spliced pages. (io_uring -> tee())

Also because of the added SPLICE_F_PRIV_FOR_READ[WRITE], pipe buffer
flags won't be copied over tee() too, because tee() can't pass the two
flags to device's ->splice_read().

We may need to document that the two pair flags should be used
together.

thanks,
Ming
Linus Torvalds Feb. 13, 2023, 8:04 p.m. UTC | #4
On Sat, Feb 11, 2023 at 5:39 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> >
> >  (a) what's the point of MAY_READ? A non-readable page sounds insane
> > and wrong. All sinks expect to be able to read.
>
> For example, it is one page which needs sink end to fill data, so
> we needn't to zero it in source end every time, just for avoiding
> leak kernel data if (unexpected)sink end simply tried to read from
> the spliced page instead of writing data to page.

I still don't understand.

A sink *reads* the data. It doesn't write the data.

There's no point trying to deal with "if unexpectedly doing crazy
things". If a sink writes the data, the sinkm is so unbelievably buggy
that it's not even funny.

And having two flags that you then say "have to be used together" is pointless.

It's not two different flags if you can't use them separately!

So I think your explanations are anything *but* explaining what you
want. They are just strange and not sensible.

Please explain to me in small words and simple sentences what it is
you want. And no, if the explanation is "the sink wants to write to
the buffer", then that's not an explanation, it's just insanity.

We *used* to have the concept of "gifting" the buffer explicitly to
the sink, so that the sink could - instead of reading from it - decide
to just use the whole buffer as-is long term. The idea was that tthe
buffer woudl literally be moved from the source to the destination,
ownership and all.

But if that's what you want, then it's not about "sink writes". It's
literally about the splice() wanting to move not just the data, but
the whole ownership of the buffer.

Anyway, I will NAK this as long as the explanations for what the
semantics are and what you want to do don't make sense. Right now they
don't.

              Linus
Ming Lei Feb. 14, 2023, 12:52 a.m. UTC | #5
On Mon, Feb 13, 2023 at 12:04:27PM -0800, Linus Torvalds wrote:
> On Sat, Feb 11, 2023 at 5:39 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > >
> > >  (a) what's the point of MAY_READ? A non-readable page sounds insane
> > > and wrong. All sinks expect to be able to read.
> >
> > For example, it is one page which needs sink end to fill data, so
> > we needn't to zero it in source end every time, just for avoiding
> > leak kernel data if (unexpected)sink end simply tried to read from
> > the spliced page instead of writing data to page.
> 
> I still don't understand.
> 
> A sink *reads* the data. It doesn't write the data.
> 
> There's no point trying to deal with "if unexpectedly doing crazy
> things". If a sink writes the data, the sinkm is so unbelievably buggy
> that it's not even funny.
> 
> And having two flags that you then say "have to be used together" is pointless.

Actually I think it is fine to use the pipe buffer flags separately,
if MAY_READ/MAY_WRITE is set in source end, the sink side need to respect
it. All current in-tree source end actually implies both MAY_READ & MAY_WRITE.

> It's not two different flags if you can't use them separately!
> 
> So I think your explanations are anything *but* explaining what you
> want. They are just strange and not sensible.
> 
> Please explain to me in small words and simple sentences what it is
> you want. And no, if the explanation is "the sink wants to write to
> the buffer", then that's not an explanation, it's just insanity.
> 
> We *used* to have the concept of "gifting" the buffer explicitly to
> the sink, so that the sink could - instead of reading from it - decide
> to just use the whole buffer as-is long term. The idea was that tthe
> buffer woudl literally be moved from the source to the destination,
> ownership and all.
> 
> But if that's what you want, then it's not about "sink writes". It's
> literally about the splice() wanting to move not just the data, but
> the whole ownership of the buffer.

Yeah, it is actually transferring the buffer ownership, and looks
SPLICE_F_GIFT is exactly the case, but the driver side needs to set
QUEUE_FLAG_STABLE_WRITES for avoiding writeback to touch these pages.

Follows the idea:

file(devices(such as, fuse, ublk), produce pipe buffer) -> direct pipe -> file(consume the pipe buffer)

The 'consume' could be READ or WRITE.

So once SPLICE_F_GIFT is set from source side, the two buffer flags
aren't needed any more, right?

Please see the detailed explanation & use case in following link:

https://lore.kernel.org/linux-block/409656a0-7db5-d87c-3bb2-c05ff7af89af@kernel.dk/T/#m237e5973571b3d85df9fa519cf2c9762440009ba



Thanks,
Ming
Ming Lei Feb. 14, 2023, 2:35 a.m. UTC | #6
On Tue, Feb 14, 2023 at 08:52:23AM +0800, Ming Lei wrote:
> On Mon, Feb 13, 2023 at 12:04:27PM -0800, Linus Torvalds wrote:
> > On Sat, Feb 11, 2023 at 5:39 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > >
> > > >  (a) what's the point of MAY_READ? A non-readable page sounds insane
> > > > and wrong. All sinks expect to be able to read.
> > >
> > > For example, it is one page which needs sink end to fill data, so
> > > we needn't to zero it in source end every time, just for avoiding
> > > leak kernel data if (unexpected)sink end simply tried to read from
> > > the spliced page instead of writing data to page.
> > 
> > I still don't understand.
> > 
> > A sink *reads* the data. It doesn't write the data.
> > 
> > There's no point trying to deal with "if unexpectedly doing crazy
> > things". If a sink writes the data, the sinkm is so unbelievably buggy
> > that it's not even funny.
> > 
> > And having two flags that you then say "have to be used together" is pointless.
> 
> Actually I think it is fine to use the pipe buffer flags separately,
> if MAY_READ/MAY_WRITE is set in source end, the sink side need to respect
> it. All current in-tree source end actually implies both MAY_READ & MAY_WRITE.
> 
> > It's not two different flags if you can't use them separately!
> > 
> > So I think your explanations are anything *but* explaining what you
> > want. They are just strange and not sensible.
> > 
> > Please explain to me in small words and simple sentences what it is
> > you want. And no, if the explanation is "the sink wants to write to
> > the buffer", then that's not an explanation, it's just insanity.
> > 
> > We *used* to have the concept of "gifting" the buffer explicitly to
> > the sink, so that the sink could - instead of reading from it - decide
> > to just use the whole buffer as-is long term. The idea was that tthe
> > buffer woudl literally be moved from the source to the destination,
> > ownership and all.
> > 
> > But if that's what you want, then it's not about "sink writes". It's
> > literally about the splice() wanting to move not just the data, but
> > the whole ownership of the buffer.
> 
> Yeah, it is actually transferring the buffer ownership, and looks
> SPLICE_F_GIFT is exactly the case, but the driver side needs to set
> QUEUE_FLAG_STABLE_WRITES for avoiding writeback to touch these pages.
> 
> Follows the idea:
> 
> file(devices(such as, fuse, ublk), produce pipe buffer) -> direct pipe -> file(consume the pipe buffer)
> 
> The 'consume' could be READ or WRITE.
> 
> So once SPLICE_F_GIFT is set from source side, the two buffer flags
> aren't needed any more, right?

Sorry, I meant PIPE_BUF_FLAG_GIFT actually.

> 
> Please see the detailed explanation & use case in following link:
> 
> https://lore.kernel.org/linux-block/409656a0-7db5-d87c-3bb2-c05ff7af89af@kernel.dk/T/#m237e5973571b3d85df9fa519cf2c9762440009ba
> 

Thanks, 
Ming
Miklos Szeredi Feb. 14, 2023, 11:03 a.m. UTC | #7
On Mon, 13 Feb 2023 at 21:04, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Sat, Feb 11, 2023 at 5:39 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > >
> > >  (a) what's the point of MAY_READ? A non-readable page sounds insane
> > > and wrong. All sinks expect to be able to read.
> >
> > For example, it is one page which needs sink end to fill data, so
> > we needn't to zero it in source end every time, just for avoiding
> > leak kernel data if (unexpected)sink end simply tried to read from
> > the spliced page instead of writing data to page.
>
> I still don't understand.
>
> A sink *reads* the data. It doesn't write the data.

I think Ming is trying to generalize splice to allow flowing data in
the opposite direction.  So yes, sink would be writing to the buffer.
And it MUST NOT be reading the data since the buffer may be
uninitialized.

The problem is how to tell the original source that the buffer is
ready?  PG_uptodate comes to mind, but pipe buffers allow partial
pages to be passed around, and there's no mechanism to describe a
partially uptodate buffer.

Thanks,
Miklos
Ming Lei Feb. 14, 2023, 2:35 p.m. UTC | #8
Hi Miklos,

On Tue, Feb 14, 2023 at 12:03:44PM +0100, Miklos Szeredi wrote:
> On Mon, 13 Feb 2023 at 21:04, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > On Sat, Feb 11, 2023 at 5:39 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > >
> > > >  (a) what's the point of MAY_READ? A non-readable page sounds insane
> > > > and wrong. All sinks expect to be able to read.
> > >
> > > For example, it is one page which needs sink end to fill data, so
> > > we needn't to zero it in source end every time, just for avoiding
> > > leak kernel data if (unexpected)sink end simply tried to read from
> > > the spliced page instead of writing data to page.
> >
> > I still don't understand.
> >
> > A sink *reads* the data. It doesn't write the data.
> 
> I think Ming is trying to generalize splice to allow flowing data in
> the opposite direction.

I think it isn't opposite direction, it is just that sink may be
WRITE to buffer, and the model is:

device(produce buffer in ->splice_read()) -> direct pipe ->
	file(consume buffer via READ or WRITE)

Follows kernel side implementation:

	splice_direct_to_actor(pipe, sd, source_actor)

	direct_actor():
		__splice_from_pipe(pipe, sd, sink_actor)

	sink_actor():
		get_page()

then read from file/socket to page.

The current userspace owns the whole buffer, so I understand the buffer
ownership can be transferred to consumer/sink side.

> So yes, sink would be writing to the buffer.
> And it MUST NOT be reading the data since the buffer may be
> uninitialized.

The added SPLICE_F_PRIV_FOR_READ[WRITE] is enough to avoid
un-expected READ, but the source end needs to confirm the buffer
ownership can be transferred to consumer, probably PIPE_BUF_FLAG_GIFT
can be used for this purpose.

> 
> The problem is how to tell the original source that the buffer is
> ready?  PG_uptodate comes to mind, but pipe buffers allow partial
> pages to be passed around, and there's no mechanism to describe a
> partially uptodate buffer.

I understand it isn't one issue from block device driver viewpoint at
least, since the WRITE to buffer in sink end can be thought as DMA
to buffer from device, and it is the upper layer(FS)'s responsibility
for updating page flag. And block driver needn't to handle page
status update.

So seems it is one fuse specific issue?


Thanks,
Ming
Miklos Szeredi Feb. 14, 2023, 3:39 p.m. UTC | #9
On Tue, 14 Feb 2023 at 15:35, Ming Lei <ming.lei@redhat.com> wrote:

> I understand it isn't one issue from block device driver viewpoint at
> least, since the WRITE to buffer in sink end can be thought as DMA
> to buffer from device, and it is the upper layer(FS)'s responsibility
> for updating page flag. And block driver needn't to handle page
> status update.

The block device still needs to know when the DMA is finished, right?

Thanks,
Miklos
Ming Lei Feb. 15, 2023, 12:11 a.m. UTC | #10
On Tue, Feb 14, 2023 at 04:39:01PM +0100, Miklos Szeredi wrote:
> On Tue, 14 Feb 2023 at 15:35, Ming Lei <ming.lei@redhat.com> wrote:
> 
> > I understand it isn't one issue from block device driver viewpoint at
> > least, since the WRITE to buffer in sink end can be thought as DMA
> > to buffer from device, and it is the upper layer(FS)'s responsibility
> > for updating page flag. And block driver needn't to handle page
> > status update.
> 
> The block device still needs to know when the DMA is finished, right?

Yeah, for normal in-kernel device driver, the completion is triggered by
interrupt or io polling.

For ublk, io handling is done by userspace, here we use io_uring to
handle the IO in aio style. When the aio is completed, the userspace
gets notified of the completion.

Here the way is basically same with loop dio mode(losetup --direct-io=on),
it is still zero copy, pages from loop block driver are passed to FS(backing file)
directly for handling the original IO.


Thanks, 
Ming
Miklos Szeredi Feb. 15, 2023, 10:36 a.m. UTC | #11
On Wed, 15 Feb 2023 at 01:11, Ming Lei <ming.lei@redhat.com> wrote:
>
> On Tue, Feb 14, 2023 at 04:39:01PM +0100, Miklos Szeredi wrote:
> > On Tue, 14 Feb 2023 at 15:35, Ming Lei <ming.lei@redhat.com> wrote:
> >
> > > I understand it isn't one issue from block device driver viewpoint at
> > > least, since the WRITE to buffer in sink end can be thought as DMA
> > > to buffer from device, and it is the upper layer(FS)'s responsibility
> > > for updating page flag. And block driver needn't to handle page
> > > status update.
> >
> > The block device still needs to know when the DMA is finished, right?
>
> Yeah, for normal in-kernel device driver, the completion is triggered by
> interrupt or io polling.
>
> For ublk, io handling is done by userspace, here we use io_uring to
> handle the IO in aio style. When the aio is completed, the userspace
> gets notified of the completion.

I think it might be better if the write completion was directly
signaled to the original pipe buffer.  There are several advantages:

 - the kernel can guarantee (modulo bugs) that the buffer was
initialized, this is important if the userspace server is unprivileged

 - the server process does not need to be woken up on I/O completion

 - there could be a chain of splices involving various entities, yet
the original pipe buffer should always get the completion

I'm not sure what a good implementation would look like.  When a pipe
buffer is split, things get complicated.  Maybe just disallow
splitting on such buffers...

Thanks,
Miklos
diff mbox series

Patch

diff --git a/fs/splice.c b/fs/splice.c
index 87d9b19349de..c4770e1644cc 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -792,6 +792,14 @@  static long do_splice_to(struct file *in, loff_t *ppos,
 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 
+static inline bool slice_read_acked(const struct pipe_inode_info *pipe,
+			       int flags)
+{
+	if (flags & SPLICE_F_KERN_NEED_CONFIRM)
+		return pipe->ack_page_consuming;
+	return true;
+}
+
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
  * @in:		file to splice from
@@ -861,10 +869,17 @@  ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		size_t read_len;
 		loff_t pos = sd->pos, prev_pos = pos;
 
+		pipe->ack_page_consuming = false;
 		ret = do_splice_to(in, &pos, pipe, len, flags);
 		if (unlikely(ret <= 0))
 			goto out_release;
 
+		if (!slice_read_acked(pipe, flags)) {
+			bytes = 0;
+			ret = -EACCES;
+			goto out_release;
+		}
+
 		read_len = ret;
 		sd->total_len = read_len;
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6cb65df3e3ba..09ee1a9380ec 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -72,6 +72,7 @@  struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+	bool ack_page_consuming;	/* only for direct pipe */
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
@@ -218,6 +219,15 @@  static inline void pipe_discard_from(struct pipe_inode_info *pipe,
 		pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]);
 }
 
+/*
+ * Called in ->splice_read() for confirming the READ/WRITE page is allowed
+ */
+static inline void pipe_ack_page_consume(struct pipe_inode_info *pipe)
+{
+	if (!WARN_ON_ONCE(current->splice_pipe != pipe))
+		pipe->ack_page_consuming = true;
+}
+
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
diff --git a/include/linux/splice.h b/include/linux/splice.h
index a55179fd60fc..98c471fd918d 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -23,6 +23,28 @@ 
 
 #define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
 
+/*
+ * Flags used for kernel internal page move from ->splice_read()
+ * by internal direct pipe, and user pipe can't touch these
+ * flags.
+ *
+ * Pages filled from ->splice_read() are usually moved/copied to
+ * ->splice_write(). Here address fuse/ublk zero copy by transferring
+ * page from device to file/socket for either READ or WRITE. So we
+ * need ->splice_read() to confirm if this READ/WRITE is allowed on
+ * pages filled in ->splice_read().
+ */
+/* The page consumer is for READ from pages moved from direct pipe */
+#define SPLICE_F_KERN_FOR_READ	(0x100)
+/* The page consumer is for WRITE to pages moved from direct pipe */
+#define SPLICE_F_KERN_FOR_WRITE	(0x200)
+/*
+ * ->splice_read() has to confirm if consumer's READ/WRITE on pages
+ * is allow. If yes, ->splice_read() has to set pipe->ack_page_consuming,
+ * otherwise pipe->ack_page_consuming has to be cleared.
+ */
+#define SPLICE_F_KERN_NEED_CONFIRM	(0x400)
+
 /*
  * Passed to the actors
  */