diff mbox series

pipe: don't update {a,c,m}time for anonymous pipes

Message ID 20250204132153.GA20921@redhat.com (mailing list archive)
State New
Headers show
Series pipe: don't update {a,c,m}time for anonymous pipes | expand

Commit Message

Oleg Nesterov Feb. 4, 2025, 1:21 p.m. UTC
These numbers are visible in fstat() but hopefully nobody uses this
information and file_accessed/file_update_time are not that cheap.
Stupid test-case:

	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <assert.h>
	#include <sys/ioctl.h>
	#include <sys/time.h>

	static char buf[17 * 4096];
	static struct timeval TW, TR;

	int wr(int fd, int size)
	{
		int c, r;
		struct timeval t0, t1;

		gettimeofday(&t0, NULL);
		for (c = 0; (r = write(fd, buf, size)) > 0; c += r);
		gettimeofday(&t1, NULL);
		timeradd(&TW, &t1, &TW);
		timersub(&TW, &t0, &TW);

		return c;
	}

	int rd(int fd, int size)
	{
		int c, r;
		struct timeval t0, t1;

		gettimeofday(&t0, NULL);
		for (c = 0; (r = read(fd, buf, size)) > 0; c += r);
		gettimeofday(&t1, NULL);
		timeradd(&TR, &t1, &TR);
		timersub(&TR, &t0, &TR);

		return c;
	}

	int main(int argc, const char *argv[])
	{
		int fd[2], nb = 1, loop, size;

		assert(argc == 3);
		loop = atoi(argv[1]);
		size = atoi(argv[2]);

		assert(pipe(fd) == 0);
		assert(ioctl(fd[0], FIONBIO, &nb) == 0);
		assert(ioctl(fd[1], FIONBIO, &nb) == 0);

		assert(size <= sizeof(buf));
		while (loop--)
			assert(wr(fd[1], size) == rd(fd[0], size));

		struct timeval tt;
		timeradd(&TW, &TR, &tt);
		printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n",
			TW.tv_sec, TW.tv_usec/1000,
			TR.tv_sec, TR.tv_usec/1000,
			tt.tv_sec, tt.tv_usec/1000);

		return 0;
	}

Before:
	# for i in 1 2 3; do /host/tmp/test 10000 100; done
	TW = 8.047 TR = 5.845 TT = 13.893
	TW = 8.091 TR = 5.872 TT = 13.963
	TW = 8.083 TR = 5.885 TT = 13.969
After:
	# for i in 1 2 3; do /host/tmp/test 10000 100; done
	TW = 4.752 TR = 4.664 TT = 9.416
	TW = 4.684 TR = 4.608 TT = 9.293
	TW = 4.736 TR = 4.652 TT = 9.388

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/pipe.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

Comments

Mateusz Guzik Feb. 4, 2025, 2:21 p.m. UTC | #1
On Tue, Feb 4, 2025 at 2:22 PM Oleg Nesterov <oleg@redhat.com> wrote:
> These numbers are visible in fstat() but hopefully nobody uses this
> information and file_accessed/file_update_time are not that cheap.

I'll note majority of the problem most likely stems from
mnt_get_write_access rolling with:
        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();

Unfortunately as is the MNT_WRITE_HOLD thing gets set under a
spinlock, so it may be this will be hard to rework in the same vein in
which percpu rwsems are operating, but perhaps someone(tm) may take an
honest shot?

Would be of benefit everywhere and possibly obsolete this patch.

> +       if (ret > 0 && !is_pipe_inode(file_inode(filp))) {

Total nit in case there is a v2:

ret is typically > 0 and most of the time this is going to be an
anonymous pipe, so I would swap these conditions around.

A not nit is that "is_pipe_inode" is imo misleading -- a named pipe
inode is also a pipe inode. How about "is_anon_pipe"?
Oleg Nesterov Feb. 4, 2025, 2:39 p.m. UTC | #2
On 02/04, Mateusz Guzik wrote:
>
> > +       if (ret > 0 && !is_pipe_inode(file_inode(filp))) {
>
> Total nit in case there is a v2:
>
> ret is typically > 0 and most of the time this is going to be an
> anonymous pipe, so I would swap these conditions around.
>
> A not nit is that "is_pipe_inode" is imo misleading -- a named pipe
> inode is also a pipe inode. How about "is_anon_pipe"?

OK, I'll wait for other reviews and send v2 with the changes you suggest.

Oleg.
Jeff Layton Feb. 4, 2025, 2:48 p.m. UTC | #3
On Tue, 2025-02-04 at 14:21 +0100, Oleg Nesterov wrote:
> These numbers are visible in fstat() but hopefully nobody uses this
> information and file_accessed/file_update_time are not that cheap.
> Stupid test-case:
> 
> 	#include <stdio.h>
> 	#include <stdlib.h>
> 	#include <unistd.h>
> 	#include <assert.h>
> 	#include <sys/ioctl.h>
> 	#include <sys/time.h>
> 
> 	static char buf[17 * 4096];
> 	static struct timeval TW, TR;
> 
> 	int wr(int fd, int size)
> 	{
> 		int c, r;
> 		struct timeval t0, t1;
> 
> 		gettimeofday(&t0, NULL);
> 		for (c = 0; (r = write(fd, buf, size)) > 0; c += r);
> 		gettimeofday(&t1, NULL);
> 		timeradd(&TW, &t1, &TW);
> 		timersub(&TW, &t0, &TW);
> 
> 		return c;
> 	}
> 
> 	int rd(int fd, int size)
> 	{
> 		int c, r;
> 		struct timeval t0, t1;
> 
> 		gettimeofday(&t0, NULL);
> 		for (c = 0; (r = read(fd, buf, size)) > 0; c += r);
> 		gettimeofday(&t1, NULL);
> 		timeradd(&TR, &t1, &TR);
> 		timersub(&TR, &t0, &TR);
> 
> 		return c;
> 	}
> 
> 	int main(int argc, const char *argv[])
> 	{
> 		int fd[2], nb = 1, loop, size;
> 
> 		assert(argc == 3);
> 		loop = atoi(argv[1]);
> 		size = atoi(argv[2]);
> 
> 		assert(pipe(fd) == 0);
> 		assert(ioctl(fd[0], FIONBIO, &nb) == 0);
> 		assert(ioctl(fd[1], FIONBIO, &nb) == 0);
> 
> 		assert(size <= sizeof(buf));
> 		while (loop--)
> 			assert(wr(fd[1], size) == rd(fd[0], size));
> 
> 		struct timeval tt;
> 		timeradd(&TW, &TR, &tt);
> 		printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n",
> 			TW.tv_sec, TW.tv_usec/1000,
> 			TR.tv_sec, TR.tv_usec/1000,
> 			tt.tv_sec, tt.tv_usec/1000);
> 
> 		return 0;
> 	}
> 
> Before:
> 	# for i in 1 2 3; do /host/tmp/test 10000 100; done
> 	TW = 8.047 TR = 5.845 TT = 13.893
> 	TW = 8.091 TR = 5.872 TT = 13.963
> 	TW = 8.083 TR = 5.885 TT = 13.969
> After:
> 	# for i in 1 2 3; do /host/tmp/test 10000 100; done
> 	TW = 4.752 TR = 4.664 TT = 9.416
> 	TW = 4.684 TR = 4.608 TT = 9.293
> 	TW = 4.736 TR = 4.652 TT = 9.388
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  fs/pipe.c | 21 ++++++++++++++-------
>  1 file changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 94b59045ab44..baaa8c0817f1 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -247,6 +247,11 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
>  	return tail;
>  }
>  
> +static inline bool is_pipe_inode(struct inode *inode)
> +{
> +	return inode->i_sb->s_magic == PIPEFS_MAGIC;
> +}
> +
>  static ssize_t
>  pipe_read(struct kiocb *iocb, struct iov_iter *to)
>  {
> @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>  	if (wake_next_reader)
>  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
>  	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
> -	if (ret > 0)
> +	if (ret > 0 && !is_pipe_inode(file_inode(filp)))

pipe_read and pipe_write are the read_iter / write_iter ops for pipefs
inodes. Is there ever a situation where is_pipe_inode() will be false
here?

>  		file_accessed(filp);
>  	return ret;
>  }
> @@ -604,11 +609,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
>  	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
>  	if (wake_next_writer)
>  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
> -	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
> -		int err = file_update_time(filp);
> -		if (err)
> -			ret = err;
> -		sb_end_write(file_inode(filp)->i_sb);
> +	if (ret > 0 && !is_pipe_inode(file_inode(filp))) {

Ditto.

> +		if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
> +			int err = file_update_time(filp);
> +			if (err)
> +				ret = err;
> +			sb_end_write(file_inode(filp)->i_sb);
> +		}
>  	}
>  	return ret;
>  }
> @@ -1108,7 +1115,7 @@ static void wake_up_partner(struct pipe_inode_info *pipe)
>  static int fifo_open(struct inode *inode, struct file *filp)
>  {
>  	struct pipe_inode_info *pipe;
> -	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
> +	bool is_pipe = is_pipe_inode(inode);

Same here: In what case is is_pipe() ever false here?

>  	int ret;
>  
>  	filp->f_pipe = 0;
Jeff Layton Feb. 4, 2025, 2:53 p.m. UTC | #4
On Tue, 2025-02-04 at 14:21 +0100, Oleg Nesterov wrote:
> These numbers are visible in fstat() but hopefully nobody uses this
> information and file_accessed/file_update_time are not that cheap.
> Stupid test-case:
> 
> 	#include <stdio.h>
> 	#include <stdlib.h>
> 	#include <unistd.h>
> 	#include <assert.h>
> 	#include <sys/ioctl.h>
> 	#include <sys/time.h>
> 
> 	static char buf[17 * 4096];
> 	static struct timeval TW, TR;
> 
> 	int wr(int fd, int size)
> 	{
> 		int c, r;
> 		struct timeval t0, t1;
> 
> 		gettimeofday(&t0, NULL);
> 		for (c = 0; (r = write(fd, buf, size)) > 0; c += r);
> 		gettimeofday(&t1, NULL);
> 		timeradd(&TW, &t1, &TW);
> 		timersub(&TW, &t0, &TW);
> 
> 		return c;
> 	}
> 
> 	int rd(int fd, int size)
> 	{
> 		int c, r;
> 		struct timeval t0, t1;
> 
> 		gettimeofday(&t0, NULL);
> 		for (c = 0; (r = read(fd, buf, size)) > 0; c += r);
> 		gettimeofday(&t1, NULL);
> 		timeradd(&TR, &t1, &TR);
> 		timersub(&TR, &t0, &TR);
> 
> 		return c;
> 	}
> 
> 	int main(int argc, const char *argv[])
> 	{
> 		int fd[2], nb = 1, loop, size;
> 
> 		assert(argc == 3);
> 		loop = atoi(argv[1]);
> 		size = atoi(argv[2]);
> 
> 		assert(pipe(fd) == 0);
> 		assert(ioctl(fd[0], FIONBIO, &nb) == 0);
> 		assert(ioctl(fd[1], FIONBIO, &nb) == 0);
> 
> 		assert(size <= sizeof(buf));
> 		while (loop--)
> 			assert(wr(fd[1], size) == rd(fd[0], size));
> 
> 		struct timeval tt;
> 		timeradd(&TW, &TR, &tt);
> 		printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n",
> 			TW.tv_sec, TW.tv_usec/1000,
> 			TR.tv_sec, TR.tv_usec/1000,
> 			tt.tv_sec, tt.tv_usec/1000);
> 
> 		return 0;
> 	}
> 
> Before:
> 	# for i in 1 2 3; do /host/tmp/test 10000 100; done
> 	TW = 8.047 TR = 5.845 TT = 13.893
> 	TW = 8.091 TR = 5.872 TT = 13.963
> 	TW = 8.083 TR = 5.885 TT = 13.969
> After:
> 	# for i in 1 2 3; do /host/tmp/test 10000 100; done
> 	TW = 4.752 TR = 4.664 TT = 9.416
> 	TW = 4.684 TR = 4.608 TT = 9.293
> 	TW = 4.736 TR = 4.652 TT = 9.388
> 
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  fs/pipe.c | 21 ++++++++++++++-------
>  1 file changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 94b59045ab44..baaa8c0817f1 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -247,6 +247,11 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
>  	return tail;
>  }
>  
> +static inline bool is_pipe_inode(struct inode *inode)
> +{
> +	return inode->i_sb->s_magic == PIPEFS_MAGIC;
> +}
> +
>  static ssize_t
>  pipe_read(struct kiocb *iocb, struct iov_iter *to)
>  {
> @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>  	if (wake_next_reader)
>  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
>  	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
> -	if (ret > 0)
> +	if (ret > 0 && !is_pipe_inode(file_inode(filp)))
>  		file_accessed(filp);
>  	return ret;
>  }
> @@ -604,11 +609,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
>  	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
>  	if (wake_next_writer)
>  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
> -	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
> -		int err = file_update_time(filp);
> -		if (err)
> -			ret = err;
> -		sb_end_write(file_inode(filp)->i_sb);
> +	if (ret > 0 && !is_pipe_inode(file_inode(filp))) {
> +		if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
> +			int err = file_update_time(filp);
> +			if (err)
> +				ret = err;
> +			sb_end_write(file_inode(filp)->i_sb);
> +		}
>  	}
>  	return ret;
>  }
> @@ -1108,7 +1115,7 @@ static void wake_up_partner(struct pipe_inode_info *pipe)
>  static int fifo_open(struct inode *inode, struct file *filp)
>  {
>  	struct pipe_inode_info *pipe;
> -	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
> +	bool is_pipe = is_pipe_inode(inode);
>  	int ret;
>  
>  	filp->f_pipe = 0;

Oh _anonymous_ pipes. You can disregard my earlier questions:

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Linus Torvalds Feb. 4, 2025, 3:28 p.m. UTC | #5
On Tue, 4 Feb 2025 at 05:22, Oleg Nesterov <oleg@redhat.com> wrote:
>
> @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
>         if (wake_next_reader)
>                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
>         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
> -       if (ret > 0)
> +       if (ret > 0 && !is_pipe_inode(file_inode(filp)))
>                 file_accessed(filp);
>         return ret;
>  }

So I really don't kike that "is_pipe_inode()" thing.

Yes, it has superficial naming problems (it should be about
*anonymous* pipes, and that should be reflected in the name), but I
think the deeper problem is that I think this should just be a
different set of read/write functions entirely.

IOW, we should have "anon_pipe_read()" and "anon_pipe_write()", and
then the named pipes just do

    int ret = anon_pipe_read(...);
    if (ret > 0)
        file_accessed(filp);
    return ret;

instead. With no tests for "what kind of pipe is this" (same for the
write side, of course)

               Linus
Oleg Nesterov Feb. 4, 2025, 3:55 p.m. UTC | #6
On 02/04, Linus Torvalds wrote:
>
> IOW, we should have "anon_pipe_read()" and "anon_pipe_write()", and
> then the named pipes just do

OK, will do in v2.

Oleg.
Christian Brauner Feb. 4, 2025, 4:34 p.m. UTC | #7
On Tue, Feb 04, 2025 at 03:21:18PM +0100, Mateusz Guzik wrote:
> On Tue, Feb 4, 2025 at 2:22 PM Oleg Nesterov <oleg@redhat.com> wrote:
> > These numbers are visible in fstat() but hopefully nobody uses this
> > information and file_accessed/file_update_time are not that cheap.
> 
> I'll note majority of the problem most likely stems from
> mnt_get_write_access rolling with:

So anonymous pipes are based on pipefs which isn't exposed to userspace
at all. Neither the superblock nor an individual mount can go read-only
or change mount properties. The superblock cannot be frozen etc.

So really that mnt_get_write_access() should be pointless for
anonymous pipes. In other words, couldn't this also just be:

diff --git a/fs/pipe.c b/fs/pipe.c
index 8df42e1ab3a3..e9989621362c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -604,12 +604,25 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        if (wake_next_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
-       if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
-               int err = file_update_time(filp);
-               if (err)
-                       ret = err;
-               sb_end_write(file_inode(filp)->i_sb);
+       if (ret > 0) {
+               int err;
+
+               if (!is_anon_pipe(file)) {
+                       if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
+                               err = file_update_time(filp);
+                               if (err)
+                                       ret = err;
+
+                               sb_end_write(file_inode(filp)->i_sb);
+                       }
+               } else {
+                       // Anonymous pipes don't need all the mount + sb bruah.
+                       err = inode_needs_update_time(inode);
+                       if (err > 0)
+                               ret = inode_update_time(file_inode(filp), err);
+               }
        }
+
        return ret;
 }

and then have zero regression risk?
Linus Torvalds Feb. 4, 2025, 4:49 p.m. UTC | #8
On Tue, 4 Feb 2025 at 08:34, Christian Brauner <brauner@kernel.org> wrote:
>
> So really that mnt_get_write_access() should be pointless for
> anonymous pipes. In other words, couldn't this also just be:

I bet you'll find that even just inode_update_times() isn't all that cheap.

Yes, the silly "get writability" thing is pointless for pipes, but the
"get current time" really isn't a no-op either. There's all the crazy
"coarse time vs fine time" crud, there's justv a *lot* of overhead.

That's not really noticeable on a "real" file where you never access
just one byte anyway, and where any write costs are elsewhere. But
pipes can be very cheap, so the silly overheads really show up.

                 Linus
diff mbox series

Patch

diff --git a/fs/pipe.c b/fs/pipe.c
index 94b59045ab44..baaa8c0817f1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -247,6 +247,11 @@  static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
 	return tail;
 }
 
+static inline bool is_pipe_inode(struct inode *inode)
+{
+	return inode->i_sb->s_magic == PIPEFS_MAGIC;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -404,7 +409,7 @@  pipe_read(struct kiocb *iocb, struct iov_iter *to)
 	if (wake_next_reader)
 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-	if (ret > 0)
+	if (ret > 0 && !is_pipe_inode(file_inode(filp)))
 		file_accessed(filp);
 	return ret;
 }
@@ -604,11 +609,13 @@  pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	if (wake_next_writer)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
-	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
-		int err = file_update_time(filp);
-		if (err)
-			ret = err;
-		sb_end_write(file_inode(filp)->i_sb);
+	if (ret > 0 && !is_pipe_inode(file_inode(filp))) {
+		if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
+			int err = file_update_time(filp);
+			if (err)
+				ret = err;
+			sb_end_write(file_inode(filp)->i_sb);
+		}
 	}
 	return ret;
 }
@@ -1108,7 +1115,7 @@  static void wake_up_partner(struct pipe_inode_info *pipe)
 static int fifo_open(struct inode *inode, struct file *filp)
 {
 	struct pipe_inode_info *pipe;
-	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
+	bool is_pipe = is_pipe_inode(inode);
 	int ret;
 
 	filp->f_pipe = 0;