Message ID | 20250204132153.GA20921@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | pipe: don't update {a,c,m}time for anonymous pipes | expand |
On Tue, Feb 4, 2025 at 2:22 PM Oleg Nesterov <oleg@redhat.com> wrote: > These numbers are visible in fstat() but hopefully nobody uses this > information and file_accessed/file_update_time are not that cheap. I'll note majority of the problem most likely stems from mnt_get_write_access rolling with: preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); Unfortunately as is the MNT_WRITE_HOLD thing gets set under a spinlock, so it may be this will be hard to rework in the same vein in which percpu rwsems are operating, but perhaps someone(tm) may take an honest shot? Would be of benefit everywhere and possibly obsolete this patch. > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) { Total nit in case there is a v2: ret is typically > 0 and most of the time this is going to be an anonymous pipe, so I would swap these conditions around. A not nit is that "is_pipe_inode" is imo misleading -- a named pipe inode is also a pipe inode. How about "is_anon_pipe"?
On 02/04, Mateusz Guzik wrote: > > > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) { > > Total nit in case there is a v2: > > ret is typically > 0 and most of the time this is going to be an > anonymous pipe, so I would swap these conditions around. > > A not nit is that "is_pipe_inode" is imo misleading -- a named pipe > inode is also a pipe inode. How about "is_anon_pipe"? OK, I'll wait for other reviews and send v2 with the changes you suggest. Oleg.
On Tue, 2025-02-04 at 14:21 +0100, Oleg Nesterov wrote: > These numbers are visible in fstat() but hopefully nobody uses this > information and file_accessed/file_update_time are not that cheap. > Stupid test-case: > > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > #include <assert.h> > #include <sys/ioctl.h> > #include <sys/time.h> > > static char buf[17 * 4096]; > static struct timeval TW, TR; > > int wr(int fd, int size) > { > int c, r; > struct timeval t0, t1; > > gettimeofday(&t0, NULL); > for (c = 0; (r = write(fd, buf, size)) > 0; c += r); > gettimeofday(&t1, NULL); > timeradd(&TW, &t1, &TW); > timersub(&TW, &t0, &TW); > > return c; > } > > int rd(int fd, int size) > { > int c, r; > struct timeval t0, t1; > > gettimeofday(&t0, NULL); > for (c = 0; (r = read(fd, buf, size)) > 0; c += r); > gettimeofday(&t1, NULL); > timeradd(&TR, &t1, &TR); > timersub(&TR, &t0, &TR); > > return c; > } > > int main(int argc, const char *argv[]) > { > int fd[2], nb = 1, loop, size; > > assert(argc == 3); > loop = atoi(argv[1]); > size = atoi(argv[2]); > > assert(pipe(fd) == 0); > assert(ioctl(fd[0], FIONBIO, &nb) == 0); > assert(ioctl(fd[1], FIONBIO, &nb) == 0); > > assert(size <= sizeof(buf)); > while (loop--) > assert(wr(fd[1], size) == rd(fd[0], size)); > > struct timeval tt; > timeradd(&TW, &TR, &tt); > printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n", > TW.tv_sec, TW.tv_usec/1000, > TR.tv_sec, TR.tv_usec/1000, > tt.tv_sec, tt.tv_usec/1000); > > return 0; > } > > Before: > # for i in 1 2 3; do /host/tmp/test 10000 100; done > TW = 8.047 TR = 5.845 TT = 13.893 > TW = 8.091 TR = 5.872 TT = 13.963 > TW = 8.083 TR = 5.885 TT = 13.969 > After: > # for i in 1 2 3; do /host/tmp/test 10000 100; done > TW = 4.752 TR = 4.664 TT = 9.416 > TW = 4.684 TR = 4.608 TT = 9.293 > TW = 4.736 TR = 4.652 TT = 9.388 > > Signed-off-by: Oleg Nesterov <oleg@redhat.com> > --- > fs/pipe.c | 21 ++++++++++++++------- > 1 file changed, 14 insertions(+), 7 deletions(-) > > diff --git a/fs/pipe.c b/fs/pipe.c > index 94b59045ab44..baaa8c0817f1 100644 > --- a/fs/pipe.c > +++ b/fs/pipe.c > @@ -247,6 +247,11 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, > return tail; > } > > +static inline bool is_pipe_inode(struct inode *inode) > +{ > + return inode->i_sb->s_magic == PIPEFS_MAGIC; > +} > + > static ssize_t > pipe_read(struct kiocb *iocb, struct iov_iter *to) > { > @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) > if (wake_next_reader) > wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); > kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); > - if (ret > 0) > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) pipe_read and pipe_write are the read_iter / write_iter ops for pipefs inodes. Is there ever a situation where is_pipe_inode() will be false here? > file_accessed(filp); > return ret; > } > @@ -604,11 +609,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) > kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); > if (wake_next_writer) > wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); > - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { > - int err = file_update_time(filp); > - if (err) > - ret = err; > - sb_end_write(file_inode(filp)->i_sb); > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) { Ditto. > + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { > + int err = file_update_time(filp); > + if (err) > + ret = err; > + sb_end_write(file_inode(filp)->i_sb); > + } > } > return ret; > } > @@ -1108,7 +1115,7 @@ static void wake_up_partner(struct pipe_inode_info *pipe) > static int fifo_open(struct inode *inode, struct file *filp) > { > struct pipe_inode_info *pipe; > - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; > + bool is_pipe = is_pipe_inode(inode); Same here: In what case is is_pipe() ever false here? > int ret; > > filp->f_pipe = 0;
On Tue, 2025-02-04 at 14:21 +0100, Oleg Nesterov wrote: > These numbers are visible in fstat() but hopefully nobody uses this > information and file_accessed/file_update_time are not that cheap. > Stupid test-case: > > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > #include <assert.h> > #include <sys/ioctl.h> > #include <sys/time.h> > > static char buf[17 * 4096]; > static struct timeval TW, TR; > > int wr(int fd, int size) > { > int c, r; > struct timeval t0, t1; > > gettimeofday(&t0, NULL); > for (c = 0; (r = write(fd, buf, size)) > 0; c += r); > gettimeofday(&t1, NULL); > timeradd(&TW, &t1, &TW); > timersub(&TW, &t0, &TW); > > return c; > } > > int rd(int fd, int size) > { > int c, r; > struct timeval t0, t1; > > gettimeofday(&t0, NULL); > for (c = 0; (r = read(fd, buf, size)) > 0; c += r); > gettimeofday(&t1, NULL); > timeradd(&TR, &t1, &TR); > timersub(&TR, &t0, &TR); > > return c; > } > > int main(int argc, const char *argv[]) > { > int fd[2], nb = 1, loop, size; > > assert(argc == 3); > loop = atoi(argv[1]); > size = atoi(argv[2]); > > assert(pipe(fd) == 0); > assert(ioctl(fd[0], FIONBIO, &nb) == 0); > assert(ioctl(fd[1], FIONBIO, &nb) == 0); > > assert(size <= sizeof(buf)); > while (loop--) > assert(wr(fd[1], size) == rd(fd[0], size)); > > struct timeval tt; > timeradd(&TW, &TR, &tt); > printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n", > TW.tv_sec, TW.tv_usec/1000, > TR.tv_sec, TR.tv_usec/1000, > tt.tv_sec, tt.tv_usec/1000); > > return 0; > } > > Before: > # for i in 1 2 3; do /host/tmp/test 10000 100; done > TW = 8.047 TR = 5.845 TT = 13.893 > TW = 8.091 TR = 5.872 TT = 13.963 > TW = 8.083 TR = 5.885 TT = 13.969 > After: > # for i in 1 2 3; do /host/tmp/test 10000 100; done > TW = 4.752 TR = 4.664 TT = 9.416 > TW = 4.684 TR = 4.608 TT = 9.293 > TW = 4.736 TR = 4.652 TT = 9.388 > > Signed-off-by: Oleg Nesterov <oleg@redhat.com> > --- > fs/pipe.c | 21 ++++++++++++++------- > 1 file changed, 14 insertions(+), 7 deletions(-) > > diff --git a/fs/pipe.c b/fs/pipe.c > index 94b59045ab44..baaa8c0817f1 100644 > --- a/fs/pipe.c > +++ b/fs/pipe.c > @@ -247,6 +247,11 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, > return tail; > } > > +static inline bool is_pipe_inode(struct inode *inode) > +{ > + return inode->i_sb->s_magic == PIPEFS_MAGIC; > +} > + > static ssize_t > pipe_read(struct kiocb *iocb, struct iov_iter *to) > { > @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) > if (wake_next_reader) > wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); > kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); > - if (ret > 0) > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) > file_accessed(filp); > return ret; > } > @@ -604,11 +609,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) > kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); > if (wake_next_writer) > wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); > - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { > - int err = file_update_time(filp); > - if (err) > - ret = err; > - sb_end_write(file_inode(filp)->i_sb); > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) { > + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { > + int err = file_update_time(filp); > + if (err) > + ret = err; > + sb_end_write(file_inode(filp)->i_sb); > + } > } > return ret; > } > @@ -1108,7 +1115,7 @@ static void wake_up_partner(struct pipe_inode_info *pipe) > static int fifo_open(struct inode *inode, struct file *filp) > { > struct pipe_inode_info *pipe; > - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; > + bool is_pipe = is_pipe_inode(inode); > int ret; > > filp->f_pipe = 0; Oh _anonymous_ pipes. You can disregard my earlier questions: Reviewed-by: Jeff Layton <jlayton@kernel.org>
On Tue, 4 Feb 2025 at 05:22, Oleg Nesterov <oleg@redhat.com> wrote: > > @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) > if (wake_next_reader) > wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); > kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); > - if (ret > 0) > + if (ret > 0 && !is_pipe_inode(file_inode(filp))) > file_accessed(filp); > return ret; > } So I really don't kike that "is_pipe_inode()" thing. Yes, it has superficial naming problems (it should be about *anonymous* pipes, and that should be reflected in the name), but I think the deeper problem is that I think this should just be a different set of read/write functions entirely. IOW, we should have "anon_pipe_read()" and "anon_pipe_write()", and then the named pipes just do int ret = anon_pipe_read(...); if (ret > 0) file_accessed(filp); return ret; instead. With no tests for "what kind of pipe is this" (same for the write side, of course) Linus
On 02/04, Linus Torvalds wrote: > > IOW, we should have "anon_pipe_read()" and "anon_pipe_write()", and > then the named pipes just do OK, will do in v2. Oleg.
On Tue, Feb 04, 2025 at 03:21:18PM +0100, Mateusz Guzik wrote: > On Tue, Feb 4, 2025 at 2:22 PM Oleg Nesterov <oleg@redhat.com> wrote: > > These numbers are visible in fstat() but hopefully nobody uses this > > information and file_accessed/file_update_time are not that cheap. > > I'll note majority of the problem most likely stems from > mnt_get_write_access rolling with: So anonymous pipes are based on pipefs which isn't exposed to userspace at all. Neither the superblock nor an individual mount can go read-only or change mount properties. The superblock cannot be frozen etc. So really that mnt_get_write_access() should be pointless for anonymous pipes. In other words, couldn't this also just be: diff --git a/fs/pipe.c b/fs/pipe.c index 8df42e1ab3a3..e9989621362c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -604,12 +604,25 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + if (ret > 0) { + int err; + + if (!is_anon_pipe(file)) { + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + err = file_update_time(filp); + if (err) + ret = err; + + sb_end_write(file_inode(filp)->i_sb); + } + } else { + // Anonymous pipes don't need all the mount + sb bruah. + err = inode_needs_update_time(inode); + if (err > 0) + ret = inode_update_time(file_inode(filp), err); + } } + return ret; } and then have zero regression risk?
On Tue, 4 Feb 2025 at 08:34, Christian Brauner <brauner@kernel.org> wrote: > > So really that mnt_get_write_access() should be pointless for > anonymous pipes. In other words, couldn't this also just be: I bet you'll find that even just inode_update_times() isn't all that cheap. Yes, the silly "get writability" thing is pointless for pipes, but the "get current time" really isn't a no-op either. There's all the crazy "coarse time vs fine time" crud, there's justv a *lot* of overhead. That's not really noticeable on a "real" file where you never access just one byte anyway, and where any write costs are elsewhere. But pipes can be very cheap, so the silly overheads really show up. Linus
diff --git a/fs/pipe.c b/fs/pipe.c index 94b59045ab44..baaa8c0817f1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -247,6 +247,11 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, return tail; } +static inline bool is_pipe_inode(struct inode *inode) +{ + return inode->i_sb->s_magic == PIPEFS_MAGIC; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -404,7 +409,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - if (ret > 0) + if (ret > 0 && !is_pipe_inode(file_inode(filp))) file_accessed(filp); return ret; } @@ -604,11 +609,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + if (ret > 0 && !is_pipe_inode(file_inode(filp))) { + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -1108,7 +1115,7 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; + bool is_pipe = is_pipe_inode(inode); int ret; filp->f_pipe = 0;
These numbers are visible in fstat() but hopefully nobody uses this information and file_accessed/file_update_time are not that cheap. Stupid test-case: #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <assert.h> #include <sys/ioctl.h> #include <sys/time.h> static char buf[17 * 4096]; static struct timeval TW, TR; int wr(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = write(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TW, &t1, &TW); timersub(&TW, &t0, &TW); return c; } int rd(int fd, int size) { int c, r; struct timeval t0, t1; gettimeofday(&t0, NULL); for (c = 0; (r = read(fd, buf, size)) > 0; c += r); gettimeofday(&t1, NULL); timeradd(&TR, &t1, &TR); timersub(&TR, &t0, &TR); return c; } int main(int argc, const char *argv[]) { int fd[2], nb = 1, loop, size; assert(argc == 3); loop = atoi(argv[1]); size = atoi(argv[2]); assert(pipe(fd) == 0); assert(ioctl(fd[0], FIONBIO, &nb) == 0); assert(ioctl(fd[1], FIONBIO, &nb) == 0); assert(size <= sizeof(buf)); while (loop--) assert(wr(fd[1], size) == rd(fd[0], size)); struct timeval tt; timeradd(&TW, &TR, &tt); printf("TW = %lu.%03lu TR = %lu.%03lu TT = %lu.%03lu\n", TW.tv_sec, TW.tv_usec/1000, TR.tv_sec, TR.tv_usec/1000, tt.tv_sec, tt.tv_usec/1000); return 0; } Before: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 8.047 TR = 5.845 TT = 13.893 TW = 8.091 TR = 5.872 TT = 13.963 TW = 8.083 TR = 5.885 TT = 13.969 After: # for i in 1 2 3; do /host/tmp/test 10000 100; done TW = 4.752 TR = 4.664 TT = 9.416 TW = 4.684 TR = 4.608 TT = 9.293 TW = 4.736 TR = 4.652 TT = 9.388 Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- fs/pipe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-)