Message ID | 20230718132112.461218-4-hao.xu@linux.dev (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring getdents | expand |
On 7/18/23 21:21, Hao Xu wrote: > From: Hao Xu <howeyxu@tencent.com> > > This add support for getdents64 to io_uring, acting exactly like the > syscall: the directory is iterated from it's current's position as > stored in the file struct, and the file's position is updated exactly as > if getdents64 had been called. > > For filesystems that support NOWAIT in iterate_shared(), try to use it > first; if a user already knows the filesystem they use do not support > nowait they can force async through IOSQE_ASYNC in the sqe flags, > avoiding the need to bounce back through a useless EAGAIN return. > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > Signed-off-by: Hao Xu <howeyxu@tencent.com> > --- > include/uapi/linux/io_uring.h | 7 +++++ > io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ > io_uring/fs.h | 3 ++ > io_uring/opdef.c | 8 +++++ > 4 files changed, 73 insertions(+) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 36f9c73082de..b200b2600622 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -65,6 +65,7 @@ struct io_uring_sqe { > __u32 xattr_flags; > __u32 msg_ring_flags; > __u32 uring_cmd_flags; > + __u32 getdents_flags; Looks this is not needed anymore, I'll remove this in next version. > }; > __u64 user_data; /* data to be passed back at completion time */ > /* pack this to avoid bogus arm OABI complaints */ > @@ -235,6 +236,7 @@ enum io_uring_op { > IORING_OP_URING_CMD, > IORING_OP_SEND_ZC, > IORING_OP_SENDMSG_ZC, > + IORING_OP_GETDENTS, > > /* this goes last, obviously */ > IORING_OP_LAST, > @@ -273,6 +275,11 @@ enum io_uring_op { > */ > #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ > > +/* > + * sqe->getdents_flags > + */ > +#define IORING_GETDENTS_REWIND (1U << 0) ditto > + > /* > * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the > * command flags for POLL_ADD are stored in sqe->len. > diff --git a/io_uring/fs.c b/io_uring/fs.c > index f6a69a549fd4..480f25677fed 100644 > --- a/io_uring/fs.c > +++ b/io_uring/fs.c > @@ -47,6 +47,13 @@ struct io_link { > int flags; > }; > > +struct io_getdents { > + struct file *file; > + struct linux_dirent64 __user *dirent; > + unsigned int count; > + int flags; ditto
On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: > From: Hao Xu <howeyxu@tencent.com> > > This add support for getdents64 to io_uring, acting exactly like the > syscall: the directory is iterated from it's current's position as > stored in the file struct, and the file's position is updated exactly as > if getdents64 had been called. > > For filesystems that support NOWAIT in iterate_shared(), try to use it > first; if a user already knows the filesystem they use do not support > nowait they can force async through IOSQE_ASYNC in the sqe flags, > avoiding the need to bounce back through a useless EAGAIN return. > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > Signed-off-by: Hao Xu <howeyxu@tencent.com> > --- > include/uapi/linux/io_uring.h | 7 +++++ > io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ > io_uring/fs.h | 3 ++ > io_uring/opdef.c | 8 +++++ > 4 files changed, 73 insertions(+) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 36f9c73082de..b200b2600622 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -65,6 +65,7 @@ struct io_uring_sqe { > __u32 xattr_flags; > __u32 msg_ring_flags; > __u32 uring_cmd_flags; > + __u32 getdents_flags; > }; > __u64 user_data; /* data to be passed back at completion time */ > /* pack this to avoid bogus arm OABI complaints */ > @@ -235,6 +236,7 @@ enum io_uring_op { > IORING_OP_URING_CMD, > IORING_OP_SEND_ZC, > IORING_OP_SENDMSG_ZC, > + IORING_OP_GETDENTS, > > /* this goes last, obviously */ > IORING_OP_LAST, > @@ -273,6 +275,11 @@ enum io_uring_op { > */ > #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ > > +/* > + * sqe->getdents_flags > + */ > +#define IORING_GETDENTS_REWIND (1U << 0) > + > /* > * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the > * command flags for POLL_ADD are stored in sqe->len. > diff --git a/io_uring/fs.c b/io_uring/fs.c > index f6a69a549fd4..480f25677fed 100644 > --- a/io_uring/fs.c > +++ b/io_uring/fs.c > @@ -47,6 +47,13 @@ struct io_link { > int flags; > }; > > +struct io_getdents { > + struct file *file; > + struct linux_dirent64 __user *dirent; > + unsigned int count; > + int flags; > +}; > + > int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > { > struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); > @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) > putname(sl->oldpath); > putname(sl->newpath); > } > + > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > +{ > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > + > + if (READ_ONCE(sqe->off) != 0) > + return -EINVAL; > + > + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); > + gd->count = READ_ONCE(sqe->len); > + > + return 0; > +} > + > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) > +{ > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > + struct file *file = req->file; > + unsigned long getdents_flags = 0; > + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. But to point this out: vfs_getdents() -> iterate_dir() { if (shared) res = down_read_killable(&inode->i_rwsem); else res = down_write_killable(&inode->i_rwsem); } which means you can still end up sleeping here before you go into a filesystem that does actually support non-waiting getdents. So if you have concurrent operations that grab inode lock (touch, mkdir etc) you can end up sleeping here. Is that intentional or an oversight? If the former can someone please explain the rules and why it's fine in this case? > + bool should_lock = file->f_mode & FMODE_ATOMIC_POS; > + int ret; > + > + if (force_nonblock) { > + if (!(req->file->f_mode & FMODE_NOWAIT)) > + return -EAGAIN; > + > + getdents_flags = DIR_CONTEXT_F_NOWAIT; > + } > + > + if (should_lock) { > + if (!force_nonblock) > + mutex_lock(&file->f_pos_lock); > + else if (!mutex_trylock(&file->f_pos_lock)) > + return -EAGAIN; > + } That now looks like it works. > + > + ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags); > + if (should_lock) > + mutex_unlock(&file->f_pos_lock);
On 7/26/23 23:00, Christian Brauner wrote: > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: >> From: Hao Xu <howeyxu@tencent.com> >> >> This add support for getdents64 to io_uring, acting exactly like the >> syscall: the directory is iterated from it's current's position as >> stored in the file struct, and the file's position is updated exactly as >> if getdents64 had been called. >> >> For filesystems that support NOWAIT in iterate_shared(), try to use it >> first; if a user already knows the filesystem they use do not support >> nowait they can force async through IOSQE_ASYNC in the sqe flags, >> avoiding the need to bounce back through a useless EAGAIN return. >> >> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> >> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> >> Signed-off-by: Hao Xu <howeyxu@tencent.com> >> --- >> include/uapi/linux/io_uring.h | 7 +++++ >> io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ >> io_uring/fs.h | 3 ++ >> io_uring/opdef.c | 8 +++++ >> 4 files changed, 73 insertions(+) >> >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index 36f9c73082de..b200b2600622 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -65,6 +65,7 @@ struct io_uring_sqe { >> __u32 xattr_flags; >> __u32 msg_ring_flags; >> __u32 uring_cmd_flags; >> + __u32 getdents_flags; >> }; >> __u64 user_data; /* data to be passed back at completion time */ >> /* pack this to avoid bogus arm OABI complaints */ >> @@ -235,6 +236,7 @@ enum io_uring_op { >> IORING_OP_URING_CMD, >> IORING_OP_SEND_ZC, >> IORING_OP_SENDMSG_ZC, >> + IORING_OP_GETDENTS, >> >> /* this goes last, obviously */ >> IORING_OP_LAST, >> @@ -273,6 +275,11 @@ enum io_uring_op { >> */ >> #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ >> >> +/* >> + * sqe->getdents_flags >> + */ >> +#define IORING_GETDENTS_REWIND (1U << 0) >> + >> /* >> * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the >> * command flags for POLL_ADD are stored in sqe->len. >> diff --git a/io_uring/fs.c b/io_uring/fs.c >> index f6a69a549fd4..480f25677fed 100644 >> --- a/io_uring/fs.c >> +++ b/io_uring/fs.c >> @@ -47,6 +47,13 @@ struct io_link { >> int flags; >> }; >> >> +struct io_getdents { >> + struct file *file; >> + struct linux_dirent64 __user *dirent; >> + unsigned int count; >> + int flags; >> +}; >> + >> int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> { >> struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); >> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) >> putname(sl->oldpath); >> putname(sl->newpath); >> } >> + >> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >> +{ >> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >> + >> + if (READ_ONCE(sqe->off) != 0) >> + return -EINVAL; >> + >> + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); >> + gd->count = READ_ONCE(sqe->len); >> + >> + return 0; >> +} >> + >> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) >> +{ >> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >> + struct file *file = req->file; >> + unsigned long getdents_flags = 0; >> + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; > > Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. > But to point this out: > > vfs_getdents() > -> iterate_dir() > { > if (shared) > res = down_read_killable(&inode->i_rwsem); > else > res = down_write_killable(&inode->i_rwsem); > } > > which means you can still end up sleeping here before you go into a > filesystem that does actually support non-waiting getdents. So if you > have concurrent operations that grab inode lock (touch, mkdir etc) you > can end up sleeping here. > > Is that intentional or an oversight? If the former can someone please > explain the rules and why it's fine in this case? I actually saw this semaphore, and there is another xfs lock in file_accessed --> touch_atime --> inode_update_time --> inode->i_op->update_time == xfs_vn_update_time Forgot to point them out in the cover-letter..., I didn't modify them since I'm not very sure about if we should do so, and I saw Stefan's patchset didn't modify them too. My personnal thinking is we should apply trylock logic for this inode->i_rwsem. For xfs lock in touch_atime, we should do that since it doesn't make sense to rollback all the stuff while we are almost at the end of getdents because of a lock. > >> + bool should_lock = file->f_mode & FMODE_ATOMIC_POS; >> + int ret; >> + >> + if (force_nonblock) { >> + if (!(req->file->f_mode & FMODE_NOWAIT)) >> + return -EAGAIN; >> + >> + getdents_flags = DIR_CONTEXT_F_NOWAIT; >> + } >> + >> + if (should_lock) { >> + if (!force_nonblock) >> + mutex_lock(&file->f_pos_lock); >> + else if (!mutex_trylock(&file->f_pos_lock)) >> + return -EAGAIN; >> + } > > That now looks like it works. > >> + >> + ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags); >> + if (should_lock) >> + mutex_unlock(&file->f_pos_lock);
On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > On 7/26/23 23:00, Christian Brauner wrote: > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: > > > From: Hao Xu <howeyxu@tencent.com> > > > > > > This add support for getdents64 to io_uring, acting exactly like the > > > syscall: the directory is iterated from it's current's position as > > > stored in the file struct, and the file's position is updated exactly as > > > if getdents64 had been called. > > > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it > > > first; if a user already knows the filesystem they use do not support > > > nowait they can force async through IOSQE_ASYNC in the sqe flags, > > > avoiding the need to bounce back through a useless EAGAIN return. > > > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > > > Signed-off-by: Hao Xu <howeyxu@tencent.com> > > > --- > > > include/uapi/linux/io_uring.h | 7 +++++ > > > io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ > > > io_uring/fs.h | 3 ++ > > > io_uring/opdef.c | 8 +++++ > > > 4 files changed, 73 insertions(+) > > > > > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > > > index 36f9c73082de..b200b2600622 100644 > > > --- a/include/uapi/linux/io_uring.h > > > +++ b/include/uapi/linux/io_uring.h > > > @@ -65,6 +65,7 @@ struct io_uring_sqe { > > > __u32 xattr_flags; > > > __u32 msg_ring_flags; > > > __u32 uring_cmd_flags; > > > + __u32 getdents_flags; > > > }; > > > __u64 user_data; /* data to be passed back at completion time */ > > > /* pack this to avoid bogus arm OABI complaints */ > > > @@ -235,6 +236,7 @@ enum io_uring_op { > > > IORING_OP_URING_CMD, > > > IORING_OP_SEND_ZC, > > > IORING_OP_SENDMSG_ZC, > > > + IORING_OP_GETDENTS, > > > /* this goes last, obviously */ > > > IORING_OP_LAST, > > > @@ -273,6 +275,11 @@ enum io_uring_op { > > > */ > > > #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ > > > +/* > > > + * sqe->getdents_flags > > > + */ > > > +#define IORING_GETDENTS_REWIND (1U << 0) > > > + > > > /* > > > * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the > > > * command flags for POLL_ADD are stored in sqe->len. > > > diff --git a/io_uring/fs.c b/io_uring/fs.c > > > index f6a69a549fd4..480f25677fed 100644 > > > --- a/io_uring/fs.c > > > +++ b/io_uring/fs.c > > > @@ -47,6 +47,13 @@ struct io_link { > > > int flags; > > > }; > > > +struct io_getdents { > > > + struct file *file; > > > + struct linux_dirent64 __user *dirent; > > > + unsigned int count; > > > + int flags; > > > +}; > > > + > > > int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > > { > > > struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); > > > @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) > > > putname(sl->oldpath); > > > putname(sl->newpath); > > > } > > > + > > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > > +{ > > > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > > > + > > > + if (READ_ONCE(sqe->off) != 0) > > > + return -EINVAL; > > > + > > > + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); > > > + gd->count = READ_ONCE(sqe->len); > > > + > > > + return 0; > > > +} > > > + > > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) > > > +{ > > > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > > > + struct file *file = req->file; > > > + unsigned long getdents_flags = 0; > > > + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; > > > > Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. > > But to point this out: > > > > vfs_getdents() > > -> iterate_dir() > > { > > if (shared) > > res = down_read_killable(&inode->i_rwsem); > > else > > res = down_write_killable(&inode->i_rwsem); > > } > > > > which means you can still end up sleeping here before you go into a > > filesystem that does actually support non-waiting getdents. So if you > > have concurrent operations that grab inode lock (touch, mkdir etc) you > > can end up sleeping here. > > > > Is that intentional or an oversight? If the former can someone please > > explain the rules and why it's fine in this case? > > I actually saw this semaphore, and there is another xfs lock in > file_accessed > --> touch_atime > --> inode_update_time > --> inode->i_op->update_time == xfs_vn_update_time > > Forgot to point them out in the cover-letter..., I didn't modify them > since I'm not very sure about if we should do so, and I saw Stefan's > patchset didn't modify them too. > > My personnal thinking is we should apply trylock logic for this > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > doesn't make sense to rollback all the stuff while we are almost at the > end of getdents because of a lock. That manoeuvres around the problem. Which I'm slightly more sensitive too as this review is a rather expensive one. Plus, it seems fixable in at least two ways: For both we need to be able to tell the filesystem that a nowait atime update is requested. Simple thing seems to me to add a S_NOWAIT flag to file_time_flags and passing that via i_op->update_time() which already has a flag argument. That would likely also help kiocb_modified(). file_accessed() -> touch_atime() -> inode_update_time() -> i_op->update_time == xfs_vn_update_time() Then we have two options afaict: (1) best-effort atime update file_accessed() already has the builtin assumption that updating atime might fail for other reasons - see the comment in there. So it is somewhat best-effort already. (2) move atime update before calling into filesystem If we want to be sure that access time is updated when a readdir request is issued through io_uring then we need to have file_accessed() give a return value and expose a new helper for io_uring or modify vfs_getdents() to do something like: vfs_getdents() { if (nowait) down_read_trylock() if (!IS_DEADDIR(inode)) { ret = file_accessed(file); if (ret == -EAGAIN) goto out_unlock; f_op->iterate_shared() } } It's not unprecedented to do update atime before the actual operation has been done afaict. That's already the case in xfs_file_write_checks() which is called before anything is written. So that seems ok. Does any of these two options work for the xfs maintainers and Jens?
On 7/27/23 15:27, Christian Brauner wrote: > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: >> On 7/26/23 23:00, Christian Brauner wrote: >>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: >>>> From: Hao Xu <howeyxu@tencent.com> >>>> >>>> This add support for getdents64 to io_uring, acting exactly like the >>>> syscall: the directory is iterated from it's current's position as >>>> stored in the file struct, and the file's position is updated exactly as >>>> if getdents64 had been called. >>>> >>>> For filesystems that support NOWAIT in iterate_shared(), try to use it >>>> first; if a user already knows the filesystem they use do not support >>>> nowait they can force async through IOSQE_ASYNC in the sqe flags, >>>> avoiding the need to bounce back through a useless EAGAIN return. >>>> >>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> >>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> >>>> Signed-off-by: Hao Xu <howeyxu@tencent.com> >>>> --- [...] >> I actually saw this semaphore, and there is another xfs lock in >> file_accessed >> --> touch_atime >> --> inode_update_time >> --> inode->i_op->update_time == xfs_vn_update_time >> >> Forgot to point them out in the cover-letter..., I didn't modify them >> since I'm not very sure about if we should do so, and I saw Stefan's >> patchset didn't modify them too. >> >> My personnal thinking is we should apply trylock logic for this >> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it >> doesn't make sense to rollback all the stuff while we are almost at the >> end of getdents because of a lock. > > That manoeuvres around the problem. Which I'm slightly more sensitive > too as this review is a rather expensive one. > > Plus, it seems fixable in at least two ways: > > For both we need to be able to tell the filesystem that a nowait atime > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > file_time_flags and passing that via i_op->update_time() which already > has a flag argument. That would likely also help kiocb_modified(). fwiw, we've just recently had similar problems with io_uring read/write and atime/mtime in prod environment, so we're interested in solving that regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time touch ignores that, that stalls other submissions and completely screws latency. > file_accessed() > -> touch_atime() > -> inode_update_time() > -> i_op->update_time == xfs_vn_update_time() > > Then we have two options afaict: > > (1) best-effort atime update > > file_accessed() already has the builtin assumption that updating atime > might fail for other reasons - see the comment in there. So it is > somewhat best-effort already. > > (2) move atime update before calling into filesystem > > If we want to be sure that access time is updated when a readdir request > is issued through io_uring then we need to have file_accessed() give a > return value and expose a new helper for io_uring or modify > vfs_getdents() to do something like: > > vfs_getdents() > { > if (nowait) > down_read_trylock() > > if (!IS_DEADDIR(inode)) { > ret = file_accessed(file); > if (ret == -EAGAIN) > goto out_unlock; > > f_op->iterate_shared() > } > } > > It's not unprecedented to do update atime before the actual operation > has been done afaict. That's already the case in xfs_file_write_checks() > which is called before anything is written. So that seems ok. > > Does any of these two options work for the xfs maintainers and Jens? It doesn't look (2) will solve it for reads/writes, at least without the pain of changing the {write,read}_iter callbacks. 1) sounds good to me from the io_uring perspective, but I guess it won't work for mtime?
On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: > On 7/27/23 15:27, Christian Brauner wrote: > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > > On 7/26/23 23:00, Christian Brauner wrote: > > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: > > > > > From: Hao Xu <howeyxu@tencent.com> > > > > > > > > > > This add support for getdents64 to io_uring, acting exactly like the > > > > > syscall: the directory is iterated from it's current's position as > > > > > stored in the file struct, and the file's position is updated exactly as > > > > > if getdents64 had been called. > > > > > > > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it > > > > > first; if a user already knows the filesystem they use do not support > > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags, > > > > > avoiding the need to bounce back through a useless EAGAIN return. > > > > > > > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com> > > > > > --- > [...] > > > I actually saw this semaphore, and there is another xfs lock in > > > file_accessed > > > --> touch_atime > > > --> inode_update_time > > > --> inode->i_op->update_time == xfs_vn_update_time > > > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > > since I'm not very sure about if we should do so, and I saw Stefan's > > > patchset didn't modify them too. > > > > > > My personnal thinking is we should apply trylock logic for this > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > > doesn't make sense to rollback all the stuff while we are almost at the > > > end of getdents because of a lock. > > > > That manoeuvres around the problem. Which I'm slightly more sensitive > > too as this review is a rather expensive one. > > > > Plus, it seems fixable in at least two ways: > > > > For both we need to be able to tell the filesystem that a nowait atime > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > > file_time_flags and passing that via i_op->update_time() which already > > has a flag argument. That would likely also help kiocb_modified(). > > fwiw, we've just recently had similar problems with io_uring read/write > and atime/mtime in prod environment, so we're interested in solving that > regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time > touch ignores that, that stalls other submissions and completely screws > latency. > > > file_accessed() > > -> touch_atime() > > -> inode_update_time() > > -> i_op->update_time == xfs_vn_update_time() > > > > Then we have two options afaict: > > > > (1) best-effort atime update > > > > file_accessed() already has the builtin assumption that updating atime > > might fail for other reasons - see the comment in there. So it is > > somewhat best-effort already. > > > > (2) move atime update before calling into filesystem > > > > If we want to be sure that access time is updated when a readdir request > > is issued through io_uring then we need to have file_accessed() give a > > return value and expose a new helper for io_uring or modify > > vfs_getdents() to do something like: > > > > vfs_getdents() > > { > > if (nowait) > > down_read_trylock() > > > > if (!IS_DEADDIR(inode)) { > > ret = file_accessed(file); > > if (ret == -EAGAIN) > > goto out_unlock; > > > > f_op->iterate_shared() > > } > > } > > > > It's not unprecedented to do update atime before the actual operation > > has been done afaict. That's already the case in xfs_file_write_checks() > > which is called before anything is written. So that seems ok. > > > > Does any of these two options work for the xfs maintainers and Jens? > > It doesn't look (2) will solve it for reads/writes, at least without It would also solve it for writes which is what my kiocb_modified() comment was about. So right now you have: kiocb_modified(IOCB_NOWAI) -> file_modified_flags(IOCB_NOWAI) -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking -> file_accessed(IOCB_NOWAIT) -> i_op->update_time(S_ATIME | S_NOWAIT) and since xfs_file_write_iter() calls xfs_file_write_checks() before doing any actual work you'd now be fine. For reads xfs_file_read_iter() would need to be changed to a similar logic but that's for xfs to decide ultimately. > the pain of changing the {write,read}_iter callbacks. 1) sounds good > to me from the io_uring perspective, but I guess it won't work > for mtime? I would prefer 2) which seems cleaner to me. But I might miss why this won't work. So input needed/wanted.
On 7/27/23 16:52, Christian Brauner wrote: > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: >> On 7/27/23 15:27, Christian Brauner wrote: >>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: >>>> On 7/26/23 23:00, Christian Brauner wrote: >>>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: >>>>>> From: Hao Xu <howeyxu@tencent.com> >>>>>> >>>>>> This add support for getdents64 to io_uring, acting exactly like the >>>>>> syscall: the directory is iterated from it's current's position as >>>>>> stored in the file struct, and the file's position is updated exactly as >>>>>> if getdents64 had been called. >>>>>> >>>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it >>>>>> first; if a user already knows the filesystem they use do not support >>>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags, >>>>>> avoiding the need to bounce back through a useless EAGAIN return. >>>>>> >>>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> >>>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> >>>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com> >>>>>> --- >> [...] >>>> I actually saw this semaphore, and there is another xfs lock in >>>> file_accessed >>>> --> touch_atime >>>> --> inode_update_time >>>> --> inode->i_op->update_time == xfs_vn_update_time >>>> >>>> Forgot to point them out in the cover-letter..., I didn't modify them >>>> since I'm not very sure about if we should do so, and I saw Stefan's >>>> patchset didn't modify them too. >>>> >>>> My personnal thinking is we should apply trylock logic for this >>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it >>>> doesn't make sense to rollback all the stuff while we are almost at the >>>> end of getdents because of a lock. >>> >>> That manoeuvres around the problem. Which I'm slightly more sensitive >>> too as this review is a rather expensive one. >>> >>> Plus, it seems fixable in at least two ways: >>> >>> For both we need to be able to tell the filesystem that a nowait atime >>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to >>> file_time_flags and passing that via i_op->update_time() which already >>> has a flag argument. That would likely also help kiocb_modified(). >> >> fwiw, we've just recently had similar problems with io_uring read/write >> and atime/mtime in prod environment, so we're interested in solving that >> regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time >> touch ignores that, that stalls other submissions and completely screws >> latency. >> >>> file_accessed() >>> -> touch_atime() >>> -> inode_update_time() >>> -> i_op->update_time == xfs_vn_update_time() >>> >>> Then we have two options afaict: >>> >>> (1) best-effort atime update >>> >>> file_accessed() already has the builtin assumption that updating atime >>> might fail for other reasons - see the comment in there. So it is >>> somewhat best-effort already. >>> >>> (2) move atime update before calling into filesystem >>> >>> If we want to be sure that access time is updated when a readdir request >>> is issued through io_uring then we need to have file_accessed() give a >>> return value and expose a new helper for io_uring or modify >>> vfs_getdents() to do something like: >>> >>> vfs_getdents() >>> { >>> if (nowait) >>> down_read_trylock() >>> >>> if (!IS_DEADDIR(inode)) { >>> ret = file_accessed(file); >>> if (ret == -EAGAIN) >>> goto out_unlock; >>> >>> f_op->iterate_shared() >>> } >>> } >>> >>> It's not unprecedented to do update atime before the actual operation >>> has been done afaict. That's already the case in xfs_file_write_checks() >>> which is called before anything is written. So that seems ok. >>> >>> Does any of these two options work for the xfs maintainers and Jens? >> >> It doesn't look (2) will solve it for reads/writes, at least without > > It would also solve it for writes which is what my kiocb_modified() > comment was about. So right now you have: Great, I assumed there are stricter requirements for mtime not transiently failing. > kiocb_modified(IOCB_NOWAI) > -> file_modified_flags(IOCB_NOWAI) > -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking > -> file_accessed(IOCB_NOWAIT) > -> i_op->update_time(S_ATIME | S_NOWAIT) > > and since xfs_file_write_iter() calls xfs_file_write_checks() before > doing any actual work you'd now be fine. > > For reads xfs_file_read_iter() would need to be changed to a similar > logic but that's for xfs to decide ultimately. > >> the pain of changing the {write,read}_iter callbacks. 1) sounds good >> to me from the io_uring perspective, but I guess it won't work >> for mtime? > > I would prefer 2) which seems cleaner to me. But I might miss why this > won't work. So input needed/wanted. Maybe I didn't fully grasp the (2) idea 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() before doing IO, which sounds like a good option if everyone agrees with that. Taking a look at direct block io, it's already like this. 2.2: Having io_uring doing file_accessed(). Since it's all currently hidden behind {read,write}_iter() callbacks and not easily extractable, it doesn't like a good option, unless I missed sth. E.g. this ugliness comes to mind. io_uring_read() { file_accessed(); file->f_op->read_iter(DONT_TOUCH_ATIME); ... } read_iter_impl() { // some pre processing if (!(flags & DONT_TOUCH_ATIME)) file_accessed(); }
On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote: > On 7/27/23 16:52, Christian Brauner wrote: > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: > > > On 7/27/23 15:27, Christian Brauner wrote: > > > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > > > > On 7/26/23 23:00, Christian Brauner wrote: > > > > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: > > > > > > > From: Hao Xu <howeyxu@tencent.com> > > > > > > > > > > > > > > This add support for getdents64 to io_uring, acting exactly like the > > > > > > > syscall: the directory is iterated from it's current's position as > > > > > > > stored in the file struct, and the file's position is updated exactly as > > > > > > > if getdents64 had been called. > > > > > > > > > > > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it > > > > > > > first; if a user already knows the filesystem they use do not support > > > > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags, > > > > > > > avoiding the need to bounce back through a useless EAGAIN return. > > > > > > > > > > > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com> > > > > > > > --- > > > [...] > > > > > I actually saw this semaphore, and there is another xfs lock in > > > > > file_accessed > > > > > --> touch_atime > > > > > --> inode_update_time > > > > > --> inode->i_op->update_time == xfs_vn_update_time > > > > > > > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > > > > since I'm not very sure about if we should do so, and I saw Stefan's > > > > > patchset didn't modify them too. > > > > > > > > > > My personnal thinking is we should apply trylock logic for this > > > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > > > > doesn't make sense to rollback all the stuff while we are almost at the > > > > > end of getdents because of a lock. > > > > > > > > That manoeuvres around the problem. Which I'm slightly more sensitive > > > > too as this review is a rather expensive one. > > > > > > > > Plus, it seems fixable in at least two ways: > > > > > > > > For both we need to be able to tell the filesystem that a nowait atime > > > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > > > > file_time_flags and passing that via i_op->update_time() which already > > > > has a flag argument. That would likely also help kiocb_modified(). > > > > > > fwiw, we've just recently had similar problems with io_uring read/write > > > and atime/mtime in prod environment, so we're interested in solving that > > > regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time > > > touch ignores that, that stalls other submissions and completely screws > > > latency. > > > > > > > file_accessed() > > > > -> touch_atime() > > > > -> inode_update_time() > > > > -> i_op->update_time == xfs_vn_update_time() > > > > > > > > Then we have two options afaict: > > > > > > > > (1) best-effort atime update > > > > > > > > file_accessed() already has the builtin assumption that updating atime > > > > might fail for other reasons - see the comment in there. So it is > > > > somewhat best-effort already. > > > > > > > > (2) move atime update before calling into filesystem > > > > > > > > If we want to be sure that access time is updated when a readdir request > > > > is issued through io_uring then we need to have file_accessed() give a > > > > return value and expose a new helper for io_uring or modify > > > > vfs_getdents() to do something like: > > > > > > > > vfs_getdents() > > > > { > > > > if (nowait) > > > > down_read_trylock() > > > > > > > > if (!IS_DEADDIR(inode)) { > > > > ret = file_accessed(file); > > > > if (ret == -EAGAIN) > > > > goto out_unlock; > > > > > > > > f_op->iterate_shared() > > > > } > > > > } > > > > > > > > It's not unprecedented to do update atime before the actual operation > > > > has been done afaict. That's already the case in xfs_file_write_checks() > > > > which is called before anything is written. So that seems ok. > > > > > > > > Does any of these two options work for the xfs maintainers and Jens? > > > > > > It doesn't look (2) will solve it for reads/writes, at least without > > > > It would also solve it for writes which is what my kiocb_modified() > > comment was about. So right now you have: > > Great, I assumed there are stricter requirements for mtime not > transiently failing. But I mean then wouldn't this already be a problem today? kiocb_modified() can error out with EAGAIN today: ret = inode_needs_update_time(inode, &now); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; return __file_update_time(file, &now, ret); the thing is that it doesn't matter for ->write_iter() - for xfs at least - because xfs does it as part of preparatory checks before actually doing any real work. The problem happens when you do actual work and afterwards call kiocb_modified(). That's why I think (2) is preferable. > > > > kiocb_modified(IOCB_NOWAI) > > -> file_modified_flags(IOCB_NOWAI) > > -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking > > -> file_accessed(IOCB_NOWAIT) > > -> i_op->update_time(S_ATIME | S_NOWAIT) > > > > and since xfs_file_write_iter() calls xfs_file_write_checks() before > > doing any actual work you'd now be fine. > > > > For reads xfs_file_read_iter() would need to be changed to a similar > > logic but that's for xfs to decide ultimately. > > > > > the pain of changing the {write,read}_iter callbacks. 1) sounds good > > > to me from the io_uring perspective, but I guess it won't work > > > for mtime? > > > > I would prefer 2) which seems cleaner to me. But I might miss why this > > won't work. So input needed/wanted. > > Maybe I didn't fully grasp the (2) idea > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() > before doing IO, which sounds like a good option if everyone agrees with > that. Taking a look at direct block io, it's already like this. Yes, that's what I'm talking about. I'm asking whether that's ok for xfs maintainers basically. i_op->write_iter() already works like that since the dawn of time but i_op->read_iter doesn't and I'm proposing to make it work like that and wondering if there's any issues I'm unaware of. > > 2.2: Having io_uring doing file_accessed(). Since it's all currently > hidden behind {read,write}_iter() callbacks and not easily extractable, > it doesn't like a good option, unless I missed sth. I think that would be the wrong approach and is definitely not what I meant.
Hi Christian, On 7/27/23 22:27, Christian Brauner wrote: > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: >> On 7/26/23 23:00, Christian Brauner wrote: >>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: >>>> From: Hao Xu <howeyxu@tencent.com> >>>> >>>> This add support for getdents64 to io_uring, acting exactly like the >>>> syscall: the directory is iterated from it's current's position as >>>> stored in the file struct, and the file's position is updated exactly as >>>> if getdents64 had been called. >>>> >>>> For filesystems that support NOWAIT in iterate_shared(), try to use it >>>> first; if a user already knows the filesystem they use do not support >>>> nowait they can force async through IOSQE_ASYNC in the sqe flags, >>>> avoiding the need to bounce back through a useless EAGAIN return. >>>> >>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> >>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> >>>> Signed-off-by: Hao Xu <howeyxu@tencent.com> >>>> --- >>>> include/uapi/linux/io_uring.h | 7 +++++ >>>> io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ >>>> io_uring/fs.h | 3 ++ >>>> io_uring/opdef.c | 8 +++++ >>>> 4 files changed, 73 insertions(+) >>>> >>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>> index 36f9c73082de..b200b2600622 100644 >>>> --- a/include/uapi/linux/io_uring.h >>>> +++ b/include/uapi/linux/io_uring.h >>>> @@ -65,6 +65,7 @@ struct io_uring_sqe { >>>> __u32 xattr_flags; >>>> __u32 msg_ring_flags; >>>> __u32 uring_cmd_flags; >>>> + __u32 getdents_flags; >>>> }; >>>> __u64 user_data; /* data to be passed back at completion time */ >>>> /* pack this to avoid bogus arm OABI complaints */ >>>> @@ -235,6 +236,7 @@ enum io_uring_op { >>>> IORING_OP_URING_CMD, >>>> IORING_OP_SEND_ZC, >>>> IORING_OP_SENDMSG_ZC, >>>> + IORING_OP_GETDENTS, >>>> /* this goes last, obviously */ >>>> IORING_OP_LAST, >>>> @@ -273,6 +275,11 @@ enum io_uring_op { >>>> */ >>>> #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ >>>> +/* >>>> + * sqe->getdents_flags >>>> + */ >>>> +#define IORING_GETDENTS_REWIND (1U << 0) >>>> + >>>> /* >>>> * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the >>>> * command flags for POLL_ADD are stored in sqe->len. >>>> diff --git a/io_uring/fs.c b/io_uring/fs.c >>>> index f6a69a549fd4..480f25677fed 100644 >>>> --- a/io_uring/fs.c >>>> +++ b/io_uring/fs.c >>>> @@ -47,6 +47,13 @@ struct io_link { >>>> int flags; >>>> }; >>>> +struct io_getdents { >>>> + struct file *file; >>>> + struct linux_dirent64 __user *dirent; >>>> + unsigned int count; >>>> + int flags; >>>> +}; >>>> + >>>> int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> { >>>> struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); >>>> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) >>>> putname(sl->oldpath); >>>> putname(sl->newpath); >>>> } >>>> + >>>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>> +{ >>>> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >>>> + >>>> + if (READ_ONCE(sqe->off) != 0) >>>> + return -EINVAL; >>>> + >>>> + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); >>>> + gd->count = READ_ONCE(sqe->len); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) >>>> +{ >>>> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >>>> + struct file *file = req->file; >>>> + unsigned long getdents_flags = 0; >>>> + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; >>> >>> Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. >>> But to point this out: >>> >>> vfs_getdents() >>> -> iterate_dir() >>> { >>> if (shared) >>> res = down_read_killable(&inode->i_rwsem); >>> else >>> res = down_write_killable(&inode->i_rwsem); >>> } >>> >>> which means you can still end up sleeping here before you go into a >>> filesystem that does actually support non-waiting getdents. So if you >>> have concurrent operations that grab inode lock (touch, mkdir etc) you >>> can end up sleeping here. >>> >>> Is that intentional or an oversight? If the former can someone please >>> explain the rules and why it's fine in this case? >> >> I actually saw this semaphore, and there is another xfs lock in >> file_accessed >> --> touch_atime >> --> inode_update_time >> --> inode->i_op->update_time == xfs_vn_update_time >> >> Forgot to point them out in the cover-letter..., I didn't modify them >> since I'm not very sure about if we should do so, and I saw Stefan's >> patchset didn't modify them too. >> >> My personnal thinking is we should apply trylock logic for this >> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it >> doesn't make sense to rollback all the stuff while we are almost at the >> end of getdents because of a lock. > > That manoeuvres around the problem. Which I'm slightly more sensitive > too as this review is a rather expensive one. > > Plus, it seems fixable in at least two ways: > > For both we need to be able to tell the filesystem that a nowait atime > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > file_time_flags and passing that via i_op->update_time() which already > has a flag argument. That would likely also help kiocb_modified(). > > file_accessed() > -> touch_atime() > -> inode_update_time() > -> i_op->update_time == xfs_vn_update_time() > > Then we have two options afaict: > > (1) best-effort atime update > > file_accessed() already has the builtin assumption that updating atime > might fail for other reasons - see the comment in there. So it is > somewhat best-effort already. > > (2) move atime update before calling into filesystem > > If we want to be sure that access time is updated when a readdir request > is issued through io_uring then we need to have file_accessed() give a > return value and expose a new helper for io_uring or modify > vfs_getdents() to do something like: > > vfs_getdents() > { > if (nowait) > down_read_trylock() > > if (!IS_DEADDIR(inode)) { > ret = file_accessed(file); > if (ret == -EAGAIN) > goto out_unlock; > > f_op->iterate_shared() > } > } > > It's not unprecedented to do update atime before the actual operation > has been done afaict. That's already the case in xfs_file_write_checks() > which is called before anything is written. So that seems ok. I'm not familiar with this part(the time update), I guess we should revert the updated time if we succeed to do file_accessed(file) but fail somewhere later in f_op->iterate_shared()? Or is it definitely counted as an "access" as long as we start to call getdents to a file? Thanks, Hao > > Does any of these two options work for the xfs maintainers and Jens?
On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote: > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > I actually saw this semaphore, and there is another xfs lock in > > file_accessed > > --> touch_atime > > --> inode_update_time > > --> inode->i_op->update_time == xfs_vn_update_time > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > since I'm not very sure about if we should do so, and I saw Stefan's > > patchset didn't modify them too. > > > > My personnal thinking is we should apply trylock logic for this > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > doesn't make sense to rollback all the stuff while we are almost at the > > end of getdents because of a lock. > > That manoeuvres around the problem. Which I'm slightly more sensitive > too as this review is a rather expensive one. > > Plus, it seems fixable in at least two ways: > > For both we need to be able to tell the filesystem that a nowait atime > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > file_time_flags and passing that via i_op->update_time() which already > has a flag argument. That would likely also help kiocb_modified(). Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT modification operations? Yeah, we did: kiocb_modified(iocb) file_modified_flags(iocb->ki_file, iocb->ki_flags) .... ret = inode_needs_update_time() if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; <does timestamp update> > file_accessed() > -> touch_atime() > -> inode_update_time() > -> i_op->update_time == xfs_vn_update_time() Yeah, so this needs the same treatment as file_modified_flags() - touch_atime() needs a flag variant that passes IOCB_NOWAIT, and after atime_needs_update() returns trues we should check IOCB_NOWAIT and return EAGAIN if it is set. That will punt the operation that needs to the update to a worker thread that can block.... > Then we have two options afaict: > > (1) best-effort atime update > > file_accessed() already has the builtin assumption that updating atime > might fail for other reasons - see the comment in there. So it is > somewhat best-effort already. > > (2) move atime update before calling into filesystem > > If we want to be sure that access time is updated when a readdir request > is issued through io_uring then we need to have file_accessed() give a > return value and expose a new helper for io_uring or modify > vfs_getdents() to do something like: > > vfs_getdents() > { > if (nowait) > down_read_trylock() > > if (!IS_DEADDIR(inode)) { > ret = file_accessed(file); > if (ret == -EAGAIN) > goto out_unlock; > > f_op->iterate_shared() > } > } Yup, that's the sort of thing that needs to be done. But as I said in the "llseek for io-uring" thread, we need to stop the game of whack-a-mole passing random nowait boolean flags to VFS operations before it starts in earnest. We really need a common context structure (like we have a kiocb for IO operations) that holds per operation control state so we have consistency across all the operations that we need different behaviours for. > It's not unprecedented to do update atime before the actual operation > has been done afaict. That's already the case in xfs_file_write_checks() > which is called before anything is written. So that seems ok. Writes don't update atime - they update mtime, and there are other considerations for doing the mtime update before the data modification. e.g. we have to strip permissions from the inode prior to any changes that are going to be made, so we've already got to do all the "change inode metadata" checks and modifications before the data is written anyway.... Cheers, Dave.
On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote: > On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote: > > On 7/27/23 16:52, Christian Brauner wrote: > > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: > > > It would also solve it for writes which is what my kiocb_modified() > > > comment was about. So right now you have: > > > > Great, I assumed there are stricter requirements for mtime not > > transiently failing. > > But I mean then wouldn't this already be a problem today? > kiocb_modified() can error out with EAGAIN today: > > ret = inode_needs_update_time(inode, &now); > if (ret <= 0) > return ret; > if (flags & IOCB_NOWAIT) > return -EAGAIN; > > return __file_update_time(file, &now, ret); > > the thing is that it doesn't matter for ->write_iter() - for xfs at > least - because xfs does it as part of preparatory checks before > actually doing any real work. The problem happens when you do actual > work and afterwards call kiocb_modified(). That's why I think (2) is > preferable. This has nothing to do with what "XFS does". It's actually an IOCB_NOWAIT API design constraint. That is, IOCB_NOWAIT means "complete the whole operation without blocking or return -EAGAIN having done nothing". If we have to do something that might block (like a timestamp update) then we need to punt the entire operation before anything has been modified. This requires all the "do we need to modify this" checks to be done up front before we start modifying anything. So while it looks like this might be "an XFS thing", that's because XFS tends to be the first filesystem that most io_uring NOWAIT functionality is implemented on. IOWs, what you see is XFS is doing things the way IOCB_NOWAIT requires to be done. i.e. it's a demonstration of how nonblocking filesystem modification operations need to be run, not an "XFS thing"... > > > I would prefer 2) which seems cleaner to me. But I might miss why this > > > won't work. So input needed/wanted. > > > > Maybe I didn't fully grasp the (2) idea > > > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() > > before doing IO, which sounds like a good option if everyone agrees with > > that. Taking a look at direct block io, it's already like this. > > Yes, that's what I'm talking about. I'm asking whether that's ok for xfs > maintainers basically. i_op->write_iter() already works like that since > the dawn of time but i_op->read_iter doesn't and I'm proposing to make > it work like that and wondering if there's any issues I'm unaware of. XFS already calls file_accessed() in the DIO read path before the read gets issued. I don't see any problem with lifting it to before the copy-out loop in filemap_read() because it is run regardless of whether any data is read or any error occurred. Hence it just doesn't look like it matters if it is run before or after the copy-out loop to me.... -Dave.
On 7/31/23 09:58, Dave Chinner wrote: > On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote: >> On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote: >>> On 7/27/23 16:52, Christian Brauner wrote: >>>> On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: >>>> It would also solve it for writes which is what my kiocb_modified() >>>> comment was about. So right now you have: >>> >>> Great, I assumed there are stricter requirements for mtime not >>> transiently failing. >> >> But I mean then wouldn't this already be a problem today? >> kiocb_modified() can error out with EAGAIN today: >> >> ret = inode_needs_update_time(inode, &now); >> if (ret <= 0) >> return ret; >> if (flags & IOCB_NOWAIT) >> return -EAGAIN; >> >> return __file_update_time(file, &now, ret); >> >> the thing is that it doesn't matter for ->write_iter() - for xfs at >> least - because xfs does it as part of preparatory checks before >> actually doing any real work. The problem happens when you do actual >> work and afterwards call kiocb_modified(). That's why I think (2) is >> preferable. > > This has nothing to do with what "XFS does". It's actually an > IOCB_NOWAIT API design constraint. > > That is, IOCB_NOWAIT means "complete the whole operation without > blocking or return -EAGAIN having done nothing". If we have to do > something that might block (like a timestamp update) then we need to > punt the entire operation before anything has been modified. This > requires all the "do we need to modify this" checks to be done up > front before we start modifying anything. > > So while it looks like this might be "an XFS thing", that's because > XFS tends to be the first filesystem that most io_uring NOWAIT > functionality is implemented on. IOWs, what you see is XFS is doing > things the way IOCB_NOWAIT requires to be done. i.e. it's a > demonstration of how nonblocking filesystem modification operations > need to be run, not an "XFS thing"... > >>>> I would prefer 2) which seems cleaner to me. But I might miss why this >>>> won't work. So input needed/wanted. >>> >>> Maybe I didn't fully grasp the (2) idea >>> >>> 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() >>> before doing IO, which sounds like a good option if everyone agrees with >>> that. Taking a look at direct block io, it's already like this. >> >> Yes, that's what I'm talking about. I'm asking whether that's ok for xfs >> maintainers basically. i_op->write_iter() already works like that since >> the dawn of time but i_op->read_iter doesn't and I'm proposing to make >> it work like that and wondering if there's any issues I'm unaware of. > > XFS already calls file_accessed() in the DIO read path before the > read gets issued. I don't see any problem with lifting it to before Hi Dave, Here I've a question, in DIO read path, if we update the time but later somehow got errors before actual reading, e.g. return -EAGAIN from the xfs_ilock_iocb(), shouldn't we revert the time update since we actually doesn't read the file? We can lazily update the time but on the contrary a false update sounds weird to me. Thanks, Hao > the copy-out loop in filemap_read() because it is run regardless of > whether any data is read or any error occurred. Hence it just > doesn't look like it matters if it is run before or after the > copy-out loop to me.... > > -Dave.
On Mon, Jul 31, 2023 at 11:58:50AM +1000, Dave Chinner wrote: > On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote: > > On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote: > > > On 7/27/23 16:52, Christian Brauner wrote: > > > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: > > > > It would also solve it for writes which is what my kiocb_modified() > > > > comment was about. So right now you have: > > > > > > Great, I assumed there are stricter requirements for mtime not > > > transiently failing. > > > > But I mean then wouldn't this already be a problem today? > > kiocb_modified() can error out with EAGAIN today: > > > > ret = inode_needs_update_time(inode, &now); > > if (ret <= 0) > > return ret; > > if (flags & IOCB_NOWAIT) > > return -EAGAIN; > > > > return __file_update_time(file, &now, ret); > > > > the thing is that it doesn't matter for ->write_iter() - for xfs at > > least - because xfs does it as part of preparatory checks before > > actually doing any real work. The problem happens when you do actual > > work and afterwards call kiocb_modified(). That's why I think (2) is > > preferable. > > This has nothing to do with what "XFS does". It's actually an > IOCB_NOWAIT API design constraint. > > That is, IOCB_NOWAIT means "complete the whole operation without > blocking or return -EAGAIN having done nothing". If we have to do > something that might block (like a timestamp update) then we need to > punt the entire operation before anything has been modified. This > requires all the "do we need to modify this" checks to be done up > front before we start modifying anything. > > So while it looks like this might be "an XFS thing", that's because > XFS tends to be the first filesystem that most io_uring NOWAIT > functionality is implemented on. IOWs, what you see is XFS is doing > things the way IOCB_NOWAIT requires to be done. i.e. it's a > demonstration of how nonblocking filesystem modification operations > need to be run, not an "XFS thing"... Yes, I'm aware. I was trying to pay xfs a compliment for that but somehow that didn't come through. > > > > > I would prefer 2) which seems cleaner to me. But I might miss why this > > > > won't work. So input needed/wanted. > > > > > > Maybe I didn't fully grasp the (2) idea > > > > > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() > > > before doing IO, which sounds like a good option if everyone agrees with > > > that. Taking a look at direct block io, it's already like this. > > > > Yes, that's what I'm talking about. I'm asking whether that's ok for xfs > > maintainers basically. i_op->write_iter() already works like that since > > the dawn of time but i_op->read_iter doesn't and I'm proposing to make > > it work like that and wondering if there's any issues I'm unaware of. > > XFS already calls file_accessed() in the DIO read path before the > read gets issued. I don't see any problem with lifting it to before > the copy-out loop in filemap_read() because it is run regardless of > whether any data is read or any error occurred. Hence it just > doesn't look like it matters if it is run before or after the > copy-out loop to me.... Great.
On Mon, Jul 31, 2023 at 03:34:48PM +0800, Hao Xu wrote: > On 7/31/23 09:58, Dave Chinner wrote: > > On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote: > > > On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote: > > > > On 7/27/23 16:52, Christian Brauner wrote: > > > > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote: > > > > > It would also solve it for writes which is what my kiocb_modified() > > > > > comment was about. So right now you have: > > > > > > > > Great, I assumed there are stricter requirements for mtime not > > > > transiently failing. > > > > > > But I mean then wouldn't this already be a problem today? > > > kiocb_modified() can error out with EAGAIN today: > > > > > > ret = inode_needs_update_time(inode, &now); > > > if (ret <= 0) > > > return ret; > > > if (flags & IOCB_NOWAIT) > > > return -EAGAIN; > > > > > > return __file_update_time(file, &now, ret); > > > > > > the thing is that it doesn't matter for ->write_iter() - for xfs at > > > least - because xfs does it as part of preparatory checks before > > > actually doing any real work. The problem happens when you do actual > > > work and afterwards call kiocb_modified(). That's why I think (2) is > > > preferable. > > > > This has nothing to do with what "XFS does". It's actually an > > IOCB_NOWAIT API design constraint. > > > > That is, IOCB_NOWAIT means "complete the whole operation without > > blocking or return -EAGAIN having done nothing". If we have to do > > something that might block (like a timestamp update) then we need to > > punt the entire operation before anything has been modified. This > > requires all the "do we need to modify this" checks to be done up > > front before we start modifying anything. > > > > So while it looks like this might be "an XFS thing", that's because > > XFS tends to be the first filesystem that most io_uring NOWAIT > > functionality is implemented on. IOWs, what you see is XFS is doing > > things the way IOCB_NOWAIT requires to be done. i.e. it's a > > demonstration of how nonblocking filesystem modification operations > > need to be run, not an "XFS thing"... > > > > > > > I would prefer 2) which seems cleaner to me. But I might miss why this > > > > > won't work. So input needed/wanted. > > > > > > > > Maybe I didn't fully grasp the (2) idea > > > > > > > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed() > > > > before doing IO, which sounds like a good option if everyone agrees with > > > > that. Taking a look at direct block io, it's already like this. > > > > > > Yes, that's what I'm talking about. I'm asking whether that's ok for xfs > > > maintainers basically. i_op->write_iter() already works like that since > > > the dawn of time but i_op->read_iter doesn't and I'm proposing to make > > > it work like that and wondering if there's any issues I'm unaware of. > > > > XFS already calls file_accessed() in the DIO read path before the > > read gets issued. I don't see any problem with lifting it to before > > Hi Dave, > > Here I've a question, in DIO read path, if we update the time but > later somehow got errors before actual reading, e.g. return -EAGAIN > from the xfs_ilock_iocb(), shouldn't we revert the time update since > we actually doesn't read the file? We can lazily update the time but > on the contrary a false update sounds weird to me. You've read the file and you failed but you did access the file. We have the same logic kinda for readdir in the sense that we call file_accessed() no matter if ->iterate_shared() or ->iterate() succeeded or not. Plus, you should just be able to reorder the read checks to mirror the write checks. Afaict, the order of the write checks are: xfs_file_write_checks() -> xfs_ilock_iocb() -> kiocb_modified() so we can just do: + xfs_file_read_checks() +-> xfs_ilock_iocb() +-> file_accessed()/kiocb_accessed() for the read checks. Then we managed to acquire the necessary lock.
On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote: > On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote: > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > > I actually saw this semaphore, and there is another xfs lock in > > > file_accessed > > > --> touch_atime > > > --> inode_update_time > > > --> inode->i_op->update_time == xfs_vn_update_time > > > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > > since I'm not very sure about if we should do so, and I saw Stefan's > > > patchset didn't modify them too. > > > > > > My personnal thinking is we should apply trylock logic for this > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > > doesn't make sense to rollback all the stuff while we are almost at the > > > end of getdents because of a lock. > > > > That manoeuvres around the problem. Which I'm slightly more sensitive > > too as this review is a rather expensive one. > > > > Plus, it seems fixable in at least two ways: > > > > For both we need to be able to tell the filesystem that a nowait atime > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > > file_time_flags and passing that via i_op->update_time() which already > > has a flag argument. That would likely also help kiocb_modified(). > > Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT > modification operations? Yeah, we did: > > kiocb_modified(iocb) > file_modified_flags(iocb->ki_file, iocb->ki_flags) > .... > ret = inode_needs_update_time() > if (ret <= 0) > return ret; > if (flags & IOCB_NOWAIT) > return -EAGAIN; > <does timestamp update> > > > file_accessed() > > -> touch_atime() > > -> inode_update_time() > > -> i_op->update_time == xfs_vn_update_time() > > Yeah, so this needs the same treatment as file_modified_flags() - > touch_atime() needs a flag variant that passes IOCB_NOWAIT, and > after atime_needs_update() returns trues we should check IOCB_NOWAIT > and return EAGAIN if it is set. That will punt the operation that > needs to the update to a worker thread that can block.... As I tried to explain, I would prefer if we could inform the filesystem through i_op->update_time() itself that this is async and give the filesystem the ability to try and acquire the locks it needs and return EAGAIN from i_op->update_time() itself if it can't acquire them. > > > Then we have two options afaict: > > > > (1) best-effort atime update > > > > file_accessed() already has the builtin assumption that updating atime > > might fail for other reasons - see the comment in there. So it is > > somewhat best-effort already. > > > > (2) move atime update before calling into filesystem > > > > If we want to be sure that access time is updated when a readdir request > > is issued through io_uring then we need to have file_accessed() give a > > return value and expose a new helper for io_uring or modify > > vfs_getdents() to do something like: > > > > vfs_getdents() > > { > > if (nowait) > > down_read_trylock() > > > > if (!IS_DEADDIR(inode)) { > > ret = file_accessed(file); > > if (ret == -EAGAIN) > > goto out_unlock; > > > > f_op->iterate_shared() > > } > > } > > Yup, that's the sort of thing that needs to be done. > > But as I said in the "llseek for io-uring" thread, we need to stop > the game of whack-a-mole passing random nowait boolean flags to VFS > operations before it starts in earnest. We really need a common > context structure (like we have a kiocb for IO operations) that > holds per operation control state so we have consistency across all > the operations that we need different behaviours for. Yes, I tend to agree and thought about the same. But right now we don't have a lot of context. So I would lean towards a flag argument at most. But I also wouldn't consider it necessarily wrong to start with booleans or a flag first and in a couple of months if the need for more context arises we know what kind of struct we want or need.
On Mon, Jul 31, 2023 at 02:02:25AM +0800, Hao Xu wrote: > Hi Christian, > > On 7/27/23 22:27, Christian Brauner wrote: > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > > On 7/26/23 23:00, Christian Brauner wrote: > > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: > > > > > From: Hao Xu <howeyxu@tencent.com> > > > > > > > > > > This add support for getdents64 to io_uring, acting exactly like the > > > > > syscall: the directory is iterated from it's current's position as > > > > > stored in the file struct, and the file's position is updated exactly as > > > > > if getdents64 had been called. > > > > > > > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it > > > > > first; if a user already knows the filesystem they use do not support > > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags, > > > > > avoiding the need to bounce back through a useless EAGAIN return. > > > > > > > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> > > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com> > > > > > --- > > > > > include/uapi/linux/io_uring.h | 7 +++++ > > > > > io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ > > > > > io_uring/fs.h | 3 ++ > > > > > io_uring/opdef.c | 8 +++++ > > > > > 4 files changed, 73 insertions(+) > > > > > > > > > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > > > > > index 36f9c73082de..b200b2600622 100644 > > > > > --- a/include/uapi/linux/io_uring.h > > > > > +++ b/include/uapi/linux/io_uring.h > > > > > @@ -65,6 +65,7 @@ struct io_uring_sqe { > > > > > __u32 xattr_flags; > > > > > __u32 msg_ring_flags; > > > > > __u32 uring_cmd_flags; > > > > > + __u32 getdents_flags; > > > > > }; > > > > > __u64 user_data; /* data to be passed back at completion time */ > > > > > /* pack this to avoid bogus arm OABI complaints */ > > > > > @@ -235,6 +236,7 @@ enum io_uring_op { > > > > > IORING_OP_URING_CMD, > > > > > IORING_OP_SEND_ZC, > > > > > IORING_OP_SENDMSG_ZC, > > > > > + IORING_OP_GETDENTS, > > > > > /* this goes last, obviously */ > > > > > IORING_OP_LAST, > > > > > @@ -273,6 +275,11 @@ enum io_uring_op { > > > > > */ > > > > > #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ > > > > > +/* > > > > > + * sqe->getdents_flags > > > > > + */ > > > > > +#define IORING_GETDENTS_REWIND (1U << 0) > > > > > + > > > > > /* > > > > > * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the > > > > > * command flags for POLL_ADD are stored in sqe->len. > > > > > diff --git a/io_uring/fs.c b/io_uring/fs.c > > > > > index f6a69a549fd4..480f25677fed 100644 > > > > > --- a/io_uring/fs.c > > > > > +++ b/io_uring/fs.c > > > > > @@ -47,6 +47,13 @@ struct io_link { > > > > > int flags; > > > > > }; > > > > > +struct io_getdents { > > > > > + struct file *file; > > > > > + struct linux_dirent64 __user *dirent; > > > > > + unsigned int count; > > > > > + int flags; > > > > > +}; > > > > > + > > > > > int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > > > > { > > > > > struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); > > > > > @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) > > > > > putname(sl->oldpath); > > > > > putname(sl->newpath); > > > > > } > > > > > + > > > > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) > > > > > +{ > > > > > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > > > > > + > > > > > + if (READ_ONCE(sqe->off) != 0) > > > > > + return -EINVAL; > > > > > + > > > > > + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); > > > > > + gd->count = READ_ONCE(sqe->len); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) > > > > > +{ > > > > > + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > > > > > + struct file *file = req->file; > > > > > + unsigned long getdents_flags = 0; > > > > > + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; > > > > > > > > Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. > > > > But to point this out: > > > > > > > > vfs_getdents() > > > > -> iterate_dir() > > > > { > > > > if (shared) > > > > res = down_read_killable(&inode->i_rwsem); > > > > else > > > > res = down_write_killable(&inode->i_rwsem); > > > > } > > > > > > > > which means you can still end up sleeping here before you go into a > > > > filesystem that does actually support non-waiting getdents. So if you > > > > have concurrent operations that grab inode lock (touch, mkdir etc) you > > > > can end up sleeping here. > > > > > > > > Is that intentional or an oversight? If the former can someone please > > > > explain the rules and why it's fine in this case? > > > > > > I actually saw this semaphore, and there is another xfs lock in > > > file_accessed > > > --> touch_atime > > > --> inode_update_time > > > --> inode->i_op->update_time == xfs_vn_update_time > > > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > > since I'm not very sure about if we should do so, and I saw Stefan's > > > patchset didn't modify them too. > > > > > > My personnal thinking is we should apply trylock logic for this > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > > doesn't make sense to rollback all the stuff while we are almost at the > > > end of getdents because of a lock. > > > > That manoeuvres around the problem. Which I'm slightly more sensitive > > too as this review is a rather expensive one. > > > > Plus, it seems fixable in at least two ways: > > > > For both we need to be able to tell the filesystem that a nowait atime > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > > file_time_flags and passing that via i_op->update_time() which already > > has a flag argument. That would likely also help kiocb_modified(). > > > > file_accessed() > > -> touch_atime() > > -> inode_update_time() > > -> i_op->update_time == xfs_vn_update_time() > > > > Then we have two options afaict: > > > > (1) best-effort atime update > > > > file_accessed() already has the builtin assumption that updating atime > > might fail for other reasons - see the comment in there. So it is > > somewhat best-effort already. > > > > (2) move atime update before calling into filesystem > > > > If we want to be sure that access time is updated when a readdir request > > is issued through io_uring then we need to have file_accessed() give a > > return value and expose a new helper for io_uring or modify > > vfs_getdents() to do something like: > > > > vfs_getdents() > > { > > if (nowait) > > down_read_trylock() > > > > if (!IS_DEADDIR(inode)) { > > ret = file_accessed(file); > > if (ret == -EAGAIN) > > goto out_unlock; > > > > f_op->iterate_shared() > > } > > } > > > > It's not unprecedented to do update atime before the actual operation > > has been done afaict. That's already the case in xfs_file_write_checks() > > which is called before anything is written. So that seems ok. > > I'm not familiar with this part(the time update), I guess we should > revert the updated time if we succeed to do file_accessed(file) but > fail somewhere later in f_op->iterate_shared()? Or is it definitely > counted as an "access" as long as we start to call getdents to a file? To answer that you can simply take a look at readdir rn res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; if (shared) res = file->f_op->iterate_shared(file, ctx); else res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } Also, I've said this before: touch_atime() is currently best effort. It may fail for any kind of reason.
On 7/31/23 16:18, Christian Brauner wrote: > On Mon, Jul 31, 2023 at 02:02:25AM +0800, Hao Xu wrote: >> Hi Christian, >> >> On 7/27/23 22:27, Christian Brauner wrote: >>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: >>>> On 7/26/23 23:00, Christian Brauner wrote: >>>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote: >>>>>> From: Hao Xu <howeyxu@tencent.com> >>>>>> >>>>>> This add support for getdents64 to io_uring, acting exactly like the >>>>>> syscall: the directory is iterated from it's current's position as >>>>>> stored in the file struct, and the file's position is updated exactly as >>>>>> if getdents64 had been called. >>>>>> >>>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it >>>>>> first; if a user already knows the filesystem they use do not support >>>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags, >>>>>> avoiding the need to bounce back through a useless EAGAIN return. >>>>>> >>>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org> >>>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org> >>>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com> >>>>>> --- >>>>>> include/uapi/linux/io_uring.h | 7 +++++ >>>>>> io_uring/fs.c | 55 +++++++++++++++++++++++++++++++++++ >>>>>> io_uring/fs.h | 3 ++ >>>>>> io_uring/opdef.c | 8 +++++ >>>>>> 4 files changed, 73 insertions(+) >>>>>> >>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >>>>>> index 36f9c73082de..b200b2600622 100644 >>>>>> --- a/include/uapi/linux/io_uring.h >>>>>> +++ b/include/uapi/linux/io_uring.h >>>>>> @@ -65,6 +65,7 @@ struct io_uring_sqe { >>>>>> __u32 xattr_flags; >>>>>> __u32 msg_ring_flags; >>>>>> __u32 uring_cmd_flags; >>>>>> + __u32 getdents_flags; >>>>>> }; >>>>>> __u64 user_data; /* data to be passed back at completion time */ >>>>>> /* pack this to avoid bogus arm OABI complaints */ >>>>>> @@ -235,6 +236,7 @@ enum io_uring_op { >>>>>> IORING_OP_URING_CMD, >>>>>> IORING_OP_SEND_ZC, >>>>>> IORING_OP_SENDMSG_ZC, >>>>>> + IORING_OP_GETDENTS, >>>>>> /* this goes last, obviously */ >>>>>> IORING_OP_LAST, >>>>>> @@ -273,6 +275,11 @@ enum io_uring_op { >>>>>> */ >>>>>> #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ >>>>>> +/* >>>>>> + * sqe->getdents_flags >>>>>> + */ >>>>>> +#define IORING_GETDENTS_REWIND (1U << 0) >>>>>> + >>>>>> /* >>>>>> * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the >>>>>> * command flags for POLL_ADD are stored in sqe->len. >>>>>> diff --git a/io_uring/fs.c b/io_uring/fs.c >>>>>> index f6a69a549fd4..480f25677fed 100644 >>>>>> --- a/io_uring/fs.c >>>>>> +++ b/io_uring/fs.c >>>>>> @@ -47,6 +47,13 @@ struct io_link { >>>>>> int flags; >>>>>> }; >>>>>> +struct io_getdents { >>>>>> + struct file *file; >>>>>> + struct linux_dirent64 __user *dirent; >>>>>> + unsigned int count; >>>>>> + int flags; >>>>>> +}; >>>>>> + >>>>>> int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>>>> { >>>>>> struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); >>>>>> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) >>>>>> putname(sl->oldpath); >>>>>> putname(sl->newpath); >>>>>> } >>>>>> + >>>>>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) >>>>>> +{ >>>>>> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >>>>>> + >>>>>> + if (READ_ONCE(sqe->off) != 0) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); >>>>>> + gd->count = READ_ONCE(sqe->len); >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) >>>>>> +{ >>>>>> + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); >>>>>> + struct file *file = req->file; >>>>>> + unsigned long getdents_flags = 0; >>>>>> + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; >>>>> Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK. >>>>> But to point this out: >>>>> >>>>> vfs_getdents() >>>>> -> iterate_dir() >>>>> { >>>>> if (shared) >>>>> res = down_read_killable(&inode->i_rwsem); >>>>> else >>>>> res = down_write_killable(&inode->i_rwsem); >>>>> } >>>>> >>>>> which means you can still end up sleeping here before you go into a >>>>> filesystem that does actually support non-waiting getdents. So if you >>>>> have concurrent operations that grab inode lock (touch, mkdir etc) you >>>>> can end up sleeping here. >>>>> >>>>> Is that intentional or an oversight? If the former can someone please >>>>> explain the rules and why it's fine in this case? >>>> I actually saw this semaphore, and there is another xfs lock in >>>> file_accessed >>>> --> touch_atime >>>> --> inode_update_time >>>> --> inode->i_op->update_time == xfs_vn_update_time >>>> >>>> Forgot to point them out in the cover-letter..., I didn't modify them >>>> since I'm not very sure about if we should do so, and I saw Stefan's >>>> patchset didn't modify them too. >>>> >>>> My personnal thinking is we should apply trylock logic for this >>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it >>>> doesn't make sense to rollback all the stuff while we are almost at the >>>> end of getdents because of a lock. >>> That manoeuvres around the problem. Which I'm slightly more sensitive >>> too as this review is a rather expensive one. >>> >>> Plus, it seems fixable in at least two ways: >>> >>> For both we need to be able to tell the filesystem that a nowait atime >>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to >>> file_time_flags and passing that via i_op->update_time() which already >>> has a flag argument. That would likely also help kiocb_modified(). >>> >>> file_accessed() >>> -> touch_atime() >>> -> inode_update_time() >>> -> i_op->update_time == xfs_vn_update_time() >>> >>> Then we have two options afaict: >>> >>> (1) best-effort atime update >>> >>> file_accessed() already has the builtin assumption that updating atime >>> might fail for other reasons - see the comment in there. So it is >>> somewhat best-effort already. >>> >>> (2) move atime update before calling into filesystem >>> >>> If we want to be sure that access time is updated when a readdir request >>> is issued through io_uring then we need to have file_accessed() give a >>> return value and expose a new helper for io_uring or modify >>> vfs_getdents() to do something like: >>> >>> vfs_getdents() >>> { >>> if (nowait) >>> down_read_trylock() >>> >>> if (!IS_DEADDIR(inode)) { >>> ret = file_accessed(file); >>> if (ret == -EAGAIN) >>> goto out_unlock; >>> >>> f_op->iterate_shared() >>> } >>> } >>> >>> It's not unprecedented to do update atime before the actual operation >>> has been done afaict. That's already the case in xfs_file_write_checks() >>> which is called before anything is written. So that seems ok. >> I'm not familiar with this part(the time update), I guess we should >> revert the updated time if we succeed to do file_accessed(file) but >> fail somewhere later in f_op->iterate_shared()? Or is it definitely >> counted as an "access" as long as we start to call getdents to a file? > To answer that you can simply take a look at readdir rn > > res = -ENOENT; > if (!IS_DEADDIR(inode)) { > ctx->pos = file->f_pos; > if (shared) > res = file->f_op->iterate_shared(file, ctx); > else > res = file->f_op->iterate(file, ctx); > file->f_pos = ctx->pos; > fsnotify_access(file); > file_accessed(file); > } > > Also, I've said this before: touch_atime() is currently best effort. It > may fail for any kind of reason. Gotcha, I checked the code and I think you are right, time updates even the fop->iterate() quits at the very beginning. I'll move file_accessed(file) to before f_op->the iterate(). Thanks.
On Mon, Jul 31, 2023 at 10:13:21AM +0200, Christian Brauner wrote: > On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote: > > On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote: > > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: > > > > I actually saw this semaphore, and there is another xfs lock in > > > > file_accessed > > > > --> touch_atime > > > > --> inode_update_time > > > > --> inode->i_op->update_time == xfs_vn_update_time > > > > > > > > Forgot to point them out in the cover-letter..., I didn't modify them > > > > since I'm not very sure about if we should do so, and I saw Stefan's > > > > patchset didn't modify them too. > > > > > > > > My personnal thinking is we should apply trylock logic for this > > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it > > > > doesn't make sense to rollback all the stuff while we are almost at the > > > > end of getdents because of a lock. > > > > > > That manoeuvres around the problem. Which I'm slightly more sensitive > > > too as this review is a rather expensive one. > > > > > > Plus, it seems fixable in at least two ways: > > > > > > For both we need to be able to tell the filesystem that a nowait atime > > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to > > > file_time_flags and passing that via i_op->update_time() which already > > > has a flag argument. That would likely also help kiocb_modified(). > > > > Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT > > modification operations? Yeah, we did: > > > > kiocb_modified(iocb) > > file_modified_flags(iocb->ki_file, iocb->ki_flags) > > .... > > ret = inode_needs_update_time() > > if (ret <= 0) > > return ret; > > if (flags & IOCB_NOWAIT) > > return -EAGAIN; > > <does timestamp update> > > > > > file_accessed() > > > -> touch_atime() > > > -> inode_update_time() > > > -> i_op->update_time == xfs_vn_update_time() > > > > Yeah, so this needs the same treatment as file_modified_flags() - > > touch_atime() needs a flag variant that passes IOCB_NOWAIT, and > > after atime_needs_update() returns trues we should check IOCB_NOWAIT > > and return EAGAIN if it is set. That will punt the operation that > > needs to the update to a worker thread that can block.... > > As I tried to explain, I would prefer if we could inform the filesystem > through i_op->update_time() itself that this is async and give the > filesystem the ability to try and acquire the locks it needs and return > EAGAIN from i_op->update_time() itself if it can't acquire them. > > > > > > Then we have two options afaict: > > > > > > (1) best-effort atime update > > > > > > file_accessed() already has the builtin assumption that updating atime > > > might fail for other reasons - see the comment in there. So it is > > > somewhat best-effort already. > > > > > > (2) move atime update before calling into filesystem > > > > > > If we want to be sure that access time is updated when a readdir request > > > is issued through io_uring then we need to have file_accessed() give a > > > return value and expose a new helper for io_uring or modify > > > vfs_getdents() to do something like: > > > > > > vfs_getdents() > > > { > > > if (nowait) > > > down_read_trylock() > > > > > > if (!IS_DEADDIR(inode)) { > > > ret = file_accessed(file); > > > if (ret == -EAGAIN) > > > goto out_unlock; > > > > > > f_op->iterate_shared() > > > } > > > } > > > > Yup, that's the sort of thing that needs to be done. > > > > But as I said in the "llseek for io-uring" thread, we need to stop > > the game of whack-a-mole passing random nowait boolean flags to VFS > > operations before it starts in earnest. We really need a common > > context structure (like we have a kiocb for IO operations) that > > holds per operation control state so we have consistency across all > > the operations that we need different behaviours for. > > Yes, I tend to agree and thought about the same. But right now we don't > have a lot of context. So I would lean towards a flag argument at most. > > But I also wouldn't consider it necessarily wrong to start with booleans > or a flag first and in a couple of months if the need for more context > arises we know what kind of struct we want or need. I'm probably missing a ton of context (because at the end of the day I don't care all that much about NOWAIT and still have never installed liburing) but AFAICT the goal seems to be that for a given io request, uring tries to execute it with trylocks in the originating process context. If that attempt fails, it'll punt the io to a workqueue and rerun the request with blocking locks. Right? I've watched quite a bit of NOWAIT whackamole going on over the past few years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC these filesystem ios all have to run in process context, right? If so, why don't we capture the NOWAIT state in a PF flag? We already do that for NOFS/NOIO memory allocations to make sure that /all/ reclaim attempts cannot recurse into the fs/io stacks. "I prefer EAGAIN errors to this process blocking" doesn't seem all that much different. But, what do I know... --D
On Mon, Jul 31, 2023 at 08:26:23AM -0700, Darrick J. Wong wrote: > On Mon, Jul 31, 2023 at 10:13:21AM +0200, Christian Brauner wrote: > > On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote: > > > On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote: > > > But as I said in the "llseek for io-uring" thread, we need to stop > > > the game of whack-a-mole passing random nowait boolean flags to VFS > > > operations before it starts in earnest. We really need a common > > > context structure (like we have a kiocb for IO operations) that > > > holds per operation control state so we have consistency across all > > > the operations that we need different behaviours for. > > > > Yes, I tend to agree and thought about the same. But right now we don't > > have a lot of context. So I would lean towards a flag argument at most. > > > > But I also wouldn't consider it necessarily wrong to start with booleans > > or a flag first and in a couple of months if the need for more context > > arises we know what kind of struct we want or need. > > I'm probably missing a ton of context (because at the end of the day I > don't care all that much about NOWAIT and still have never installed > liburing) but AFAICT the goal seems to be that for a given io request, > uring tries to execute it with trylocks in the originating process > context. If that attempt fails, it'll punt the io to a workqueue and > rerun the request with blocking locks. Right? Yes, that might be the case for the VFS level code we are talking about right now.... ... but, for example, I have no clue what task context nvmet_file_execute_rw() runs in but it definitely issues file IO with IOCB_NOWAIT... > I've watched quite a bit of NOWAIT whackamole going on over the past few > years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC > these filesystem ios all have to run in process context, right? If so, > why don't we capture the NOWAIT state in a PF flag? We already do that > for NOFS/NOIO memory allocations to make sure that /all/ reclaim > attempts cannot recurse into the fs/io stacks. Interesting idea. That would mean the high level code would have to toggle the task flags before calling into the VFS, which means we'd have to capture RWF_NOWAIT flags at the syscall level rather than propagating them into the iocb. That may impact speed racers, because the RWF flag propagation has been a target of significant micro-optimisation since io_uring came along.... > "I prefer EAGAIN errors to this process blocking" doesn't seem all that > much different. But, what do I know... Yeah, I can see how that might be advantageous from an API persepective, though my gut says "that's a can of worms" but I haven't spent enough time thinking about it to work out why I feel that way. -Dave.
On 7/31/23 9:26?AM, Darrick J. Wong wrote: > I've watched quite a bit of NOWAIT whackamole going on over the past few > years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC > these filesystem ios all have to run in process context, right? If so, > why don't we capture the NOWAIT state in a PF flag? We already do that > for NOFS/NOIO memory allocations to make sure that /all/ reclaim > attempts cannot recurse into the fs/io stacks. I would greatly prefer passing down the context rather than capitulating and adding a task_struct flag for this. I think it _kind of_ makes sense for things like allocations, as you cannot easily track that all the way down, but it's a really ugly solution. It certainly creates more churn passing it down, but it also reveals the parts that need to check it. WHen new code is added, it's much more likely you'll spot the fact that there's passed in context. For allocation, you end up in the allocator anyway, which can augment the gfp mask with whatever is set in the task. The same is not true for locking and other bits, as they don't return a value to begin with. When we know they are sane, we can flag the fs as supporting it (like we've done for async buffered reads, for example). It's also not an absolute thing, like memory allocations are. It's perfectly fine to grab a mutex under NOWAIT issue. What you should not do is grab a mutex that someone else can grab while waiting on IO. This kind of extra context is only available in the code in question, not generically for eg mutex locking. I'm not a huge fan of the "let's add a bool nowait". Most/all use cases pass down state anyway, putting it in a suitable type struct seems much cleaner to me than the out-of-band approach of just adding a current->please_nowait.
On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: > It's also not an absolute thing, like memory allocations are. It's > perfectly fine to grab a mutex under NOWAIT issue. What you should not > do is grab a mutex that someone else can grab while waiting on IO. This > kind of extra context is only available in the code in question, not > generically for eg mutex locking. Is that information documented somewhere? I didn't know that was the rule, and I wouldn't be surprised if that's news to several of the other people on this thread.
On 7/31/23 6:47?PM, Matthew Wilcox wrote: > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: >> It's also not an absolute thing, like memory allocations are. It's >> perfectly fine to grab a mutex under NOWAIT issue. What you should not >> do is grab a mutex that someone else can grab while waiting on IO. This >> kind of extra context is only available in the code in question, not >> generically for eg mutex locking. > > Is that information documented somewhere? I didn't know that was the > rule, and I wouldn't be surprised if that's news to several of the other > people on this thread. I've mentioned it in various threads, but would probably be good to write down somewhere if that actually makes more people see it. I'm dubious, but since it applies to NOWAIT in general, would be a good thing to do regardless. And then at least I could point to that rather than have to write up a long description every time.
On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote: > On 7/31/23 6:47?PM, Matthew Wilcox wrote: > > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: > >> It's also not an absolute thing, like memory allocations are. It's > >> perfectly fine to grab a mutex under NOWAIT issue. What you should not > >> do is grab a mutex that someone else can grab while waiting on IO. This > >> kind of extra context is only available in the code in question, not > >> generically for eg mutex locking. > > > > Is that information documented somewhere? I didn't know that was the > > rule, and I wouldn't be surprised if that's news to several of the other > > people on this thread. > > I've mentioned it in various threads, but would probably be good to > write down somewhere if that actually makes more people see it. I'm > dubious, but since it applies to NOWAIT in general, would be a good > thing to do regardless. And then at least I could point to that rather > than have to write up a long description every time. Would be good to include whether it's OK to wait on a mutex that's held by another thread that's allocating memory without __GFP_NOFS (since obviously that can block on I/O)
On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote: > On 7/31/23 6:47?PM, Matthew Wilcox wrote: > > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: > >> It's also not an absolute thing, like memory allocations are. It's > >> perfectly fine to grab a mutex under NOWAIT issue. What you should not > >> do is grab a mutex that someone else can grab while waiting on IO. This > >> kind of extra context is only available in the code in question, not > >> generically for eg mutex locking. > > > > Is that information documented somewhere? I didn't know that was the > > rule, and I wouldn't be surprised if that's news to several of the other > > people on this thread. > > I've mentioned it in various threads, but would probably be good to > write down somewhere if that actually makes more people see it. I'm > dubious, but since it applies to NOWAIT in general, would be a good > thing to do regardless. And then at least I could point to that rather > than have to write up a long description every time. I've only been aware of this because we talked about it somewhere else. So adding it as documentation would be great.
On Tue, Aug 01, 2023 at 02:01:59AM +0100, Matthew Wilcox wrote: > On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote: > > On 7/31/23 6:47?PM, Matthew Wilcox wrote: > > > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: > > >> It's also not an absolute thing, like memory allocations are. It's > > >> perfectly fine to grab a mutex under NOWAIT issue. What you should not > > >> do is grab a mutex that someone else can grab while waiting on IO. This > > >> kind of extra context is only available in the code in question, not > > >> generically for eg mutex locking. > > > > > > Is that information documented somewhere? I didn't know that was the > > > rule, and I wouldn't be surprised if that's news to several of the other > > > people on this thread. > > > > I've mentioned it in various threads, but would probably be good to > > write down somewhere if that actually makes more people see it. I'm > > dubious, but since it applies to NOWAIT in general, would be a good > > thing to do regardless. And then at least I could point to that rather > > than have to write up a long description every time. > > Would be good to include whether it's OK to wait on a mutex that's held > by another thread that's allocating memory without __GFP_NOFS (since > obviously that can block on I/O) And adding a few illustrative examples would be helpful.
On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote: > On 7/31/23 9:26?AM, Darrick J. Wong wrote: > > I've watched quite a bit of NOWAIT whackamole going on over the past few > > years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC > > these filesystem ios all have to run in process context, right? If so, > > why don't we capture the NOWAIT state in a PF flag? We already do that > > for NOFS/NOIO memory allocations to make sure that /all/ reclaim > > attempts cannot recurse into the fs/io stacks. > > I would greatly prefer passing down the context rather than capitulating > and adding a task_struct flag for this. I think it _kind of_ makes sense > for things like allocations, as you cannot easily track that all the way > down, but it's a really ugly solution. It certainly creates more churn > passing it down, but it also reveals the parts that need to check it. > WHen new code is added, it's much more likely you'll spot the fact that > there's passed in context. For allocation, you end up in the allocator > anyway, which can augment the gfp mask with whatever is set in the task. > The same is not true for locking and other bits, as they don't return a > value to begin with. When we know they are sane, we can flag the fs as > supporting it (like we've done for async buffered reads, for example). > > It's also not an absolute thing, like memory allocations are. It's > perfectly fine to grab a mutex under NOWAIT issue. What you should not > do is grab a mutex that someone else can grab while waiting on IO. This > kind of extra context is only available in the code in question, not > generically for eg mutex locking. > > I'm not a huge fan of the "let's add a bool nowait". Most/all use cases > pass down state anyway, putting it in a suitable type struct seems much We're only going to pass a struct if there really is a need for one though. Meaning, we're shouldn't end up passing a struct with a single element around in the hopes that we'll add more members at some point. > cleaner to me than the out-of-band approach of just adding a > current->please_nowait. I'm not convinced that abusing current/task_struct for this is sane. I not just very much doubt this will go down well with reviewers outside of fs/ we'd also rightly be told that we're punting on a design problem because it would be more churn.
On 7/31/23 16:13, Christian Brauner wrote: > On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote: >> On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote: >>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote: >>>> I actually saw this semaphore, and there is another xfs lock in >>>> file_accessed >>>> --> touch_atime >>>> --> inode_update_time >>>> --> inode->i_op->update_time == xfs_vn_update_time >>>> >>>> Forgot to point them out in the cover-letter..., I didn't modify them >>>> since I'm not very sure about if we should do so, and I saw Stefan's >>>> patchset didn't modify them too. >>>> >>>> My personnal thinking is we should apply trylock logic for this >>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it >>>> doesn't make sense to rollback all the stuff while we are almost at the >>>> end of getdents because of a lock. >>> >>> That manoeuvres around the problem. Which I'm slightly more sensitive >>> too as this review is a rather expensive one. >>> >>> Plus, it seems fixable in at least two ways: >>> >>> For both we need to be able to tell the filesystem that a nowait atime >>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to >>> file_time_flags and passing that via i_op->update_time() which already >>> has a flag argument. That would likely also help kiocb_modified(). >> >> Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT >> modification operations? Yeah, we did: >> >> kiocb_modified(iocb) >> file_modified_flags(iocb->ki_file, iocb->ki_flags) >> .... >> ret = inode_needs_update_time() >> if (ret <= 0) >> return ret; >> if (flags & IOCB_NOWAIT) >> return -EAGAIN; >> <does timestamp update> >> >>> file_accessed() >>> -> touch_atime() >>> -> inode_update_time() >>> -> i_op->update_time == xfs_vn_update_time() >> >> Yeah, so this needs the same treatment as file_modified_flags() - >> touch_atime() needs a flag variant that passes IOCB_NOWAIT, and >> after atime_needs_update() returns trues we should check IOCB_NOWAIT >> and return EAGAIN if it is set. That will punt the operation that >> needs to the update to a worker thread that can block.... > > As I tried to explain, I would prefer if we could inform the filesystem > through i_op->update_time() itself that this is async and give the > filesystem the ability to try and acquire the locks it needs and return > EAGAIN from i_op->update_time() itself if it can't acquire them. I browse code in i_op->update_time = xfs_vn_update_time, it's mainly about xfs journal code. It creates a transaction and adds a item to it, not familiar with this part, from a quick look I found some locks and sleep point in it to modify. I think I need some time to sort out this part. Or maybe we can do it like what Dave said as a short term solution and change the block points in journal code later as a separate patchset, those journal code I believe are common code for xfs IO operations. I'm ok with both though. > >> >>> Then we have two options afaict: >>> >>> (1) best-effort atime update >>> >>> file_accessed() already has the builtin assumption that updating atime >>> might fail for other reasons - see the comment in there. So it is >>> somewhat best-effort already. >>> >>> (2) move atime update before calling into filesystem >>> >>> If we want to be sure that access time is updated when a readdir request >>> is issued through io_uring then we need to have file_accessed() give a >>> return value and expose a new helper for io_uring or modify >>> vfs_getdents() to do something like: >>> >>> vfs_getdents() >>> { >>> if (nowait) >>> down_read_trylock() >>> >>> if (!IS_DEADDIR(inode)) { >>> ret = file_accessed(file); >>> if (ret == -EAGAIN) >>> goto out_unlock; >>> >>> f_op->iterate_shared() >>> } >>> } >> >> Yup, that's the sort of thing that needs to be done. >> >> But as I said in the "llseek for io-uring" thread, we need to stop >> the game of whack-a-mole passing random nowait boolean flags to VFS >> operations before it starts in earnest. We really need a common >> context structure (like we have a kiocb for IO operations) that >> holds per operation control state so we have consistency across all >> the operations that we need different behaviours for. > > Yes, I tend to agree and thought about the same. But right now we don't > have a lot of context. So I would lean towards a flag argument at most. > > But I also wouldn't consider it necessarily wrong to start with booleans > or a flag first and in a couple of months if the need for more context > arises we know what kind of struct we want or need.
On 8/1/23 08:28, Jens Axboe wrote: > On 7/31/23 9:26?AM, Darrick J. Wong wrote: >> I've watched quite a bit of NOWAIT whackamole going on over the past few >> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC >> these filesystem ios all have to run in process context, right? If so, >> why don't we capture the NOWAIT state in a PF flag? We already do that >> for NOFS/NOIO memory allocations to make sure that /all/ reclaim >> attempts cannot recurse into the fs/io stacks. > > I would greatly prefer passing down the context rather than capitulating > and adding a task_struct flag for this. I think it _kind of_ makes sense > for things like allocations, as you cannot easily track that all the way > down, but it's a really ugly solution. It certainly creates more churn > passing it down, but it also reveals the parts that need to check it. > WHen new code is added, it's much more likely you'll spot the fact that > there's passed in context. For allocation, you end up in the allocator > anyway, which can augment the gfp mask with whatever is set in the task. > The same is not true for locking and other bits, as they don't return a > value to begin with. When we know they are sane, we can flag the fs as > supporting it (like we've done for async buffered reads, for example). > > It's also not an absolute thing, like memory allocations are. It's > perfectly fine to grab a mutex under NOWAIT issue. What you should not Hi Jens, To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I understand it right? Thanks, Hao > do is grab a mutex that someone else can grab while waiting on IO. This > kind of extra context is only available in the code in question, not > generically for eg mutex locking. > > I'm not a huge fan of the "let's add a bool nowait". Most/all use cases > pass down state anyway, putting it in a suitable type struct seems much > cleaner to me than the out-of-band approach of just adding a > current->please_nowait. >
On 8/8/23 12:34, Hao Xu wrote: > On 8/1/23 08:28, Jens Axboe wrote: >> On 7/31/23 9:26?AM, Darrick J. Wong wrote: >>> I've watched quite a bit of NOWAIT whackamole going on over the past >>> few >>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC >>> these filesystem ios all have to run in process context, right? If so, >>> why don't we capture the NOWAIT state in a PF flag? We already do that >>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim >>> attempts cannot recurse into the fs/io stacks. >> >> I would greatly prefer passing down the context rather than capitulating >> and adding a task_struct flag for this. I think it _kind of_ makes sense >> for things like allocations, as you cannot easily track that all the way >> down, but it's a really ugly solution. It certainly creates more churn >> passing it down, but it also reveals the parts that need to check it. >> WHen new code is added, it's much more likely you'll spot the fact that >> there's passed in context. For allocation, you end up in the allocator >> anyway, which can augment the gfp mask with whatever is set in the task. >> The same is not true for locking and other bits, as they don't return a >> value to begin with. When we know they are sane, we can flag the fs as >> supporting it (like we've done for async buffered reads, for example). >> >> It's also not an absolute thing, like memory allocations are. It's >> perfectly fine to grab a mutex under NOWAIT issue. What you should not > > Hi Jens, > To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics > is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I > understand it right? > > Thanks, > Hao Trying to find a lock in mem allocation process that GFP_NOIO holds it while other normal GFP_* like GFP_KERNEL also holds it and does IO. Failed to find one such lock. So I guess though GFP_NOIO may cause sleep but won't wait on IO.
On 8/1/23 08:28, Jens Axboe wrote: > On 7/31/23 9:26?AM, Darrick J. Wong wrote: >> I've watched quite a bit of NOWAIT whackamole going on over the past few >> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC >> these filesystem ios all have to run in process context, right? If so, >> why don't we capture the NOWAIT state in a PF flag? We already do that >> for NOFS/NOIO memory allocations to make sure that /all/ reclaim >> attempts cannot recurse into the fs/io stacks. > > I would greatly prefer passing down the context rather than capitulating > and adding a task_struct flag for this. I think it _kind of_ makes sense > for things like allocations, as you cannot easily track that all the way > down, but it's a really ugly solution. It certainly creates more churn > passing it down, but it also reveals the parts that need to check it. > WHen new code is added, it's much more likely you'll spot the fact that > there's passed in context. For allocation, you end up in the allocator > anyway, which can augment the gfp mask with whatever is set in the task. > The same is not true for locking and other bits, as they don't return a > value to begin with. When we know they are sane, we can flag the fs as > supporting it (like we've done for async buffered reads, for example). > > It's also not an absolute thing, like memory allocations are. It's > perfectly fine to grab a mutex under NOWAIT issue. What you should not Hi Jens, To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I understand it right? Thanks, Hao > do is grab a mutex that someone else can grab while waiting on IO. This > kind of extra context is only available in the code in question, not > generically for eg mutex locking. > > I'm not a huge fan of the "let's add a bool nowait". Most/all use cases > pass down state anyway, putting it in a suitable type struct seems much > cleaner to me than the out-of-band approach of just adding a > current->please_nowait. >
On 8/8/23 3:33?AM, Hao Xu wrote: > On 8/1/23 08:28, Jens Axboe wrote: >> On 7/31/23 9:26?AM, Darrick J. Wong wrote: >>> I've watched quite a bit of NOWAIT whackamole going on over the past few >>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC >>> these filesystem ios all have to run in process context, right? If so, >>> why don't we capture the NOWAIT state in a PF flag? We already do that >>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim >>> attempts cannot recurse into the fs/io stacks. >> >> I would greatly prefer passing down the context rather than capitulating >> and adding a task_struct flag for this. I think it _kind of_ makes sense >> for things like allocations, as you cannot easily track that all the way >> down, but it's a really ugly solution. It certainly creates more churn >> passing it down, but it also reveals the parts that need to check it. >> WHen new code is added, it's much more likely you'll spot the fact that >> there's passed in context. For allocation, you end up in the allocator >> anyway, which can augment the gfp mask with whatever is set in the task. >> The same is not true for locking and other bits, as they don't return a >> value to begin with. When we know they are sane, we can flag the fs as >> supporting it (like we've done for async buffered reads, for example). >> >> It's also not an absolute thing, like memory allocations are. It's >> perfectly fine to grab a mutex under NOWAIT issue. What you should not > > Hi Jens, > To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics > is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I > understand it right? Yep, GFP_NOIO should be just fine.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 36f9c73082de..b200b2600622 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -65,6 +65,7 @@ struct io_uring_sqe { __u32 xattr_flags; __u32 msg_ring_flags; __u32 uring_cmd_flags; + __u32 getdents_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -235,6 +236,7 @@ enum io_uring_op { IORING_OP_URING_CMD, IORING_OP_SEND_ZC, IORING_OP_SENDMSG_ZC, + IORING_OP_GETDENTS, /* this goes last, obviously */ IORING_OP_LAST, @@ -273,6 +275,11 @@ enum io_uring_op { */ #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ +/* + * sqe->getdents_flags + */ +#define IORING_GETDENTS_REWIND (1U << 0) + /* * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the * command flags for POLL_ADD are stored in sqe->len. diff --git a/io_uring/fs.c b/io_uring/fs.c index f6a69a549fd4..480f25677fed 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -47,6 +47,13 @@ struct io_link { int flags; }; +struct io_getdents { + struct file *file; + struct linux_dirent64 __user *dirent; + unsigned int count; + int flags; +}; + int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req) putname(sl->oldpath); putname(sl->newpath); } + +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); + + if (READ_ONCE(sqe->off) != 0) + return -EINVAL; + + gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); + gd->count = READ_ONCE(sqe->len); + + return 0; +} + +int io_getdents(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); + struct file *file = req->file; + unsigned long getdents_flags = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool should_lock = file->f_mode & FMODE_ATOMIC_POS; + int ret; + + if (force_nonblock) { + if (!(req->file->f_mode & FMODE_NOWAIT)) + return -EAGAIN; + + getdents_flags = DIR_CONTEXT_F_NOWAIT; + } + + if (should_lock) { + if (!force_nonblock) + mutex_lock(&file->f_pos_lock); + else if (!mutex_trylock(&file->f_pos_lock)) + return -EAGAIN; + } + + ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags); + if (should_lock) + mutex_unlock(&file->f_pos_lock); + + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + + io_req_set_res(req, ret, 0); + return 0; +} + diff --git a/io_uring/fs.h b/io_uring/fs.h index 0bb5efe3d6bb..f83a6f3a678d 100644 --- a/io_uring/fs.h +++ b/io_uring/fs.h @@ -18,3 +18,6 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags); int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_linkat(struct io_kiocb *req, unsigned int issue_flags); void io_link_cleanup(struct io_kiocb *req); + +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_getdents(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3b9c6489b8b6..1bae6b2a8d0b 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -428,6 +428,11 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_GETDENTS] = { + .needs_file = 1, + .prep = io_getdents_prep, + .issue = io_getdents, + }, }; @@ -648,6 +653,9 @@ const struct io_cold_def io_cold_defs[] = { .fail = io_sendrecv_fail, #endif }, + [IORING_OP_GETDENTS] = { + .name = "GETDENTS", + }, }; const char *io_uring_get_opcode(u8 opcode)