diff mbox series

[3/5] io_uring: add support for getdents

Message ID 20230718132112.461218-4-hao.xu@linux.dev (mailing list archive)
State New
Headers show
Series io_uring getdents | expand

Commit Message

Hao Xu July 18, 2023, 1:21 p.m. UTC
From: Hao Xu <howeyxu@tencent.com>

This add support for getdents64 to io_uring, acting exactly like the
syscall: the directory is iterated from it's current's position as
stored in the file struct, and the file's position is updated exactly as
if getdents64 had been called.

For filesystems that support NOWAIT in iterate_shared(), try to use it
first; if a user already knows the filesystem they use do not support
nowait they can force async through IOSQE_ASYNC in the sqe flags,
avoiding the need to bounce back through a useless EAGAIN return.

Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Hao Xu <howeyxu@tencent.com>
---
 include/uapi/linux/io_uring.h |  7 +++++
 io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
 io_uring/fs.h                 |  3 ++
 io_uring/opdef.c              |  8 +++++
 4 files changed, 73 insertions(+)

Comments

Hao Xu July 19, 2023, 8:56 a.m. UTC | #1
On 7/18/23 21:21, Hao Xu wrote:
> From: Hao Xu <howeyxu@tencent.com>
> 
> This add support for getdents64 to io_uring, acting exactly like the
> syscall: the directory is iterated from it's current's position as
> stored in the file struct, and the file's position is updated exactly as
> if getdents64 had been called.
> 
> For filesystems that support NOWAIT in iterate_shared(), try to use it
> first; if a user already knows the filesystem they use do not support
> nowait they can force async through IOSQE_ASYNC in the sqe flags,
> avoiding the need to bounce back through a useless EAGAIN return.
> 
> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Hao Xu <howeyxu@tencent.com>
> ---
>   include/uapi/linux/io_uring.h |  7 +++++
>   io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
>   io_uring/fs.h                 |  3 ++
>   io_uring/opdef.c              |  8 +++++
>   4 files changed, 73 insertions(+)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 36f9c73082de..b200b2600622 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>   		__u32		xattr_flags;
>   		__u32		msg_ring_flags;
>   		__u32		uring_cmd_flags;
> +		__u32		getdents_flags;

Looks this is not needed anymore, I'll remove this in next version.

>   	};
>   	__u64	user_data;	/* data to be passed back at completion time */
>   	/* pack this to avoid bogus arm OABI complaints */
> @@ -235,6 +236,7 @@ enum io_uring_op {
>   	IORING_OP_URING_CMD,
>   	IORING_OP_SEND_ZC,
>   	IORING_OP_SENDMSG_ZC,
> +	IORING_OP_GETDENTS,
>   
>   	/* this goes last, obviously */
>   	IORING_OP_LAST,
> @@ -273,6 +275,11 @@ enum io_uring_op {
>    */
>   #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>   
> +/*
> + * sqe->getdents_flags
> + */
> +#define IORING_GETDENTS_REWIND	(1U << 0)

ditto

> +
>   /*
>    * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>    * command flags for POLL_ADD are stored in sqe->len.
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index f6a69a549fd4..480f25677fed 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -47,6 +47,13 @@ struct io_link {
>   	int				flags;
>   };
>   
> +struct io_getdents {
> +	struct file			*file;
> +	struct linux_dirent64 __user	*dirent;
> +	unsigned int			count;
> +	int				flags;

ditto
Christian Brauner July 26, 2023, 3 p.m. UTC | #2
On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
> From: Hao Xu <howeyxu@tencent.com>
> 
> This add support for getdents64 to io_uring, acting exactly like the
> syscall: the directory is iterated from it's current's position as
> stored in the file struct, and the file's position is updated exactly as
> if getdents64 had been called.
> 
> For filesystems that support NOWAIT in iterate_shared(), try to use it
> first; if a user already knows the filesystem they use do not support
> nowait they can force async through IOSQE_ASYNC in the sqe flags,
> avoiding the need to bounce back through a useless EAGAIN return.
> 
> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Hao Xu <howeyxu@tencent.com>
> ---
>  include/uapi/linux/io_uring.h |  7 +++++
>  io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
>  io_uring/fs.h                 |  3 ++
>  io_uring/opdef.c              |  8 +++++
>  4 files changed, 73 insertions(+)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 36f9c73082de..b200b2600622 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>  		__u32		xattr_flags;
>  		__u32		msg_ring_flags;
>  		__u32		uring_cmd_flags;
> +		__u32		getdents_flags;
>  	};
>  	__u64	user_data;	/* data to be passed back at completion time */
>  	/* pack this to avoid bogus arm OABI complaints */
> @@ -235,6 +236,7 @@ enum io_uring_op {
>  	IORING_OP_URING_CMD,
>  	IORING_OP_SEND_ZC,
>  	IORING_OP_SENDMSG_ZC,
> +	IORING_OP_GETDENTS,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
> @@ -273,6 +275,11 @@ enum io_uring_op {
>   */
>  #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>  
> +/*
> + * sqe->getdents_flags
> + */
> +#define IORING_GETDENTS_REWIND	(1U << 0)
> +
>  /*
>   * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>   * command flags for POLL_ADD are stored in sqe->len.
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index f6a69a549fd4..480f25677fed 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -47,6 +47,13 @@ struct io_link {
>  	int				flags;
>  };
>  
> +struct io_getdents {
> +	struct file			*file;
> +	struct linux_dirent64 __user	*dirent;
> +	unsigned int			count;
> +	int				flags;
> +};
> +
>  int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
>  	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
>  	putname(sl->oldpath);
>  	putname(sl->newpath);
>  }
> +
> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +
> +	if (READ_ONCE(sqe->off) != 0)
> +		return -EINVAL;
> +
> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	gd->count = READ_ONCE(sqe->len);
> +
> +	return 0;
> +}
> +
> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +	struct file *file = req->file;
> +	unsigned long getdents_flags = 0;
> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;

Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
But to point this out:

vfs_getdents()
-> iterate_dir()
   {
        if (shared)
                res = down_read_killable(&inode->i_rwsem);
        else
                res = down_write_killable(&inode->i_rwsem);
   }

which means you can still end up sleeping here before you go into a
filesystem that does actually support non-waiting getdents. So if you
have concurrent operations that grab inode lock (touch, mkdir etc) you
can end up sleeping here.

Is that intentional or an oversight? If the former can someone please
explain the rules and why it's fine in this case?

> +	bool should_lock = file->f_mode & FMODE_ATOMIC_POS;
> +	int ret;
> +
> +	if (force_nonblock) {
> +		if (!(req->file->f_mode & FMODE_NOWAIT))
> +			return -EAGAIN;
> +
> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
> +	}
> +
> +	if (should_lock) {
> +		if (!force_nonblock)
> +			mutex_lock(&file->f_pos_lock);
> +		else if (!mutex_trylock(&file->f_pos_lock))
> +			return -EAGAIN;
> +	}

That now looks like it works.

> +
> +	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
> +	if (should_lock)
> +		mutex_unlock(&file->f_pos_lock);
Hao Xu July 27, 2023, 11:51 a.m. UTC | #3
On 7/26/23 23:00, Christian Brauner wrote:
> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
>> From: Hao Xu <howeyxu@tencent.com>
>>
>> This add support for getdents64 to io_uring, acting exactly like the
>> syscall: the directory is iterated from it's current's position as
>> stored in the file struct, and the file's position is updated exactly as
>> if getdents64 had been called.
>>
>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>> first; if a user already knows the filesystem they use do not support
>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>> avoiding the need to bounce back through a useless EAGAIN return.
>>
>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>> ---
>>   include/uapi/linux/io_uring.h |  7 +++++
>>   io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
>>   io_uring/fs.h                 |  3 ++
>>   io_uring/opdef.c              |  8 +++++
>>   4 files changed, 73 insertions(+)
>>
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 36f9c73082de..b200b2600622 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>>   		__u32		xattr_flags;
>>   		__u32		msg_ring_flags;
>>   		__u32		uring_cmd_flags;
>> +		__u32		getdents_flags;
>>   	};
>>   	__u64	user_data;	/* data to be passed back at completion time */
>>   	/* pack this to avoid bogus arm OABI complaints */
>> @@ -235,6 +236,7 @@ enum io_uring_op {
>>   	IORING_OP_URING_CMD,
>>   	IORING_OP_SEND_ZC,
>>   	IORING_OP_SENDMSG_ZC,
>> +	IORING_OP_GETDENTS,
>>   
>>   	/* this goes last, obviously */
>>   	IORING_OP_LAST,
>> @@ -273,6 +275,11 @@ enum io_uring_op {
>>    */
>>   #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>>   
>> +/*
>> + * sqe->getdents_flags
>> + */
>> +#define IORING_GETDENTS_REWIND	(1U << 0)
>> +
>>   /*
>>    * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>>    * command flags for POLL_ADD are stored in sqe->len.
>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>> index f6a69a549fd4..480f25677fed 100644
>> --- a/io_uring/fs.c
>> +++ b/io_uring/fs.c
>> @@ -47,6 +47,13 @@ struct io_link {
>>   	int				flags;
>>   };
>>   
>> +struct io_getdents {
>> +	struct file			*file;
>> +	struct linux_dirent64 __user	*dirent;
>> +	unsigned int			count;
>> +	int				flags;
>> +};
>> +
>>   int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   {
>>   	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
>> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
>>   	putname(sl->oldpath);
>>   	putname(sl->newpath);
>>   }
>> +
>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +
>> +	if (READ_ONCE(sqe->off) != 0)
>> +		return -EINVAL;
>> +
>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	gd->count = READ_ONCE(sqe->len);
>> +
>> +	return 0;
>> +}
>> +
>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +	struct file *file = req->file;
>> +	unsigned long getdents_flags = 0;
>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> 
> Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
> But to point this out:
> 
> vfs_getdents()
> -> iterate_dir()
>     {
>          if (shared)
>                  res = down_read_killable(&inode->i_rwsem);
>          else
>                  res = down_write_killable(&inode->i_rwsem);
>     }
> 
> which means you can still end up sleeping here before you go into a
> filesystem that does actually support non-waiting getdents. So if you
> have concurrent operations that grab inode lock (touch, mkdir etc) you
> can end up sleeping here.
> 
> Is that intentional or an oversight? If the former can someone please
> explain the rules and why it's fine in this case?

I actually saw this semaphore, and there is another xfs lock in
file_accessed
   --> touch_atime
     --> inode_update_time
       --> inode->i_op->update_time == xfs_vn_update_time

Forgot to point them out in the cover-letter..., I didn't modify them
since I'm not very sure about if we should do so, and I saw Stefan's
patchset didn't modify them too.

My personnal thinking is we should apply trylock logic for this
inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
doesn't make sense to rollback all the stuff while we are almost at the
end of getdents because of a lock.


> 
>> +	bool should_lock = file->f_mode & FMODE_ATOMIC_POS;
>> +	int ret;
>> +
>> +	if (force_nonblock) {
>> +		if (!(req->file->f_mode & FMODE_NOWAIT))
>> +			return -EAGAIN;
>> +
>> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
>> +	}
>> +
>> +	if (should_lock) {
>> +		if (!force_nonblock)
>> +			mutex_lock(&file->f_pos_lock);
>> +		else if (!mutex_trylock(&file->f_pos_lock))
>> +			return -EAGAIN;
>> +	}
> 
> That now looks like it works.
> 
>> +
>> +	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
>> +	if (should_lock)
>> +		mutex_unlock(&file->f_pos_lock);
Christian Brauner July 27, 2023, 2:27 p.m. UTC | #4
On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> On 7/26/23 23:00, Christian Brauner wrote:
> > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
> > > From: Hao Xu <howeyxu@tencent.com>
> > > 
> > > This add support for getdents64 to io_uring, acting exactly like the
> > > syscall: the directory is iterated from it's current's position as
> > > stored in the file struct, and the file's position is updated exactly as
> > > if getdents64 had been called.
> > > 
> > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > first; if a user already knows the filesystem they use do not support
> > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > avoiding the need to bounce back through a useless EAGAIN return.
> > > 
> > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > ---
> > >   include/uapi/linux/io_uring.h |  7 +++++
> > >   io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
> > >   io_uring/fs.h                 |  3 ++
> > >   io_uring/opdef.c              |  8 +++++
> > >   4 files changed, 73 insertions(+)
> > > 
> > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > > index 36f9c73082de..b200b2600622 100644
> > > --- a/include/uapi/linux/io_uring.h
> > > +++ b/include/uapi/linux/io_uring.h
> > > @@ -65,6 +65,7 @@ struct io_uring_sqe {
> > >   		__u32		xattr_flags;
> > >   		__u32		msg_ring_flags;
> > >   		__u32		uring_cmd_flags;
> > > +		__u32		getdents_flags;
> > >   	};
> > >   	__u64	user_data;	/* data to be passed back at completion time */
> > >   	/* pack this to avoid bogus arm OABI complaints */
> > > @@ -235,6 +236,7 @@ enum io_uring_op {
> > >   	IORING_OP_URING_CMD,
> > >   	IORING_OP_SEND_ZC,
> > >   	IORING_OP_SENDMSG_ZC,
> > > +	IORING_OP_GETDENTS,
> > >   	/* this goes last, obviously */
> > >   	IORING_OP_LAST,
> > > @@ -273,6 +275,11 @@ enum io_uring_op {
> > >    */
> > >   #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
> > > +/*
> > > + * sqe->getdents_flags
> > > + */
> > > +#define IORING_GETDENTS_REWIND	(1U << 0)
> > > +
> > >   /*
> > >    * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
> > >    * command flags for POLL_ADD are stored in sqe->len.
> > > diff --git a/io_uring/fs.c b/io_uring/fs.c
> > > index f6a69a549fd4..480f25677fed 100644
> > > --- a/io_uring/fs.c
> > > +++ b/io_uring/fs.c
> > > @@ -47,6 +47,13 @@ struct io_link {
> > >   	int				flags;
> > >   };
> > > +struct io_getdents {
> > > +	struct file			*file;
> > > +	struct linux_dirent64 __user	*dirent;
> > > +	unsigned int			count;
> > > +	int				flags;
> > > +};
> > > +
> > >   int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > >   {
> > >   	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> > > @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
> > >   	putname(sl->oldpath);
> > >   	putname(sl->newpath);
> > >   }
> > > +
> > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > +{
> > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > +
> > > +	if (READ_ONCE(sqe->off) != 0)
> > > +		return -EINVAL;
> > > +
> > > +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > > +	gd->count = READ_ONCE(sqe->len);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > > +{
> > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > +	struct file *file = req->file;
> > > +	unsigned long getdents_flags = 0;
> > > +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> > 
> > Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
> > But to point this out:
> > 
> > vfs_getdents()
> > -> iterate_dir()
> >     {
> >          if (shared)
> >                  res = down_read_killable(&inode->i_rwsem);
> >          else
> >                  res = down_write_killable(&inode->i_rwsem);
> >     }
> > 
> > which means you can still end up sleeping here before you go into a
> > filesystem that does actually support non-waiting getdents. So if you
> > have concurrent operations that grab inode lock (touch, mkdir etc) you
> > can end up sleeping here.
> > 
> > Is that intentional or an oversight? If the former can someone please
> > explain the rules and why it's fine in this case?
> 
> I actually saw this semaphore, and there is another xfs lock in
> file_accessed
>   --> touch_atime
>     --> inode_update_time
>       --> inode->i_op->update_time == xfs_vn_update_time
> 
> Forgot to point them out in the cover-letter..., I didn't modify them
> since I'm not very sure about if we should do so, and I saw Stefan's
> patchset didn't modify them too.
> 
> My personnal thinking is we should apply trylock logic for this
> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> doesn't make sense to rollback all the stuff while we are almost at the
> end of getdents because of a lock.

That manoeuvres around the problem. Which I'm slightly more sensitive
too as this review is a rather expensive one.

Plus, it seems fixable in at least two ways:

For both we need to be able to tell the filesystem that a nowait atime
update is requested. Simple thing seems to me to add a S_NOWAIT flag to
file_time_flags and passing that via i_op->update_time() which already
has a flag argument. That would likely also help kiocb_modified().

file_accessed()
-> touch_atime()
   -> inode_update_time()
      -> i_op->update_time == xfs_vn_update_time()

Then we have two options afaict:

(1) best-effort atime update

file_accessed() already has the builtin assumption that updating atime
might fail for other reasons - see the comment in there. So it is
somewhat best-effort already.

(2) move atime update before calling into filesystem

If we want to be sure that access time is updated when a readdir request
is issued through io_uring then we need to have file_accessed() give a
return value and expose a new helper for io_uring or modify
vfs_getdents() to do something like:

vfs_getdents()
{
	if (nowait)
		down_read_trylock()

	if (!IS_DEADDIR(inode)) {
		ret = file_accessed(file);
		if (ret == -EAGAIN)
			goto out_unlock;

		f_op->iterate_shared()
	}
}

It's not unprecedented to do update atime before the actual operation
has been done afaict. That's already the case in xfs_file_write_checks()
which is called before anything is written. So that seems ok.

Does any of these two options work for the xfs maintainers and Jens?
Pavel Begunkov July 27, 2023, 3:12 p.m. UTC | #5
On 7/27/23 15:27, Christian Brauner wrote:
> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
>> On 7/26/23 23:00, Christian Brauner wrote:
>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
>>>> From: Hao Xu <howeyxu@tencent.com>
>>>>
>>>> This add support for getdents64 to io_uring, acting exactly like the
>>>> syscall: the directory is iterated from it's current's position as
>>>> stored in the file struct, and the file's position is updated exactly as
>>>> if getdents64 had been called.
>>>>
>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>>>> first; if a user already knows the filesystem they use do not support
>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>>>> avoiding the need to bounce back through a useless EAGAIN return.
>>>>
>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>>>> ---
[...]
>> I actually saw this semaphore, and there is another xfs lock in
>> file_accessed
>>    --> touch_atime
>>      --> inode_update_time
>>        --> inode->i_op->update_time == xfs_vn_update_time
>>
>> Forgot to point them out in the cover-letter..., I didn't modify them
>> since I'm not very sure about if we should do so, and I saw Stefan's
>> patchset didn't modify them too.
>>
>> My personnal thinking is we should apply trylock logic for this
>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
>> doesn't make sense to rollback all the stuff while we are almost at the
>> end of getdents because of a lock.
> 
> That manoeuvres around the problem. Which I'm slightly more sensitive
> too as this review is a rather expensive one.
> 
> Plus, it seems fixable in at least two ways:
> 
> For both we need to be able to tell the filesystem that a nowait atime
> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> file_time_flags and passing that via i_op->update_time() which already
> has a flag argument. That would likely also help kiocb_modified().

fwiw, we've just recently had similar problems with io_uring read/write
and atime/mtime in prod environment, so we're interested in solving that
regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time
touch ignores that, that stalls other submissions and completely screws
latency.

> file_accessed()
> -> touch_atime()
>     -> inode_update_time()
>        -> i_op->update_time == xfs_vn_update_time()
> 
> Then we have two options afaict:
> 
> (1) best-effort atime update
> 
> file_accessed() already has the builtin assumption that updating atime
> might fail for other reasons - see the comment in there. So it is
> somewhat best-effort already.
> 
> (2) move atime update before calling into filesystem
> 
> If we want to be sure that access time is updated when a readdir request
> is issued through io_uring then we need to have file_accessed() give a
> return value and expose a new helper for io_uring or modify
> vfs_getdents() to do something like:
> 
> vfs_getdents()
> {
> 	if (nowait)
> 		down_read_trylock()
> 
> 	if (!IS_DEADDIR(inode)) {
> 		ret = file_accessed(file);
> 		if (ret == -EAGAIN)
> 			goto out_unlock;
> 
> 		f_op->iterate_shared()
> 	}
> }
> 
> It's not unprecedented to do update atime before the actual operation
> has been done afaict. That's already the case in xfs_file_write_checks()
> which is called before anything is written. So that seems ok.
> 
> Does any of these two options work for the xfs maintainers and Jens?

It doesn't look (2) will solve it for reads/writes, at least without
the pain of changing the {write,read}_iter callbacks. 1) sounds good
to me from the io_uring perspective, but I guess it won't work
for mtime?
Christian Brauner July 27, 2023, 3:52 p.m. UTC | #6
On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
> On 7/27/23 15:27, Christian Brauner wrote:
> > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > > On 7/26/23 23:00, Christian Brauner wrote:
> > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu <howeyxu@tencent.com>
> > > > > 
> > > > > This add support for getdents64 to io_uring, acting exactly like the
> > > > > syscall: the directory is iterated from it's current's position as
> > > > > stored in the file struct, and the file's position is updated exactly as
> > > > > if getdents64 had been called.
> > > > > 
> > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > > > first; if a user already knows the filesystem they use do not support
> > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > > > avoiding the need to bounce back through a useless EAGAIN return.
> > > > > 
> > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > > > ---
> [...]
> > > I actually saw this semaphore, and there is another xfs lock in
> > > file_accessed
> > >    --> touch_atime
> > >      --> inode_update_time
> > >        --> inode->i_op->update_time == xfs_vn_update_time
> > > 
> > > Forgot to point them out in the cover-letter..., I didn't modify them
> > > since I'm not very sure about if we should do so, and I saw Stefan's
> > > patchset didn't modify them too.
> > > 
> > > My personnal thinking is we should apply trylock logic for this
> > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > > doesn't make sense to rollback all the stuff while we are almost at the
> > > end of getdents because of a lock.
> > 
> > That manoeuvres around the problem. Which I'm slightly more sensitive
> > too as this review is a rather expensive one.
> > 
> > Plus, it seems fixable in at least two ways:
> > 
> > For both we need to be able to tell the filesystem that a nowait atime
> > update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> > file_time_flags and passing that via i_op->update_time() which already
> > has a flag argument. That would likely also help kiocb_modified().
> 
> fwiw, we've just recently had similar problems with io_uring read/write
> and atime/mtime in prod environment, so we're interested in solving that
> regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time
> touch ignores that, that stalls other submissions and completely screws
> latency.
> 
> > file_accessed()
> > -> touch_atime()
> >     -> inode_update_time()
> >        -> i_op->update_time == xfs_vn_update_time()
> > 
> > Then we have two options afaict:
> > 
> > (1) best-effort atime update
> > 
> > file_accessed() already has the builtin assumption that updating atime
> > might fail for other reasons - see the comment in there. So it is
> > somewhat best-effort already.
> > 
> > (2) move atime update before calling into filesystem
> > 
> > If we want to be sure that access time is updated when a readdir request
> > is issued through io_uring then we need to have file_accessed() give a
> > return value and expose a new helper for io_uring or modify
> > vfs_getdents() to do something like:
> > 
> > vfs_getdents()
> > {
> > 	if (nowait)
> > 		down_read_trylock()
> > 
> > 	if (!IS_DEADDIR(inode)) {
> > 		ret = file_accessed(file);
> > 		if (ret == -EAGAIN)
> > 			goto out_unlock;
> > 
> > 		f_op->iterate_shared()
> > 	}
> > }
> > 
> > It's not unprecedented to do update atime before the actual operation
> > has been done afaict. That's already the case in xfs_file_write_checks()
> > which is called before anything is written. So that seems ok.
> > 
> > Does any of these two options work for the xfs maintainers and Jens?
> 
> It doesn't look (2) will solve it for reads/writes, at least without

It would also solve it for writes which is what my kiocb_modified()
comment was about. So right now you have:

kiocb_modified(IOCB_NOWAI)
-> file_modified_flags(IOCB_NOWAI)
   -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking
   -> file_accessed(IOCB_NOWAIT)
      -> i_op->update_time(S_ATIME | S_NOWAIT)

and since xfs_file_write_iter() calls xfs_file_write_checks() before
doing any actual work you'd now be fine.

For reads xfs_file_read_iter() would need to be changed to a similar
logic but that's for xfs to decide ultimately.

> the pain of changing the {write,read}_iter callbacks. 1) sounds good
> to me from the io_uring perspective, but I guess it won't work
> for mtime?

I would prefer 2) which seems cleaner to me. But I might miss why this
won't work. So input needed/wanted.
Pavel Begunkov July 27, 2023, 4:17 p.m. UTC | #7
On 7/27/23 16:52, Christian Brauner wrote:
> On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
>> On 7/27/23 15:27, Christian Brauner wrote:
>>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
>>>> On 7/26/23 23:00, Christian Brauner wrote:
>>>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
>>>>>> From: Hao Xu <howeyxu@tencent.com>
>>>>>>
>>>>>> This add support for getdents64 to io_uring, acting exactly like the
>>>>>> syscall: the directory is iterated from it's current's position as
>>>>>> stored in the file struct, and the file's position is updated exactly as
>>>>>> if getdents64 had been called.
>>>>>>
>>>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>>>>>> first; if a user already knows the filesystem they use do not support
>>>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>>>>>> avoiding the need to bounce back through a useless EAGAIN return.
>>>>>>
>>>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>>>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>>>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>>>>>> ---
>> [...]
>>>> I actually saw this semaphore, and there is another xfs lock in
>>>> file_accessed
>>>>     --> touch_atime
>>>>       --> inode_update_time
>>>>         --> inode->i_op->update_time == xfs_vn_update_time
>>>>
>>>> Forgot to point them out in the cover-letter..., I didn't modify them
>>>> since I'm not very sure about if we should do so, and I saw Stefan's
>>>> patchset didn't modify them too.
>>>>
>>>> My personnal thinking is we should apply trylock logic for this
>>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
>>>> doesn't make sense to rollback all the stuff while we are almost at the
>>>> end of getdents because of a lock.
>>>
>>> That manoeuvres around the problem. Which I'm slightly more sensitive
>>> too as this review is a rather expensive one.
>>>
>>> Plus, it seems fixable in at least two ways:
>>>
>>> For both we need to be able to tell the filesystem that a nowait atime
>>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
>>> file_time_flags and passing that via i_op->update_time() which already
>>> has a flag argument. That would likely also help kiocb_modified().
>>
>> fwiw, we've just recently had similar problems with io_uring read/write
>> and atime/mtime in prod environment, so we're interested in solving that
>> regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time
>> touch ignores that, that stalls other submissions and completely screws
>> latency.
>>
>>> file_accessed()
>>> -> touch_atime()
>>>      -> inode_update_time()
>>>         -> i_op->update_time == xfs_vn_update_time()
>>>
>>> Then we have two options afaict:
>>>
>>> (1) best-effort atime update
>>>
>>> file_accessed() already has the builtin assumption that updating atime
>>> might fail for other reasons - see the comment in there. So it is
>>> somewhat best-effort already.
>>>
>>> (2) move atime update before calling into filesystem
>>>
>>> If we want to be sure that access time is updated when a readdir request
>>> is issued through io_uring then we need to have file_accessed() give a
>>> return value and expose a new helper for io_uring or modify
>>> vfs_getdents() to do something like:
>>>
>>> vfs_getdents()
>>> {
>>> 	if (nowait)
>>> 		down_read_trylock()
>>>
>>> 	if (!IS_DEADDIR(inode)) {
>>> 		ret = file_accessed(file);
>>> 		if (ret == -EAGAIN)
>>> 			goto out_unlock;
>>>
>>> 		f_op->iterate_shared()
>>> 	}
>>> }
>>>
>>> It's not unprecedented to do update atime before the actual operation
>>> has been done afaict. That's already the case in xfs_file_write_checks()
>>> which is called before anything is written. So that seems ok.
>>>
>>> Does any of these two options work for the xfs maintainers and Jens?
>>
>> It doesn't look (2) will solve it for reads/writes, at least without
> 
> It would also solve it for writes which is what my kiocb_modified()
> comment was about. So right now you have:

Great, I assumed there are stricter requirements for mtime not
transiently failing.


> kiocb_modified(IOCB_NOWAI)
> -> file_modified_flags(IOCB_NOWAI)
>     -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking
>     -> file_accessed(IOCB_NOWAIT)
>        -> i_op->update_time(S_ATIME | S_NOWAIT)
> 
> and since xfs_file_write_iter() calls xfs_file_write_checks() before
> doing any actual work you'd now be fine.
> 
> For reads xfs_file_read_iter() would need to be changed to a similar
> logic but that's for xfs to decide ultimately.
> 
>> the pain of changing the {write,read}_iter callbacks. 1) sounds good
>> to me from the io_uring perspective, but I guess it won't work
>> for mtime?
> 
> I would prefer 2) which seems cleaner to me. But I might miss why this
> won't work. So input needed/wanted.

Maybe I didn't fully grasp the (2) idea

2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
before doing IO, which sounds like a good option if everyone agrees with
that. Taking a look at direct block io, it's already like this.

2.2: Having io_uring doing file_accessed(). Since it's all currently
hidden behind {read,write}_iter() callbacks and not easily extractable,
it doesn't like a good option, unless I missed sth.
E.g. this ugliness comes to mind.

io_uring_read() {
	file_accessed();
	file->f_op->read_iter(DONT_TOUCH_ATIME);
	...
}

read_iter_impl() {
	// some pre processing

	if (!(flags & DONT_TOUCH_ATIME))
		file_accessed();
}
Christian Brauner July 27, 2023, 4:28 p.m. UTC | #8
On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote:
> On 7/27/23 16:52, Christian Brauner wrote:
> > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
> > > On 7/27/23 15:27, Christian Brauner wrote:
> > > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > > > > On 7/26/23 23:00, Christian Brauner wrote:
> > > > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
> > > > > > > From: Hao Xu <howeyxu@tencent.com>
> > > > > > > 
> > > > > > > This add support for getdents64 to io_uring, acting exactly like the
> > > > > > > syscall: the directory is iterated from it's current's position as
> > > > > > > stored in the file struct, and the file's position is updated exactly as
> > > > > > > if getdents64 had been called.
> > > > > > > 
> > > > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > > > > > first; if a user already knows the filesystem they use do not support
> > > > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > > > > > avoiding the need to bounce back through a useless EAGAIN return.
> > > > > > > 
> > > > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > > > > > ---
> > > [...]
> > > > > I actually saw this semaphore, and there is another xfs lock in
> > > > > file_accessed
> > > > >     --> touch_atime
> > > > >       --> inode_update_time
> > > > >         --> inode->i_op->update_time == xfs_vn_update_time
> > > > > 
> > > > > Forgot to point them out in the cover-letter..., I didn't modify them
> > > > > since I'm not very sure about if we should do so, and I saw Stefan's
> > > > > patchset didn't modify them too.
> > > > > 
> > > > > My personnal thinking is we should apply trylock logic for this
> > > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > > > > doesn't make sense to rollback all the stuff while we are almost at the
> > > > > end of getdents because of a lock.
> > > > 
> > > > That manoeuvres around the problem. Which I'm slightly more sensitive
> > > > too as this review is a rather expensive one.
> > > > 
> > > > Plus, it seems fixable in at least two ways:
> > > > 
> > > > For both we need to be able to tell the filesystem that a nowait atime
> > > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> > > > file_time_flags and passing that via i_op->update_time() which already
> > > > has a flag argument. That would likely also help kiocb_modified().
> > > 
> > > fwiw, we've just recently had similar problems with io_uring read/write
> > > and atime/mtime in prod environment, so we're interested in solving that
> > > regardless of this patchset. I.e. io_uring issues rw with NOWAIT, {a,m}time
> > > touch ignores that, that stalls other submissions and completely screws
> > > latency.
> > > 
> > > > file_accessed()
> > > > -> touch_atime()
> > > >      -> inode_update_time()
> > > >         -> i_op->update_time == xfs_vn_update_time()
> > > > 
> > > > Then we have two options afaict:
> > > > 
> > > > (1) best-effort atime update
> > > > 
> > > > file_accessed() already has the builtin assumption that updating atime
> > > > might fail for other reasons - see the comment in there. So it is
> > > > somewhat best-effort already.
> > > > 
> > > > (2) move atime update before calling into filesystem
> > > > 
> > > > If we want to be sure that access time is updated when a readdir request
> > > > is issued through io_uring then we need to have file_accessed() give a
> > > > return value and expose a new helper for io_uring or modify
> > > > vfs_getdents() to do something like:
> > > > 
> > > > vfs_getdents()
> > > > {
> > > > 	if (nowait)
> > > > 		down_read_trylock()
> > > > 
> > > > 	if (!IS_DEADDIR(inode)) {
> > > > 		ret = file_accessed(file);
> > > > 		if (ret == -EAGAIN)
> > > > 			goto out_unlock;
> > > > 
> > > > 		f_op->iterate_shared()
> > > > 	}
> > > > }
> > > > 
> > > > It's not unprecedented to do update atime before the actual operation
> > > > has been done afaict. That's already the case in xfs_file_write_checks()
> > > > which is called before anything is written. So that seems ok.
> > > > 
> > > > Does any of these two options work for the xfs maintainers and Jens?
> > > 
> > > It doesn't look (2) will solve it for reads/writes, at least without
> > 
> > It would also solve it for writes which is what my kiocb_modified()
> > comment was about. So right now you have:
> 
> Great, I assumed there are stricter requirements for mtime not
> transiently failing.

But I mean then wouldn't this already be a problem today?
kiocb_modified() can error out with EAGAIN today:

          ret = inode_needs_update_time(inode, &now);
          if (ret <= 0)
                  return ret;
          if (flags & IOCB_NOWAIT)
                  return -EAGAIN;

          return __file_update_time(file, &now, ret);

the thing is that it doesn't matter for ->write_iter() - for xfs at
least - because xfs does it as part of preparatory checks before
actually doing any real work. The problem happens when you do actual
work and afterwards call kiocb_modified(). That's why I think (2) is
preferable.

> 
> 
> > kiocb_modified(IOCB_NOWAI)
> > -> file_modified_flags(IOCB_NOWAI)
> >     -> file_remove_privs(IOCB_NOWAIT) // already fully non-blocking
> >     -> file_accessed(IOCB_NOWAIT)
> >        -> i_op->update_time(S_ATIME | S_NOWAIT)
> > 
> > and since xfs_file_write_iter() calls xfs_file_write_checks() before
> > doing any actual work you'd now be fine.
> > 
> > For reads xfs_file_read_iter() would need to be changed to a similar
> > logic but that's for xfs to decide ultimately.
> > 
> > > the pain of changing the {write,read}_iter callbacks. 1) sounds good
> > > to me from the io_uring perspective, but I guess it won't work
> > > for mtime?
> > 
> > I would prefer 2) which seems cleaner to me. But I might miss why this
> > won't work. So input needed/wanted.
> 
> Maybe I didn't fully grasp the (2) idea
> 
> 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
> before doing IO, which sounds like a good option if everyone agrees with
> that. Taking a look at direct block io, it's already like this.

Yes, that's what I'm talking about. I'm asking whether that's ok for xfs
maintainers basically. i_op->write_iter() already works like that since
the dawn of time but i_op->read_iter doesn't and I'm proposing to make
it work like that and wondering if there's any issues I'm unaware of.

> 
> 2.2: Having io_uring doing file_accessed(). Since it's all currently
> hidden behind {read,write}_iter() callbacks and not easily extractable,
> it doesn't like a good option, unless I missed sth.

I think that would be the wrong approach and is definitely not what I
meant.
Hao Xu July 30, 2023, 6:02 p.m. UTC | #9
Hi Christian,

On 7/27/23 22:27, Christian Brauner wrote:
> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
>> On 7/26/23 23:00, Christian Brauner wrote:
>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
>>>> From: Hao Xu <howeyxu@tencent.com>
>>>>
>>>> This add support for getdents64 to io_uring, acting exactly like the
>>>> syscall: the directory is iterated from it's current's position as
>>>> stored in the file struct, and the file's position is updated exactly as
>>>> if getdents64 had been called.
>>>>
>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>>>> first; if a user already knows the filesystem they use do not support
>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>>>> avoiding the need to bounce back through a useless EAGAIN return.
>>>>
>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>>>> ---
>>>>    include/uapi/linux/io_uring.h |  7 +++++
>>>>    io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
>>>>    io_uring/fs.h                 |  3 ++
>>>>    io_uring/opdef.c              |  8 +++++
>>>>    4 files changed, 73 insertions(+)
>>>>
>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>> index 36f9c73082de..b200b2600622 100644
>>>> --- a/include/uapi/linux/io_uring.h
>>>> +++ b/include/uapi/linux/io_uring.h
>>>> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>>>>    		__u32		xattr_flags;
>>>>    		__u32		msg_ring_flags;
>>>>    		__u32		uring_cmd_flags;
>>>> +		__u32		getdents_flags;
>>>>    	};
>>>>    	__u64	user_data;	/* data to be passed back at completion time */
>>>>    	/* pack this to avoid bogus arm OABI complaints */
>>>> @@ -235,6 +236,7 @@ enum io_uring_op {
>>>>    	IORING_OP_URING_CMD,
>>>>    	IORING_OP_SEND_ZC,
>>>>    	IORING_OP_SENDMSG_ZC,
>>>> +	IORING_OP_GETDENTS,
>>>>    	/* this goes last, obviously */
>>>>    	IORING_OP_LAST,
>>>> @@ -273,6 +275,11 @@ enum io_uring_op {
>>>>     */
>>>>    #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>>>> +/*
>>>> + * sqe->getdents_flags
>>>> + */
>>>> +#define IORING_GETDENTS_REWIND	(1U << 0)
>>>> +
>>>>    /*
>>>>     * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>>>>     * command flags for POLL_ADD are stored in sqe->len.
>>>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>>>> index f6a69a549fd4..480f25677fed 100644
>>>> --- a/io_uring/fs.c
>>>> +++ b/io_uring/fs.c
>>>> @@ -47,6 +47,13 @@ struct io_link {
>>>>    	int				flags;
>>>>    };
>>>> +struct io_getdents {
>>>> +	struct file			*file;
>>>> +	struct linux_dirent64 __user	*dirent;
>>>> +	unsigned int			count;
>>>> +	int				flags;
>>>> +};
>>>> +
>>>>    int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>    {
>>>>    	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
>>>> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
>>>>    	putname(sl->oldpath);
>>>>    	putname(sl->newpath);
>>>>    }
>>>> +
>>>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>> +{
>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>> +
>>>> +	if (READ_ONCE(sqe->off) != 0)
>>>> +		return -EINVAL;
>>>> +
>>>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>>>> +	gd->count = READ_ONCE(sqe->len);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>>>> +{
>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>> +	struct file *file = req->file;
>>>> +	unsigned long getdents_flags = 0;
>>>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>>
>>> Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
>>> But to point this out:
>>>
>>> vfs_getdents()
>>> -> iterate_dir()
>>>      {
>>>           if (shared)
>>>                   res = down_read_killable(&inode->i_rwsem);
>>>           else
>>>                   res = down_write_killable(&inode->i_rwsem);
>>>      }
>>>
>>> which means you can still end up sleeping here before you go into a
>>> filesystem that does actually support non-waiting getdents. So if you
>>> have concurrent operations that grab inode lock (touch, mkdir etc) you
>>> can end up sleeping here.
>>>
>>> Is that intentional or an oversight? If the former can someone please
>>> explain the rules and why it's fine in this case?
>>
>> I actually saw this semaphore, and there is another xfs lock in
>> file_accessed
>>    --> touch_atime
>>      --> inode_update_time
>>        --> inode->i_op->update_time == xfs_vn_update_time
>>
>> Forgot to point them out in the cover-letter..., I didn't modify them
>> since I'm not very sure about if we should do so, and I saw Stefan's
>> patchset didn't modify them too.
>>
>> My personnal thinking is we should apply trylock logic for this
>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
>> doesn't make sense to rollback all the stuff while we are almost at the
>> end of getdents because of a lock.
> 
> That manoeuvres around the problem. Which I'm slightly more sensitive
> too as this review is a rather expensive one.
> 
> Plus, it seems fixable in at least two ways:
> 
> For both we need to be able to tell the filesystem that a nowait atime
> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> file_time_flags and passing that via i_op->update_time() which already
> has a flag argument. That would likely also help kiocb_modified().
> 
> file_accessed()
> -> touch_atime()
>     -> inode_update_time()
>        -> i_op->update_time == xfs_vn_update_time()
> 
> Then we have two options afaict:
> 
> (1) best-effort atime update
> 
> file_accessed() already has the builtin assumption that updating atime
> might fail for other reasons - see the comment in there. So it is
> somewhat best-effort already.
> 
> (2) move atime update before calling into filesystem
> 
> If we want to be sure that access time is updated when a readdir request
> is issued through io_uring then we need to have file_accessed() give a
> return value and expose a new helper for io_uring or modify
> vfs_getdents() to do something like:
> 
> vfs_getdents()
> {
> 	if (nowait)
> 		down_read_trylock()
> 
> 	if (!IS_DEADDIR(inode)) {
> 		ret = file_accessed(file);
> 		if (ret == -EAGAIN)
> 			goto out_unlock;
> 
> 		f_op->iterate_shared()
> 	}
> }
> 
> It's not unprecedented to do update atime before the actual operation
> has been done afaict. That's already the case in xfs_file_write_checks()
> which is called before anything is written. So that seems ok.

I'm not familiar with this part(the time update), I guess we should
revert the updated time if we succeed to do file_accessed(file) but
fail somewhere later in f_op->iterate_shared()? Or is it definitely
counted as an "access" as long as we start to call getdents to a file?

Thanks,
Hao

> 
> Does any of these two options work for the xfs maintainers and Jens?
Dave Chinner July 31, 2023, 1:33 a.m. UTC | #10
On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote:
> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > I actually saw this semaphore, and there is another xfs lock in
> > file_accessed
> >   --> touch_atime
> >     --> inode_update_time
> >       --> inode->i_op->update_time == xfs_vn_update_time
> > 
> > Forgot to point them out in the cover-letter..., I didn't modify them
> > since I'm not very sure about if we should do so, and I saw Stefan's
> > patchset didn't modify them too.
> > 
> > My personnal thinking is we should apply trylock logic for this
> > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > doesn't make sense to rollback all the stuff while we are almost at the
> > end of getdents because of a lock.
> 
> That manoeuvres around the problem. Which I'm slightly more sensitive
> too as this review is a rather expensive one.
> 
> Plus, it seems fixable in at least two ways:
> 
> For both we need to be able to tell the filesystem that a nowait atime
> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> file_time_flags and passing that via i_op->update_time() which already
> has a flag argument. That would likely also help kiocb_modified().

Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT
modification operations? Yeah, we did:

kiocb_modified(iocb)
  file_modified_flags(iocb->ki_file, iocb->ki_flags)
    ....
    ret = inode_needs_update_time()
    if (ret <= 0)
	return ret;
    if (flags & IOCB_NOWAIT)
	return -EAGAIN;
    <does timestamp update>

> file_accessed()
> -> touch_atime()
>    -> inode_update_time()
>       -> i_op->update_time == xfs_vn_update_time()

Yeah, so this needs the same treatment as file_modified_flags() -
touch_atime() needs a flag variant that passes IOCB_NOWAIT, and
after atime_needs_update() returns trues we should check IOCB_NOWAIT
and return EAGAIN if it is set. That will punt the operation that
needs to the update to a worker thread that can block....

> Then we have two options afaict:
> 
> (1) best-effort atime update
> 
> file_accessed() already has the builtin assumption that updating atime
> might fail for other reasons - see the comment in there. So it is
> somewhat best-effort already.
> 
> (2) move atime update before calling into filesystem
> 
> If we want to be sure that access time is updated when a readdir request
> is issued through io_uring then we need to have file_accessed() give a
> return value and expose a new helper for io_uring or modify
> vfs_getdents() to do something like:
> 
> vfs_getdents()
> {
> 	if (nowait)
> 		down_read_trylock()
> 
> 	if (!IS_DEADDIR(inode)) {
> 		ret = file_accessed(file);
> 		if (ret == -EAGAIN)
> 			goto out_unlock;
> 
> 		f_op->iterate_shared()
> 	}
> }

Yup, that's the sort of thing that needs to be done.

But as I said in the "llseek for io-uring" thread, we need to stop
the game of whack-a-mole passing random nowait boolean flags to VFS
operations before it starts in earnest.  We really need a common
context structure (like we have a kiocb for IO operations) that
holds per operation control state so we have consistency across all
the operations that we need different behaviours for.

> It's not unprecedented to do update atime before the actual operation
> has been done afaict. That's already the case in xfs_file_write_checks()
> which is called before anything is written. So that seems ok.

Writes don't update atime - they update mtime, and there are other
considerations for doing the mtime update before the data
modification. e.g. we have to strip permissions from the inode prior
to any changes that are going to be made, so we've already got to do
all the "change inode metadata" checks and modifications before the
data is written anyway....

Cheers,

Dave.
Dave Chinner July 31, 2023, 1:58 a.m. UTC | #11
On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote:
> On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote:
> > On 7/27/23 16:52, Christian Brauner wrote:
> > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
> > > It would also solve it for writes which is what my kiocb_modified()
> > > comment was about. So right now you have:
> > 
> > Great, I assumed there are stricter requirements for mtime not
> > transiently failing.
> 
> But I mean then wouldn't this already be a problem today?
> kiocb_modified() can error out with EAGAIN today:
> 
>           ret = inode_needs_update_time(inode, &now);
>           if (ret <= 0)
>                   return ret;
>           if (flags & IOCB_NOWAIT)
>                   return -EAGAIN;
> 
>           return __file_update_time(file, &now, ret);
> 
> the thing is that it doesn't matter for ->write_iter() - for xfs at
> least - because xfs does it as part of preparatory checks before
> actually doing any real work. The problem happens when you do actual
> work and afterwards call kiocb_modified(). That's why I think (2) is
> preferable.

This has nothing to do with what "XFS does". It's actually an
IOCB_NOWAIT API design constraint.

That is, IOCB_NOWAIT means "complete the whole operation without
blocking or return -EAGAIN having done nothing".  If we have to do
something that might block (like a timestamp update) then we need to
punt the entire operation before anything has been modified.  This
requires all the "do we need to modify this" checks to be done up
front before we start modifying anything.

So while it looks like this might be "an XFS thing", that's because
XFS tends to be the first filesystem that most io_uring NOWAIT
functionality is implemented on. IOWs, what you see is XFS is doing
things the way IOCB_NOWAIT requires to be done. i.e. it's a
demonstration of how nonblocking filesystem modification operations
need to be run, not an "XFS thing"...

> > > I would prefer 2) which seems cleaner to me. But I might miss why this
> > > won't work. So input needed/wanted.
> > 
> > Maybe I didn't fully grasp the (2) idea
> > 
> > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
> > before doing IO, which sounds like a good option if everyone agrees with
> > that. Taking a look at direct block io, it's already like this.
> 
> Yes, that's what I'm talking about. I'm asking whether that's ok for xfs
> maintainers basically. i_op->write_iter() already works like that since
> the dawn of time but i_op->read_iter doesn't and I'm proposing to make
> it work like that and wondering if there's any issues I'm unaware of.

XFS already calls file_accessed() in the DIO read path before the
read gets issued. I don't see any problem with lifting it to before
the copy-out loop in filemap_read() because it is run regardless of
whether any data is read or any error occurred.  Hence it just
doesn't look like it matters if it is run before or after the
copy-out loop to me....

-Dave.
Hao Xu July 31, 2023, 7:34 a.m. UTC | #12
On 7/31/23 09:58, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote:
>> On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote:
>>> On 7/27/23 16:52, Christian Brauner wrote:
>>>> On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
>>>> It would also solve it for writes which is what my kiocb_modified()
>>>> comment was about. So right now you have:
>>>
>>> Great, I assumed there are stricter requirements for mtime not
>>> transiently failing.
>>
>> But I mean then wouldn't this already be a problem today?
>> kiocb_modified() can error out with EAGAIN today:
>>
>>            ret = inode_needs_update_time(inode, &now);
>>            if (ret <= 0)
>>                    return ret;
>>            if (flags & IOCB_NOWAIT)
>>                    return -EAGAIN;
>>
>>            return __file_update_time(file, &now, ret);
>>
>> the thing is that it doesn't matter for ->write_iter() - for xfs at
>> least - because xfs does it as part of preparatory checks before
>> actually doing any real work. The problem happens when you do actual
>> work and afterwards call kiocb_modified(). That's why I think (2) is
>> preferable.
> 
> This has nothing to do with what "XFS does". It's actually an
> IOCB_NOWAIT API design constraint.
> 
> That is, IOCB_NOWAIT means "complete the whole operation without
> blocking or return -EAGAIN having done nothing".  If we have to do
> something that might block (like a timestamp update) then we need to
> punt the entire operation before anything has been modified.  This
> requires all the "do we need to modify this" checks to be done up
> front before we start modifying anything.
> 
> So while it looks like this might be "an XFS thing", that's because
> XFS tends to be the first filesystem that most io_uring NOWAIT
> functionality is implemented on. IOWs, what you see is XFS is doing
> things the way IOCB_NOWAIT requires to be done. i.e. it's a
> demonstration of how nonblocking filesystem modification operations
> need to be run, not an "XFS thing"...
> 
>>>> I would prefer 2) which seems cleaner to me. But I might miss why this
>>>> won't work. So input needed/wanted.
>>>
>>> Maybe I didn't fully grasp the (2) idea
>>>
>>> 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
>>> before doing IO, which sounds like a good option if everyone agrees with
>>> that. Taking a look at direct block io, it's already like this.
>>
>> Yes, that's what I'm talking about. I'm asking whether that's ok for xfs
>> maintainers basically. i_op->write_iter() already works like that since
>> the dawn of time but i_op->read_iter doesn't and I'm proposing to make
>> it work like that and wondering if there's any issues I'm unaware of.
> 
> XFS already calls file_accessed() in the DIO read path before the
> read gets issued. I don't see any problem with lifting it to before

Hi Dave,

Here I've a question, in DIO read path, if we update the time but
later somehow got errors before actual reading, e.g. return -EAGAIN
from the xfs_ilock_iocb(), shouldn't we revert the time update since
we actually doesn't read the file? We can lazily update the time but
on the contrary a false update sounds weird to me.

Thanks,
Hao

> the copy-out loop in filemap_read() because it is run regardless of
> whether any data is read or any error occurred.  Hence it just
> doesn't look like it matters if it is run before or after the
> copy-out loop to me....
> 
> -Dave.
Christian Brauner July 31, 2023, 7:40 a.m. UTC | #13
On Mon, Jul 31, 2023 at 11:58:50AM +1000, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote:
> > On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote:
> > > On 7/27/23 16:52, Christian Brauner wrote:
> > > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
> > > > It would also solve it for writes which is what my kiocb_modified()
> > > > comment was about. So right now you have:
> > > 
> > > Great, I assumed there are stricter requirements for mtime not
> > > transiently failing.
> > 
> > But I mean then wouldn't this already be a problem today?
> > kiocb_modified() can error out with EAGAIN today:
> > 
> >           ret = inode_needs_update_time(inode, &now);
> >           if (ret <= 0)
> >                   return ret;
> >           if (flags & IOCB_NOWAIT)
> >                   return -EAGAIN;
> > 
> >           return __file_update_time(file, &now, ret);
> > 
> > the thing is that it doesn't matter for ->write_iter() - for xfs at
> > least - because xfs does it as part of preparatory checks before
> > actually doing any real work. The problem happens when you do actual
> > work and afterwards call kiocb_modified(). That's why I think (2) is
> > preferable.
> 
> This has nothing to do with what "XFS does". It's actually an
> IOCB_NOWAIT API design constraint.
> 
> That is, IOCB_NOWAIT means "complete the whole operation without
> blocking or return -EAGAIN having done nothing".  If we have to do
> something that might block (like a timestamp update) then we need to
> punt the entire operation before anything has been modified.  This
> requires all the "do we need to modify this" checks to be done up
> front before we start modifying anything.
> 
> So while it looks like this might be "an XFS thing", that's because
> XFS tends to be the first filesystem that most io_uring NOWAIT
> functionality is implemented on. IOWs, what you see is XFS is doing
> things the way IOCB_NOWAIT requires to be done. i.e. it's a
> demonstration of how nonblocking filesystem modification operations
> need to be run, not an "XFS thing"...

Yes, I'm aware. I was trying to pay xfs a compliment for that but
somehow that didn't come through.

> 
> > > > I would prefer 2) which seems cleaner to me. But I might miss why this
> > > > won't work. So input needed/wanted.
> > > 
> > > Maybe I didn't fully grasp the (2) idea
> > > 
> > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
> > > before doing IO, which sounds like a good option if everyone agrees with
> > > that. Taking a look at direct block io, it's already like this.
> > 
> > Yes, that's what I'm talking about. I'm asking whether that's ok for xfs
> > maintainers basically. i_op->write_iter() already works like that since
> > the dawn of time but i_op->read_iter doesn't and I'm proposing to make
> > it work like that and wondering if there's any issues I'm unaware of.
> 
> XFS already calls file_accessed() in the DIO read path before the
> read gets issued. I don't see any problem with lifting it to before
> the copy-out loop in filemap_read() because it is run regardless of
> whether any data is read or any error occurred.  Hence it just
> doesn't look like it matters if it is run before or after the
> copy-out loop to me....

Great.
Christian Brauner July 31, 2023, 7:50 a.m. UTC | #14
On Mon, Jul 31, 2023 at 03:34:48PM +0800, Hao Xu wrote:
> On 7/31/23 09:58, Dave Chinner wrote:
> > On Thu, Jul 27, 2023 at 06:28:52PM +0200, Christian Brauner wrote:
> > > On Thu, Jul 27, 2023 at 05:17:30PM +0100, Pavel Begunkov wrote:
> > > > On 7/27/23 16:52, Christian Brauner wrote:
> > > > > On Thu, Jul 27, 2023 at 04:12:12PM +0100, Pavel Begunkov wrote:
> > > > > It would also solve it for writes which is what my kiocb_modified()
> > > > > comment was about. So right now you have:
> > > > 
> > > > Great, I assumed there are stricter requirements for mtime not
> > > > transiently failing.
> > > 
> > > But I mean then wouldn't this already be a problem today?
> > > kiocb_modified() can error out with EAGAIN today:
> > > 
> > >            ret = inode_needs_update_time(inode, &now);
> > >            if (ret <= 0)
> > >                    return ret;
> > >            if (flags & IOCB_NOWAIT)
> > >                    return -EAGAIN;
> > > 
> > >            return __file_update_time(file, &now, ret);
> > > 
> > > the thing is that it doesn't matter for ->write_iter() - for xfs at
> > > least - because xfs does it as part of preparatory checks before
> > > actually doing any real work. The problem happens when you do actual
> > > work and afterwards call kiocb_modified(). That's why I think (2) is
> > > preferable.
> > 
> > This has nothing to do with what "XFS does". It's actually an
> > IOCB_NOWAIT API design constraint.
> > 
> > That is, IOCB_NOWAIT means "complete the whole operation without
> > blocking or return -EAGAIN having done nothing".  If we have to do
> > something that might block (like a timestamp update) then we need to
> > punt the entire operation before anything has been modified.  This
> > requires all the "do we need to modify this" checks to be done up
> > front before we start modifying anything.
> > 
> > So while it looks like this might be "an XFS thing", that's because
> > XFS tends to be the first filesystem that most io_uring NOWAIT
> > functionality is implemented on. IOWs, what you see is XFS is doing
> > things the way IOCB_NOWAIT requires to be done. i.e. it's a
> > demonstration of how nonblocking filesystem modification operations
> > need to be run, not an "XFS thing"...
> > 
> > > > > I would prefer 2) which seems cleaner to me. But I might miss why this
> > > > > won't work. So input needed/wanted.
> > > > 
> > > > Maybe I didn't fully grasp the (2) idea
> > > > 
> > > > 2.1: all read_iter, write_iter, etc. callbacks should do file_accessed()
> > > > before doing IO, which sounds like a good option if everyone agrees with
> > > > that. Taking a look at direct block io, it's already like this.
> > > 
> > > Yes, that's what I'm talking about. I'm asking whether that's ok for xfs
> > > maintainers basically. i_op->write_iter() already works like that since
> > > the dawn of time but i_op->read_iter doesn't and I'm proposing to make
> > > it work like that and wondering if there's any issues I'm unaware of.
> > 
> > XFS already calls file_accessed() in the DIO read path before the
> > read gets issued. I don't see any problem with lifting it to before
> 
> Hi Dave,
> 
> Here I've a question, in DIO read path, if we update the time but
> later somehow got errors before actual reading, e.g. return -EAGAIN
> from the xfs_ilock_iocb(), shouldn't we revert the time update since
> we actually doesn't read the file? We can lazily update the time but
> on the contrary a false update sounds weird to me.

You've read the file and you failed but you did access the file. We have
the same logic kinda for readdir in the sense that we call
file_accessed() no matter if ->iterate_shared() or ->iterate() succeeded
or not.

Plus, you should just be able to reorder the read checks to mirror the
write checks. Afaict, the order of the write checks are:

xfs_file_write_checks()
-> xfs_ilock_iocb()
-> kiocb_modified()

so we can just do:

+ xfs_file_read_checks()
  +-> xfs_ilock_iocb()
      +-> file_accessed()/kiocb_accessed()

for the read checks. Then we managed to acquire the necessary lock.
Christian Brauner July 31, 2023, 8:13 a.m. UTC | #15
On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote:
> On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote:
> > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > > I actually saw this semaphore, and there is another xfs lock in
> > > file_accessed
> > >   --> touch_atime
> > >     --> inode_update_time
> > >       --> inode->i_op->update_time == xfs_vn_update_time
> > > 
> > > Forgot to point them out in the cover-letter..., I didn't modify them
> > > since I'm not very sure about if we should do so, and I saw Stefan's
> > > patchset didn't modify them too.
> > > 
> > > My personnal thinking is we should apply trylock logic for this
> > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > > doesn't make sense to rollback all the stuff while we are almost at the
> > > end of getdents because of a lock.
> > 
> > That manoeuvres around the problem. Which I'm slightly more sensitive
> > too as this review is a rather expensive one.
> > 
> > Plus, it seems fixable in at least two ways:
> > 
> > For both we need to be able to tell the filesystem that a nowait atime
> > update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> > file_time_flags and passing that via i_op->update_time() which already
> > has a flag argument. That would likely also help kiocb_modified().
> 
> Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT
> modification operations? Yeah, we did:
> 
> kiocb_modified(iocb)
>   file_modified_flags(iocb->ki_file, iocb->ki_flags)
>     ....
>     ret = inode_needs_update_time()
>     if (ret <= 0)
> 	return ret;
>     if (flags & IOCB_NOWAIT)
> 	return -EAGAIN;
>     <does timestamp update>
> 
> > file_accessed()
> > -> touch_atime()
> >    -> inode_update_time()
> >       -> i_op->update_time == xfs_vn_update_time()
> 
> Yeah, so this needs the same treatment as file_modified_flags() -
> touch_atime() needs a flag variant that passes IOCB_NOWAIT, and
> after atime_needs_update() returns trues we should check IOCB_NOWAIT
> and return EAGAIN if it is set. That will punt the operation that
> needs to the update to a worker thread that can block....

As I tried to explain, I would prefer if we could inform the filesystem
through i_op->update_time() itself that this is async and give the
filesystem the ability to try and acquire the locks it needs and return
EAGAIN from i_op->update_time() itself if it can't acquire them.

> 
> > Then we have two options afaict:
> > 
> > (1) best-effort atime update
> > 
> > file_accessed() already has the builtin assumption that updating atime
> > might fail for other reasons - see the comment in there. So it is
> > somewhat best-effort already.
> > 
> > (2) move atime update before calling into filesystem
> > 
> > If we want to be sure that access time is updated when a readdir request
> > is issued through io_uring then we need to have file_accessed() give a
> > return value and expose a new helper for io_uring or modify
> > vfs_getdents() to do something like:
> > 
> > vfs_getdents()
> > {
> > 	if (nowait)
> > 		down_read_trylock()
> > 
> > 	if (!IS_DEADDIR(inode)) {
> > 		ret = file_accessed(file);
> > 		if (ret == -EAGAIN)
> > 			goto out_unlock;
> > 
> > 		f_op->iterate_shared()
> > 	}
> > }
> 
> Yup, that's the sort of thing that needs to be done.
> 
> But as I said in the "llseek for io-uring" thread, we need to stop
> the game of whack-a-mole passing random nowait boolean flags to VFS
> operations before it starts in earnest.  We really need a common
> context structure (like we have a kiocb for IO operations) that
> holds per operation control state so we have consistency across all
> the operations that we need different behaviours for.

Yes, I tend to agree and thought about the same. But right now we don't
have a lot of context. So I would lean towards a flag argument at most.

But I also wouldn't consider it necessarily wrong to start with booleans
or a flag first and in a couple of months if the need for more context
arises we know what kind of struct we want or need.
Christian Brauner July 31, 2023, 8:18 a.m. UTC | #16
On Mon, Jul 31, 2023 at 02:02:25AM +0800, Hao Xu wrote:
> Hi Christian,
> 
> On 7/27/23 22:27, Christian Brauner wrote:
> > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > > On 7/26/23 23:00, Christian Brauner wrote:
> > > > On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu <howeyxu@tencent.com>
> > > > > 
> > > > > This add support for getdents64 to io_uring, acting exactly like the
> > > > > syscall: the directory is iterated from it's current's position as
> > > > > stored in the file struct, and the file's position is updated exactly as
> > > > > if getdents64 had been called.
> > > > > 
> > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > > > first; if a user already knows the filesystem they use do not support
> > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > > > avoiding the need to bounce back through a useless EAGAIN return.
> > > > > 
> > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > > > ---
> > > > >    include/uapi/linux/io_uring.h |  7 +++++
> > > > >    io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
> > > > >    io_uring/fs.h                 |  3 ++
> > > > >    io_uring/opdef.c              |  8 +++++
> > > > >    4 files changed, 73 insertions(+)
> > > > > 
> > > > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > > > > index 36f9c73082de..b200b2600622 100644
> > > > > --- a/include/uapi/linux/io_uring.h
> > > > > +++ b/include/uapi/linux/io_uring.h
> > > > > @@ -65,6 +65,7 @@ struct io_uring_sqe {
> > > > >    		__u32		xattr_flags;
> > > > >    		__u32		msg_ring_flags;
> > > > >    		__u32		uring_cmd_flags;
> > > > > +		__u32		getdents_flags;
> > > > >    	};
> > > > >    	__u64	user_data;	/* data to be passed back at completion time */
> > > > >    	/* pack this to avoid bogus arm OABI complaints */
> > > > > @@ -235,6 +236,7 @@ enum io_uring_op {
> > > > >    	IORING_OP_URING_CMD,
> > > > >    	IORING_OP_SEND_ZC,
> > > > >    	IORING_OP_SENDMSG_ZC,
> > > > > +	IORING_OP_GETDENTS,
> > > > >    	/* this goes last, obviously */
> > > > >    	IORING_OP_LAST,
> > > > > @@ -273,6 +275,11 @@ enum io_uring_op {
> > > > >     */
> > > > >    #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
> > > > > +/*
> > > > > + * sqe->getdents_flags
> > > > > + */
> > > > > +#define IORING_GETDENTS_REWIND	(1U << 0)
> > > > > +
> > > > >    /*
> > > > >     * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
> > > > >     * command flags for POLL_ADD are stored in sqe->len.
> > > > > diff --git a/io_uring/fs.c b/io_uring/fs.c
> > > > > index f6a69a549fd4..480f25677fed 100644
> > > > > --- a/io_uring/fs.c
> > > > > +++ b/io_uring/fs.c
> > > > > @@ -47,6 +47,13 @@ struct io_link {
> > > > >    	int				flags;
> > > > >    };
> > > > > +struct io_getdents {
> > > > > +	struct file			*file;
> > > > > +	struct linux_dirent64 __user	*dirent;
> > > > > +	unsigned int			count;
> > > > > +	int				flags;
> > > > > +};
> > > > > +
> > > > >    int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > > >    {
> > > > >    	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> > > > > @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
> > > > >    	putname(sl->oldpath);
> > > > >    	putname(sl->newpath);
> > > > >    }
> > > > > +
> > > > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > > > +{
> > > > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > > > +
> > > > > +	if (READ_ONCE(sqe->off) != 0)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > > > > +	gd->count = READ_ONCE(sqe->len);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > > > > +{
> > > > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > > > +	struct file *file = req->file;
> > > > > +	unsigned long getdents_flags = 0;
> > > > > +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> > > > 
> > > > Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
> > > > But to point this out:
> > > > 
> > > > vfs_getdents()
> > > > -> iterate_dir()
> > > >      {
> > > >           if (shared)
> > > >                   res = down_read_killable(&inode->i_rwsem);
> > > >           else
> > > >                   res = down_write_killable(&inode->i_rwsem);
> > > >      }
> > > > 
> > > > which means you can still end up sleeping here before you go into a
> > > > filesystem that does actually support non-waiting getdents. So if you
> > > > have concurrent operations that grab inode lock (touch, mkdir etc) you
> > > > can end up sleeping here.
> > > > 
> > > > Is that intentional or an oversight? If the former can someone please
> > > > explain the rules and why it's fine in this case?
> > > 
> > > I actually saw this semaphore, and there is another xfs lock in
> > > file_accessed
> > >    --> touch_atime
> > >      --> inode_update_time
> > >        --> inode->i_op->update_time == xfs_vn_update_time
> > > 
> > > Forgot to point them out in the cover-letter..., I didn't modify them
> > > since I'm not very sure about if we should do so, and I saw Stefan's
> > > patchset didn't modify them too.
> > > 
> > > My personnal thinking is we should apply trylock logic for this
> > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > > doesn't make sense to rollback all the stuff while we are almost at the
> > > end of getdents because of a lock.
> > 
> > That manoeuvres around the problem. Which I'm slightly more sensitive
> > too as this review is a rather expensive one.
> > 
> > Plus, it seems fixable in at least two ways:
> > 
> > For both we need to be able to tell the filesystem that a nowait atime
> > update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> > file_time_flags and passing that via i_op->update_time() which already
> > has a flag argument. That would likely also help kiocb_modified().
> > 
> > file_accessed()
> > -> touch_atime()
> >     -> inode_update_time()
> >        -> i_op->update_time == xfs_vn_update_time()
> > 
> > Then we have two options afaict:
> > 
> > (1) best-effort atime update
> > 
> > file_accessed() already has the builtin assumption that updating atime
> > might fail for other reasons - see the comment in there. So it is
> > somewhat best-effort already.
> > 
> > (2) move atime update before calling into filesystem
> > 
> > If we want to be sure that access time is updated when a readdir request
> > is issued through io_uring then we need to have file_accessed() give a
> > return value and expose a new helper for io_uring or modify
> > vfs_getdents() to do something like:
> > 
> > vfs_getdents()
> > {
> > 	if (nowait)
> > 		down_read_trylock()
> > 
> > 	if (!IS_DEADDIR(inode)) {
> > 		ret = file_accessed(file);
> > 		if (ret == -EAGAIN)
> > 			goto out_unlock;
> > 
> > 		f_op->iterate_shared()
> > 	}
> > }
> > 
> > It's not unprecedented to do update atime before the actual operation
> > has been done afaict. That's already the case in xfs_file_write_checks()
> > which is called before anything is written. So that seems ok.
> 
> I'm not familiar with this part(the time update), I guess we should
> revert the updated time if we succeed to do file_accessed(file) but
> fail somewhere later in f_op->iterate_shared()? Or is it definitely
> counted as an "access" as long as we start to call getdents to a file?

To answer that you can simply take a look at readdir rn

res = -ENOENT;
if (!IS_DEADDIR(inode)) {
        ctx->pos = file->f_pos;
        if (shared)
                res = file->f_op->iterate_shared(file, ctx);
        else
                res = file->f_op->iterate(file, ctx);
        file->f_pos = ctx->pos;
        fsnotify_access(file);
        file_accessed(file);
}

Also, I've said this before: touch_atime() is currently best effort. It
may fail for any kind of reason.
Hao Xu July 31, 2023, 9:31 a.m. UTC | #17
On 7/31/23 16:18, Christian Brauner wrote:
> On Mon, Jul 31, 2023 at 02:02:25AM +0800, Hao Xu wrote:
>> Hi Christian,
>>
>> On 7/27/23 22:27, Christian Brauner wrote:
>>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
>>>> On 7/26/23 23:00, Christian Brauner wrote:
>>>>> On Tue, Jul 18, 2023 at 09:21:10PM +0800, Hao Xu wrote:
>>>>>> From: Hao Xu <howeyxu@tencent.com>
>>>>>>
>>>>>> This add support for getdents64 to io_uring, acting exactly like the
>>>>>> syscall: the directory is iterated from it's current's position as
>>>>>> stored in the file struct, and the file's position is updated exactly as
>>>>>> if getdents64 had been called.
>>>>>>
>>>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>>>>>> first; if a user already knows the filesystem they use do not support
>>>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>>>>>> avoiding the need to bounce back through a useless EAGAIN return.
>>>>>>
>>>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>>>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>>>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>>>>>> ---
>>>>>>     include/uapi/linux/io_uring.h |  7 +++++
>>>>>>     io_uring/fs.c                 | 55 +++++++++++++++++++++++++++++++++++
>>>>>>     io_uring/fs.h                 |  3 ++
>>>>>>     io_uring/opdef.c              |  8 +++++
>>>>>>     4 files changed, 73 insertions(+)
>>>>>>
>>>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>>>> index 36f9c73082de..b200b2600622 100644
>>>>>> --- a/include/uapi/linux/io_uring.h
>>>>>> +++ b/include/uapi/linux/io_uring.h
>>>>>> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>>>>>>     		__u32		xattr_flags;
>>>>>>     		__u32		msg_ring_flags;
>>>>>>     		__u32		uring_cmd_flags;
>>>>>> +		__u32		getdents_flags;
>>>>>>     	};
>>>>>>     	__u64	user_data;	/* data to be passed back at completion time */
>>>>>>     	/* pack this to avoid bogus arm OABI complaints */
>>>>>> @@ -235,6 +236,7 @@ enum io_uring_op {
>>>>>>     	IORING_OP_URING_CMD,
>>>>>>     	IORING_OP_SEND_ZC,
>>>>>>     	IORING_OP_SENDMSG_ZC,
>>>>>> +	IORING_OP_GETDENTS,
>>>>>>     	/* this goes last, obviously */
>>>>>>     	IORING_OP_LAST,
>>>>>> @@ -273,6 +275,11 @@ enum io_uring_op {
>>>>>>      */
>>>>>>     #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>>>>>> +/*
>>>>>> + * sqe->getdents_flags
>>>>>> + */
>>>>>> +#define IORING_GETDENTS_REWIND	(1U << 0)
>>>>>> +
>>>>>>     /*
>>>>>>      * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>>>>>>      * command flags for POLL_ADD are stored in sqe->len.
>>>>>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>>>>>> index f6a69a549fd4..480f25677fed 100644
>>>>>> --- a/io_uring/fs.c
>>>>>> +++ b/io_uring/fs.c
>>>>>> @@ -47,6 +47,13 @@ struct io_link {
>>>>>>     	int				flags;
>>>>>>     };
>>>>>> +struct io_getdents {
>>>>>> +	struct file			*file;
>>>>>> +	struct linux_dirent64 __user	*dirent;
>>>>>> +	unsigned int			count;
>>>>>> +	int				flags;
>>>>>> +};
>>>>>> +
>>>>>>     int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>>     {
>>>>>>     	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
>>>>>> @@ -291,3 +298,51 @@ void io_link_cleanup(struct io_kiocb *req)
>>>>>>     	putname(sl->oldpath);
>>>>>>     	putname(sl->newpath);
>>>>>>     }
>>>>>> +
>>>>>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>>> +{
>>>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>>>> +
>>>>>> +	if (READ_ONCE(sqe->off) != 0)
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>>>>>> +	gd->count = READ_ONCE(sqe->len);
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>>>>>> +{
>>>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>>>> +	struct file *file = req->file;
>>>>>> +	unsigned long getdents_flags = 0;
>>>>>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>>>> Hm, I'm not sure what exactly the rules are for IO_URING_F_NONBLOCK.
>>>>> But to point this out:
>>>>>
>>>>> vfs_getdents()
>>>>> -> iterate_dir()
>>>>>       {
>>>>>            if (shared)
>>>>>                    res = down_read_killable(&inode->i_rwsem);
>>>>>            else
>>>>>                    res = down_write_killable(&inode->i_rwsem);
>>>>>       }
>>>>>
>>>>> which means you can still end up sleeping here before you go into a
>>>>> filesystem that does actually support non-waiting getdents. So if you
>>>>> have concurrent operations that grab inode lock (touch, mkdir etc) you
>>>>> can end up sleeping here.
>>>>>
>>>>> Is that intentional or an oversight? If the former can someone please
>>>>> explain the rules and why it's fine in this case?
>>>> I actually saw this semaphore, and there is another xfs lock in
>>>> file_accessed
>>>>     --> touch_atime
>>>>       --> inode_update_time
>>>>         --> inode->i_op->update_time == xfs_vn_update_time
>>>>
>>>> Forgot to point them out in the cover-letter..., I didn't modify them
>>>> since I'm not very sure about if we should do so, and I saw Stefan's
>>>> patchset didn't modify them too.
>>>>
>>>> My personnal thinking is we should apply trylock logic for this
>>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
>>>> doesn't make sense to rollback all the stuff while we are almost at the
>>>> end of getdents because of a lock.
>>> That manoeuvres around the problem. Which I'm slightly more sensitive
>>> too as this review is a rather expensive one.
>>>
>>> Plus, it seems fixable in at least two ways:
>>>
>>> For both we need to be able to tell the filesystem that a nowait atime
>>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
>>> file_time_flags and passing that via i_op->update_time() which already
>>> has a flag argument. That would likely also help kiocb_modified().
>>>
>>> file_accessed()
>>> -> touch_atime()
>>>      -> inode_update_time()
>>>         -> i_op->update_time == xfs_vn_update_time()
>>>
>>> Then we have two options afaict:
>>>
>>> (1) best-effort atime update
>>>
>>> file_accessed() already has the builtin assumption that updating atime
>>> might fail for other reasons - see the comment in there. So it is
>>> somewhat best-effort already.
>>>
>>> (2) move atime update before calling into filesystem
>>>
>>> If we want to be sure that access time is updated when a readdir request
>>> is issued through io_uring then we need to have file_accessed() give a
>>> return value and expose a new helper for io_uring or modify
>>> vfs_getdents() to do something like:
>>>
>>> vfs_getdents()
>>> {
>>> 	if (nowait)
>>> 		down_read_trylock()
>>>
>>> 	if (!IS_DEADDIR(inode)) {
>>> 		ret = file_accessed(file);
>>> 		if (ret == -EAGAIN)
>>> 			goto out_unlock;
>>>
>>> 		f_op->iterate_shared()
>>> 	}
>>> }
>>>
>>> It's not unprecedented to do update atime before the actual operation
>>> has been done afaict. That's already the case in xfs_file_write_checks()
>>> which is called before anything is written. So that seems ok.
>> I'm not familiar with this part(the time update), I guess we should
>> revert the updated time if we succeed to do file_accessed(file) but
>> fail somewhere later in f_op->iterate_shared()? Or is it definitely
>> counted as an "access" as long as we start to call getdents to a file?
> To answer that you can simply take a look at readdir rn
>
> res = -ENOENT;
> if (!IS_DEADDIR(inode)) {
>          ctx->pos = file->f_pos;
>          if (shared)
>                  res = file->f_op->iterate_shared(file, ctx);
>          else
>                  res = file->f_op->iterate(file, ctx);
>          file->f_pos = ctx->pos;
>          fsnotify_access(file);
>          file_accessed(file);
> }
>
> Also, I've said this before: touch_atime() is currently best effort. It
> may fail for any kind of reason.


Gotcha, I checked the code and I think you are right, time updates

even the fop->iterate() quits at the very beginning. I'll move 
file_accessed(file)

to before f_op->the iterate(). Thanks.
Darrick J. Wong July 31, 2023, 3:26 p.m. UTC | #18
On Mon, Jul 31, 2023 at 10:13:21AM +0200, Christian Brauner wrote:
> On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote:
> > On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote:
> > > On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
> > > > I actually saw this semaphore, and there is another xfs lock in
> > > > file_accessed
> > > >   --> touch_atime
> > > >     --> inode_update_time
> > > >       --> inode->i_op->update_time == xfs_vn_update_time
> > > > 
> > > > Forgot to point them out in the cover-letter..., I didn't modify them
> > > > since I'm not very sure about if we should do so, and I saw Stefan's
> > > > patchset didn't modify them too.
> > > > 
> > > > My personnal thinking is we should apply trylock logic for this
> > > > inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
> > > > doesn't make sense to rollback all the stuff while we are almost at the
> > > > end of getdents because of a lock.
> > > 
> > > That manoeuvres around the problem. Which I'm slightly more sensitive
> > > too as this review is a rather expensive one.
> > > 
> > > Plus, it seems fixable in at least two ways:
> > > 
> > > For both we need to be able to tell the filesystem that a nowait atime
> > > update is requested. Simple thing seems to me to add a S_NOWAIT flag to
> > > file_time_flags and passing that via i_op->update_time() which already
> > > has a flag argument. That would likely also help kiocb_modified().
> > 
> > Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT
> > modification operations? Yeah, we did:
> > 
> > kiocb_modified(iocb)
> >   file_modified_flags(iocb->ki_file, iocb->ki_flags)
> >     ....
> >     ret = inode_needs_update_time()
> >     if (ret <= 0)
> > 	return ret;
> >     if (flags & IOCB_NOWAIT)
> > 	return -EAGAIN;
> >     <does timestamp update>
> > 
> > > file_accessed()
> > > -> touch_atime()
> > >    -> inode_update_time()
> > >       -> i_op->update_time == xfs_vn_update_time()
> > 
> > Yeah, so this needs the same treatment as file_modified_flags() -
> > touch_atime() needs a flag variant that passes IOCB_NOWAIT, and
> > after atime_needs_update() returns trues we should check IOCB_NOWAIT
> > and return EAGAIN if it is set. That will punt the operation that
> > needs to the update to a worker thread that can block....
> 
> As I tried to explain, I would prefer if we could inform the filesystem
> through i_op->update_time() itself that this is async and give the
> filesystem the ability to try and acquire the locks it needs and return
> EAGAIN from i_op->update_time() itself if it can't acquire them.
> 
> > 
> > > Then we have two options afaict:
> > > 
> > > (1) best-effort atime update
> > > 
> > > file_accessed() already has the builtin assumption that updating atime
> > > might fail for other reasons - see the comment in there. So it is
> > > somewhat best-effort already.
> > > 
> > > (2) move atime update before calling into filesystem
> > > 
> > > If we want to be sure that access time is updated when a readdir request
> > > is issued through io_uring then we need to have file_accessed() give a
> > > return value and expose a new helper for io_uring or modify
> > > vfs_getdents() to do something like:
> > > 
> > > vfs_getdents()
> > > {
> > > 	if (nowait)
> > > 		down_read_trylock()
> > > 
> > > 	if (!IS_DEADDIR(inode)) {
> > > 		ret = file_accessed(file);
> > > 		if (ret == -EAGAIN)
> > > 			goto out_unlock;
> > > 
> > > 		f_op->iterate_shared()
> > > 	}
> > > }
> > 
> > Yup, that's the sort of thing that needs to be done.
> > 
> > But as I said in the "llseek for io-uring" thread, we need to stop
> > the game of whack-a-mole passing random nowait boolean flags to VFS
> > operations before it starts in earnest.  We really need a common
> > context structure (like we have a kiocb for IO operations) that
> > holds per operation control state so we have consistency across all
> > the operations that we need different behaviours for.
> 
> Yes, I tend to agree and thought about the same. But right now we don't
> have a lot of context. So I would lean towards a flag argument at most.
> 
> But I also wouldn't consider it necessarily wrong to start with booleans
> or a flag first and in a couple of months if the need for more context
> arises we know what kind of struct we want or need.

I'm probably missing a ton of context (because at the end of the day I
don't care all that much about NOWAIT and still have never installed
liburing) but AFAICT the goal seems to be that for a given io request,
uring tries to execute it with trylocks in the originating process
context.  If that attempt fails, it'll punt the io to a workqueue and
rerun the request with blocking locks.  Right?

I've watched quite a bit of NOWAIT whackamole going on over the past few
years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
these filesystem ios all have to run in process context, right?  If so,
why don't we capture the NOWAIT state in a PF flag?  We already do that
for NOFS/NOIO memory allocations to make sure that /all/ reclaim
attempts cannot recurse into the fs/io stacks.

"I prefer EAGAIN errors to this process blocking" doesn't seem all that
much different.  But, what do I know...

--D
Dave Chinner July 31, 2023, 10:18 p.m. UTC | #19
On Mon, Jul 31, 2023 at 08:26:23AM -0700, Darrick J. Wong wrote:
> On Mon, Jul 31, 2023 at 10:13:21AM +0200, Christian Brauner wrote:
> > On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote:
> > > On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote:
> > > But as I said in the "llseek for io-uring" thread, we need to stop
> > > the game of whack-a-mole passing random nowait boolean flags to VFS
> > > operations before it starts in earnest.  We really need a common
> > > context structure (like we have a kiocb for IO operations) that
> > > holds per operation control state so we have consistency across all
> > > the operations that we need different behaviours for.
> > 
> > Yes, I tend to agree and thought about the same. But right now we don't
> > have a lot of context. So I would lean towards a flag argument at most.
> > 
> > But I also wouldn't consider it necessarily wrong to start with booleans
> > or a flag first and in a couple of months if the need for more context
> > arises we know what kind of struct we want or need.
> 
> I'm probably missing a ton of context (because at the end of the day I
> don't care all that much about NOWAIT and still have never installed
> liburing) but AFAICT the goal seems to be that for a given io request,
> uring tries to execute it with trylocks in the originating process
> context.  If that attempt fails, it'll punt the io to a workqueue and
> rerun the request with blocking locks.  Right?

Yes, that might be the case for the VFS level code we are talking
about right now....

... but, for example, I have no clue what task context
nvmet_file_execute_rw() runs in but it definitely issues file IO
with IOCB_NOWAIT...

> I've watched quite a bit of NOWAIT whackamole going on over the past few
> years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
> these filesystem ios all have to run in process context, right?  If so,
> why don't we capture the NOWAIT state in a PF flag?  We already do that
> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
> attempts cannot recurse into the fs/io stacks.

Interesting idea.

That would mean the high level code would have to toggle the task
flags before calling into the VFS, which means we'd have to capture
RWF_NOWAIT flags at the syscall level rather than propagating them
into the iocb. That may impact speed racers, because the RWF flag
propagation has been a target of significant micro-optimisation
since io_uring came along....

> "I prefer EAGAIN errors to this process blocking" doesn't seem all that
> much different.  But, what do I know...

Yeah, I can see how that might be advantageous from an API
persepective, though my gut says "that's a can of worms" but I
haven't spent enough time thinking about it to work out why I feel
that way.

-Dave.
Jens Axboe Aug. 1, 2023, 12:28 a.m. UTC | #20
On 7/31/23 9:26?AM, Darrick J. Wong wrote:
> I've watched quite a bit of NOWAIT whackamole going on over the past few
> years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
> these filesystem ios all have to run in process context, right?  If so,
> why don't we capture the NOWAIT state in a PF flag?  We already do that
> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
> attempts cannot recurse into the fs/io stacks.

I would greatly prefer passing down the context rather than capitulating
and adding a task_struct flag for this. I think it _kind of_ makes sense
for things like allocations, as you cannot easily track that all the way
down, but it's a really ugly solution. It certainly creates more churn
passing it down, but it also reveals the parts that need to check it.
WHen new code is added, it's much more likely you'll spot the fact that
there's passed in context. For allocation, you end up in the allocator
anyway, which can augment the gfp mask with whatever is set in the task.
The same is not true for locking and other bits, as they don't return a
value to begin with. When we know they are sane, we can flag the fs as
supporting it (like we've done for async buffered reads, for example).

It's also not an absolute thing, like memory allocations are. It's
perfectly fine to grab a mutex under NOWAIT issue. What you should not
do is grab a mutex that someone else can grab while waiting on IO. This
kind of extra context is only available in the code in question, not
generically for eg mutex locking.

I'm not a huge fan of the "let's add a bool nowait". Most/all use cases
pass down state anyway, putting it in a suitable type struct seems much
cleaner to me than the out-of-band approach of just adding a
current->please_nowait.
Matthew Wilcox Aug. 1, 2023, 12:47 a.m. UTC | #21
On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
> It's also not an absolute thing, like memory allocations are. It's
> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> do is grab a mutex that someone else can grab while waiting on IO. This
> kind of extra context is only available in the code in question, not
> generically for eg mutex locking.

Is that information documented somewhere?  I didn't know that was the
rule, and I wouldn't be surprised if that's news to several of the other
people on this thread.
Jens Axboe Aug. 1, 2023, 12:49 a.m. UTC | #22
On 7/31/23 6:47?PM, Matthew Wilcox wrote:
> On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
>> It's also not an absolute thing, like memory allocations are. It's
>> perfectly fine to grab a mutex under NOWAIT issue. What you should not
>> do is grab a mutex that someone else can grab while waiting on IO. This
>> kind of extra context is only available in the code in question, not
>> generically for eg mutex locking.
> 
> Is that information documented somewhere?  I didn't know that was the
> rule, and I wouldn't be surprised if that's news to several of the other
> people on this thread.

I've mentioned it in various threads, but would probably be good to
write down somewhere if that actually makes more people see it. I'm
dubious, but since it applies to NOWAIT in general, would be a good
thing to do regardless. And then at least I could point to that rather
than have to write up a long description every time.
Matthew Wilcox Aug. 1, 2023, 1:01 a.m. UTC | #23
On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote:
> On 7/31/23 6:47?PM, Matthew Wilcox wrote:
> > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
> >> It's also not an absolute thing, like memory allocations are. It's
> >> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> >> do is grab a mutex that someone else can grab while waiting on IO. This
> >> kind of extra context is only available in the code in question, not
> >> generically for eg mutex locking.
> > 
> > Is that information documented somewhere?  I didn't know that was the
> > rule, and I wouldn't be surprised if that's news to several of the other
> > people on this thread.
> 
> I've mentioned it in various threads, but would probably be good to
> write down somewhere if that actually makes more people see it. I'm
> dubious, but since it applies to NOWAIT in general, would be a good
> thing to do regardless. And then at least I could point to that rather
> than have to write up a long description every time.

Would be good to include whether it's OK to wait on a mutex that's held
by another thread that's allocating memory without __GFP_NOFS (since
obviously that can block on I/O)
Christian Brauner Aug. 1, 2023, 6:59 a.m. UTC | #24
On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote:
> On 7/31/23 6:47?PM, Matthew Wilcox wrote:
> > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
> >> It's also not an absolute thing, like memory allocations are. It's
> >> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> >> do is grab a mutex that someone else can grab while waiting on IO. This
> >> kind of extra context is only available in the code in question, not
> >> generically for eg mutex locking.
> > 
> > Is that information documented somewhere?  I didn't know that was the
> > rule, and I wouldn't be surprised if that's news to several of the other
> > people on this thread.
> 
> I've mentioned it in various threads, but would probably be good to
> write down somewhere if that actually makes more people see it. I'm
> dubious, but since it applies to NOWAIT in general, would be a good
> thing to do regardless. And then at least I could point to that rather
> than have to write up a long description every time.

I've only been aware of this because we talked about it somewhere else.
So adding it as documentation would be great.
Christian Brauner Aug. 1, 2023, 7 a.m. UTC | #25
On Tue, Aug 01, 2023 at 02:01:59AM +0100, Matthew Wilcox wrote:
> On Mon, Jul 31, 2023 at 06:49:45PM -0600, Jens Axboe wrote:
> > On 7/31/23 6:47?PM, Matthew Wilcox wrote:
> > > On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
> > >> It's also not an absolute thing, like memory allocations are. It's
> > >> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> > >> do is grab a mutex that someone else can grab while waiting on IO. This
> > >> kind of extra context is only available in the code in question, not
> > >> generically for eg mutex locking.
> > > 
> > > Is that information documented somewhere?  I didn't know that was the
> > > rule, and I wouldn't be surprised if that's news to several of the other
> > > people on this thread.
> > 
> > I've mentioned it in various threads, but would probably be good to
> > write down somewhere if that actually makes more people see it. I'm
> > dubious, but since it applies to NOWAIT in general, would be a good
> > thing to do regardless. And then at least I could point to that rather
> > than have to write up a long description every time.
> 
> Would be good to include whether it's OK to wait on a mutex that's held
> by another thread that's allocating memory without __GFP_NOFS (since
> obviously that can block on I/O)

And adding a few illustrative examples would be helpful.
Christian Brauner Aug. 1, 2023, 7:17 a.m. UTC | #26
On Mon, Jul 31, 2023 at 06:28:02PM -0600, Jens Axboe wrote:
> On 7/31/23 9:26?AM, Darrick J. Wong wrote:
> > I've watched quite a bit of NOWAIT whackamole going on over the past few
> > years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
> > these filesystem ios all have to run in process context, right?  If so,
> > why don't we capture the NOWAIT state in a PF flag?  We already do that
> > for NOFS/NOIO memory allocations to make sure that /all/ reclaim
> > attempts cannot recurse into the fs/io stacks.
> 
> I would greatly prefer passing down the context rather than capitulating
> and adding a task_struct flag for this. I think it _kind of_ makes sense
> for things like allocations, as you cannot easily track that all the way
> down, but it's a really ugly solution. It certainly creates more churn
> passing it down, but it also reveals the parts that need to check it.
> WHen new code is added, it's much more likely you'll spot the fact that
> there's passed in context. For allocation, you end up in the allocator
> anyway, which can augment the gfp mask with whatever is set in the task.
> The same is not true for locking and other bits, as they don't return a
> value to begin with. When we know they are sane, we can flag the fs as
> supporting it (like we've done for async buffered reads, for example).
> 
> It's also not an absolute thing, like memory allocations are. It's
> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> do is grab a mutex that someone else can grab while waiting on IO. This
> kind of extra context is only available in the code in question, not
> generically for eg mutex locking.
> 
> I'm not a huge fan of the "let's add a bool nowait". Most/all use cases
> pass down state anyway, putting it in a suitable type struct seems much

We're only going to pass a struct if there really is a need for one
though. Meaning, we're shouldn't end up passing a struct with a single
element around in the hopes that we'll add more members at some point. 

> cleaner to me than the out-of-band approach of just adding a
> current->please_nowait.

I'm not convinced that abusing current/task_struct for this is sane. I
not just very much doubt this will go down well with reviewers outside
of fs/ we'd also rightly be told that we're punting on a design problem
because it would be more churn.
Hao Xu Aug. 1, 2023, 6:39 p.m. UTC | #27
On 7/31/23 16:13, Christian Brauner wrote:
> On Mon, Jul 31, 2023 at 11:33:05AM +1000, Dave Chinner wrote:
>> On Thu, Jul 27, 2023 at 04:27:30PM +0200, Christian Brauner wrote:
>>> On Thu, Jul 27, 2023 at 07:51:19PM +0800, Hao Xu wrote:
>>>> I actually saw this semaphore, and there is another xfs lock in
>>>> file_accessed
>>>>    --> touch_atime
>>>>      --> inode_update_time
>>>>        --> inode->i_op->update_time == xfs_vn_update_time
>>>>
>>>> Forgot to point them out in the cover-letter..., I didn't modify them
>>>> since I'm not very sure about if we should do so, and I saw Stefan's
>>>> patchset didn't modify them too.
>>>>
>>>> My personnal thinking is we should apply trylock logic for this
>>>> inode->i_rwsem. For xfs lock in touch_atime, we should do that since it
>>>> doesn't make sense to rollback all the stuff while we are almost at the
>>>> end of getdents because of a lock.
>>>
>>> That manoeuvres around the problem. Which I'm slightly more sensitive
>>> too as this review is a rather expensive one.
>>>
>>> Plus, it seems fixable in at least two ways:
>>>
>>> For both we need to be able to tell the filesystem that a nowait atime
>>> update is requested. Simple thing seems to me to add a S_NOWAIT flag to
>>> file_time_flags and passing that via i_op->update_time() which already
>>> has a flag argument. That would likely also help kiocb_modified().
>>
>> Wait - didn't we already fix this for mtime updates on IOCB_NOWAIT
>> modification operations? Yeah, we did:
>>
>> kiocb_modified(iocb)
>>    file_modified_flags(iocb->ki_file, iocb->ki_flags)
>>      ....
>>      ret = inode_needs_update_time()
>>      if (ret <= 0)
>> 	return ret;
>>      if (flags & IOCB_NOWAIT)
>> 	return -EAGAIN;
>>      <does timestamp update>
>>
>>> file_accessed()
>>> -> touch_atime()
>>>     -> inode_update_time()
>>>        -> i_op->update_time == xfs_vn_update_time()
>>
>> Yeah, so this needs the same treatment as file_modified_flags() -
>> touch_atime() needs a flag variant that passes IOCB_NOWAIT, and
>> after atime_needs_update() returns trues we should check IOCB_NOWAIT
>> and return EAGAIN if it is set. That will punt the operation that
>> needs to the update to a worker thread that can block....
> 
> As I tried to explain, I would prefer if we could inform the filesystem
> through i_op->update_time() itself that this is async and give the
> filesystem the ability to try and acquire the locks it needs and return
> EAGAIN from i_op->update_time() itself if it can't acquire them.

I browse code in i_op->update_time = xfs_vn_update_time, it's mainly
about xfs journal code. It creates a transaction and adds a item to
it, not familiar with this part, from a quick look I found some
locks and sleep point in it to modify. I think I need some time to sort
out this part. Or maybe we can do it like what Dave said as a short term
solution and change the block points in journal code later as a separate
patchset, those journal code I believe are common code for xfs IO
operations. I'm ok with both though.

> 
>>
>>> Then we have two options afaict:
>>>
>>> (1) best-effort atime update
>>>
>>> file_accessed() already has the builtin assumption that updating atime
>>> might fail for other reasons - see the comment in there. So it is
>>> somewhat best-effort already.
>>>
>>> (2) move atime update before calling into filesystem
>>>
>>> If we want to be sure that access time is updated when a readdir request
>>> is issued through io_uring then we need to have file_accessed() give a
>>> return value and expose a new helper for io_uring or modify
>>> vfs_getdents() to do something like:
>>>
>>> vfs_getdents()
>>> {
>>> 	if (nowait)
>>> 		down_read_trylock()
>>>
>>> 	if (!IS_DEADDIR(inode)) {
>>> 		ret = file_accessed(file);
>>> 		if (ret == -EAGAIN)
>>> 			goto out_unlock;
>>>
>>> 		f_op->iterate_shared()
>>> 	}
>>> }
>>
>> Yup, that's the sort of thing that needs to be done.
>>
>> But as I said in the "llseek for io-uring" thread, we need to stop
>> the game of whack-a-mole passing random nowait boolean flags to VFS
>> operations before it starts in earnest.  We really need a common
>> context structure (like we have a kiocb for IO operations) that
>> holds per operation control state so we have consistency across all
>> the operations that we need different behaviours for.
> 
> Yes, I tend to agree and thought about the same. But right now we don't
> have a lot of context. So I would lean towards a flag argument at most.
> 
> But I also wouldn't consider it necessarily wrong to start with booleans
> or a flag first and in a couple of months if the need for more context
> arises we know what kind of struct we want or need.
Hao Xu Aug. 8, 2023, 4:34 a.m. UTC | #28
On 8/1/23 08:28, Jens Axboe wrote:
> On 7/31/23 9:26?AM, Darrick J. Wong wrote:
>> I've watched quite a bit of NOWAIT whackamole going on over the past few
>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
>> these filesystem ios all have to run in process context, right?  If so,
>> why don't we capture the NOWAIT state in a PF flag?  We already do that
>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
>> attempts cannot recurse into the fs/io stacks.
> 
> I would greatly prefer passing down the context rather than capitulating
> and adding a task_struct flag for this. I think it _kind of_ makes sense
> for things like allocations, as you cannot easily track that all the way
> down, but it's a really ugly solution. It certainly creates more churn
> passing it down, but it also reveals the parts that need to check it.
> WHen new code is added, it's much more likely you'll spot the fact that
> there's passed in context. For allocation, you end up in the allocator
> anyway, which can augment the gfp mask with whatever is set in the task.
> The same is not true for locking and other bits, as they don't return a
> value to begin with. When we know they are sane, we can flag the fs as
> supporting it (like we've done for async buffered reads, for example).
> 
> It's also not an absolute thing, like memory allocations are. It's
> perfectly fine to grab a mutex under NOWAIT issue. What you should not

Hi Jens,
To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics
is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I
understand it right?

Thanks,
Hao

> do is grab a mutex that someone else can grab while waiting on IO. This
> kind of extra context is only available in the code in question, not
> generically for eg mutex locking.
> 
> I'm not a huge fan of the "let's add a bool nowait". Most/all use cases
> pass down state anyway, putting it in a suitable type struct seems much
> cleaner to me than the out-of-band approach of just adding a
> current->please_nowait.
>
Hao Xu Aug. 8, 2023, 5:18 a.m. UTC | #29
On 8/8/23 12:34, Hao Xu wrote:
> On 8/1/23 08:28, Jens Axboe wrote:
>> On 7/31/23 9:26?AM, Darrick J. Wong wrote:
>>> I've watched quite a bit of NOWAIT whackamole going on over the past 
>>> few
>>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...).  IIRC
>>> these filesystem ios all have to run in process context, right?  If so,
>>> why don't we capture the NOWAIT state in a PF flag?  We already do that
>>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
>>> attempts cannot recurse into the fs/io stacks.
>>
>> I would greatly prefer passing down the context rather than capitulating
>> and adding a task_struct flag for this. I think it _kind of_ makes sense
>> for things like allocations, as you cannot easily track that all the way
>> down, but it's a really ugly solution. It certainly creates more churn
>> passing it down, but it also reveals the parts that need to check it.
>> WHen new code is added, it's much more likely you'll spot the fact that
>> there's passed in context. For allocation, you end up in the allocator
>> anyway, which can augment the gfp mask with whatever is set in the task.
>> The same is not true for locking and other bits, as they don't return a
>> value to begin with. When we know they are sane, we can flag the fs as
>> supporting it (like we've done for async buffered reads, for example).
>>
>> It's also not an absolute thing, like memory allocations are. It's
>> perfectly fine to grab a mutex under NOWAIT issue. What you should not
>
> Hi Jens,
> To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics
> is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I
> understand it right?
>
> Thanks,
> Hao


Trying to find a lock in mem allocation process that GFP_NOIO holds it while

other normal GFP_* like GFP_KERNEL also holds it and does IO.

Failed to find one such lock.  So I guess though GFP_NOIO may cause sleep

but won't wait on IO.
Hao Xu Aug. 8, 2023, 9:33 a.m. UTC | #30
On 8/1/23 08:28, Jens Axboe wrote:
> On 7/31/23 9:26?AM, Darrick J. Wong wrote:
>> I've watched quite a bit of NOWAIT whackamole going on over the past few
>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC
>> these filesystem ios all have to run in process context, right? If so,
>> why don't we capture the NOWAIT state in a PF flag? We already do that
>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
>> attempts cannot recurse into the fs/io stacks.
>
> I would greatly prefer passing down the context rather than capitulating
> and adding a task_struct flag for this. I think it _kind of_ makes sense
> for things like allocations, as you cannot easily track that all the way
> down, but it's a really ugly solution. It certainly creates more churn
> passing it down, but it also reveals the parts that need to check it.
> WHen new code is added, it's much more likely you'll spot the fact that
> there's passed in context. For allocation, you end up in the allocator
> anyway, which can augment the gfp mask with whatever is set in the task.
> The same is not true for locking and other bits, as they don't return a
> value to begin with. When we know they are sane, we can flag the fs as
> supporting it (like we've done for async buffered reads, for example).
>
> It's also not an absolute thing, like memory allocations are. It's
> perfectly fine to grab a mutex under NOWAIT issue. What you should not

Hi Jens,
To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics
is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I
understand it right?

Thanks,
Hao

> do is grab a mutex that someone else can grab while waiting on IO. This
> kind of extra context is only available in the code in question, not
> generically for eg mutex locking.
>
> I'm not a huge fan of the "let's add a bool nowait". Most/all use cases
> pass down state anyway, putting it in a suitable type struct seems much
> cleaner to me than the out-of-band approach of just adding a
> current->please_nowait.
>
Jens Axboe Aug. 8, 2023, 10:55 p.m. UTC | #31
On 8/8/23 3:33?AM, Hao Xu wrote:
> On 8/1/23 08:28, Jens Axboe wrote:
>> On 7/31/23 9:26?AM, Darrick J. Wong wrote:
>>> I've watched quite a bit of NOWAIT whackamole going on over the past few
>>> years (i_rwsem, the ILOCK, the IO layer, memory allocations...). IIRC
>>> these filesystem ios all have to run in process context, right? If so,
>>> why don't we capture the NOWAIT state in a PF flag? We already do that
>>> for NOFS/NOIO memory allocations to make sure that /all/ reclaim
>>> attempts cannot recurse into the fs/io stacks.
>>
>> I would greatly prefer passing down the context rather than capitulating
>> and adding a task_struct flag for this. I think it _kind of_ makes sense
>> for things like allocations, as you cannot easily track that all the way
>> down, but it's a really ugly solution. It certainly creates more churn
>> passing it down, but it also reveals the parts that need to check it.
>> WHen new code is added, it's much more likely you'll spot the fact that
>> there's passed in context. For allocation, you end up in the allocator
>> anyway, which can augment the gfp mask with whatever is set in the task.
>> The same is not true for locking and other bits, as they don't return a
>> value to begin with. When we know they are sane, we can flag the fs as
>> supporting it (like we've done for async buffered reads, for example).
>>
>> It's also not an absolute thing, like memory allocations are. It's
>> perfectly fine to grab a mutex under NOWAIT issue. What you should not
> 
> Hi Jens,
> To make sure, I'd like to ask, for memory allocation, GFP_NOIO semantics
> is all we need in NOWAIT issue, GFP_NOWAIT is not necessary, do I
> understand it right?

Yep, GFP_NOIO should be just fine.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 36f9c73082de..b200b2600622 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -65,6 +65,7 @@  struct io_uring_sqe {
 		__u32		xattr_flags;
 		__u32		msg_ring_flags;
 		__u32		uring_cmd_flags;
+		__u32		getdents_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -235,6 +236,7 @@  enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_GETDENTS,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -273,6 +275,11 @@  enum io_uring_op {
  */
 #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
 
+/*
+ * sqe->getdents_flags
+ */
+#define IORING_GETDENTS_REWIND	(1U << 0)
+
 /*
  * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
  * command flags for POLL_ADD are stored in sqe->len.
diff --git a/io_uring/fs.c b/io_uring/fs.c
index f6a69a549fd4..480f25677fed 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -47,6 +47,13 @@  struct io_link {
 	int				flags;
 };
 
+struct io_getdents {
+	struct file			*file;
+	struct linux_dirent64 __user	*dirent;
+	unsigned int			count;
+	int				flags;
+};
+
 int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
@@ -291,3 +298,51 @@  void io_link_cleanup(struct io_kiocb *req)
 	putname(sl->oldpath);
 	putname(sl->newpath);
 }
+
+int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
+
+	if (READ_ONCE(sqe->off) != 0)
+		return -EINVAL;
+
+	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	gd->count = READ_ONCE(sqe->len);
+
+	return 0;
+}
+
+int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
+	struct file *file = req->file;
+	unsigned long getdents_flags = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool should_lock = file->f_mode & FMODE_ATOMIC_POS;
+	int ret;
+
+	if (force_nonblock) {
+		if (!(req->file->f_mode & FMODE_NOWAIT))
+			return -EAGAIN;
+
+		getdents_flags = DIR_CONTEXT_F_NOWAIT;
+	}
+
+	if (should_lock) {
+		if (!force_nonblock)
+			mutex_lock(&file->f_pos_lock);
+		else if (!mutex_trylock(&file->f_pos_lock))
+			return -EAGAIN;
+	}
+
+	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
+	if (should_lock)
+		mutex_unlock(&file->f_pos_lock);
+
+	if (ret == -EAGAIN && force_nonblock)
+		return -EAGAIN;
+
+	io_req_set_res(req, ret, 0);
+	return 0;
+}
+
diff --git a/io_uring/fs.h b/io_uring/fs.h
index 0bb5efe3d6bb..f83a6f3a678d 100644
--- a/io_uring/fs.h
+++ b/io_uring/fs.h
@@ -18,3 +18,6 @@  int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
 int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
 void io_link_cleanup(struct io_kiocb *req);
+
+int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_getdents(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b8b6..1bae6b2a8d0b 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -428,6 +428,11 @@  const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_GETDENTS] = {
+		.needs_file		= 1,
+		.prep			= io_getdents_prep,
+		.issue			= io_getdents,
+	},
 };
 
 
@@ -648,6 +653,9 @@  const struct io_cold_def io_cold_defs[] = {
 		.fail			= io_sendrecv_fail,
 #endif
 	},
+	[IORING_OP_GETDENTS] = {
+		.name			= "GETDENTS",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)