diff mbox series

[v2,6/6] RFC: io_uring getdents: test returning an EOF flag in CQE

Message ID 20230422-uring-getdents-v2-6-2db1e37dc55e@codewreck.org (mailing list archive)
State New
Headers show
Series io_uring: add getdents support, take 2 | expand

Commit Message

Dominique Martinet May 10, 2023, 10:52 a.m. UTC
This turns out to be very slightly faster than an extra call to
getdents, but in practice it doesn't seem to be such an improvement as
the trailing getdents will return almost immediately be absorbed by the
scheduling noise in a find-like context (my ""server"" is too noisy to
get proper benchmarks out, but results look slightly better with this in
async mode, and almost identical in the NOWAIT path)

If the user is waiting the end of a single directory though it might be
worth it, so including the patch for comments.
(in particular I'm not really happy that the flag has become in-out for
vfs_getdents, especially when the getdents64 syscall does not use it,
but I don't see much other way around it)

If this approach is acceptable/wanted then this patch will be split down
further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four
separate commits)

Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 fs/internal.h                 |  2 +-
 fs/kernfs/dir.c               |  1 +
 fs/libfs.c                    |  9 ++++++---
 fs/readdir.c                  | 10 ++++++----
 include/linux/fs.h            |  2 ++
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/fs.c                 |  8 ++++++--
 7 files changed, 24 insertions(+), 10 deletions(-)

Comments

Christian Brauner May 23, 2023, 2:30 p.m. UTC | #1
On Wed, May 10, 2023 at 07:52:54PM +0900, Dominique Martinet wrote:
> This turns out to be very slightly faster than an extra call to
> getdents, but in practice it doesn't seem to be such an improvement as
> the trailing getdents will return almost immediately be absorbed by the
> scheduling noise in a find-like context (my ""server"" is too noisy to
> get proper benchmarks out, but results look slightly better with this in
> async mode, and almost identical in the NOWAIT path)
> 
> If the user is waiting the end of a single directory though it might be
> worth it, so including the patch for comments.
> (in particular I'm not really happy that the flag has become in-out for
> vfs_getdents, especially when the getdents64 syscall does not use it,
> but I don't see much other way around it)
> 
> If this approach is acceptable/wanted then this patch will be split down
> further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four
> separate commits)
> 
> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> ---
>  fs/internal.h                 |  2 +-
>  fs/kernfs/dir.c               |  1 +
>  fs/libfs.c                    |  9 ++++++---
>  fs/readdir.c                  | 10 ++++++----
>  include/linux/fs.h            |  2 ++
>  include/uapi/linux/io_uring.h |  2 ++
>  io_uring/fs.c                 |  8 ++++++--
>  7 files changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index 0264b001d99a..0b1552c7a870 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -267,4 +267,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap);
>  struct linux_dirent64;
>  
>  int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> -		 unsigned int count, unsigned long flags);
> +		 unsigned int count, unsigned long *flags);
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 5a5b3e7881bf..53a6b4804c34 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -1860,6 +1860,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
>  	up_read(&root->kernfs_rwsem);
>  	file->private_data = NULL;
>  	ctx->pos = INT_MAX;
> +	ctx->flags |= DIR_CONTEXT_F_EOD;
>  	return 0;
>  }
>  
> diff --git a/fs/libfs.c b/fs/libfs.c
> index a3c7e42d90a7..b2a95dadffbd 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -208,10 +208,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
>  		p = &next->d_child;
>  	}
>  	spin_lock(&dentry->d_lock);
> -	if (next)
> +	if (next) {
>  		list_move_tail(&cursor->d_child, &next->d_child);
> -	else
> +	} else {
>  		list_del_init(&cursor->d_child);
> +		ctx->flags |= DIR_CONTEXT_F_EOD;
> +	}
>  	spin_unlock(&dentry->d_lock);
>  	dput(next);
>  
> @@ -1347,7 +1349,8 @@ static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
>  
>  static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
>  {
> -	dir_emit_dots(file, ctx);
> +	if (dir_emit_dots(file, ctx))
> +		ctx->flags |= DIR_CONTEXT_F_EOD;
>  	return 0;
>  }
>  
> diff --git a/fs/readdir.c b/fs/readdir.c
> index 1311b89d75e1..be75a2154b4f 100644
> --- a/fs/readdir.c
> +++ b/fs/readdir.c
> @@ -358,14 +358,14 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
>   * @file    : pointer to file struct of directory
>   * @dirent  : pointer to user directory structure
>   * @count   : size of buffer
> - * @flags   : additional dir_context flags
> + * @flags   : pointer to additional dir_context flags
>   */
>  int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> -		 unsigned int count, unsigned long flags)
> +		 unsigned int count, unsigned long *flags)
>  {
>  	struct getdents_callback64 buf = {
>  		.ctx.actor = filldir64,
> -		.ctx.flags = flags,
> +		.ctx.flags = flags ? *flags : 0,
>  		.count = count,
>  		.current_dir = dirent
>  	};
> @@ -384,6 +384,8 @@ int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
>  		else
>  			error = count - buf.count;
>  	}
> +	if (flags)
> +		*flags = buf.ctx.flags;
>  	return error;
>  }
>  
> @@ -397,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
>  	if (!f.file)
>  		return -EBADF;
>  
> -	error = vfs_getdents(f.file, dirent, count, 0);
> +	error = vfs_getdents(f.file, dirent, count, NULL);
>  
>  	fdput_pos(f);
>  	return error;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f7de2b5ca38e..d1e31bccfb4f 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1723,8 +1723,10 @@ struct dir_context {
>   * flags for dir_context flags
>   * DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate
>   *                       (requires file->f_mode & FMODE_NOWAIT)
> + * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs
>   */
>  #define DIR_CONTEXT_F_NOWAIT	0x1
> +#define DIR_CONTEXT_F_EOD	0x2
>  
>  /*
>   * These flags let !MMU mmap() govern direct device mapping vs immediate
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 35d0de18d893..35877132027e 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -381,11 +381,13 @@ struct io_uring_cqe {
>   * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
>   * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
>   * 			them from sends.
> + * IORING_CQE_F_EOF     If set, file or directory has reached end of file.
>   */
>  #define IORING_CQE_F_BUFFER		(1U << 0)
>  #define IORING_CQE_F_MORE		(1U << 1)
>  #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
>  #define IORING_CQE_F_NOTIF		(1U << 3)
> +#define IORING_CQE_F_EOF		(1U << 4)
>  
>  enum {
>  	IORING_CQE_BUFFER_SHIFT		= 16,
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index b15ec81c1ed2..f6222b0148ef 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>  {
>  	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>  	unsigned long getdents_flags = 0;
> +	u32 cqe_flags = 0;
>  	int ret;
>  
>  	if (issue_flags & IO_URING_F_NONBLOCK) {
> @@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>  			goto out;
>  	}
>  
> -	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
> +	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);

I don't understand how synchronization and updating of f_pos works here.
For example, what happens if a concurrent seek happens on the fd while
io_uring is using vfs_getdents which calls into iterate_dir() and
updates f_pos?
Dominique Martinet May 23, 2023, 9:05 p.m. UTC | #2
Christian Brauner wrote on Tue, May 23, 2023 at 04:30:14PM +0200:
> > index b15ec81c1ed2..f6222b0148ef 100644
> > --- a/io_uring/fs.c
> > +++ b/io_uring/fs.c
> > @@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> >  {
> >  	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> >  	unsigned long getdents_flags = 0;
> > +	u32 cqe_flags = 0;
> >  	int ret;
> >  
> >  	if (issue_flags & IO_URING_F_NONBLOCK) {
> > @@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> >  			goto out;
> >  	}
> >  
> > -	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
> > +	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);
> 
> I don't understand how synchronization and updating of f_pos works here.
> For example, what happens if a concurrent seek happens on the fd while
> io_uring is using vfs_getdents which calls into iterate_dir() and
> updates f_pos?

I don't see how different that is from a user spawning two threads and
calling getdents64 + lseek or two getdents64 in parallel?
(or any two other users of iterate_dir)

As far as I understand you'll either get the old or new pos as
obtained/updated by iterate_dir()?

That iterate_dir probably ought to be using READ_ONCE/WRITE_ONCE or some
atomic read/update wrappers as the shared case only has a read lock
around these, but that's not a new problem; and for all I care
about I'm happy to let users shoot themselves in the foot.
(although I guess that with filesystems not validating the offset as
was pointed out in a previous version comment having non-atomic update
might be a security issue at some point on architectures that don't
guarantee atomic 64bit updates, but if someone manages to abuse it
it's already possible to abuse it with the good old syscalls, so I'd
rather leave that up to someone who understand how atomicity in the
kernel works better than me...)
Dominique Martinet May 24, 2023, 9:36 p.m. UTC | #3
Christian Brauner wrote on Wed, May 24, 2023 at 03:52:45PM +0200:
> The main objection in [1] was to allow specifying an arbitrary offset
> from userspace. What [3] did was to implement a pread() variant for
> directories, i.e., pgetdents(). That can't work in principle/is
> prohibitively complex. Which is what your series avoids by not allowing
> any offsets to be specified.

Yes.

> However, there's still a problem here. Updates to f_pos happen under an
> approriate lock to guarantee consistency of the position between calls
> that move the cursor position. In the normal read-write path io_uring
> doesn't concern itself with f_pos as it keeps local state in
> kiocb->ki_pos.
> 
> But then it still does end up running into f_pos consistency problems
> for read-write because it does allow operating on the current f_pos if
> the offset if struct io_rw is set to -1.
> 
> In that case it does retrieve and update f_pos which should take
> f_pos_lock and a patchset for this was posted but it didn't go anywhere.
> It should probably hold that lock. See Jann's comments in the other
> thread how that currently can lead to issues.

Assuming that is this mail:
https://lore.kernel.org/io-uring/CAG48ez1O9VxSuWuLXBjke23YxUA8EhMP+6RCHo5PNQBf3B0pDQ@mail.gmail.com/

So, ok, I didn't realize fdget_pos() actually acted as a lock on the
file's f_pos_lock (apparently only if FMODE_ATMOIC_POS is set? but it
looks set on most if not all directories through finish_open(), that
looks called consistently enough)

What was confusing is that default_llseek updates f_pos under the
inode_lock (write), and getdents also takes that lock (for read only in
shared implem), so I assumed getdents also was just protected by this
read lock, but I guess that was a bad assumption (as I kept pointing
out, a shared read lock isn't good enough, we definitely agree there)


In practice, in the non-registered file case io_uring is also calling
fdget, so the lock is held exactly the same as the syscall and I wasn't
so far off -- but we need to figure something for the registered file
case.
openat variants don't allow working with the registered variant at all
on the parent fd, so as far as I'm concerned I'd be happy setting the
same limitation for getdents: just say it acts on fd and not files, and
call it a day...
It'd also be possible to check if REQ_F_FIXED_FILE flag was set and
manually take the lock somehow but we don't have any primitive that
takes f_pos_lock from a file (the only place that takes it is fdget
which requires a fd), so I'd rather not add such a new exception.
I assume the other patch you mentioned about adding that lock was this
one:
https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
and it just atkes the lock, but __fdget_pos also checks for
FMODE_ATOMIC_OPS and file_count and I'm not sure I understand how it
sets (f.flags & FDPUT_POS_UNLOCK) (for fdput_pos) so I'd rather not add
such a code path at this point..


So, ok, what do you think about just forbidding registered files?
I can't see where that wouldn't suffice but I might be missing something
else.

> For getdents() not protecting f_pos is equally bad or worse. The patch
> doesn't hold f_pos_lock and just updates f_pos prior _and_ post
> iterate_dir() arguing that this race is fine. But again, f_version and
> f_pos are consistent after each system call invocation.
> 
> But without that you can have a concurrent seek running and can end up
> with an inconsistent f_pos position within the same system call. IOW,
> you're breaking f_pos being in a well-known state. And you're not doing
> that just for io_uring you're doing it for the regular system call
> interface as well as both can be used on the same fd simultaneously.
> So that's a no go imho.

(so seek is fine, but I agree two concurrent getdents on registered
files won't have the required lock)
Christian Brauner May 25, 2023, 9:22 a.m. UTC | #4
On Thu, May 25, 2023 at 06:36:17AM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Wed, May 24, 2023 at 03:52:45PM +0200:
> > The main objection in [1] was to allow specifying an arbitrary offset
> > from userspace. What [3] did was to implement a pread() variant for
> > directories, i.e., pgetdents(). That can't work in principle/is
> > prohibitively complex. Which is what your series avoids by not allowing
> > any offsets to be specified.
> 
> Yes.
> 
> > However, there's still a problem here. Updates to f_pos happen under an
> > approriate lock to guarantee consistency of the position between calls
> > that move the cursor position. In the normal read-write path io_uring
> > doesn't concern itself with f_pos as it keeps local state in
> > kiocb->ki_pos.
> > 
> > But then it still does end up running into f_pos consistency problems
> > for read-write because it does allow operating on the current f_pos if
> > the offset if struct io_rw is set to -1.
> > 
> > In that case it does retrieve and update f_pos which should take
> > f_pos_lock and a patchset for this was posted but it didn't go anywhere.
> > It should probably hold that lock. See Jann's comments in the other
> > thread how that currently can lead to issues.
> 
> Assuming that is this mail:
> https://lore.kernel.org/io-uring/CAG48ez1O9VxSuWuLXBjke23YxUA8EhMP+6RCHo5PNQBf3B0pDQ@mail.gmail.com/
> 
> So, ok, I didn't realize fdget_pos() actually acted as a lock on the
> file's f_pos_lock (apparently only if FMODE_ATMOIC_POS is set? but it
> looks set on most if not all directories through finish_open(), that
> looks called consistently enough)

It's set for any regular file and directory.

> 
> What was confusing is that default_llseek updates f_pos under the
> inode_lock (write), and getdents also takes that lock (for read only in
> shared implem), so I assumed getdents also was just protected by this
> read lock, but I guess that was a bad assumption (as I kept pointing
> out, a shared read lock isn't good enough, we definitely agree there)
> 
> 
> In practice, in the non-registered file case io_uring is also calling
> fdget, so the lock is held exactly the same as the syscall and I wasn't

No, it really isn't. fdget() doesn't take f_pos_lock at all:

fdget()
-> __fdget()
   -> __fget_light()
      -> __fget()
         -> __fget_files()
            -> __fget_files_rcu()

If that were true then any system call that passes an fd and uses
fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
every *at based system call on f_pos_lock whenever we have multiple fds
referring to the same file trying to operate on it concurrently.

We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
a select group of system calls that require this synchronization.

> so far off -- but we need to figure something for the registered file
> case.
> openat variants don't allow working with the registered variant at all
> on the parent fd, so as far as I'm concerned I'd be happy setting the
> same limitation for getdents: just say it acts on fd and not files, and
> call it a day...

I don't follow. Also this is hacky so no.

The reason why io_uring *at implementations don't work with fixed files
is that the VFS interface expect regular fds. You could very well make
this work for fixed files but why. It would mean exposing a whole new
set of vfs helpers to io_uring and would probably involve nasty corner
cases.

Also the connection between regular and fixed files in io_uring is
pretty much fluent. While fixed files can only remain pinned in an
io_uring instance it requires that the caller explicitly gave up all
references in their fdtable to that struct file by closing all fds
referring to the same file.

But there's no guarantee. For example, if another thread dups the fd or
the caller sends the fd via SCM_RIGHTS to another process or the caller
simply doesn't close the fd or another thread gets an fd to the same
file from that task via pidfd_getfd before it closed it this doesn't hold.

So it's very well possible to have an fd and a fixed io_uring reference
referring to the same file. The first one can be used with the regular
system call interface and io_uring *at requests that forbid fixed files.
And the other one can be used for i_uring fixed file operations. Doesn't
matter if that shouldn't be done, it's possible afaict.

For regular and fixed files you also have the same problem from within
the same io_uring instance where you can have concurrent getdent
requests. You'd end up producing the exact same inconsistencies.

> It'd also be possible to check if REQ_F_FIXED_FILE flag was set and
> manually take the lock somehow but we don't have any primitive that
> takes f_pos_lock from a file (the only place that takes it is fdget
> which requires a fd), so I'd rather not add such a new exception.
> I assume the other patch you mentioned about adding that lock was this
> one:
> https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
> and it just atkes the lock, but __fdget_pos also checks for
> FMODE_ATOMIC_OPS and file_count and I'm not sure I understand how it
> sets (f.flags & FDPUT_POS_UNLOCK) (for fdput_pos) so I'd rather not add
> such a code path at this point..
> 
> 
> So, ok, what do you think about just forbidding registered files?
> I can't see where that wouldn't suffice but I might be missing something
> else.

It doesn't help.
Dominique Martinet May 25, 2023, 11 a.m. UTC | #5
Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
> > What was confusing is that default_llseek updates f_pos under the
> > inode_lock (write), and getdents also takes that lock (for read only in
> > shared implem), so I assumed getdents also was just protected by this
> > read lock, but I guess that was a bad assumption (as I kept pointing
> > out, a shared read lock isn't good enough, we definitely agree there)
> > 
> > 
> > In practice, in the non-registered file case io_uring is also calling
> > fdget, so the lock is held exactly the same as the syscall and I wasn't
> 
> No, it really isn't. fdget() doesn't take f_pos_lock at all:
> 
> fdget()
> -> __fdget()
>    -> __fget_light()
>       -> __fget()
>          -> __fget_files()
>             -> __fget_files_rcu()

Ugh, I managed to not notice that I was looking at fdget_pos and that
it's not the same as fdget by the time I wrote two paragraphs... These
functions all have too many wrappers and too similar names for a quick
look before work.

> If that were true then any system call that passes an fd and uses
> fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
> every *at based system call on f_pos_lock whenever we have multiple fds
> referring to the same file trying to operate on it concurrently.
> 
> We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
> a select group of system calls that require this synchronization.

Right, that makes sense, and invalidates everything I said after that
anyway but it's not like looking stupid ever killed anyone.

Ok so it would require adding a new wrapper from struct file to struct
fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
fdput_pos but another function for that stopping short of fdput...
Then just call that around both vfs_llseek and vfs_getdents calls; which
is the easy part.

(Or possibly call mutex_lock directly like Dylan did in [1]...)
[1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6



I'll be honest though I'm thankful for your explanations but I think
I'll just do like Stefan and stop trying for now: the only reason I've
started this was because I wanted to play with io_uring for a new toy
project and it felt awkward without a getdents for crawling a tree; and
I'm long past the point where I should have thrown the towel and just
make that a sequential walk.
There's too many "conditional patches" (NOWAIT, end of dir indicator)
that I don't care about and require additional work to rebase
continuously so I'll just leave it up to someone else who does care.

So to that someone: feel free to continue from these branches (I've
included the fix for kernfs_fop_readdir that Dan Carpenter reported):
https://github.com/martinetd/linux/commits/io_uring_getdents
https://github.com/martinetd/liburing/commits/getdents

Or just start over, there's not that much code now hopefully the
baseline requirements have gotten a little bit clearer.


Sorry for stirring the mess and leaving halfway, if nobody does continue
I might send a v3 when I have more time/energy in a few months, but it
won't be quick.
Christian Brauner May 25, 2023, 12:33 p.m. UTC | #6
On Thu, May 25, 2023 at 08:00:02PM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
> > > What was confusing is that default_llseek updates f_pos under the
> > > inode_lock (write), and getdents also takes that lock (for read only in
> > > shared implem), so I assumed getdents also was just protected by this
> > > read lock, but I guess that was a bad assumption (as I kept pointing
> > > out, a shared read lock isn't good enough, we definitely agree there)
> > > 
> > > 
> > > In practice, in the non-registered file case io_uring is also calling
> > > fdget, so the lock is held exactly the same as the syscall and I wasn't
> > 
> > No, it really isn't. fdget() doesn't take f_pos_lock at all:
> > 
> > fdget()
> > -> __fdget()
> >    -> __fget_light()
> >       -> __fget()
> >          -> __fget_files()
> >             -> __fget_files_rcu()
> 
> Ugh, I managed to not notice that I was looking at fdget_pos and that
> it's not the same as fdget by the time I wrote two paragraphs... These
> functions all have too many wrappers and too similar names for a quick
> look before work.
> 
> > If that were true then any system call that passes an fd and uses
> > fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
> > every *at based system call on f_pos_lock whenever we have multiple fds
> > referring to the same file trying to operate on it concurrently.
> > 
> > We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
> > a select group of system calls that require this synchronization.
> 
> Right, that makes sense, and invalidates everything I said after that
> anyway but it's not like looking stupid ever killed anyone.

I strongly disagree with the looking stupid part. These callchains are
quite unwieldy and it's easy to get confused. Usually if you receive a
long mail about the semantics involved - as in the earlier thread - it
means there's landmines all over.

> 
> Ok so it would require adding a new wrapper from struct file to struct
> fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
> fdput_pos but another function for that stopping short of fdput...
> Then just call that around both vfs_llseek and vfs_getdents calls; which
> is the easy part.
> 
> (Or possibly call mutex_lock directly like Dylan did in [1]...)
> [1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6

We'd need a consistent story whatever it ends up being.

> I'll be honest though I'm thankful for your explanations but I think
> I'll just do like Stefan and stop trying for now: the only reason I've
> started this was because I wanted to play with io_uring for a new toy
> project and it felt awkward without a getdents for crawling a tree; and
> I'm long past the point where I should have thrown the towel and just
> make that a sequential walk.
> There's too many "conditional patches" (NOWAIT, end of dir indicator)
> that I don't care about and require additional work to rebase
> continuously so I'll just leave it up to someone else who does care.
> 
> So to that someone: feel free to continue from these branches (I've
> included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> https://github.com/martinetd/linux/commits/io_uring_getdents
> https://github.com/martinetd/liburing/commits/getdents
> 
> Or just start over, there's not that much code now hopefully the
> baseline requirements have gotten a little bit clearer.
> 
> 
> Sorry for stirring the mess and leaving halfway, if nobody does continue
> I might send a v3 when I have more time/energy in a few months, but it
> won't be quick.

It's fine.
Hao Xu July 11, 2023, 8:17 a.m. UTC | #7
On 5/25/23 19:00, Dominique Martinet wrote:
> Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
>>> What was confusing is that default_llseek updates f_pos under the
>>> inode_lock (write), and getdents also takes that lock (for read only in
>>> shared implem), so I assumed getdents also was just protected by this
>>> read lock, but I guess that was a bad assumption (as I kept pointing
>>> out, a shared read lock isn't good enough, we definitely agree there)
>>>
>>>
>>> In practice, in the non-registered file case io_uring is also calling
>>> fdget, so the lock is held exactly the same as the syscall and I wasn't
>>
>> No, it really isn't. fdget() doesn't take f_pos_lock at all:
>>
>> fdget()
>> -> __fdget()
>>     -> __fget_light()
>>        -> __fget()
>>           -> __fget_files()
>>              -> __fget_files_rcu()
> 
> Ugh, I managed to not notice that I was looking at fdget_pos and that
> it's not the same as fdget by the time I wrote two paragraphs... These
> functions all have too many wrappers and too similar names for a quick
> look before work.
> 
>> If that were true then any system call that passes an fd and uses
>> fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
>> every *at based system call on f_pos_lock whenever we have multiple fds
>> referring to the same file trying to operate on it concurrently.
>>
>> We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
>> a select group of system calls that require this synchronization.
> 
> Right, that makes sense, and invalidates everything I said after that
> anyway but it's not like looking stupid ever killed anyone.
> 
> Ok so it would require adding a new wrapper from struct file to struct
> fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
> fdput_pos but another function for that stopping short of fdput...
> Then just call that around both vfs_llseek and vfs_getdents calls; which
> is the easy part.
> 
> (Or possibly call mutex_lock directly like Dylan did in [1]...)
> [1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
> 
> 
> 
> I'll be honest though I'm thankful for your explanations but I think
> I'll just do like Stefan and stop trying for now: the only reason I've
> started this was because I wanted to play with io_uring for a new toy
> project and it felt awkward without a getdents for crawling a tree; and
> I'm long past the point where I should have thrown the towel and just
> make that a sequential walk.
> There's too many "conditional patches" (NOWAIT, end of dir indicator)
> that I don't care about and require additional work to rebase
> continuously so I'll just leave it up to someone else who does care.
> 
> So to that someone: feel free to continue from these branches (I've
> included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> https://github.com/martinetd/linux/commits/io_uring_getdents
> https://github.com/martinetd/liburing/commits/getdents
> 
> Or just start over, there's not that much code now hopefully the
> baseline requirements have gotten a little bit clearer.
> 
> 
> Sorry for stirring the mess and leaving halfway, if nobody does continue
> I might send a v3 when I have more time/energy in a few months, but it
> won't be quick.
> 

Hi Dominique,

I'd like to take this if you don't mind.

Regards,
Hao
Dominique Martinet July 11, 2023, 8:24 a.m. UTC | #8
Hao Xu wrote on Tue, Jul 11, 2023 at 04:17:11PM +0800:
> > So to that someone: feel free to continue from these branches (I've
> > included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> > https://github.com/martinetd/linux/commits/io_uring_getdents
> > https://github.com/martinetd/liburing/commits/getdents
> > 
> > Or just start over, there's not that much code now hopefully the
> > baseline requirements have gotten a little bit clearer.
> > 
> > 
> > Sorry for stirring the mess and leaving halfway, if nobody does continue
> > I might send a v3 when I have more time/energy in a few months, but it
> > won't be quick.
> 
> I'd like to take this if you don't mind.

Sure, I haven't been working on this lately, feel free to work on this.

Will be happy to review anything you send based on what came out of the
previous discussions to save Christian and others some time so you can
keep me in Cc if you'd like.
diff mbox series

Patch

diff --git a/fs/internal.h b/fs/internal.h
index 0264b001d99a..0b1552c7a870 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -267,4 +267,4 @@  void mnt_idmap_put(struct mnt_idmap *idmap);
 struct linux_dirent64;
 
 int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
-		 unsigned int count, unsigned long flags);
+		 unsigned int count, unsigned long *flags);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a5b3e7881bf..53a6b4804c34 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1860,6 +1860,7 @@  static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 	up_read(&root->kernfs_rwsem);
 	file->private_data = NULL;
 	ctx->pos = INT_MAX;
+	ctx->flags |= DIR_CONTEXT_F_EOD;
 	return 0;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a3c7e42d90a7..b2a95dadffbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -208,10 +208,12 @@  int dcache_readdir(struct file *file, struct dir_context *ctx)
 		p = &next->d_child;
 	}
 	spin_lock(&dentry->d_lock);
-	if (next)
+	if (next) {
 		list_move_tail(&cursor->d_child, &next->d_child);
-	else
+	} else {
 		list_del_init(&cursor->d_child);
+		ctx->flags |= DIR_CONTEXT_F_EOD;
+	}
 	spin_unlock(&dentry->d_lock);
 	dput(next);
 
@@ -1347,7 +1349,8 @@  static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
 
 static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	dir_emit_dots(file, ctx);
+	if (dir_emit_dots(file, ctx))
+		ctx->flags |= DIR_CONTEXT_F_EOD;
 	return 0;
 }
 
diff --git a/fs/readdir.c b/fs/readdir.c
index 1311b89d75e1..be75a2154b4f 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -358,14 +358,14 @@  static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
  * @file    : pointer to file struct of directory
  * @dirent  : pointer to user directory structure
  * @count   : size of buffer
- * @flags   : additional dir_context flags
+ * @flags   : pointer to additional dir_context flags
  */
 int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
-		 unsigned int count, unsigned long flags)
+		 unsigned int count, unsigned long *flags)
 {
 	struct getdents_callback64 buf = {
 		.ctx.actor = filldir64,
-		.ctx.flags = flags,
+		.ctx.flags = flags ? *flags : 0,
 		.count = count,
 		.current_dir = dirent
 	};
@@ -384,6 +384,8 @@  int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
 		else
 			error = count - buf.count;
 	}
+	if (flags)
+		*flags = buf.ctx.flags;
 	return error;
 }
 
@@ -397,7 +399,7 @@  SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	if (!f.file)
 		return -EBADF;
 
-	error = vfs_getdents(f.file, dirent, count, 0);
+	error = vfs_getdents(f.file, dirent, count, NULL);
 
 	fdput_pos(f);
 	return error;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7de2b5ca38e..d1e31bccfb4f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1723,8 +1723,10 @@  struct dir_context {
  * flags for dir_context flags
  * DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate
  *                       (requires file->f_mode & FMODE_NOWAIT)
+ * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs
  */
 #define DIR_CONTEXT_F_NOWAIT	0x1
+#define DIR_CONTEXT_F_EOD	0x2
 
 /*
  * These flags let !MMU mmap() govern direct device mapping vs immediate
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 35d0de18d893..35877132027e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -381,11 +381,13 @@  struct io_uring_cqe {
  * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
  * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
  * 			them from sends.
+ * IORING_CQE_F_EOF     If set, file or directory has reached end of file.
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
 #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
 #define IORING_CQE_F_NOTIF		(1U << 3)
+#define IORING_CQE_F_EOF		(1U << 4)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
diff --git a/io_uring/fs.c b/io_uring/fs.c
index b15ec81c1ed2..f6222b0148ef 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -322,6 +322,7 @@  int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
 	unsigned long getdents_flags = 0;
+	u32 cqe_flags = 0;
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK) {
@@ -338,13 +339,16 @@  int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
 			goto out;
 	}
 
-	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
+	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);
 out:
 	if (ret == -EAGAIN &&
 	    (issue_flags & IO_URING_F_NONBLOCK))
 			return -EAGAIN;
 
-	io_req_set_res(req, ret, 0);
+	if (getdents_flags & DIR_CONTEXT_F_EOD)
+		cqe_flags |= IORING_CQE_F_EOF;
+
+	io_req_set_res(req, ret, cqe_flags);
 	return 0;
 }