diff mbox series

[2/2] vfs: support statx(..., NULL, AT_EMPTY_PATH, ...)

Message ID 20240625110029.606032-3-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series statx NULL path support | expand

Commit Message

Mateusz Guzik June 25, 2024, 11 a.m. UTC
The newly used helper also checks for 0-sized buffers.

This avoids path lookup code, lockref management, memory allocation and
in case of NULL path userspace memory access (which can be quite
expensive with SMAP on x86_64).

statx with AT_EMPTY_PATH paired with "" or NULL argument as appropriate
issued on Sapphire Rapids (ops/s):
stock:     4231237
0-check:   5944063 (+40%)
NULL path: 6601619 (+11%/+56%)

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 fs/internal.h    |  2 ++
 fs/stat.c        | 90 ++++++++++++++++++++++++++++++++++--------------
 io_uring/statx.c | 23 +++++++------
 3 files changed, 80 insertions(+), 35 deletions(-)

Comments

Xi Ruoyao June 25, 2024, 1:24 p.m. UTC | #1
On Tue, 2024-06-25 at 13:00 +0200, Mateusz Guzik wrote:
> +	if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))

Could it be

if ((flags & AT_EMPTY_PATH) && vfs_empty_path(dfd, filename))

instead?

When fstatat is implemented with statx AT_NO_AUTOMOUNT is needed, or at
least Glibc developers think it's needed:

#if FSTATAT_USE_STATX

static inline int 
fstatat64_time64_statx (int fd, const char *file, struct __stat64_t64 *buf,
            int flag)
{
  /* 32-bit kABI with default 64-bit time_t, e.g. arc, riscv32.   Also
     64-bit time_t support is done through statx syscall.  */
  struct statx tmp;
  int r = INTERNAL_SYSCALL_CALL (statx, fd, file, AT_NO_AUTOMOUNT | flag,
                 STATX_BASIC_STATS, &tmp);

so "flags == AT_EMPTY_PATH" won't be true if Glibc implements fstatat
and fstat via statx (on LoongArch and 32-bit platforms).

I was just surprised when I saw a 100%+ improve for statx("",
AT_EMPTY_PATH) but not stat on the Loongson machine.

> +		return do_statx_fd(dfd, flags, mask, buffer);
> +
>  	name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
Xi Ruoyao June 25, 2024, 1:28 p.m. UTC | #2
On Tue, 2024-06-25 at 21:24 +0800, Xi Ruoyao wrote:
> On Tue, 2024-06-25 at 13:00 +0200, Mateusz Guzik wrote:
> > +	if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
> 
> Could it be
> 
> if ((flags & AT_EMPTY_PATH) && vfs_empty_path(dfd, filename))
> 
> instead?
> 
> When fstatat is implemented with statx AT_NO_AUTOMOUNT is needed, or at
> least Glibc developers think it's needed:

/* snip */

> I was just surprised when I saw a 100%+ improve for statx("",
> AT_EMPTY_PATH) but not stat on the Loongson machine.
                         ^^^^ fstat

I cannot type :(
Mateusz Guzik June 25, 2024, 1:28 p.m. UTC | #3
On Tue, Jun 25, 2024 at 3:24 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Tue, 2024-06-25 at 13:00 +0200, Mateusz Guzik wrote:
> > +     if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
>
> Could it be
>
> if ((flags & AT_EMPTY_PATH) && vfs_empty_path(dfd, filename))
>
> instead?
>
> When fstatat is implemented with statx AT_NO_AUTOMOUNT is needed, or at
> least Glibc developers think it's needed:
>
> #if FSTATAT_USE_STATX
>
> static inline int
> fstatat64_time64_statx (int fd, const char *file, struct __stat64_t64 *buf,
>             int flag)
> {
>   /* 32-bit kABI with default 64-bit time_t, e.g. arc, riscv32.   Also
>      64-bit time_t support is done through statx syscall.  */
>   struct statx tmp;
>   int r = INTERNAL_SYSCALL_CALL (statx, fd, file, AT_NO_AUTOMOUNT | flag,
>                  STATX_BASIC_STATS, &tmp);
>
> so "flags == AT_EMPTY_PATH" won't be true if Glibc implements fstatat
> and fstat via statx (on LoongArch and 32-bit platforms).
>
> I was just surprised when I saw a 100%+ improve for statx("",
> AT_EMPTY_PATH) but not stat on the Loongson machine.
>

It can't be like that specifically because we still need to catch
bogus AT flags.

I'm going to poke a little bit and send a v2, thanks.
Huacai Chen June 25, 2024, 2:09 p.m. UTC | #4
On Tue, Jun 25, 2024 at 7:01 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> The newly used helper also checks for 0-sized buffers.
>
> This avoids path lookup code, lockref management, memory allocation and
> in case of NULL path userspace memory access (which can be quite
> expensive with SMAP on x86_64).
>
> statx with AT_EMPTY_PATH paired with "" or NULL argument as appropriate
> issued on Sapphire Rapids (ops/s):
> stock:     4231237
> 0-check:   5944063 (+40%)
> NULL path: 6601619 (+11%/+56%)
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Hi, Ruoyao,

I'm a bit confused. Ii this patch a replacement of your recent patch?

Huacai
> ---
>  fs/internal.h    |  2 ++
>  fs/stat.c        | 90 ++++++++++++++++++++++++++++++++++--------------
>  io_uring/statx.c | 23 +++++++------
>  3 files changed, 80 insertions(+), 35 deletions(-)
>
> diff --git a/fs/internal.h b/fs/internal.h
> index 1caa6a8f666f..0a018ebcaf49 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -244,6 +244,8 @@ extern const struct dentry_operations ns_dentry_operations;
>  int getname_statx_lookup_flags(int flags);
>  int do_statx(int dfd, struct filename *filename, unsigned int flags,
>              unsigned int mask, struct statx __user *buffer);
> +int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
> +               struct statx __user *buffer);
>
>  /*
>   * fs/splice.c:
> diff --git a/fs/stat.c b/fs/stat.c
> index 106684034fdb..1214826f3a36 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -214,6 +214,43 @@ int getname_statx_lookup_flags(int flags)
>         return lookup_flags;
>  }
>
> +static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
> +                         u32 request_mask)
> +{
> +       int error = vfs_getattr(path, stat, request_mask, flags);
> +
> +       if (request_mask & STATX_MNT_ID_UNIQUE) {
> +               stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
> +               stat->result_mask |= STATX_MNT_ID_UNIQUE;
> +       } else {
> +               stat->mnt_id = real_mount(path->mnt)->mnt_id;
> +               stat->result_mask |= STATX_MNT_ID;
> +       }
> +
> +       if (path->mnt->mnt_root == path->dentry)
> +               stat->attributes |= STATX_ATTR_MOUNT_ROOT;
> +       stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
> +
> +       /* Handle STATX_DIOALIGN for block devices. */
> +       if (request_mask & STATX_DIOALIGN) {
> +               struct inode *inode = d_backing_inode(path->dentry);
> +
> +               if (S_ISBLK(inode->i_mode))
> +                       bdev_statx_dioalign(inode, stat);
> +       }
> +
> +       return error;
> +}
> +
> +static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
> +                         u32 request_mask)
> +{
> +       CLASS(fd_raw, f)(fd);
> +       if (!f.file)
> +               return -EBADF;
> +       return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
> +}
> +
>  /**
>   * vfs_statx - Get basic and extra attributes by filename
>   * @dfd: A file descriptor representing the base dir for a relative filename
> @@ -243,36 +280,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
>  retry:
>         error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
>         if (error)
> -               goto out;
> -
> -       error = vfs_getattr(&path, stat, request_mask, flags);
> -
> -       if (request_mask & STATX_MNT_ID_UNIQUE) {
> -               stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
> -               stat->result_mask |= STATX_MNT_ID_UNIQUE;
> -       } else {
> -               stat->mnt_id = real_mount(path.mnt)->mnt_id;
> -               stat->result_mask |= STATX_MNT_ID;
> -       }
> -
> -       if (path.mnt->mnt_root == path.dentry)
> -               stat->attributes |= STATX_ATTR_MOUNT_ROOT;
> -       stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
> -
> -       /* Handle STATX_DIOALIGN for block devices. */
> -       if (request_mask & STATX_DIOALIGN) {
> -               struct inode *inode = d_backing_inode(path.dentry);
> -
> -               if (S_ISBLK(inode->i_mode))
> -                       bdev_statx_dioalign(inode, stat);
> -       }
> -
> +               return error;
> +       error = vfs_statx_path(&path, flags, stat, request_mask);
>         path_put(&path);
>         if (retry_estale(error, lookup_flags)) {
>                 lookup_flags |= LOOKUP_REVAL;
>                 goto retry;
>         }
> -out:
>         return error;
>  }
>
> @@ -677,6 +691,29 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
>         return cp_statx(&stat, buffer);
>  }
>
> +int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
> +            struct statx __user *buffer)
> +{
> +       struct kstat stat;
> +       int error;
> +
> +       if (mask & STATX__RESERVED)
> +               return -EINVAL;
> +       if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
> +               return -EINVAL;
> +
> +       /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
> +        * from userland.
> +        */
> +       mask &= ~STATX_CHANGE_COOKIE;
> +
> +       error = vfs_statx_fd(fd, flags, &stat, mask);
> +       if (error)
> +               return error;
> +
> +       return cp_statx(&stat, buffer);
> +}
> +
>  /**
>   * sys_statx - System call to get enhanced stats
>   * @dfd: Base directory to pathwalk from *or* fd to stat.
> @@ -696,6 +733,9 @@ SYSCALL_DEFINE5(statx,
>         int ret;
>         struct filename *name;
>
> +       if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
> +               return do_statx_fd(dfd, flags, mask, buffer);
> +
>         name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
>         ret = do_statx(dfd, name, flags, mask, buffer);
>         putname(name);
> diff --git a/io_uring/statx.c b/io_uring/statx.c
> index abb874209caa..fe967ecb1762 100644
> --- a/io_uring/statx.c
> +++ b/io_uring/statx.c
> @@ -23,6 +23,7 @@ struct io_statx {
>  int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
>         struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
> +       struct filename *filename;
>         const char __user *path;
>
>         if (sqe->buf_index || sqe->splice_fd_in)
> @@ -36,15 +37,14 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>         sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
>         sx->flags = READ_ONCE(sqe->statx_flags);
>
> -       sx->filename = getname_flags(path,
> -                                    getname_statx_lookup_flags(sx->flags),
> -                                    NULL);
> -
> -       if (IS_ERR(sx->filename)) {
> -               int ret = PTR_ERR(sx->filename);
> -
> -               sx->filename = NULL;
> -               return ret;
> +       sx->filename = NULL;
> +       if (!(sx->flags == AT_EMPTY_PATH && vfs_empty_path(sx->dfd, path))) {
> +               filename = getname_flags(path,
> +                                        getname_statx_lookup_flags(sx->flags),
> +                                        NULL);
> +               if (IS_ERR(filename))
> +                       return PTR_ERR(filename);
> +               sx->filename = filename;
>         }
>
>         req->flags |= REQ_F_NEED_CLEANUP;
> @@ -59,7 +59,10 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
>
>         WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
>
> -       ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
> +       if (sx->filename == NULL)
> +               ret = do_statx_fd(sx->dfd, sx->flags, sx->mask, sx->buffer);
> +       else
> +               ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
>         io_req_set_res(req, ret, 0);
>         return IOU_OK;
>  }
> --
> 2.43.0
>
>
Xi Ruoyao June 25, 2024, 2:58 p.m. UTC | #5
On Tue, 2024-06-25 at 22:09 +0800, Huacai Chen wrote:
> On Tue, Jun 25, 2024 at 7:01 PM Mateusz Guzik <mjguzik@gmail.com>
> wrote:
> > 
> > The newly used helper also checks for 0-sized buffers.
> > 
> > This avoids path lookup code, lockref management, memory allocation
> > and
> > in case of NULL path userspace memory access (which can be quite
> > expensive with SMAP on x86_64).
> > 
> > statx with AT_EMPTY_PATH paired with "" or NULL argument as
> > appropriate
> > issued on Sapphire Rapids (ops/s):
> > stock:     4231237
> > 0-check:   5944063 (+40%)
> > NULL path: 6601619 (+11%/+56%)
> > 
> > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> Hi, Ruoyao,
> 
> I'm a bit confused. Ii this patch a replacement of your recent patch?

Yes, both Linus and Christian hates introducing a new AT_ flag for this.

This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave like
statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the performance
issue and it's also audit-able by seccomp BPF.
Huacai Chen June 30, 2024, 1:40 a.m. UTC | #6
On Tue, Jun 25, 2024 at 11:00 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Tue, 2024-06-25 at 22:09 +0800, Huacai Chen wrote:
> > On Tue, Jun 25, 2024 at 7:01 PM Mateusz Guzik <mjguzik@gmail.com>
> > wrote:
> > >
> > > The newly used helper also checks for 0-sized buffers.
> > >
> > > This avoids path lookup code, lockref management, memory allocation
> > > and
> > > in case of NULL path userspace memory access (which can be quite
> > > expensive with SMAP on x86_64).
> > >
> > > statx with AT_EMPTY_PATH paired with "" or NULL argument as
> > > appropriate
> > > issued on Sapphire Rapids (ops/s):
> > > stock:     4231237
> > > 0-check:   5944063 (+40%)
> > > NULL path: 6601619 (+11%/+56%)
> > >
> > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> > Hi, Ruoyao,
> >
> > I'm a bit confused. Ii this patch a replacement of your recent patch?
>
> Yes, both Linus and Christian hates introducing a new AT_ flag for this.
>
> This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave like
> statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the performance
> issue and it's also audit-able by seccomp BPF.
To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
even if statx() becomes audit-able, it is still blacklisted now.
Restoring __ARCH_WANT_NEW_STAT is a very small change that doesn't
introduce any complexity, but it makes life easier. And I think libLoL
also likes __ARCH_WANT_NEW_STAT, though it isn't an upstream
project...

Huacai

>
> --
> Xi Ruoyao <xry111@xry111.site>
> School of Aerospace Science and Technology, Xidian University
>
Xi Ruoyao June 30, 2024, 2:39 a.m. UTC | #7
On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> On Tue, Jun 25, 2024 at 11:00 PM Xi Ruoyao <xry111@xry111.site> wrote:
> > 
> > On Tue, 2024-06-25 at 22:09 +0800, Huacai Chen wrote:
> > > On Tue, Jun 25, 2024 at 7:01 PM Mateusz Guzik <mjguzik@gmail.com>
> > > wrote:
> > > > 
> > > > The newly used helper also checks for 0-sized buffers.
> > > > 
> > > > This avoids path lookup code, lockref management, memory
> > > > allocation
> > > > and
> > > > in case of NULL path userspace memory access (which can be quite
> > > > expensive with SMAP on x86_64).
> > > > 
> > > > statx with AT_EMPTY_PATH paired with "" or NULL argument as
> > > > appropriate
> > > > issued on Sapphire Rapids (ops/s):
> > > > stock:     4231237
> > > > 0-check:   5944063 (+40%)
> > > > NULL path: 6601619 (+11%/+56%)
> > > > 
> > > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> > > Hi, Ruoyao,
> > > 
> > > I'm a bit confused. Ii this patch a replacement of your recent
> > > patch?
> > 
> > Yes, both Linus and Christian hates introducing a new AT_ flag for
> > this.
> > 
> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> > like
> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> > performance
> > issue and it's also audit-able by seccomp BPF.
> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> even if statx() becomes audit-able, it is still blacklisted now.

Then patch the sandbox to allow it.

The sandbox **must** be patched anyway or it'll be broken on all 32-bit
systems after 2037.  [Unless they'll unsupport all 32-bit systems before
2037.]

> Restoring __ARCH_WANT_NEW_STAT is a very small change that doesn't
> introduce any complexity, but it makes life easier. And I think libLoL
> also likes __ARCH_WANT_NEW_STAT, though it isn't an upstream
> project...

At least you should not restore it for 32-bit.  libLoL also has nothing
to do with 32-bit systems anyway.  Maybe conditional it with a #if
checking __BITS_PER_LONG.

And the vendors should really port their software to the upstreamed ABI
instead of relying on liblol.  <rant>Is a recompiling so difficult, or
are the programmers so stupid to invoke plenty of low-level syscalls
directly (bypassing Glibc) in their code?</rant>
Huacai Chen June 30, 2024, 1:18 p.m. UTC | #8
On Sun, Jun 30, 2024 at 10:40 AM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> > On Tue, Jun 25, 2024 at 11:00 PM Xi Ruoyao <xry111@xry111.site> wrote:
> > >
> > > On Tue, 2024-06-25 at 22:09 +0800, Huacai Chen wrote:
> > > > On Tue, Jun 25, 2024 at 7:01 PM Mateusz Guzik <mjguzik@gmail.com>
> > > > wrote:
> > > > >
> > > > > The newly used helper also checks for 0-sized buffers.
> > > > >
> > > > > This avoids path lookup code, lockref management, memory
> > > > > allocation
> > > > > and
> > > > > in case of NULL path userspace memory access (which can be quite
> > > > > expensive with SMAP on x86_64).
> > > > >
> > > > > statx with AT_EMPTY_PATH paired with "" or NULL argument as
> > > > > appropriate
> > > > > issued on Sapphire Rapids (ops/s):
> > > > > stock:     4231237
> > > > > 0-check:   5944063 (+40%)
> > > > > NULL path: 6601619 (+11%/+56%)
> > > > >
> > > > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> > > > Hi, Ruoyao,
> > > >
> > > > I'm a bit confused. Ii this patch a replacement of your recent
> > > > patch?
> > >
> > > Yes, both Linus and Christian hates introducing a new AT_ flag for
> > > this.
> > >
> > > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> > > like
> > > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> > > performance
> > > issue and it's also audit-able by seccomp BPF.
> > To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> > even if statx() becomes audit-able, it is still blacklisted now.
>
> Then patch the sandbox to allow it.
>
> The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> 2037.]
Yes, but it will not happen immediately.

>
> > Restoring __ARCH_WANT_NEW_STAT is a very small change that doesn't
> > introduce any complexity, but it makes life easier. And I think libLoL
> > also likes __ARCH_WANT_NEW_STAT, though it isn't an upstream
> > project...
>
> At least you should not restore it for 32-bit.  libLoL also has nothing
> to do with 32-bit systems anyway.  Maybe conditional it with a #if
> checking __BITS_PER_LONG.
Agree, but currently LoongArch only support 64bit, so we don't need
#ifdef now (Many Kconfig options also need to depend on 64bit, but
dependencies are removed when LoongArch get upstream).

>
> And the vendors should really port their software to the upstreamed ABI
> instead of relying on liblol.  <rant>Is a recompiling so difficult, or
> are the programmers so stupid to invoke plenty of low-level syscalls
> directly (bypassing Glibc) in their code?</rant>
Unfortunately, libLoL may exist for a very long time. Recompiling
isn't difficult, the real problem is "I have already ported to
LoongArch, why should I port again?".

Huacai

>
> --
> Xi Ruoyao <xry111@xry111.site>
> School of Aerospace Science and Technology, Xidian University
>
Arnd Bergmann July 1, 2024, 11:59 a.m. UTC | #9
On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
>> > 
>> > Yes, both Linus and Christian hates introducing a new AT_ flag for
>> > this.
>> > 
>> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
>> > like
>> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
>> > performance
>> > issue and it's also audit-able by seccomp BPF.
>> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
>> even if statx() becomes audit-able, it is still blacklisted now.
>
> Then patch the sandbox to allow it.
>
> The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> 2037.]

More importantly, the sandbox won't be able to support any 32-bit
targets that support running after 2037, regardless of how long
the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
in order to be sure those don't get called by accident, the
fallback is immediately broken.

      Arnd
Huacai Chen July 2, 2024, 3:36 p.m. UTC | #10
Hi, Arnd,

On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> >> >
> >> > Yes, both Linus and Christian hates introducing a new AT_ flag for
> >> > this.
> >> >
> >> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> >> > like
> >> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> >> > performance
> >> > issue and it's also audit-able by seccomp BPF.
> >> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> >> even if statx() becomes audit-able, it is still blacklisted now.
> >
> > Then patch the sandbox to allow it.
> >
> > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> > 2037.]
>
> More importantly, the sandbox won't be able to support any 32-bit
> targets that support running after 2037, regardless of how long
> the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
> in order to be sure those don't get called by accident, the
> fallback is immediately broken.
Would you mind if I restore newstat for LoongArch64 even if this patch exist?

Huacai

>
>       Arnd
Arnd Bergmann July 2, 2024, 5:06 p.m. UTC | #11
On Tue, Jul 2, 2024, at 17:36, Huacai Chen wrote:
> On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
>> On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
>> > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
>> >> >
>> >> > Yes, both Linus and Christian hates introducing a new AT_ flag for
>> >> > this.
>> >> >
>> >> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
>> >> > like
>> >> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
>> >> > performance
>> >> > issue and it's also audit-able by seccomp BPF.
>> >> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
>> >> even if statx() becomes audit-able, it is still blacklisted now.
>> >
>> > Then patch the sandbox to allow it.
>> >
>> > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
>> > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
>> > 2037.]
>>
>> More importantly, the sandbox won't be able to support any 32-bit
>> targets that support running after 2037, regardless of how long
>> the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
>> in order to be sure those don't get called by accident, the
>> fallback is immediately broken.
> Would you mind if I restore newstat for LoongArch64 even if this patch exist?

I still prefer not add newstat back: it's easier to
get applications to correctly implement the statx() code
path if there are more architectures that only have that.

       Arnd
Huacai Chen July 3, 2024, 4:30 a.m. UTC | #12
On Wed, Jul 3, 2024 at 1:07 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Tue, Jul 2, 2024, at 17:36, Huacai Chen wrote:
> > On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >> On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> >> > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> >> >> >
> >> >> > Yes, both Linus and Christian hates introducing a new AT_ flag for
> >> >> > this.
> >> >> >
> >> >> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> >> >> > like
> >> >> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> >> >> > performance
> >> >> > issue and it's also audit-able by seccomp BPF.
> >> >> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> >> >> even if statx() becomes audit-able, it is still blacklisted now.
> >> >
> >> > Then patch the sandbox to allow it.
> >> >
> >> > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> >> > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> >> > 2037.]
> >>
> >> More importantly, the sandbox won't be able to support any 32-bit
> >> targets that support running after 2037, regardless of how long
> >> the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
> >> in order to be sure those don't get called by accident, the
> >> fallback is immediately broken.
> > Would you mind if I restore newstat for LoongArch64 even if this patch exist?
>
> I still prefer not add newstat back: it's easier to
> get applications to correctly implement the statx() code
> path if there are more architectures that only have that.
Yes, we need statx-only architecures to improve statx(), and so this
patch got upstream. But I'm considering bidirectional compatibility,
which means the kernel should work with future patched and existing
un-patched sandboxes. So I think now is the correct time to add
newstat back for LoongArch --- statx() has been improved, and existing
applications want to work on LoongArch.

Huacai

>
>        Arnd
Christian Brauner July 3, 2024, 8:45 a.m. UTC | #13
On Tue, Jul 02, 2024 at 07:06:53PM GMT, Arnd Bergmann wrote:
> On Tue, Jul 2, 2024, at 17:36, Huacai Chen wrote:
> > On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
> >> On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> >> > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> >> >> >
> >> >> > Yes, both Linus and Christian hates introducing a new AT_ flag for
> >> >> > this.
> >> >> >
> >> >> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> >> >> > like
> >> >> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> >> >> > performance
> >> >> > issue and it's also audit-able by seccomp BPF.
> >> >> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> >> >> even if statx() becomes audit-able, it is still blacklisted now.
> >> >
> >> > Then patch the sandbox to allow it.
> >> >
> >> > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> >> > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> >> > 2037.]
> >>
> >> More importantly, the sandbox won't be able to support any 32-bit
> >> targets that support running after 2037, regardless of how long
> >> the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
> >> in order to be sure those don't get called by accident, the
> >> fallback is immediately broken.
> > Would you mind if I restore newstat for LoongArch64 even if this patch exist?
> 
> I still prefer not add newstat back: it's easier to
> get applications to correctly implement the statx() code
> path if there are more architectures that only have that.

I agree.

We've now added AT_EMPTY_PATH support with NULL names because we want to
allow that generically. But I clearly remember that this was requested
to make statx() work with these sandboxes. So the kernel has done its
part. Now it's for the sandbox to allow statx() with NULL paths and
AT_EMPTY_PATH but certainly not for the kernel to start reenabling old
system calls.
Huacai Chen July 3, 2024, 9:35 a.m. UTC | #14
Hi, Christian,

On Wed, Jul 3, 2024 at 4:46 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Jul 02, 2024 at 07:06:53PM GMT, Arnd Bergmann wrote:
> > On Tue, Jul 2, 2024, at 17:36, Huacai Chen wrote:
> > > On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > >> On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> > >> > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> > >> >> >
> > >> >> > Yes, both Linus and Christian hates introducing a new AT_ flag for
> > >> >> > this.
> > >> >> >
> > >> >> > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> > >> >> > like
> > >> >> > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> > >> >> > performance
> > >> >> > issue and it's also audit-able by seccomp BPF.
> > >> >> To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> > >> >> even if statx() becomes audit-able, it is still blacklisted now.
> > >> >
> > >> > Then patch the sandbox to allow it.
> > >> >
> > >> > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> > >> > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> > >> > 2037.]
> > >>
> > >> More importantly, the sandbox won't be able to support any 32-bit
> > >> targets that support running after 2037, regardless of how long
> > >> the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
> > >> in order to be sure those don't get called by accident, the
> > >> fallback is immediately broken.
> > > Would you mind if I restore newstat for LoongArch64 even if this patch exist?
> >
> > I still prefer not add newstat back: it's easier to
> > get applications to correctly implement the statx() code
> > path if there are more architectures that only have that.
>
> I agree.
>
> We've now added AT_EMPTY_PATH support with NULL names because we want to
> allow that generically. But I clearly remember that this was requested
> to make statx() work with these sandboxes. So the kernel has done its
> part. Now it's for the sandbox to allow statx() with NULL paths and
> AT_EMPTY_PATH but certainly not for the kernel to start reenabling old
> system calls.
Linux distributions don't use latest applications, so they still need
an out-of-tree kernel patch to restore newstat. Of course they can
also patch their applications, but patching the kernel is
significantly easier.

So in my opinion LoongArch has completed its task to drive statx()
improvement, now restoring newstat is a double-insurance for
compatibility.

Huacai
Xi Ruoyao July 3, 2024, 10:07 a.m. UTC | #15
On Wed, 2024-07-03 at 17:35 +0800, Huacai Chen wrote:
> Hi, Christian,
> 
> On Wed, Jul 3, 2024 at 4:46 PM Christian Brauner <brauner@kernel.org> wrote:
> > 
> > On Tue, Jul 02, 2024 at 07:06:53PM GMT, Arnd Bergmann wrote:
> > > On Tue, Jul 2, 2024, at 17:36, Huacai Chen wrote:
> > > > On Mon, Jul 1, 2024 at 7:59 PM Arnd Bergmann <arnd@arndb.de> wrote:
> > > > > On Sun, Jun 30, 2024, at 04:39, Xi Ruoyao wrote:
> > > > > > On Sun, 2024-06-30 at 09:40 +0800, Huacai Chen wrote:
> > > > > > > > 
> > > > > > > > Yes, both Linus and Christian hates introducing a new AT_ flag for
> > > > > > > > this.
> > > > > > > > 
> > > > > > > > This patch just makes statx(fd, NULL, AT_EMPTY_PATH, ...) behave
> > > > > > > > like
> > > > > > > > statx(fd, "", AT_EMPTY_PATH, ...) instead.  NULL avoids the
> > > > > > > > performance
> > > > > > > > issue and it's also audit-able by seccomp BPF.
> > > > > > > To be honest, I still want to restore __ARCH_WANT_NEW_STAT. Because
> > > > > > > even if statx() becomes audit-able, it is still blacklisted now.
> > > > > > 
> > > > > > Then patch the sandbox to allow it.
> > > > > > 
> > > > > > The sandbox **must** be patched anyway or it'll be broken on all 32-bit
> > > > > > systems after 2037.  [Unless they'll unsupport all 32-bit systems before
> > > > > > 2037.]
> > > > > 
> > > > > More importantly, the sandbox won't be able to support any 32-bit
> > > > > targets that support running after 2037, regardless of how long
> > > > > the sandbox supports them: if you turn off COMPAT_32BIT_TIME today
> > > > > in order to be sure those don't get called by accident, the
> > > > > fallback is immediately broken.
> > > > Would you mind if I restore newstat for LoongArch64 even if this patch exist?
> > > 
> > > I still prefer not add newstat back: it's easier to
> > > get applications to correctly implement the statx() code
> > > path if there are more architectures that only have that.
> > 
> > I agree.
> > 
> > We've now added AT_EMPTY_PATH support with NULL names because we want to
> > allow that generically. But I clearly remember that this was requested
> > to make statx() work with these sandboxes. So the kernel has done its
> > part. Now it's for the sandbox to allow statx() with NULL paths and
> > AT_EMPTY_PATH but certainly not for the kernel to start reenabling old
> > system calls.
> Linux distributions don't use latest applications, so they still need
> an out-of-tree kernel patch to restore newstat. Of course they can
> also patch their applications, but patching the kernel is
> significantly easier.
> 
> So in my opinion LoongArch has completed its task to drive statx()
> improvement

It'll only be finished once the apps are adapted, or they'll stop to
work after 2037 anyway.

I've informed Firefox at
https://bugzilla.mozilla.org/show_bug.cgi?id=1673771.  For Google
products I guess someone else will have to do (I'm really unfamiliar
with their things, and they often block my proxy server despite I've
never used the proxy to attack them).

> now restoring newstat is a double-insurance for compatibility.

It may also introduce incompatibility: consider a seccomp sandbox which
does not handle fstat on LoongArch because __NR_fstat is not defined in
the UAPI header.  Now the kernel is updated to provide fstat the sandbox
will be broken: a blocklist sandbox will fail to block fstat and leave a
security hole; a whitelist sandbox will fail to allow fstat and blow up
the app if some runtime library is updated to "take the advantage" of
fstat.

My preference (most preferable to least preferable):

1. Not to add them back at all.  Just let the downstream to patch the
kernel if they must support a broken userspace.
2. Add them back with a configurable option (depending on CONFIG_EXPERT:
the distros are already enabling this anyway), make them documented
clearly as only intended to support a broken userspace and removable in
the future.
3. Add it back only for 64-bit.  Add a #if **now** for ruling it out for
32-bit despite we don't have 32-bit support, to make it clear we'll not
flatter broken userspace anymore when we make the 32-bit port.
<rant>4. Remove seccomp.  Personally I really wish to put this on the
top.</rant>

BTW has anyone tried to use Landlock for those browser sandboxes
instead?
Linus Torvalds July 3, 2024, 4:31 p.m. UTC | #16
On Wed, 3 Jul 2024 at 01:46, Christian Brauner <brauner@kernel.org> wrote:
>
> We've now added AT_EMPTY_PATH support with NULL names because we want to
> allow that generically. But I clearly remember that this was requested
> to make statx() work with these sandboxes. So the kernel has done its
> part. Now it's for the sandbox to allow statx() with NULL paths and
> AT_EMPTY_PATH but certainly not for the kernel to start reenabling old
> system calls.

Those old system calls are still used.

Just enable them.

statx isn't the promised land. Existing applications matter. And there
is absolutely nothing wrong with plain old 'stat' (well, we call it
"newstat" in the kernel for historical reasons) on 64-bit
architectures.

Honestly, 'statx' is disgusting. I don't understand why anybody pushes
that thing that nobody actually uses or cares about.

                Linus
Xi Ruoyao July 3, 2024, 4:54 p.m. UTC | #17
On Wed, 2024-07-03 at 09:31 -0700, Linus Torvalds wrote:
> On Wed, 3 Jul 2024 at 01:46, Christian Brauner <brauner@kernel.org>
> wrote:
> > 
> > We've now added AT_EMPTY_PATH support with NULL names because we
> > want to
> > allow that generically. But I clearly remember that this was
> > requested
> > to make statx() work with these sandboxes. So the kernel has done
> > its
> > part. Now it's for the sandbox to allow statx() with NULL paths and
> > AT_EMPTY_PATH but certainly not for the kernel to start reenabling
> > old
> > system calls.
> 
> Those old system calls are still used.
> 
> Just enable them.
> 
> statx isn't the promised land. Existing applications matter. And there
> is absolutely nothing wrong with plain old 'stat' (well, we call it
> "newstat" in the kernel for historical reasons) on 64-bit
> architectures.
> 
> Honestly, 'statx' is disgusting. I don't understand why anybody pushes
> that thing that nobody actually uses or cares about.

Hmm why it was added in the first place then?  Why not just NAK it?  If
someone tries to add a "seccomp sandbox" into my project I'll
immediately NAK it anyway :).

And should we add stat_time64, fstat_time64, and fstatat_time64 to stop
using statx on 32-bit platforms too as it's disgusting?

Also some bad news: Glibc has this:

#if (__WORDSIZE == 32 \
     && (!defined __SYSCALL_WORDSIZE || __SYSCALL_WORDSIZE == 32)) \
    || defined STAT_HAS_TIME32 \
    || (!defined __NR_newfstatat && !defined __NR_fstatat64)
# define FSTATAT_USE_STATX 1
#else
# define FSTATAT_USE_STATX 0
#endif

So if a LoongArch Glibc is built with Linux kernel headers >= 6.11,
it'll use fstatat **even configured --with-kernel=5.19** and fail to run
on Linux kernel <= 6.10.  This will immediately blow up building Linux
From Scratch on a host distro with an "old" kernel.

<sarcasm>Alright, some Google project matters but Glibc does not matter
because it uses a disgusting syscall in the first place.</sarcasm>

We have to add some __ASSUME_blah_blah here now.

To make things worse Glibc 2.40 is being frozen today :(.  Copying to
libc-alpha and the RM.
Linus Torvalds July 3, 2024, 5:09 p.m. UTC | #18
On Wed, 3 Jul 2024 at 09:54, Xi Ruoyao <xry111@xry111.site> wrote:
>
> > Honestly, 'statx' is disgusting. I don't understand why anybody pushes
> > that thing that nobody actually uses or cares about.
>
> Hmm why it was added in the first place then?  Why not just NAK it?

There are valid uses of statx - they are just VERY very few and far between.

For example, if you want the "change cookie" or any number of other
special things that aren't standard, you have to use statx.

But _normal_ programs will never do that. It's unportable, and it
really is a specialty interface.

Pushing 'statx' as a replacement for 'stat' is just crazy. It's a
different thing. It's not a "better" thing. It's an extension, yes,
but "extension" doesn't mean "better".

That's true when it was mis-designed in certain ways that we now have
to fix up, but PARTICULARLY true when it's nonstandard and no other OS
has it.

> And should we add stat_time64, fstat_time64, and fstatat_time64 to stop
> using statx on 32-bit platforms too as it's disgusting?

We already have 'stat64' for 32-bit platforms. We have had it for over
25 years - it predates not only the kernel git tree, it predates the
BK tree too.

I think stat64 was introduced in 2.3.34. That is literally last century.

Anybody who tries to make this about 2037 is being actively dishonest.

Why are people even discussing this pointless thing?

            Linus
Xi Ruoyao July 3, 2024, 5:11 p.m. UTC | #19
On Thu, 2024-07-04 at 00:54 +0800, Xi Ruoyao wrote:
> On Wed, 2024-07-03 at 09:31 -0700, Linus Torvalds wrote:
> > On Wed, 3 Jul 2024 at 01:46, Christian Brauner <brauner@kernel.org>
> > wrote:
> > > 
> > > We've now added AT_EMPTY_PATH support with NULL names because we
> > > want to
> > > allow that generically. But I clearly remember that this was
> > > requested
> > > to make statx() work with these sandboxes. So the kernel has done
> > > its
> > > part. Now it's for the sandbox to allow statx() with NULL paths and
> > > AT_EMPTY_PATH but certainly not for the kernel to start reenabling
> > > old
> > > system calls.
> > 
> > Those old system calls are still used.
> > 
> > Just enable them.
> > 
> > statx isn't the promised land. Existing applications matter. And there
> > is absolutely nothing wrong with plain old 'stat' (well, we call it
> > "newstat" in the kernel for historical reasons) on 64-bit
> > architectures.
> > 
> > Honestly, 'statx' is disgusting. I don't understand why anybody pushes
> > that thing that nobody actually uses or cares about.
> 
> Hmm why it was added in the first place then?  Why not just NAK it?  If
> someone tries to add a "seccomp sandbox" into my project I'll
> immediately NAK it anyway :).
> 
> And should we add stat_time64, fstat_time64, and fstatat_time64 to stop
> using statx on 32-bit platforms too as it's disgusting?
> 
> Also some bad news: Glibc has this:
> 
> #if (__WORDSIZE == 32 \
>      && (!defined __SYSCALL_WORDSIZE || __SYSCALL_WORDSIZE == 32)) \
>     || defined STAT_HAS_TIME32 \
>     || (!defined __NR_newfstatat && !defined __NR_fstatat64)
> # define FSTATAT_USE_STATX 1
> #else
> # define FSTATAT_USE_STATX 0
> #endif
> 
> So if a LoongArch Glibc is built with Linux kernel headers >= 6.11,
> it'll use fstatat **even configured --with-kernel=5.19** and fail to run
> on Linux kernel <= 6.10.  This will immediately blow up building Linux
> From Scratch on a host distro with an "old" kernel.
> 
> <sarcasm>Alright, some Google project matters but Glibc does not matter
> because it uses a disgusting syscall in the first place.</sarcasm>
> 
> We have to add some __ASSUME_blah_blah here now.
> 
> To make things worse Glibc 2.40 is being frozen today :(.  Copying to
> libc-alpha and the RM.

Alright it's not an emergency issue.  It will only blow up when we run
update-syscall-lists.py the next time.  Thus this release should be OK
and I'm going to lying flat for now.
Xi Ruoyao July 3, 2024, 5:30 p.m. UTC | #20
On Wed, 2024-07-03 at 10:09 -0700, Linus Torvalds wrote:
> > And should we add stat_time64, fstat_time64, and fstatat_time64 to stop
> > using statx on 32-bit platforms too as it's disgusting?
> 
> We already have 'stat64' for 32-bit platforms. We have had it for over
> 25 years - it predates not only the kernel git tree, it predates the
> BK tree too.
> 
> I think stat64 was introduced in 2.3.34. That is literally last century.

struct stat64 {

// ...

    int     st_atime;   /* Time of last access.  */
    unsigned int    st_atime_nsec;
    int     st_mtime;   /* Time of last modification.  */
    unsigned int    st_mtime_nsec;
    int     st_ctime;   /* Time of last status change.  */
    unsigned int    st_ctime_nsec;
    unsigned int    __unused4;
    unsigned int    __unused5;
};

> Anybody who tries to make this about 2037 is being actively dishonest.

> Why are people even discussing this pointless thing?

So are we going to drop 32-bit support before 2037?  Then yes it'd be
pointless and I can live (even easier) without 32-bit things.

Otherwise, we still have 13 years before 2037 but this does not render
the thing pointless.  We still have to provide a 64-bit time stamp soon
or later.
Linus Torvalds July 3, 2024, 5:40 p.m. UTC | #21
On Wed, 3 Jul 2024 at 10:30, Xi Ruoyao <xry111@xry111.site> wrote:
>
> struct stat64 {
>
> // ...
>
>     int     st_atime;   /* Time of last access.  */

Oh wow. Shows just *how* long ago that was - and how long ago I looked
at 32-bit code. Because clearly, I was wrong.

I guess it shows how nobody actually cares about 32-bit any more, at
least in the 2037 sense.

The point stands, though - statx isn't a replacement for existing binaries.

             Linus
Linus Torvalds July 3, 2024, 5:54 p.m. UTC | #22
On Wed, 3 Jul 2024 at 10:40, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Oh wow. Shows just *how* long ago that was - and how long ago I looked
> at 32-bit code. Because clearly, I was wrong.

Ok, so clearly any *new* 32-bit architecture should use 'struct statx'
as 'struct stat', and at least avoid the conversion pain.

Of course, if using <asm-generic/stat.h> like loongarch does, that is
very much not what happens. You get those old models with just 'long'.

So any architecture that didn't do that 'stat == statx' and has
binaries with old stat models should just continue to have them.

It's not like we can get rid of the kernel side code for that all _anyway_.

             Linus
Christian Brauner July 3, 2024, 6:14 p.m. UTC | #23
On Wed, Jul 03, 2024 at 10:54:53AM GMT, Linus Torvalds wrote:
> On Wed, 3 Jul 2024 at 10:40, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > Oh wow. Shows just *how* long ago that was - and how long ago I looked
> > at 32-bit code. Because clearly, I was wrong.
> 
> Ok, so clearly any *new* 32-bit architecture should use 'struct statx'
> as 'struct stat', and at least avoid the conversion pain.
> 
> Of course, if using <asm-generic/stat.h> like loongarch does, that is
> very much not what happens. You get those old models with just 'long'.
> 
> So any architecture that didn't do that 'stat == statx' and has
> binaries with old stat models should just continue to have them.
> 
> It's not like we can get rid of the kernel side code for that all _anyway_.

Fwiw, the original motivation for that whole "let's do NULL with
AT_EMPTY_PATH" (somewhat independent from the generic use of it) that
somehow morphed into this discussion was that the Chrome Sandbox has
rewrites fstatat() system calls to fstat() via SECCOMP_RET_TRAP:

  if (args.nr == __NR_fstatat_default) {
    if (*reinterpret_cast<const char*>(args.args[1]) == '\0' &&
        args.args[3] == static_cast<uint64_t>(AT_EMPTY_PATH)) {
      return syscall(__NR_fstat_default, static_cast<int>(args.args[0]),
                     reinterpret_cast<default_stat_struct*>(args.args[2]));
    }

while also disabling statx() completely because they can't (easily)
rewrite it and don't want to allow it unless we have NULL for
AT_EMPTY_PATH (which we'll have soon ofc).

In any case in [1] I proposed they add back fstat()/fstatat64() which
should get that problem solved because they can rewrite that thing.

In any case, which one of these does a new architecture have to add for
reasonable backward compatibility:

fstat()
fstat64()
fstatat64()

lstat()
lstat64()

stat()
stat64()
statx()

newstat()
newlstat()
newfstat()
newfstatat()

Because really that's a complete mess and we have all sorts of overflow
issues and odd failures in the varioius variants. And the userspace
ifdefery in libcs is just as bad if not very much worse.

[1]: https://lore.kernel.org/lkml/20240226-altmodisch-gedeutet-91c5ba2f6071@brauner
Christian Brauner July 3, 2024, 6:39 p.m. UTC | #24
On Wed, Jul 03, 2024 at 08:14:15PM GMT, Christian Brauner wrote:
> On Wed, Jul 03, 2024 at 10:54:53AM GMT, Linus Torvalds wrote:
> > On Wed, 3 Jul 2024 at 10:40, Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > >
> > > Oh wow. Shows just *how* long ago that was - and how long ago I looked
> > > at 32-bit code. Because clearly, I was wrong.
> > 
> > Ok, so clearly any *new* 32-bit architecture should use 'struct statx'
> > as 'struct stat', and at least avoid the conversion pain.
> > 
> > Of course, if using <asm-generic/stat.h> like loongarch does, that is
> > very much not what happens. You get those old models with just 'long'.
> > 
> > So any architecture that didn't do that 'stat == statx' and has
> > binaries with old stat models should just continue to have them.
> > 
> > It's not like we can get rid of the kernel side code for that all _anyway_.
> 
> Fwiw, the original motivation for that whole "let's do NULL with
> AT_EMPTY_PATH" (somewhat independent from the generic use of it) that
> somehow morphed into this discussion was that the Chrome Sandbox has
> rewrites fstatat() system calls to fstat() via SECCOMP_RET_TRAP:
> 
>   if (args.nr == __NR_fstatat_default) {
>     if (*reinterpret_cast<const char*>(args.args[1]) == '\0' &&
>         args.args[3] == static_cast<uint64_t>(AT_EMPTY_PATH)) {
>       return syscall(__NR_fstat_default, static_cast<int>(args.args[0]),
>                      reinterpret_cast<default_stat_struct*>(args.args[2]));
>     }
> 
> while also disabling statx() completely because they can't (easily)
> rewrite it and don't want to allow it unless we have NULL for
> AT_EMPTY_PATH (which we'll have soon ofc).
> 
> In any case in [1] I proposed they add back fstat()/fstatat64() which
> should get that problem solved because they can rewrite that thing.
> 
> In any case, which one of these does a new architecture have to add for
> reasonable backward compatibility:

Going by riscv added in 2017 it would be:

newstat()
newlstat()
newfstat()
newfstatat()
statx()

> 
> fstat()
> fstat64()
> fstatat64()
> 
> lstat()
> lstat64()
> 
> stat()
> stat64()
> statx()
> 
> newstat()
> newlstat()
> newfstat()
> newfstatat()
> 
> Because really that's a complete mess and we have all sorts of overflow
> issues and odd failures in the varioius variants. And the userspace
> ifdefery in libcs is just as bad if not very much worse.
> 
> [1]: https://lore.kernel.org/lkml/20240226-altmodisch-gedeutet-91c5ba2f6071@brauner
Arnd Bergmann July 3, 2024, 6:44 p.m. UTC | #25
On Wed, Jul 3, 2024, at 19:40, Linus Torvalds wrote:
> On Wed, 3 Jul 2024 at 10:30, Xi Ruoyao <xry111@xry111.site> wrote:
>>
>> struct stat64 {
>>
>> // ...
>>
>>     int     st_atime;   /* Time of last access.  */
>
> Oh wow. Shows just *how* long ago that was - and how long ago I looked
> at 32-bit code. Because clearly, I was wrong.
>
> I guess it shows how nobody actually cares about 32-bit any more, at
> least in the 2037 sense.
>
> The point stands, though - statx isn't a replacement for existing binaries.

We had long discussions about adding another stat()/fstat()
variant with 64-bit timestamps from 2012 to 2017, the result
was that we mandated that a libc implementation with 64-bit
time_t must only use statx() and not fall back to the time32
syscalls on kernels that are new enough to have statx().
This is both for architectures that were introduced after
time64 support was added (riscv32 and the glibc port for
arc), and for userspace builds that are explicitly using
time64 syscalls only.

That may have been a mistake in hindsight, or it may have
been the right choice, but the thing is that if we now decide
that 32-bit userspace can not rely on statx() to be available,
then we need to introduce one or two new system calls.

    Arnd
Xi Ruoyao July 3, 2024, 6:48 p.m. UTC | #26
On Wed, 2024-07-03 at 10:54 -0700, Linus Torvalds wrote:
> On Wed, 3 Jul 2024 at 10:40, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > 
> > Oh wow. Shows just *how* long ago that was - and how long ago I
> > looked
> > at 32-bit code. Because clearly, I was wrong.
> 
> Ok, so clearly any *new* 32-bit architecture should use 'struct statx'
> as 'struct stat', and at least avoid the conversion pain.
> 
> Of course, if using <asm-generic/stat.h> like loongarch does, that is
> very much not what happens. You get those old models with just 'long'.

Fortunately LoongArch ILP32 ABI is not finalized yet (there's no 32-bit
kernel and 64-bit kernel does not support 32-bit userspace yet) so we
can still make it happen to use struct statx as (userspace) struct
stat...
Linus Torvalds July 3, 2024, 7 p.m. UTC | #27
On Wed, 3 Jul 2024 at 11:14, Christian Brauner <brauner@kernel.org> wrote:
>
> In any case, which one of these does a new architecture have to add for
> reasonable backward compatibility:
>
> fstat()
> fstat64()
> fstatat64()
>
> lstat()
> lstat64()
>
> stat()
> stat64()
> statx()
>
> newstat()
> newlstat()
> newfstat()
> newfstatat()

Well, I do think that a *new* architecture should indeed add all of
those, but make the 'struct stat' for all of them be the same.

So then if people do the system call rewriting thing - or just do the
manual system call thing with

    syscall(__NR_xyz, ...)

it is all available, but it ends up being all the same code.

Wouldn't that be lovely?

Of course, I also happen to think that "new architecture" and "32-bit"
is just crazy to begin with, so honestly, I don't even care. 32-bit
architectures aren't really relevant for any new development, and I
think we should just codify that.

And on 64-bit architectures, the standard 'stat' works fine.

            Linus
Linus Torvalds July 3, 2024, 7:05 p.m. UTC | #28
On Wed, 3 Jul 2024 at 11:48, Xi Ruoyao <xry111@xry111.site> wrote:
>
> Fortunately LoongArch ILP32 ABI is not finalized yet (there's no 32-bit
> kernel and 64-bit kernel does not support 32-bit userspace yet) so we
> can still make it happen to use struct statx as (userspace) struct
> stat...

Oh, no problem then. If there are no existing binaries, then yes,
please do that,

It avoids the compat issues too.

I think 'struct statx' is a horrid bloated thing (clearing those extra
"spare" words is a pain, and yes, the user copy for _regular_ 'stat()'
already shows up in profiles), but for some new 32-bit platform it's
definitely worth the pain just to avoid the compat code or new
structure definitions.

              Linus
Linus Torvalds July 3, 2024, 7:18 p.m. UTC | #29
On Wed, 3 Jul 2024 at 12:00, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Well, I do think that a *new* architecture should indeed add all of
> those, but make the 'struct stat' for all of them be the same.

Just to clarify: by "all of those" I don't mean all the
stat64/oldstat/newstat variants that we have for compatibility with
older ABI's, but all the "calling conventions" variants.

IOW, all of stat/lstat/fstat and statx should exist as separate system
calls, and libc shouldn't have to rewrite arguments to make one into
another.

(And yes, things like 'strace()' and friends should show what the app
did, not what glibc had to rewrite things as, which is a private pet
peeve of mine)

         Linus
Christian Brauner July 3, 2024, 7:33 p.m. UTC | #30
On Wed, Jul 03, 2024 at 12:05:04PM GMT, Linus Torvalds wrote:
> On Wed, 3 Jul 2024 at 11:48, Xi Ruoyao <xry111@xry111.site> wrote:
> >
> > Fortunately LoongArch ILP32 ABI is not finalized yet (there's no 32-bit
> > kernel and 64-bit kernel does not support 32-bit userspace yet) so we
> > can still make it happen to use struct statx as (userspace) struct
> > stat...
> 
> Oh, no problem then. If there are no existing binaries, then yes,
> please do that,
> 
> It avoids the compat issues too.
> 
> I think 'struct statx' is a horrid bloated thing (clearing those extra
> "spare" words is a pain, and yes, the user copy for _regular_ 'stat()'
> already shows up in profiles), but for some new 32-bit platform it's
> definitely worth the pain just to avoid the compat code or new
> structure definitions.

Fwiw, that's why I prefer structs versioned by size which we added clean
handling for via copy_struct_from_user() as in e.g., struct clone_args,
struct mount_setattr, struct mnt_id_req and so on because then you don't
have such problems.

If the struct gets extended 100 times each time adding a 64 bit value
but all the caller cares about is the base information then they can
just pass the first, minimal struct version and be done with it. No need
to reserve any space for unknown future extensions as well.
Linus Torvalds July 3, 2024, 7:52 p.m. UTC | #31
On Wed, 3 Jul 2024 at 12:33, Christian Brauner <brauner@kernel.org> wrote:
>
> Fwiw, that's why I prefer structs versioned by size which we added clean
> handling for via copy_struct_from_user()

That works very well for the kernel interface for new things, but it
actually doesn't solve the issue for user space library versioning.

If you are something like 'glibc', you don't have the option of saying
"pass in struct and size". You are kind of stuck with the API rules,
and the rules are that you expose a 'struct stat' that has a fixed
size.

So I don't disagree that copy_struct_from_user() is a good model, but
what would happen is just that then glibc says "I will need to make a
decision", and would pick a size that is bigger than the current size
it uses, so that glibc later could do those extensions without
breaking the ABI.

And yes, it would pass that larger size to the kernel,. because it
would want the kernel to zero out the unused tail of the struct.

So the 'struct and extensible size' thing really only works when
everybody agrees on using it, and users pass the size end-to-end.

Side note: this is our original i386 'stat64':

        unsigned long   st_blocks;      /* Number 512-byte blocks allocated. */
        unsigned long   __pad4;         /* future possible st_blocks
high bits */

        unsigned long   st_atime;
        unsigned long   __pad5;

        unsigned long   st_mtime;
        unsigned long   __pad6;

        unsigned long   st_ctime;
        unsigned long   __pad7;         /* will be high 32 bits of
ctime someday */

which is kind of sad. The code was literally designed to extend the
time range, had a comment to that effect and all, and then we screwed
it up.

On little-endian, we could literally have done it as

        unsigned long long  st_ctime:44, st_ctime_usec:20;

without losin gbinary compatibility. But it's sadly not what we did.
Instead we gave the full 32-bit padding to the nsec field.

And yes, I had to go back a long time to find this screw-up. It
happened back in 2002.

Oh well. Not the first time we've royally screwed up, and it most
definitely won't be the last time either.

           Linus
Christian Brauner July 3, 2024, 7:55 p.m. UTC | #32
On Wed, Jul 03, 2024 at 08:44:50PM GMT, Arnd Bergmann wrote:
> On Wed, Jul 3, 2024, at 19:40, Linus Torvalds wrote:
> > On Wed, 3 Jul 2024 at 10:30, Xi Ruoyao <xry111@xry111.site> wrote:
> >>
> >> struct stat64 {
> >>
> >> // ...
> >>
> >>     int     st_atime;   /* Time of last access.  */
> >
> > Oh wow. Shows just *how* long ago that was - and how long ago I looked
> > at 32-bit code. Because clearly, I was wrong.
> >
> > I guess it shows how nobody actually cares about 32-bit any more, at
> > least in the 2037 sense.
> >
> > The point stands, though - statx isn't a replacement for existing binaries.
> 
> We had long discussions about adding another stat()/fstat()
> variant with 64-bit timestamps from 2012 to 2017, the result
> was that we mandated that a libc implementation with 64-bit
> time_t must only use statx() and not fall back to the time32
> syscalls on kernels that are new enough to have statx().
> This is both for architectures that were introduced after
> time64 support was added (riscv32 and the glibc port for
> arc), and for userspace builds that are explicitly using
> time64 syscalls only.
> 
> That may have been a mistake in hindsight, or it may have
> been the right choice, but the thing is that if we now decide
> that 32-bit userspace can not rely on statx() to be available,
> then we need to introduce one or two new system calls.

I'm not sure we need to now pull the rug out from everyone now and I
don't think this was where the discussion was going. Any new
architecture will implement statx(). And for 32bit I think that's
entirely fine and we don't need to add even more variants just for this
case. I don't think we need to add newnewstat_promiseitsthelastone().
Huacai Chen July 4, 2024, 2:38 a.m. UTC | #33
On Thu, Jul 4, 2024 at 12:54 AM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Wed, 2024-07-03 at 09:31 -0700, Linus Torvalds wrote:
> > On Wed, 3 Jul 2024 at 01:46, Christian Brauner <brauner@kernel.org>
> > wrote:
> > >
> > > We've now added AT_EMPTY_PATH support with NULL names because we
> > > want to
> > > allow that generically. But I clearly remember that this was
> > > requested
> > > to make statx() work with these sandboxes. So the kernel has done
> > > its
> > > part. Now it's for the sandbox to allow statx() with NULL paths and
> > > AT_EMPTY_PATH but certainly not for the kernel to start reenabling
> > > old
> > > system calls.
> >
> > Those old system calls are still used.
> >
> > Just enable them.
> >
> > statx isn't the promised land. Existing applications matter. And there
> > is absolutely nothing wrong with plain old 'stat' (well, we call it
> > "newstat" in the kernel for historical reasons) on 64-bit
> > architectures.
> >
> > Honestly, 'statx' is disgusting. I don't understand why anybody pushes
> > that thing that nobody actually uses or cares about.
>
> Hmm why it was added in the first place then?  Why not just NAK it?  If
> someone tries to add a "seccomp sandbox" into my project I'll
> immediately NAK it anyway :).
>
> And should we add stat_time64, fstat_time64, and fstatat_time64 to stop
> using statx on 32-bit platforms too as it's disgusting?
>
> Also some bad news: Glibc has this:
>
> #if (__WORDSIZE == 32 \
>      && (!defined __SYSCALL_WORDSIZE || __SYSCALL_WORDSIZE == 32)) \
>     || defined STAT_HAS_TIME32 \
>     || (!defined __NR_newfstatat && !defined __NR_fstatat64)
> # define FSTATAT_USE_STATX 1
> #else
> # define FSTATAT_USE_STATX 0
> #endif
>
> So if a LoongArch Glibc is built with Linux kernel headers >= 6.11,
> it'll use fstatat **even configured --with-kernel=5.19** and fail to run
> on Linux kernel <= 6.10.  This will immediately blow up building Linux
> From Scratch on a host distro with an "old" kernel.
The patch which adds newstat back will CC the stable list and be
backported to old kernels.

Huacai

>
> <sarcasm>Alright, some Google project matters but Glibc does not matter
> because it uses a disgusting syscall in the first place.</sarcasm>
>
> We have to add some __ASSUME_blah_blah here now.
>
> To make things worse Glibc 2.40 is being frozen today :(.  Copying to
> libc-alpha and the RM.
>
> --
> Xi Ruoyao <xry111@xry111.site>
> School of Aerospace Science and Technology, Xidian University
Xi Ruoyao July 4, 2024, 3:23 a.m. UTC | #34
On Thu, 2024-07-04 at 10:38 +0800, Huacai Chen wrote:
> > So if a LoongArch Glibc is built with Linux kernel headers >= 6.11,
> > it'll use fstatat **even configured --with-kernel=5.19** and fail to run
> > on Linux kernel <= 6.10.  This will immediately blow up building Linux
> > From Scratch on a host distro with an "old" kernel.
> The patch which adds newstat back will CC the stable list and be
> backported to old kernels.

AFAIK in Glibc --enable-kernel=x.y (not with, I was too sleepy
yesterday) means it'll work with even x.y.0.  And even if we "re-
purpose" x.y to mean "the latest x.y patch release" people can still
explicitly spell the patch level, like --enable-kernel=5.19.0.

Thus we still need to handle this in Glibc.

And the backport will raise another question: assume 6.6.40 gets the
backport, what should we do with --enable-kernel=6.6.40?  Maybe we
should we assume newfstatat is available but then people will start to
complain "hey 6.9.7 > 6.6.40 but my Glibc configured with --enable-
kernel=6.6.40 does not work on 6.9.7"...

To me the only rational way seems only assuming 6.11 or later has
newfstatat on LoongArch.
Xi Ruoyao July 4, 2024, 4:14 a.m. UTC | #35
On Thu, 2024-07-04 at 11:23 +0800, Xi Ruoyao wrote:
> On Thu, 2024-07-04 at 10:38 +0800, Huacai Chen wrote:
> > > So if a LoongArch Glibc is built with Linux kernel headers >= 6.11,
> > > it'll use fstatat **even configured --with-kernel=5.19** and fail to run
> > > on Linux kernel <= 6.10.  This will immediately blow up building Linux
> > > From Scratch on a host distro with an "old" kernel.
> > The patch which adds newstat back will CC the stable list and be
> > backported to old kernels.
> 
> AFAIK in Glibc --enable-kernel=x.y (not with, I was too sleepy
> yesterday) means it'll work with even x.y.0.  And even if we "re-
> purpose" x.y to mean "the latest x.y patch release" people can still
> explicitly spell the patch level, like --enable-kernel=5.19.0.
> 
> Thus we still need to handle this in Glibc.
> 
> And the backport will raise another question: assume 6.6.40 gets the
> backport, what should we do with --enable-kernel=6.6.40?  Maybe we
> should we assume newfstatat is available but then people will start to
> complain "hey 6.9.7 > 6.6.40 but my Glibc configured with --enable-
> kernel=6.6.40 does not work on 6.9.7"...
> 
> To me the only rational way seems only assuming 6.11 or later

Or the first 6.10.x which will get the backport.

> has newfstatat on LoongArch.
Florian Weimer July 4, 2024, 5:55 a.m. UTC | #36
* Xi Ruoyao:

> Also some bad news: Glibc has this:
>
> #if (__WORDSIZE == 32 \
>      && (!defined __SYSCALL_WORDSIZE || __SYSCALL_WORDSIZE == 32)) \
>     || defined STAT_HAS_TIME32 \
>     || (!defined __NR_newfstatat && !defined __NR_fstatat64)
> # define FSTATAT_USE_STATX 1
> #else
> # define FSTATAT_USE_STATX 0
> #endif

These __NR_* constants come from the glibc headers, not the kernel
headers.  In other words, the result of the preprocessor condition does
not depend on the kernel header version.

Thanks,
Florian
Xi Ruoyao July 4, 2024, 6:02 a.m. UTC | #37
On Thu, 2024-07-04 at 07:55 +0200, Florian Weimer wrote:
> * Xi Ruoyao:
> 
> > Also some bad news: Glibc has this:
> > 
> > #if (__WORDSIZE == 32 \
> >      && (!defined __SYSCALL_WORDSIZE || __SYSCALL_WORDSIZE == 32)) \
> >     || defined STAT_HAS_TIME32 \
> >     || (!defined __NR_newfstatat && !defined __NR_fstatat64)
> > # define FSTATAT_USE_STATX 1
> > #else
> > # define FSTATAT_USE_STATX 0
> > #endif
> 
> These __NR_* constants come from the glibc headers, not the kernel
> headers.  In other words, the result of the preprocessor condition does
> not depend on the kernel header version.

Yes, I realized it in https://sourceware.org/pipermail/libc-alpha/2024-
July/157975.html and 2.40 should be fine.  But the __NR_* constants will
be there once we run update-syscall-lists.py, so we still need to handle
the issue in the Glibc 2.41 dev cycle.
diff mbox series

Patch

diff --git a/fs/internal.h b/fs/internal.h
index 1caa6a8f666f..0a018ebcaf49 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -244,6 +244,8 @@  extern const struct dentry_operations ns_dentry_operations;
 int getname_statx_lookup_flags(int flags);
 int do_statx(int dfd, struct filename *filename, unsigned int flags,
 	     unsigned int mask, struct statx __user *buffer);
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+		struct statx __user *buffer);
 
 /*
  * fs/splice.c:
diff --git a/fs/stat.c b/fs/stat.c
index 106684034fdb..1214826f3a36 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -214,6 +214,43 @@  int getname_statx_lookup_flags(int flags)
 	return lookup_flags;
 }
 
+static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
+			  u32 request_mask)
+{
+	int error = vfs_getattr(path, stat, request_mask, flags);
+
+	if (request_mask & STATX_MNT_ID_UNIQUE) {
+		stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
+		stat->result_mask |= STATX_MNT_ID_UNIQUE;
+	} else {
+		stat->mnt_id = real_mount(path->mnt)->mnt_id;
+		stat->result_mask |= STATX_MNT_ID;
+	}
+
+	if (path->mnt->mnt_root == path->dentry)
+		stat->attributes |= STATX_ATTR_MOUNT_ROOT;
+	stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+	/* Handle STATX_DIOALIGN for block devices. */
+	if (request_mask & STATX_DIOALIGN) {
+		struct inode *inode = d_backing_inode(path->dentry);
+
+		if (S_ISBLK(inode->i_mode))
+			bdev_statx_dioalign(inode, stat);
+	}
+
+	return error;
+}
+
+static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
+			  u32 request_mask)
+{
+	CLASS(fd_raw, f)(fd);
+	if (!f.file)
+		return -EBADF;
+	return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
+}
+
 /**
  * vfs_statx - Get basic and extra attributes by filename
  * @dfd: A file descriptor representing the base dir for a relative filename
@@ -243,36 +280,13 @@  static int vfs_statx(int dfd, struct filename *filename, int flags,
 retry:
 	error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
 	if (error)
-		goto out;
-
-	error = vfs_getattr(&path, stat, request_mask, flags);
-
-	if (request_mask & STATX_MNT_ID_UNIQUE) {
-		stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
-		stat->result_mask |= STATX_MNT_ID_UNIQUE;
-	} else {
-		stat->mnt_id = real_mount(path.mnt)->mnt_id;
-		stat->result_mask |= STATX_MNT_ID;
-	}
-
-	if (path.mnt->mnt_root == path.dentry)
-		stat->attributes |= STATX_ATTR_MOUNT_ROOT;
-	stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
-
-	/* Handle STATX_DIOALIGN for block devices. */
-	if (request_mask & STATX_DIOALIGN) {
-		struct inode *inode = d_backing_inode(path.dentry);
-
-		if (S_ISBLK(inode->i_mode))
-			bdev_statx_dioalign(inode, stat);
-	}
-
+		return error;
+	error = vfs_statx_path(&path, flags, stat, request_mask);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
 	}
-out:
 	return error;
 }
 
@@ -677,6 +691,29 @@  int do_statx(int dfd, struct filename *filename, unsigned int flags,
 	return cp_statx(&stat, buffer);
 }
 
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+	     struct statx __user *buffer)
+{
+	struct kstat stat;
+	int error;
+
+	if (mask & STATX__RESERVED)
+		return -EINVAL;
+	if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
+		return -EINVAL;
+
+	/* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
+	 * from userland.
+	 */
+	mask &= ~STATX_CHANGE_COOKIE;
+
+	error = vfs_statx_fd(fd, flags, &stat, mask);
+	if (error)
+		return error;
+
+	return cp_statx(&stat, buffer);
+}
+
 /**
  * sys_statx - System call to get enhanced stats
  * @dfd: Base directory to pathwalk from *or* fd to stat.
@@ -696,6 +733,9 @@  SYSCALL_DEFINE5(statx,
 	int ret;
 	struct filename *name;
 
+	if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+		return do_statx_fd(dfd, flags, mask, buffer);
+
 	name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
 	ret = do_statx(dfd, name, flags, mask, buffer);
 	putname(name);
diff --git a/io_uring/statx.c b/io_uring/statx.c
index abb874209caa..fe967ecb1762 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -23,6 +23,7 @@  struct io_statx {
 int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
+	struct filename *filename;
 	const char __user *path;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -36,15 +37,14 @@  int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	sx->flags = READ_ONCE(sqe->statx_flags);
 
-	sx->filename = getname_flags(path,
-				     getname_statx_lookup_flags(sx->flags),
-				     NULL);
-
-	if (IS_ERR(sx->filename)) {
-		int ret = PTR_ERR(sx->filename);
-
-		sx->filename = NULL;
-		return ret;
+	sx->filename = NULL;
+	if (!(sx->flags == AT_EMPTY_PATH && vfs_empty_path(sx->dfd, path))) {
+		filename = getname_flags(path,
+					 getname_statx_lookup_flags(sx->flags),
+					 NULL);
+		if (IS_ERR(filename))
+			return PTR_ERR(filename);
+		sx->filename = filename;
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
@@ -59,7 +59,10 @@  int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
-	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
+	if (sx->filename == NULL)
+		ret = do_statx_fd(sx->dfd, sx->flags, sx->mask, sx->buffer);
+	else
+		ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }