diff mbox series

[RFC,3/3] add listmnt(2) syscall

Message ID 20230913152238.905247-4-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show
Series quering mount attributes | expand

Commit Message

Miklos Szeredi Sept. 13, 2023, 3:22 p.m. UTC
Add way to query the children of a particular mount.  This is a more
flexible way to iterate the mount tree than having to parse the complete
/proc/self/mountinfo.

Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
needs to be queried based on path, then statx(2) can be used to first query
the mount ID belonging to the path.

Return an array of new (64bit) mount ID's.  Without privileges only mounts
are listed which are reachable from the task's root.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/namespace.c                         | 51 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 +
 include/uapi/asm-generic/unistd.h      |  5 ++-
 4 files changed, 58 insertions(+), 1 deletion(-)

Comments

Amir Goldstein Sept. 14, 2023, 6 a.m. UTC | #1
On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add way to query the children of a particular mount.  This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Return an array of new (64bit) mount ID's.  Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 +
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  4 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 6d807c30cd16..0d9a47b0ce9b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -376,6 +376,7 @@
>  452    common  fchmodat2               sys_fchmodat2
>  453    64      map_shadow_stack        sys_map_shadow_stack
>  454    common  statmnt                 sys_statmnt
> +455    common  listmnt                 sys_listmnt
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 088a52043bba..5362b1ffb26f 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>         return err;
>  }
>
> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> +                     const struct path *root)
> +{
> +       struct mount *r, *m = real_mount(mnt);
> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
> +       long ctr = 0;
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> +               return -EPERM;
> +
> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> +               if (!capable(CAP_SYS_ADMIN) &&
> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> +                       continue;
> +
> +               if (ctr >= bufsize)
> +                       return -EOVERFLOW;
> +               if (put_user(r->mnt_id_unique, buf + ctr))
> +                       return -EFAULT;
> +               ctr++;
> +               if (ctr < 0)
> +                       return -ERANGE;

I think it'd be good for userspace to be able to query required
bufsize with NULL buf, listattr style, rather than having to
guess and re-guess on EOVERFLOW.

Thanks,
Amir.






> +       }
> +       return ctr;
> +}
> +
> +SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
> +               unsigned int, flags)
> +{
> +       struct vfsmount *mnt;
> +       struct path root;
> +       long err;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       down_read(&namespace_sem);
> +       mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +       err = -ENOENT;
> +       if (mnt) {
> +               get_fs_root(current->fs, &root);
> +               err = do_listmnt(mnt, buf, bufsize, &root);
> +               path_put(&root);
> +       }
> +       up_read(&namespace_sem);
> +
> +       return err;
> +}
> +
> +
>  static void __init init_mount_tree(void)
>  {
>         struct vfsmount *mnt;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 1099bd307fa7..5d776cdb6f18 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -411,6 +411,8 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
>                             struct statmnt __user *buf, size_t bufsize,
>                             unsigned int flags);
> +asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
> +                           unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 640997231ff6..a2b41370f603 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>  #define __NR_statmnt   454
>  __SYSCALL(__NR_statmnt, sys_statmnt)
>
> +#define __NR_listmnt   455
> +__SYSCALL(__NR_listmnt, sys_listmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 455
> +#define __NR_syscalls 456
>
>  /*
>   * 32 bit systems traditionally used different
> --
> 2.41.0
>
Miklos Szeredi Sept. 14, 2023, 8:50 a.m. UTC | #2
On Thu, 14 Sept 2023 at 08:00, Amir Goldstein <amir73il@gmail.com> wrote:

> > +               if (ctr >= bufsize)
> > +                       return -EOVERFLOW;
> > +               if (put_user(r->mnt_id_unique, buf + ctr))
> > +                       return -EFAULT;
> > +               ctr++;
> > +               if (ctr < 0)
> > +                       return -ERANGE;
>
> I think it'd be good for userspace to be able to query required
> bufsize with NULL buf, listattr style, rather than having to
> guess and re-guess on EOVERFLOW.

The getxattr/listxattr style encourages the following code:

  size = get(NULL, 0);
  buf = alloc(size);
  err = get(buf, size);
  if (err)
      /* failure */

Which is wrong, since the needed buffer size could change between the two calls.

Doing it iteratively is the only correct way, and then adding
complexity to both userspace and the kernel for *optimizing* the
iteration is not really worth it, IMO.

Thanks,
Miklos
Christian Brauner Sept. 14, 2023, 10:01 a.m. UTC | #3
On Thu, Sep 14, 2023 at 10:50:04AM +0200, Miklos Szeredi wrote:
> On Thu, 14 Sept 2023 at 08:00, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> > > +               if (ctr >= bufsize)
> > > +                       return -EOVERFLOW;
> > > +               if (put_user(r->mnt_id_unique, buf + ctr))
> > > +                       return -EFAULT;
> > > +               ctr++;
> > > +               if (ctr < 0)
> > > +                       return -ERANGE;
> >
> > I think it'd be good for userspace to be able to query required
> > bufsize with NULL buf, listattr style, rather than having to
> > guess and re-guess on EOVERFLOW.
> 
> The getxattr/listxattr style encourages the following code:
> 
>   size = get(NULL, 0);
>   buf = alloc(size);
>   err = get(buf, size);
>   if (err)
>       /* failure */
> 
> Which is wrong, since the needed buffer size could change between the two calls.

Not a fan of this either tbh.

> 
> Doing it iteratively is the only correct way, and then adding
> complexity to both userspace and the kernel for *optimizing* the
> iteration is not really worth it, IMO.

So realistically, userspace nows that an upper bound on the number of
mounts in a mount namespace (expressed in /proc/sys/fs/mount-max usually
100000 - which is often too much ofc).

This is probably insane but I'll power through it: ideally we'd have an
iterator interface that keeps state between calls so we can continue
iterating similar to how readdir/getdents does.
Ian Kent Sept. 15, 2023, 1 a.m. UTC | #4
On 14/9/23 14:00, Amir Goldstein wrote:
> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Add way to query the children of a particular mount.  This is a more
>> flexible way to iterate the mount tree than having to parse the complete
>> /proc/self/mountinfo.
>>
>> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
>> needs to be queried based on path, then statx(2) can be used to first query
>> the mount ID belonging to the path.
>>
>> Return an array of new (64bit) mount ID's.  Without privileges only mounts
>> are listed which are reachable from the task's root.
>>
>> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
>> ---
>>   arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>>   fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>>   include/linux/syscalls.h               |  2 +
>>   include/uapi/asm-generic/unistd.h      |  5 ++-
>>   4 files changed, 58 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 6d807c30cd16..0d9a47b0ce9b 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -376,6 +376,7 @@
>>   452    common  fchmodat2               sys_fchmodat2
>>   453    64      map_shadow_stack        sys_map_shadow_stack
>>   454    common  statmnt                 sys_statmnt
>> +455    common  listmnt                 sys_listmnt
>>
>>   #
>>   # Due to a historical design error, certain syscalls are numbered differently
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 088a52043bba..5362b1ffb26f 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>>          return err;
>>   }
>>
>> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
>> +                     const struct path *root)
>> +{
>> +       struct mount *r, *m = real_mount(mnt);
>> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
>> +       long ctr = 0;
>> +
>> +       if (!capable(CAP_SYS_ADMIN) &&
>> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
>> +               return -EPERM;
>> +
>> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
>> +               if (!capable(CAP_SYS_ADMIN) &&
>> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
>> +                       continue;
>> +
>> +               if (ctr >= bufsize)
>> +                       return -EOVERFLOW;
>> +               if (put_user(r->mnt_id_unique, buf + ctr))
>> +                       return -EFAULT;
>> +               ctr++;
>> +               if (ctr < 0)
>> +                       return -ERANGE;
> I think it'd be good for userspace to be able to query required
> bufsize with NULL buf, listattr style, rather than having to
> guess and re-guess on EOVERFLOW.

Agreed, I also think that would be useful.


Ian
Matthew House Sept. 17, 2023, 12:54 a.m. UTC | #5
On Thu, Sep 14, 2023 at 12:02 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
> Add way to query the children of a particular mount.  This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Return an array of new (64bit) mount ID's.  Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 +
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  4 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 6d807c30cd16..0d9a47b0ce9b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -376,6 +376,7 @@
>  452    common  fchmodat2               sys_fchmodat2
>  453    64      map_shadow_stack        sys_map_shadow_stack
>  454    common  statmnt                 sys_statmnt
> +455    common  listmnt                 sys_listmnt
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 088a52043bba..5362b1ffb26f 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>         return err;
>  }
>
> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> +                     const struct path *root)
> +{
> +       struct mount *r, *m = real_mount(mnt);
> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
> +       long ctr = 0;
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> +               return -EPERM;
> +
> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> +               if (!capable(CAP_SYS_ADMIN) &&
> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> +                       continue;

I'm not an expert on the kernel API, but to my eyes, it looks a bit weird
to silently include or exclude unreachable mounts from the list based on
the result of a capability check. I'd normally expect a more explicit
design, where (e.g.) the caller would set a flag to request unreachable
mounts, then get an -EPERM back if it didn't have the capability, as
opposed to this design, where the meaning of the output ("all mounts" vs.
"all reachable mounts") changes implicitly depending on the caller. Is
there any precedent for a design like this, where inaccessible results
are silently omitted from a returned list?

Thank you,
Matthew House

> +
> +               if (ctr >= bufsize)
> +                       return -EOVERFLOW;
> +               if (put_user(r->mnt_id_unique, buf + ctr))
> +                       return -EFAULT;
> +               ctr++;
> +               if (ctr < 0)
> +                       return -ERANGE;
> +       }
> +       return ctr;
> +}
> +
> +SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
> +               unsigned int, flags)
> +{
> +       struct vfsmount *mnt;
> +       struct path root;
> +       long err;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       down_read(&namespace_sem);
> +       mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +       err = -ENOENT;
> +       if (mnt) {
> +               get_fs_root(current->fs, &root);
> +               err = do_listmnt(mnt, buf, bufsize, &root);
> +               path_put(&root);
> +       }
> +       up_read(&namespace_sem);
> +
> +       return err;
> +}
> +
> +
>  static void __init init_mount_tree(void)
>  {
>         struct vfsmount *mnt;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 1099bd307fa7..5d776cdb6f18 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -411,6 +411,8 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
>                             struct statmnt __user *buf, size_t bufsize,
>                             unsigned int flags);
> +asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
> +                           unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 640997231ff6..a2b41370f603 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>  #define __NR_statmnt   454
>  __SYSCALL(__NR_statmnt, sys_statmnt)
>
> +#define __NR_listmnt   455
> +__SYSCALL(__NR_listmnt, sys_listmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 455
> +#define __NR_syscalls 456
>
>  /*
>   * 32 bit systems traditionally used different
> --
> 2.41.0
Miklos Szeredi Sept. 17, 2023, 2:32 p.m. UTC | #6
On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:

> > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > +               if (!capable(CAP_SYS_ADMIN) &&
> > +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> > +                       continue;
>
> I'm not an expert on the kernel API, but to my eyes, it looks a bit weird
> to silently include or exclude unreachable mounts from the list based on
> the result of a capability check. I'd normally expect a more explicit
> design, where (e.g.) the caller would set a flag to request unreachable
> mounts, then get an -EPERM back if it didn't have the capability, as
> opposed to this design, where the meaning of the output ("all mounts" vs.
> "all reachable mounts") changes implicitly depending on the caller. Is
> there any precedent for a design like this, where inaccessible results
> are silently omitted from a returned list?

Good point.  That issue was nagging at the back of my mind.  Having an
explicit flag nicely solves the issue.

Thanks,
Miklos
Christian Brauner Sept. 18, 2023, 1:15 p.m. UTC | #7
On Sun, Sep 17, 2023 at 04:32:04PM +0200, Miklos Szeredi wrote:
> On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:
> 
> > > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > > +               if (!capable(CAP_SYS_ADMIN) &&


> Good point.  That issue was nagging at the back of my mind.  Having an
> explicit flag nicely solves the issue.

Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
once and saving the return value. capable() call's aren't that cheap.
Plus, we should decide whether this should trigger an audit event or
not: capable(CAP_SYS_ADMIN) triggers an audit event,
ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN) wouldn't.
Paul Moore Sept. 19, 2023, 4:47 p.m. UTC | #8
On Mon, Sep 18, 2023 at 12:52 PM Christian Brauner <brauner@kernel.org> wrote:
> On Sun, Sep 17, 2023 at 04:32:04PM +0200, Miklos Szeredi wrote:
> > On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:
> >
> > > > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > > > +               if (!capable(CAP_SYS_ADMIN) &&
>
>
> > Good point.  That issue was nagging at the back of my mind.  Having an
> > explicit flag nicely solves the issue.
>
> Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> once and saving the return value. capable() call's aren't that cheap.

Agreed.  The capability check doesn't do any subject/object
comparisons so calling it for each mount is overkill.  However, I
would think we would want the LSM hook called from inside the loop as
that could involve a subject (@current) and object (individual mount
point) comparison.

> Plus, we should decide whether this should trigger an audit event or
> not: capable(CAP_SYS_ADMIN) triggers an audit event,
> ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN) wouldn't.

Why would we not want to audit the capable() call?
Miklos Szeredi Sept. 28, 2023, 10:07 a.m. UTC | #9
On Tue, 19 Sept 2023 at 18:48, Paul Moore <paul@paul-moore.com> wrote:

> > Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> > once and saving the return value. capable() call's aren't that cheap.
>
> Agreed.  The capability check doesn't do any subject/object
> comparisons so calling it for each mount is overkill.  However, I
> would think we would want the LSM hook called from inside the loop as
> that could involve a subject (@current) and object (individual mount
> point) comparison.

The security_sb_statfs() one?

Should a single failure result in a complete failure?

Why is it not enough to check permission on the parent?

Thanks,
Miklos
Paul Moore Oct. 4, 2023, 7:22 p.m. UTC | #10
On Thu, Sep 28, 2023 at 6:07 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, 19 Sept 2023 at 18:48, Paul Moore <paul@paul-moore.com> wrote:
>
> > > Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> > > once and saving the return value. capable() call's aren't that cheap.
> >
> > Agreed.  The capability check doesn't do any subject/object
> > comparisons so calling it for each mount is overkill.  However, I
> > would think we would want the LSM hook called from inside the loop as
> > that could involve a subject (@current) and object (individual mount
> > point) comparison.

My apologies, I was traveling and while I was quickly checking my
email each day this message was lost.  I'm very sorry for the delay in
responding.

> The security_sb_statfs() one?

Yes.

> Should a single failure result in a complete failure?

My opinion is that it should only result in the failure of that
listing/stat'ing that particular mount; if other mounts are allowed to
be queried than the operation should be allowed to continue.

> Why is it not enough to check permission on the parent?

Each mount has the potential to have a unique security identify in the
context of the LSM, and since the LSM access controls are generally
intended to support a subject-verb-object access control policy we
need to examine the subject and object together (the subject here is
@current, the object is the individual mount, and the verb is the
stat/list operation).

Does that make sense?

I'm looking at the v3 patchset right now, I've got some small nits,
but I'll add those to that thread.
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6d807c30cd16..0d9a47b0ce9b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -376,6 +376,7 @@ 
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	statmnt			sys_statmnt
+455	common	listmnt			sys_listmnt
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index 088a52043bba..5362b1ffb26f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4988,6 +4988,57 @@  SYSCALL_DEFINE5(statmnt, u64, mnt_id,
 	return err;
 }
 
+static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+		      const struct path *root)
+{
+	struct mount *r, *m = real_mount(mnt);
+	struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
+	long ctr = 0;
+
+	if (!capable(CAP_SYS_ADMIN) &&
+	    !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+		return -EPERM;
+
+	list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
+		if (!capable(CAP_SYS_ADMIN) &&
+		    !is_path_reachable(r, r->mnt.mnt_root, root))
+			continue;
+
+		if (ctr >= bufsize)
+			return -EOVERFLOW;
+		if (put_user(r->mnt_id_unique, buf + ctr))
+			return -EFAULT;
+		ctr++;
+		if (ctr < 0)
+			return -ERANGE;
+	}
+	return ctr;
+}
+
+SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
+		unsigned int, flags)
+{
+	struct vfsmount *mnt;
+	struct path root;
+	long err;
+
+	if (flags)
+		return -EINVAL;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
+	err = -ENOENT;
+	if (mnt) {
+		get_fs_root(current->fs, &root);
+		err = do_listmnt(mnt, buf, bufsize, &root);
+		path_put(&root);
+	}
+	up_read(&namespace_sem);
+
+	return err;
+}
+
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1099bd307fa7..5d776cdb6f18 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -411,6 +411,8 @@  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
 			    struct statmnt __user *buf, size_t bufsize,
 			    unsigned int flags);
+asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
+			    unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 640997231ff6..a2b41370f603 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -826,8 +826,11 @@  __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_statmnt   454
 __SYSCALL(__NR_statmnt, sys_statmnt)
 
+#define __NR_listmnt   455
+__SYSCALL(__NR_listmnt, sys_listmnt)
+
 #undef __NR_syscalls
-#define __NR_syscalls 455
+#define __NR_syscalls 456
 
 /*
  * 32 bit systems traditionally used different