diff mbox series

[v3,4/4] add listmount(2) syscall

Message ID 20230928130147.564503-5-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show
Series querying mount attributes | expand

Commit Message

Miklos Szeredi Sept. 28, 2023, 1:01 p.m. UTC
Add way to query the children of a particular mount.  This is a more
flexible way to iterate the mount tree than having to parse the complete
/proc/self/mountinfo.

Lookup the mount by the new 64bit mount ID.  If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Return an array of new (64bit) mount ID's.  Without privileges only mounts
are listed which are reachable from the task's root.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/namespace.c                         | 69 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 +-
 include/uapi/linux/mount.h             |  3 ++
 6 files changed, 81 insertions(+), 1 deletion(-)

Comments

Paul Moore Oct. 4, 2023, 7:37 p.m. UTC | #1
On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add way to query the children of a particular mount.  This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Lookup the mount by the new 64bit mount ID.  If a mount needs to be queried
> based on path, then statx(2) can be used to first query the mount ID
> belonging to the path.
>
> Return an array of new (64bit) mount ID's.  Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/namespace.c                         | 69 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  3 ++
>  include/uapi/asm-generic/unistd.h      |  5 +-
>  include/uapi/linux/mount.h             |  3 ++
>  6 files changed, 81 insertions(+), 1 deletion(-)

...

> diff --git a/fs/namespace.c b/fs/namespace.c
> index 3326ba2b2810..050e2d2af110 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
>         return ret;
>  }
>
> +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> +                        const struct path *root, unsigned int flags)
> +{
> +       struct mount *r, *m = real_mount(mnt);
> +       struct path rootmnt = {
> +               .mnt = root->mnt,
> +               .dentry = root->mnt->mnt_root
> +       };
> +       long ctr = 0;
> +       bool reachable_only = true;
> +       int err;
> +
> +       err = security_sb_statfs(mnt->mnt_root);
> +       if (err)
> +               return err;
> +
> +       if (flags & LISTMOUNT_UNREACHABLE) {
> +               if (!capable(CAP_SYS_ADMIN))
> +                       return -EPERM;
> +               reachable_only = false;
> +       }
> +
> +       if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> +               return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
> +
> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> +               if (reachable_only &&
> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> +                       continue;

I believe we would want to move the security_sb_statfs() call from
above to down here; something like this I think ...

  err = security_sb_statfs(r->mnt.mnt_root);
  if (err)
    /* if we can't access the mount, pretend it doesn't exist */
    continue;

> +               if (ctr >= bufsize)
> +                       return -EOVERFLOW;
> +               if (put_user(r->mnt_id_unique, buf + ctr))
> +                       return -EFAULT;
> +               ctr++;
> +               if (ctr < 0)
> +                       return -ERANGE;
> +       }
> +       return ctr;
> +}
Miklos Szeredi Oct. 5, 2023, 4:01 a.m. UTC | #2
On Wed, 4 Oct 2023 at 21:38, Paul Moore <paul@paul-moore.com> wrote:
>
> On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
> >
> > Add way to query the children of a particular mount.  This is a more
> > flexible way to iterate the mount tree than having to parse the complete
> > /proc/self/mountinfo.
> >
> > Lookup the mount by the new 64bit mount ID.  If a mount needs to be queried
> > based on path, then statx(2) can be used to first query the mount ID
> > belonging to the path.
> >
> > Return an array of new (64bit) mount ID's.  Without privileges only mounts
> > are listed which are reachable from the task's root.
> >
> > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> > ---
> >  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
> >  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
> >  fs/namespace.c                         | 69 ++++++++++++++++++++++++++
> >  include/linux/syscalls.h               |  3 ++
> >  include/uapi/asm-generic/unistd.h      |  5 +-
> >  include/uapi/linux/mount.h             |  3 ++
> >  6 files changed, 81 insertions(+), 1 deletion(-)
>
> ...
>
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 3326ba2b2810..050e2d2af110 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
> >         return ret;
> >  }
> >
> > +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> > +                        const struct path *root, unsigned int flags)
> > +{
> > +       struct mount *r, *m = real_mount(mnt);
> > +       struct path rootmnt = {
> > +               .mnt = root->mnt,
> > +               .dentry = root->mnt->mnt_root
> > +       };
> > +       long ctr = 0;
> > +       bool reachable_only = true;
> > +       int err;
> > +
> > +       err = security_sb_statfs(mnt->mnt_root);
> > +       if (err)
> > +               return err;
> > +
> > +       if (flags & LISTMOUNT_UNREACHABLE) {
> > +               if (!capable(CAP_SYS_ADMIN))
> > +                       return -EPERM;
> > +               reachable_only = false;
> > +       }
> > +
> > +       if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> > +               return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
> > +
> > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > +               if (reachable_only &&
> > +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> > +                       continue;
>
> I believe we would want to move the security_sb_statfs() call from
> above to down here; something like this I think ...
>
>   err = security_sb_statfs(r->mnt.mnt_root);
>   if (err)
>     /* if we can't access the mount, pretend it doesn't exist */
>     continue;

Hmm.  Why is this specific to listing mounts (i.e. why doesn't readdir
have a similar filter)?

Also why hasn't this come up with regards to the proc interfaces that
list mounts?

I just want to understand the big picture here.

Thanks,
Miklos
Ian Kent Oct. 5, 2023, 4:23 a.m. UTC | #3
On 5/10/23 12:01, Miklos Szeredi wrote:
> On Wed, 4 Oct 2023 at 21:38, Paul Moore <paul@paul-moore.com> wrote:
>> On Thu, Sep 28, 2023 at 9:04 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>>> Add way to query the children of a particular mount.  This is a more
>>> flexible way to iterate the mount tree than having to parse the complete
>>> /proc/self/mountinfo.
>>>
>>> Lookup the mount by the new 64bit mount ID.  If a mount needs to be queried
>>> based on path, then statx(2) can be used to first query the mount ID
>>> belonging to the path.
>>>
>>> Return an array of new (64bit) mount ID's.  Without privileges only mounts
>>> are listed which are reachable from the task's root.
>>>
>>> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
>>> ---
>>>   arch/x86/entry/syscalls/syscall_32.tbl |  1 +
>>>   arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>>>   fs/namespace.c                         | 69 ++++++++++++++++++++++++++
>>>   include/linux/syscalls.h               |  3 ++
>>>   include/uapi/asm-generic/unistd.h      |  5 +-
>>>   include/uapi/linux/mount.h             |  3 ++
>>>   6 files changed, 81 insertions(+), 1 deletion(-)
>> ...
>>
>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>> index 3326ba2b2810..050e2d2af110 100644
>>> --- a/fs/namespace.c
>>> +++ b/fs/namespace.c
>>> @@ -4970,6 +4970,75 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
>>>          return ret;
>>>   }
>>>
>>> +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
>>> +                        const struct path *root, unsigned int flags)
>>> +{
>>> +       struct mount *r, *m = real_mount(mnt);
>>> +       struct path rootmnt = {
>>> +               .mnt = root->mnt,
>>> +               .dentry = root->mnt->mnt_root
>>> +       };
>>> +       long ctr = 0;
>>> +       bool reachable_only = true;
>>> +       int err;
>>> +
>>> +       err = security_sb_statfs(mnt->mnt_root);
>>> +       if (err)
>>> +               return err;
>>> +
>>> +       if (flags & LISTMOUNT_UNREACHABLE) {
>>> +               if (!capable(CAP_SYS_ADMIN))
>>> +                       return -EPERM;
>>> +               reachable_only = false;
>>> +       }
>>> +
>>> +       if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
>>> +               return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
>>> +
>>> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
>>> +               if (reachable_only &&
>>> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
>>> +                       continue;
>> I believe we would want to move the security_sb_statfs() call from
>> above to down here; something like this I think ...
>>
>>    err = security_sb_statfs(r->mnt.mnt_root);
>>    if (err)
>>      /* if we can't access the mount, pretend it doesn't exist */
>>      continue;
> Hmm.  Why is this specific to listing mounts (i.e. why doesn't readdir
> have a similar filter)?
>
> Also why hasn't this come up with regards to the proc interfaces that
> list mounts?

The proc interfaces essentially use <mount namespace>->list to provide

the mounts that can be seen so it's filtered by mount namespace of the

task that's doing the open().


See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(),

etc.


Ian

>
> I just want to understand the big picture here.
>
> Thanks,
> Miklos
Miklos Szeredi Oct. 5, 2023, 3:47 p.m. UTC | #4
On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote:

> The proc interfaces essentially use <mount namespace>->list to provide
>
> the mounts that can be seen so it's filtered by mount namespace of the
>
> task that's doing the open().
>
>
> See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(),

/proc/$PID/mountinfo will list the mount namespace of $PID.  Whether
current task has permission to do so is decided at open time.

listmount() will list the children of the given mount ID.  The mount
ID is looked up in the task's mount namespace, so this cannot be used
to list mounts of other namespaces.  It's a more limited interface.

I sort of understand the reasoning behind calling into a security hook
on entry to statmount() and listmount().  And BTW I also think that if
statmount() and listmount() is limited in this way, then the same
limitation should be applied to the proc interfaces.  But that needs
to be done real carefully because it might cause regressions.  OTOH if
it's only done on the new interfaces, then what is the point, since
the old interfaces will be available indefinitely?

Also I cannot see the point in hiding some mount ID's from the list.
It seems to me that the list is just an array of numbers that in
itself doesn't carry any information.

Thanks,
Miklos
Ian Kent Oct. 6, 2023, 12:27 a.m. UTC | #5
On 5/10/23 23:47, Miklos Szeredi wrote:
> On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote:
>
>> The proc interfaces essentially use <mount namespace>->list to provide
>>
>> the mounts that can be seen so it's filtered by mount namespace of the
>>
>> task that's doing the open().
>>
>>
>> See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(),
> /proc/$PID/mountinfo will list the mount namespace of $PID.  Whether
> current task has permission to do so is decided at open time.
>
> listmount() will list the children of the given mount ID.  The mount
> ID is looked up in the task's mount namespace, so this cannot be used
> to list mounts of other namespaces.  It's a more limited interface.

Yep. But isn't the ability to see these based on task privilege?


Is the proc style restriction actually what we need here (or some variation

of that implementation)?


An privileged task typically has the init namespace as its mount namespace

and mounts should propagate from there so it should be able to see all 
mounts.


If the file handle has been opened in a task that is using some other mount

namespace then presumably that's what the program author wants the task 
to see.

So I'm not sure I see a problem obeying the namespace of a given task.


Ian

>
> I sort of understand the reasoning behind calling into a security hook
> on entry to statmount() and listmount().  And BTW I also think that if
> statmount() and listmount() is limited in this way, then the same
> limitation should be applied to the proc interfaces.  But that needs
> to be done real carefully because it might cause regressions.  OTOH if
> it's only done on the new interfaces, then what is the point, since
> the old interfaces will be available indefinitely?
>
> Also I cannot see the point in hiding some mount ID's from the list.
> It seems to me that the list is just an array of numbers that in
> itself doesn't carry any information.
>
> Thanks,
> Miklos
Paul Moore Oct. 6, 2023, 2:56 a.m. UTC | #6
On Thu, Oct 5, 2023 at 11:47 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote:
> > The proc interfaces essentially use <mount namespace>->list to provide
> >
> > the mounts that can be seen so it's filtered by mount namespace of the
> >
> > task that's doing the open().
> >
> >
> > See fs/namespace.c:mnt_list_next() and just below the m_start(), m_next(),
>
> /proc/$PID/mountinfo will list the mount namespace of $PID.  Whether
> current task has permission to do so is decided at open time.
>
> listmount() will list the children of the given mount ID.  The mount
> ID is looked up in the task's mount namespace, so this cannot be used
> to list mounts of other namespaces.  It's a more limited interface.
>
> I sort of understand the reasoning behind calling into a security hook
> on entry to statmount() and listmount().  And BTW I also think that if
> statmount() and listmount() is limited in this way, then the same
> limitation should be applied to the proc interfaces.  But that needs
> to be done real carefully because it might cause regressions.  OTOH if
> it's only done on the new interfaces, then what is the point, since
> the old interfaces will be available indefinitely?

LSMs that are designed to enforce access controls on procfs interfaces
typically leverage the fact that the procfs interfaces are file based
and the normal file I/O access controls can be used.  In some cases,
e.g. /proc/self/attr, there may also be additional access controls
implemented via a dedicated set of LSM hooks.

> Also I cannot see the point in hiding some mount ID's from the list.
> It seems to me that the list is just an array of numbers that in
> itself doesn't carry any information.

I think it really comes down to the significance of the mount ID, and
I can't say I know enough of the details here to be entirely
comfortable taking a hard stance on this.  Can you help me understand
the mount ID concept a bit better?

While I'm reasonably confident that we want a security_sb_statfs()
control point in statmount(), it may turn out that we don't want/need
a call in the listmount() case.  Perhaps your original patch was
correct in the sense that we only want a single security_sb_statfs()
call for the root (implying that the child mount IDs are attributes of
the root/parent mount)?  Maybe it's something else entirely?
Miklos Szeredi Oct. 6, 2023, 8:53 a.m. UTC | #7
On Fri, 6 Oct 2023 at 04:56, Paul Moore <paul@paul-moore.com> wrote:

> > Also I cannot see the point in hiding some mount ID's from the list.
> > It seems to me that the list is just an array of numbers that in
> > itself doesn't carry any information.
>
> I think it really comes down to the significance of the mount ID, and
> I can't say I know enough of the details here to be entirely
> comfortable taking a hard stance on this.  Can you help me understand
> the mount ID concept a bit better?

Mount ID is a descriptor that allows referring to a specific struct
mount from userspace.

The old 32 bit mount id is allocated with IDA from a global pool.
Because it's non-referencing it doesn't allow uniquely identifying a
mount.  That was a design mistake that I made back in 2008, thinking
that the same sort of dense descriptor space as used for file
descriptors would work.  Originally it was used to identify the mount
and the parent mount in /proc/PID/mountinfo.  Later it was also added
to the following interfaces:

 - name_to_handle_at(2) returns 32 bit value
 - /proc/PID/FD/fdinfo
 - statx(2) returns 64 bit value

It was never used on the kernel interfaces as an input argument.

statmount(2) and listmount(2) require the mount to be identified by
userspace, so having a unique ID is important.  So the "[1/4] add
unique mount ID" adds a new 64 bit ID (still global) that is allocated
sequentially and only reused after reboot.   It is used as an input to
these syscalls.  It is returned by statx(2) if requested by
STATX_MNT_ID_UNIQUE and as an array of ID's by listmount(2).

I can see mild security problems with the global allocation, since a
task can observe mounts being done in other namespaces.  This doesn't
sound too serious, and the old ID has similar issues.  But I think
making the new ID be local to the mount namespace is also feasible.

> While I'm reasonably confident that we want a security_sb_statfs()
> control point in statmount(), it may turn out that we don't want/need
> a call in the listmount() case.  Perhaps your original patch was
> correct in the sense that we only want a single security_sb_statfs()
> call for the root (implying that the child mount IDs are attributes of
> the root/parent mount)?  Maybe it's something else entirely?

Mounts are arranged in a tree (I think it obvious how) and
listmount(2) just lists the IDs of the immediate children of a mount.

I don't see ID being an attribute of a mount, it's a descriptor.

Thanks,
Miklos
Paul Moore Oct. 6, 2023, 11:07 p.m. UTC | #8
On Fri, Oct 6, 2023 at 4:53 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Fri, 6 Oct 2023 at 04:56, Paul Moore <paul@paul-moore.com> wrote:
>
> > > Also I cannot see the point in hiding some mount ID's from the list.
> > > It seems to me that the list is just an array of numbers that in
> > > itself doesn't carry any information.
> >
> > I think it really comes down to the significance of the mount ID, and
> > I can't say I know enough of the details here to be entirely
> > comfortable taking a hard stance on this.  Can you help me understand
> > the mount ID concept a bit better?
>
> Mount ID is a descriptor that allows referring to a specific struct
> mount from userspace.
>
> The old 32 bit mount id is allocated with IDA from a global pool.
> Because it's non-referencing it doesn't allow uniquely identifying a
> mount.  That was a design mistake that I made back in 2008, thinking
> that the same sort of dense descriptor space as used for file
> descriptors would work.  Originally it was used to identify the mount
> and the parent mount in /proc/PID/mountinfo.  Later it was also added
> to the following interfaces:
>
>  - name_to_handle_at(2) returns 32 bit value
>  - /proc/PID/FD/fdinfo
>  - statx(2) returns 64 bit value
>
> It was never used on the kernel interfaces as an input argument.

Thanks for the background.

> statmount(2) and listmount(2) require the mount to be identified by
> userspace, so having a unique ID is important.  So the "[1/4] add
> unique mount ID" adds a new 64 bit ID (still global) that is allocated
> sequentially and only reused after reboot.   It is used as an input to
> these syscalls.  It is returned by statx(2) if requested by
> STATX_MNT_ID_UNIQUE and as an array of ID's by listmount(2).
>
> I can see mild security problems with the global allocation, since a
> task can observe mounts being done in other namespaces.  This doesn't
> sound too serious, and the old ID has similar issues.  But I think
> making the new ID be local to the mount namespace is also feasible.

The LSM hook API is designed to operate independently from any of the
kernel namespaces; while some LSMs may choose to be aware of
namespaces and adjust their controls accordingly, there is no
requirement that they do so.  For that reason, I'm not too bothered
either way if the mount ID is global or tied to a namespace.

> > While I'm reasonably confident that we want a security_sb_statfs()
> > control point in statmount(), it may turn out that we don't want/need
> > a call in the listmount() case.  Perhaps your original patch was
> > correct in the sense that we only want a single security_sb_statfs()
> > call for the root (implying that the child mount IDs are attributes of
> > the root/parent mount)?  Maybe it's something else entirely?
>
> Mounts are arranged in a tree (I think it obvious how) and
> listmount(2) just lists the IDs of the immediate children of a mount.
>
> I don't see ID being an attribute of a mount, it's a descriptor.

In this case I think the approach you took originally in this thread
is likely what we want, call security_sb_statfs() against the root
mount in listmount().  Just please move it after the capability
checks.

If you look at the two LSMs which implement the security_sb_statfs(),
Smack and SELinux, you see that Smack treats this as a read operation
between the current process and the specified mount and SELinux treats
this as a filesystem:getattr operations between the current process
and the specified mount.  In both cases I can see that being the right
approach for reading a list of child mounts off of a root mount.

Does that sound good?  I'm guessing that's okay since that was how you
wrote it in your original patch, but there has been a lot of
discussion since then :)
Ian Kent Oct. 13, 2023, 2:39 a.m. UTC | #9
On 6/10/23 08:27, Ian Kent wrote:
> On 5/10/23 23:47, Miklos Szeredi wrote:
>> On Thu, 5 Oct 2023 at 06:23, Ian Kent <raven@themaw.net> wrote:
>>
>>> The proc interfaces essentially use <mount namespace>->list to provide
>>>
>>> the mounts that can be seen so it's filtered by mount namespace of the
>>>
>>> task that's doing the open().
>>>
>>>
>>> See fs/namespace.c:mnt_list_next() and just below the m_start(), 
>>> m_next(),
>> /proc/$PID/mountinfo will list the mount namespace of $PID. Whether
>> current task has permission to do so is decided at open time.
>>
>> listmount() will list the children of the given mount ID.  The mount
>> ID is looked up in the task's mount namespace, so this cannot be used
>> to list mounts of other namespaces.  It's a more limited interface.
>
> Yep. But isn't the ability to see these based on task privilege?
>
>
> Is the proc style restriction actually what we need here (or some 
> variation
>
> of that implementation)?
>
>
> An privileged task typically has the init namespace as its mount 
> namespace
>
> and mounts should propagate from there so it should be able to see all 
> mounts.
>
>
> If the file handle has been opened in a task that is using some other 
> mount
>
> namespace then presumably that's what the program author wants the 
> task to see.
>
> So I'm not sure I see a problem obeying the namespace of a given task.

I've had a look through the code we had in the old fsinfo() proposal

because I think we need to consider the use cases that are needed.


IIRC initially we had a flag FSINFO_ATTR_MOUNT_CHILDREN that essentially

enumerated the children of the given mount in much the same way as is

done now in this system call.


But because we needed to enumerate mounts in the same way as the proc file

system mount tables a flag FSINFO_ATTR_MOUNT_ALL was added that essentially

used the mount namespace mounts list in a similar way to the proc file

system so that a list of mounts for a mount namespace could be retrieved.


This later use case is what is used by processes that monitor mounts and

is what's needed more so than enumerating the children as we do now.


I'm still looking at the mount id lookup.


Ian

>
>
> Ian
>
>>
>> I sort of understand the reasoning behind calling into a security hook
>> on entry to statmount() and listmount().  And BTW I also think that if
>> statmount() and listmount() is limited in this way, then the same
>> limitation should be applied to the proc interfaces.  But that needs
>> to be done real carefully because it might cause regressions. OTOH if
>> it's only done on the new interfaces, then what is the point, since
>> the old interfaces will be available indefinitely?
>>
>> Also I cannot see the point in hiding some mount ID's from the list.
>> It seems to me that the list is just an array of numbers that in
>> itself doesn't carry any information.
>>
>> Thanks,
>> Miklos
Miklos Szeredi Oct. 24, 2023, 1:57 p.m. UTC | #10
On Fri, 13 Oct 2023 at 04:40, Ian Kent <raven@themaw.net> wrote:

> But because we needed to enumerate mounts in the same way as the proc file
>
> system mount tables a flag FSINFO_ATTR_MOUNT_ALL was added that essentially
>
> used the mount namespace mounts list in a similar way to the proc file
>
> system so that a list of mounts for a mount namespace could be retrieved.

Makes sense.

Added a LISTMOUNT_RECURSIVE flag.

Thanks,
Miklos
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 317b1320ad18..65e0185b47a9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -458,3 +458,4 @@ 
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
 454	i386	statmount		sys_statmount
+455	i386	listmount		sys_listmount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7312c440978f..a1b3ce7d22cc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -376,6 +376,7 @@ 
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	statmount		sys_statmount
+455	common	listmount		sys_listmount
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index 3326ba2b2810..050e2d2af110 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4970,6 +4970,75 @@  SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
 	return ret;
 }
 
+static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+			 const struct path *root, unsigned int flags)
+{
+	struct mount *r, *m = real_mount(mnt);
+	struct path rootmnt = {
+		.mnt = root->mnt,
+		.dentry = root->mnt->mnt_root
+	};
+	long ctr = 0;
+	bool reachable_only = true;
+	int err;
+
+	err = security_sb_statfs(mnt->mnt_root);
+	if (err)
+		return err;
+
+	if (flags & LISTMOUNT_UNREACHABLE) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		reachable_only = false;
+	}
+
+	if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+		return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
+
+	list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
+		if (reachable_only &&
+		    !is_path_reachable(r, r->mnt.mnt_root, root))
+			continue;
+
+		if (ctr >= bufsize)
+			return -EOVERFLOW;
+		if (put_user(r->mnt_id_unique, buf + ctr))
+			return -EFAULT;
+		ctr++;
+		if (ctr < 0)
+			return -ERANGE;
+	}
+	return ctr;
+}
+
+SYSCALL_DEFINE4(listmount, const struct __mount_arg __user *, req,
+		u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+{
+	struct __mount_arg kreq;
+	struct vfsmount *mnt;
+	struct path root;
+	long err;
+
+	if (flags & ~LISTMOUNT_UNREACHABLE)
+		return -EINVAL;
+
+	if (copy_from_user(&kreq, req, sizeof(kreq)))
+		return -EFAULT;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+	err = -ENOENT;
+	if (mnt) {
+		get_fs_root(current->fs, &root);
+		err = do_listmount(mnt, buf, bufsize, &root, flags);
+		path_put(&root);
+	}
+	up_read(&namespace_sem);
+
+	return err;
+}
+
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ba371024d902..38f3da7e04d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -413,6 +413,9 @@  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 asmlinkage long sys_statmount(const struct __mount_arg __user *req,
 			      struct statmnt __user *buf, size_t bufsize,
 			      unsigned int flags);
+asmlinkage long sys_listmount(const struct __mount_arg __user *req,
+			      u64 __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8f034e934a2e..8df6a747e21a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -826,8 +826,11 @@  __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_statmount   454
 __SYSCALL(__NR_statmount, sys_statmount)
 
+#define __NR_listmount   455
+__SYSCALL(__NR_listmount, sys_listmount)
+
 #undef __NR_syscalls
-#define __NR_syscalls 455
+#define __NR_syscalls 456
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d2c988ab526b..7aa9916659d2 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -194,4 +194,7 @@  struct __mount_arg {
 #define STMT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
 #define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
 
+/* listmount(2) flags */
+#define LISTMOUNT_UNREACHABLE	0x01	/* List unreachable mounts too */
+
 #endif /* _UAPI_LINUX_MOUNT_H */