diff mbox series

[v3,3/4] add statmount(2) syscall

Message ID 20230928130147.564503-4-mszeredi@redhat.com (mailing list archive)
State New, archived
Headers show
Series querying mount attributes | expand

Commit Message

Miklos Szeredi Sept. 28, 2023, 1:01 p.m. UTC
Add a way to query attributes of a single mount instead of having to parse
the complete /proc/$PID/mountinfo, which might be huge.

Lookup the mount the new 64bit mount ID.  If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.

Design is based on a suggestion by Linus:

  "So I'd suggest something that is very much like "statfsat()", which gets
   a buffer and a length, and returns an extended "struct statfs" *AND*
   just a string description at the end."

The interface closely mimics that of statx.

Handle ASCII attributes by appending after the end of the structure (as per
above suggestion).  Pointers to strings are stored in u64 members to make
the structure the same regardless of pointer size.  Strings are nul
terminated.

Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/namespace.c                         | 283 +++++++++++++++++++++++++
 fs/statfs.c                            |   1 +
 include/linux/syscalls.h               |   5 +
 include/uapi/asm-generic/unistd.h      |   5 +-
 include/uapi/linux/mount.h             |  56 +++++
 7 files changed, 351 insertions(+), 1 deletion(-)

Comments

Ian Kent Sept. 29, 2023, 12:42 a.m. UTC | #1
On 28/9/23 21:01, Miklos Szeredi wrote:
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
>
> Lookup the mount the new 64bit mount ID.  If a mount needs to be queried
> based on path, then statx(2) can be used to first query the mount ID
> belonging to the path.
>
> Design is based on a suggestion by Linus:
>
>    "So I'd suggest something that is very much like "statfsat()", which gets
>     a buffer and a length, and returns an extended "struct statfs" *AND*
>     just a string description at the end."
>
> The interface closely mimics that of statx.
>
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Pointers to strings are stored in u64 members to make
> the structure the same regardless of pointer size.  Strings are nul
> terminated.
>
> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>   arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>   arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>   fs/namespace.c                         | 283 +++++++++++++++++++++++++
>   fs/statfs.c                            |   1 +
>   include/linux/syscalls.h               |   5 +
>   include/uapi/asm-generic/unistd.h      |   5 +-
>   include/uapi/linux/mount.h             |  56 +++++
>   7 files changed, 351 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 2d0b1bd866ea..317b1320ad18 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -457,3 +457,4 @@
>   450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
>   451	i386	cachestat		sys_cachestat
>   452	i386	fchmodat2		sys_fchmodat2
> +454	i386	statmount		sys_statmount
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 1d6eee30eceb..7312c440978f 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -375,6 +375,7 @@
>   451	common	cachestat		sys_cachestat
>   452	common	fchmodat2		sys_fchmodat2
>   453	64	map_shadow_stack	sys_map_shadow_stack
> +454	common	statmount		sys_statmount
>   
>   #
>   # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/namespace.c b/fs/namespace.c
> index c3a41200fe70..3326ba2b2810 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4687,6 +4687,289 @@ int show_path(struct seq_file *m, struct dentry *root)
>   	return 0;
>   }
>   
> +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
> +{
> +	struct mount *mnt;
> +	struct vfsmount *res = NULL;
> +
> +	lock_ns_list(ns);
> +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> +		if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
> +			res = &mnt->mnt;
> +			break;
> +		}
> +	}
> +	unlock_ns_list(ns);
> +	return res;
> +}

Seems like we might need to consider making (struct mnt_namespace)->list

a hashed list.


The number of mounts could be large, for example people using autofs direct

mount setups.


It's not common for people to have of the order of 8k map entries (for

which there is a trigger mount per entry plus any mounts that have been

automounted) but it does happen. A small setup would be of the order of

1k map entries plus automounted mounts so the benefit is likely still

there to some extent.


Ian

> +
> +struct stmt_state {
> +	struct statmnt __user *const buf;
> +	size_t const bufsize;
> +	struct vfsmount *const mnt;
> +	u64 const mask;
> +	struct seq_file seq;
> +	struct path root;
> +	struct statmnt sm;
> +	size_t pos;
> +	int err;
> +};
> +
> +typedef int (*stmt_func_t)(struct stmt_state *);
> +
> +static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
> +{
> +	size_t rem = s->bufsize - s->pos - sizeof(s->sm);
> +	struct seq_file *seq = &s->seq;
> +	int ret;
> +
> +	seq->count = 0;
> +	seq->size = min(seq->size, rem);
> +	seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
> +	if (!seq->buf)
> +		return -ENOMEM;
> +
> +	ret = func(s);
> +	if (ret)
> +		return ret;
> +
> +	if (seq_has_overflowed(seq)) {
> +		if (seq->size == rem)
> +			return -EOVERFLOW;
> +		seq->size *= 2;
> +		if (seq->size > MAX_RW_COUNT)
> +			return -ENOMEM;
> +		kvfree(seq->buf);
> +		return 0;
> +	}
> +
> +	/* Done */
> +	return 1;
> +}
> +
> +static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
> +		       u32 *str)
> +{
> +	int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0;
> +	struct statmnt *sm = &s->sm;
> +	struct seq_file *seq = &s->seq;
> +
> +	if (s->err || !(s->mask & mask))
> +		return;
> +
> +	seq->size = PAGE_SIZE;
> +	while (!ret)
> +		ret = stmt_string_seq(s, func);
> +
> +	if (ret < 0) {
> +		s->err = ret;
> +	} else {
> +		seq->buf[seq->count++] = '\0';
> +		if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) {
> +			s->err = -EFAULT;
> +		} else {
> +			*str = s->pos;
> +			s->pos += seq->count;
> +		}
> +	}
> +	kvfree(seq->buf);
> +	sm->mask |= mask;
> +}
> +
> +static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
> +{
> +	if (s->err || !(s->mask & mask))
> +		return;
> +
> +	s->err = func(s);
> +	s->sm.mask |= mask;
> +}
> +
> +static u64 mnt_to_attr_flags(struct vfsmount *mnt)
> +{
> +	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
> +	u64 attr_flags = 0;
> +
> +	if (mnt_flags & MNT_READONLY)
> +		attr_flags |= MOUNT_ATTR_RDONLY;
> +	if (mnt_flags & MNT_NOSUID)
> +		attr_flags |= MOUNT_ATTR_NOSUID;
> +	if (mnt_flags & MNT_NODEV)
> +		attr_flags |= MOUNT_ATTR_NODEV;
> +	if (mnt_flags & MNT_NOEXEC)
> +		attr_flags |= MOUNT_ATTR_NOEXEC;
> +	if (mnt_flags & MNT_NODIRATIME)
> +		attr_flags |= MOUNT_ATTR_NODIRATIME;
> +	if (mnt_flags & MNT_NOSYMFOLLOW)
> +		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
> +
> +	if (mnt_flags & MNT_NOATIME)
> +		attr_flags |= MOUNT_ATTR_NOATIME;
> +	else if (mnt_flags & MNT_RELATIME)
> +		attr_flags |= MOUNT_ATTR_RELATIME;
> +	else
> +		attr_flags |= MOUNT_ATTR_STRICTATIME;
> +
> +	if (is_idmapped_mnt(mnt))
> +		attr_flags |= MOUNT_ATTR_IDMAP;
> +
> +	return attr_flags;
> +}
> +
> +static u64 mnt_to_propagation_flags(struct mount *m)
> +{
> +	u64 propagation = 0;
> +
> +	if (IS_MNT_SHARED(m))
> +		propagation |= MS_SHARED;
> +	if (IS_MNT_SLAVE(m))
> +		propagation |= MS_SLAVE;
> +	if (IS_MNT_UNBINDABLE(m))
> +		propagation |= MS_UNBINDABLE;
> +	if (!propagation)
> +		propagation |= MS_PRIVATE;
> +
> +	return propagation;
> +}
> +
> +static int stmt_sb_basic(struct stmt_state *s)
> +{
> +	struct super_block *sb = s->mnt->mnt_sb;
> +
> +	s->sm.sb_dev_major = MAJOR(sb->s_dev);
> +	s->sm.sb_dev_minor = MINOR(sb->s_dev);
> +	s->sm.sb_magic = sb->s_magic;
> +	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
> +
> +	return 0;
> +}
> +
> +static int stmt_mnt_basic(struct stmt_state *s)
> +{
> +	struct mount *m = real_mount(s->mnt);
> +
> +	s->sm.mnt_id = m->mnt_id_unique;
> +	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
> +	s->sm.mnt_id_old = m->mnt_id;
> +	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
> +	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
> +	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
> +	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
> +	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
> +
> +	return 0;
> +}
> +
> +static int stmt_propagate_from(struct stmt_state *s)
> +{
> +	struct mount *m = real_mount(s->mnt);
> +
> +	if (!IS_MNT_SLAVE(m))
> +		return 0;
> +
> +	s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
> +
> +	return 0;
> +}
> +
> +static int stmt_mnt_root(struct stmt_state *s)
> +{
> +	struct seq_file *seq = &s->seq;
> +	int err = show_path(seq, s->mnt->mnt_root);
> +
> +	if (!err && !seq_has_overflowed(seq)) {
> +		seq->buf[seq->count] = '\0';
> +		seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
> +	}
> +	return err;
> +}
> +
> +static int stmt_mnt_point(struct stmt_state *s)
> +{
> +	struct vfsmount *mnt = s->mnt;
> +	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
> +	int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
> +
> +	return err == SEQ_SKIP ? 0 : err;
> +}
> +
> +static int stmt_fs_type(struct stmt_state *s)
> +{
> +	struct seq_file *seq = &s->seq;
> +	struct super_block *sb = s->mnt->mnt_sb;
> +
> +	seq_puts(seq, sb->s_type->name);
> +	return 0;
> +}
> +
> +static int do_statmount(struct stmt_state *s)
> +{
> +	struct statmnt *sm = &s->sm;
> +	struct mount *m = real_mount(s->mnt);
> +	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
> +	int err;
> +
> +	err = security_sb_statfs(s->mnt->mnt_root);
> +	if (err)
> +		return err;
> +
> +	if (!capable(CAP_SYS_ADMIN) &&
> +	    !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> +		return -EPERM;
> +
> +	stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
> +	stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
> +	stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
> +	stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
> +	stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
> +	stmt_string(s, STMT_MNT_POINT, stmt_mnt_point, &sm->mnt_point);
> +
> +	if (s->err)
> +		return s->err;
> +
> +	/* Return the number of bytes copied to the buffer */
> +	sm->size = copysize + s->pos;
> +
> +	if (copy_to_user(s->buf, sm, copysize))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
> +		struct statmnt __user *, buf, size_t, bufsize,
> +		unsigned int, flags)
> +{
> +	struct vfsmount *mnt;
> +	struct __mount_arg kreq;
> +	int ret;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&kreq, req, sizeof(kreq)))
> +		return -EFAULT;
> +
> +	down_read(&namespace_sem);
> +	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
> +	ret = -ENOENT;
> +	if (mnt) {
> +		struct stmt_state s = {
> +			.mask = kreq.request_mask,
> +			.buf = buf,
> +			.bufsize = bufsize,
> +			.mnt = mnt,
> +		};
> +
> +		get_fs_root(current->fs, &s.root);
> +		ret = do_statmount(&s);
> +		path_put(&s.root);
> +	}
> +	up_read(&namespace_sem);
> +
> +	return ret;
> +}
> +
>   static void __init init_mount_tree(void)
>   {
>   	struct vfsmount *mnt;
> diff --git a/fs/statfs.c b/fs/statfs.c
> index 96d1c3edf289..cc774c2e2c9a 100644
> --- a/fs/statfs.c
> +++ b/fs/statfs.c
> @@ -9,6 +9,7 @@
>   #include <linux/security.h>
>   #include <linux/uaccess.h>
>   #include <linux/compat.h>
> +#include <uapi/linux/mount.h>
>   #include "internal.h"
>   
>   static int flags_by_mnt(int mnt_flags)
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 22bc6bc147f8..ba371024d902 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -74,6 +74,8 @@ struct landlock_ruleset_attr;
>   enum landlock_rule_type;
>   struct cachestat_range;
>   struct cachestat;
> +struct statmnt;
> +struct __mount_arg;
>   
>   #include <linux/types.h>
>   #include <linux/aio_abi.h>
> @@ -408,6 +410,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
>   asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
>   asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>   				struct statfs64 __user *buf);
> +asmlinkage long sys_statmount(const struct __mount_arg __user *req,
> +			      struct statmnt __user *buf, size_t bufsize,
> +			      unsigned int flags);
>   asmlinkage long sys_truncate(const char __user *path, long length);
>   asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>   #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index abe087c53b4b..8f034e934a2e 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -823,8 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
>   #define __NR_fchmodat2 452
>   __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>   
> +#define __NR_statmount   454
> +__SYSCALL(__NR_statmount, sys_statmount)
> +
>   #undef __NR_syscalls
> -#define __NR_syscalls 453
> +#define __NR_syscalls 455
>   
>   /*
>    * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
> index bb242fdcfe6b..d2c988ab526b 100644
> --- a/include/uapi/linux/mount.h
> +++ b/include/uapi/linux/mount.h
> @@ -138,4 +138,60 @@ struct mount_attr {
>   /* List of all mount_attr versions. */
>   #define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
>   
> +
> +/*
> + * Structure for getting mount/superblock/filesystem info with statmount(2).
> + *
> + * The interface is similar to statx(2): individual fields or groups can be
> + * selected with the @mask argument of statmount().  Kernel will set the @mask
> + * field according to the supported fields.
> + *
> + * If string fields are selected, then the caller needs to pass a buffer that
> + * has space after the fixed part of the structure.  Nul terminated strings are
> + * copied there and offsets relative to @str are stored in the relevant fields.
> + * If the buffer is too small, then EOVERFLOW is returned.  The actually used
> + * size is returned in @size.
> + */
> +struct statmnt {
> +	__u32 size;		/* Total size, including strings */
> +	__u32 __spare1;
> +	__u64 mask;		/* What results were written */
> +	__u32 sb_dev_major;	/* Device ID */
> +	__u32 sb_dev_minor;
> +	__u64 sb_magic;		/* ..._SUPER_MAGIC */
> +	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> +	__u32 fs_type;		/* [str] Filesystem type */
> +	__u64 mnt_id;		/* Unique ID of mount */
> +	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
> +	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
> +	__u32 mnt_parent_id_old;
> +	__u64 mnt_attr;		/* MOUNT_ATTR_... */
> +	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> +	__u64 mnt_peer_group;	/* ID of shared peer group */
> +	__u64 mnt_master;	/* Mount receives propagation from this ID */
> +	__u64 propagate_from;	/* Propagation from in current namespace */
> +	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
> +	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
> +	__u64 __spare2[50];
> +	char str[];		/* Variable size part containing strings */
> +};
> +
> +/*
> + * To be used on the kernel ABI only for passing 64bit arguments to statmount(2)
> + */
> +struct __mount_arg {
> +	__u64 mnt_id;
> +	__u64 request_mask;
> +};
> +
> +/*
> + * @mask bits for statmount(2)
> + */
> +#define STMT_SB_BASIC		0x00000001U     /* Want/got sb_... */
> +#define STMT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
> +#define STMT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
> +#define STMT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
> +#define STMT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
> +#define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
> +
>   #endif /* _UAPI_LINUX_MOUNT_H */
Miklos Szeredi Sept. 29, 2023, 9:10 a.m. UTC | #2
On Fri, 29 Sept 2023 at 02:42, Ian Kent <raven@themaw.net> wrote:
>
> On 28/9/23 21:01, Miklos Szeredi wrote:

> > +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
> > +{
> > +     struct mount *mnt;
> > +     struct vfsmount *res = NULL;
> > +
> > +     lock_ns_list(ns);
> > +     list_for_each_entry(mnt, &ns->list, mnt_list) {
> > +             if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
> > +                     res = &mnt->mnt;
> > +                     break;
> > +             }
> > +     }
> > +     unlock_ns_list(ns);
> > +     return res;
> > +}
>
> Seems like we might need to consider making (struct mnt_namespace)->list
>
> a hashed list.

Yes, linear search needs to go.  A hash table is probably the easiest solution.

But I'd also consider replacing ns->list with an rbtree.  Not as
trivial as adding a system hash table and probably also slightly
slower, but it would have some advantages:

 - most space efficient (no overhead of hash buckets)

 - cursor can go away (f_pos can just contain last ID)

Thanks,
Miklos
Ian Kent Sept. 30, 2023, 1:16 a.m. UTC | #3
On 29/9/23 17:10, Miklos Szeredi wrote:
> On Fri, 29 Sept 2023 at 02:42, Ian Kent <raven@themaw.net> wrote:
>> On 28/9/23 21:01, Miklos Szeredi wrote:
>>> +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
>>> +{
>>> +     struct mount *mnt;
>>> +     struct vfsmount *res = NULL;
>>> +
>>> +     lock_ns_list(ns);
>>> +     list_for_each_entry(mnt, &ns->list, mnt_list) {
>>> +             if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
>>> +                     res = &mnt->mnt;
>>> +                     break;
>>> +             }
>>> +     }
>>> +     unlock_ns_list(ns);
>>> +     return res;
>>> +}
>> Seems like we might need to consider making (struct mnt_namespace)->list
>>
>> a hashed list.
> Yes, linear search needs to go.  A hash table is probably the easiest solution.
>
> But I'd also consider replacing ns->list with an rbtree.  Not as
> trivial as adding a system hash table and probably also slightly
> slower, but it would have some advantages:
>
>   - most space efficient (no overhead of hash buckets)
>
>   - cursor can go away (f_pos can just contain last ID)

I guess that would be ok.

Avoiding the cursor is a big plus.


An rbtree is used in kernfs and its readdir function is rather painful so

I wonder what the implications might be for other enumeration needs.


Ian
Paul Moore Oct. 4, 2023, 7:26 p.m. UTC | #4
On Thu, Sep 28, 2023 at 9:03 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
>
> Lookup the mount the new 64bit mount ID.  If a mount needs to be queried
> based on path, then statx(2) can be used to first query the mount ID
> belonging to the path.
>
> Design is based on a suggestion by Linus:
>
>   "So I'd suggest something that is very much like "statfsat()", which gets
>    a buffer and a length, and returns an extended "struct statfs" *AND*
>    just a string description at the end."
>
> The interface closely mimics that of statx.
>
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Pointers to strings are stored in u64 members to make
> the structure the same regardless of pointer size.  Strings are nul
> terminated.
>
> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/namespace.c                         | 283 +++++++++++++++++++++++++
>  fs/statfs.c                            |   1 +
>  include/linux/syscalls.h               |   5 +
>  include/uapi/asm-generic/unistd.h      |   5 +-
>  include/uapi/linux/mount.h             |  56 +++++
>  7 files changed, 351 insertions(+), 1 deletion(-)

...

> diff --git a/fs/namespace.c b/fs/namespace.c
> index c3a41200fe70..3326ba2b2810 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c

...

> +static int do_statmount(struct stmt_state *s)
> +{
> +       struct statmnt *sm = &s->sm;
> +       struct mount *m = real_mount(s->mnt);
> +       size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
> +       int err;
> +
> +       err = security_sb_statfs(s->mnt->mnt_root);
> +       if (err)
> +               return err;

Thank you for adding the security_sb_statfs() call to this operation,
however I believe we want to place it *after* the capability check to
be consistent with other LSM calls.

> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> +               return -EPERM;
> +
> +       stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
> +       stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
> +       stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
> +       stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
> +       stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
> +       stmt_string(s, STMT_MNT_POINT, stmt_mnt_point, &sm->mnt_point);
> +
> +       if (s->err)
> +               return s->err;
> +
> +       /* Return the number of bytes copied to the buffer */
> +       sm->size = copysize + s->pos;
> +
> +       if (copy_to_user(s->buf, sm, copysize))
> +               return -EFAULT;
> +
> +       return 0;
> +}
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..317b1320ad18 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,4 @@ 
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
+454	i386	statmount		sys_statmount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..7312c440978f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,7 @@ 
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	statmount		sys_statmount
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index c3a41200fe70..3326ba2b2810 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4687,6 +4687,289 @@  int show_path(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
+static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+	struct mount *mnt;
+	struct vfsmount *res = NULL;
+
+	lock_ns_list(ns);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
+			res = &mnt->mnt;
+			break;
+		}
+	}
+	unlock_ns_list(ns);
+	return res;
+}
+
+struct stmt_state {
+	struct statmnt __user *const buf;
+	size_t const bufsize;
+	struct vfsmount *const mnt;
+	u64 const mask;
+	struct seq_file seq;
+	struct path root;
+	struct statmnt sm;
+	size_t pos;
+	int err;
+};
+
+typedef int (*stmt_func_t)(struct stmt_state *);
+
+static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
+{
+	size_t rem = s->bufsize - s->pos - sizeof(s->sm);
+	struct seq_file *seq = &s->seq;
+	int ret;
+
+	seq->count = 0;
+	seq->size = min(seq->size, rem);
+	seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
+	if (!seq->buf)
+		return -ENOMEM;
+
+	ret = func(s);
+	if (ret)
+		return ret;
+
+	if (seq_has_overflowed(seq)) {
+		if (seq->size == rem)
+			return -EOVERFLOW;
+		seq->size *= 2;
+		if (seq->size > MAX_RW_COUNT)
+			return -ENOMEM;
+		kvfree(seq->buf);
+		return 0;
+	}
+
+	/* Done */
+	return 1;
+}
+
+static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
+		       u32 *str)
+{
+	int ret = s->pos + sizeof(s->sm) >= s->bufsize ? -EOVERFLOW : 0;
+	struct statmnt *sm = &s->sm;
+	struct seq_file *seq = &s->seq;
+
+	if (s->err || !(s->mask & mask))
+		return;
+
+	seq->size = PAGE_SIZE;
+	while (!ret)
+		ret = stmt_string_seq(s, func);
+
+	if (ret < 0) {
+		s->err = ret;
+	} else {
+		seq->buf[seq->count++] = '\0';
+		if (copy_to_user(s->buf->str + s->pos, seq->buf, seq->count)) {
+			s->err = -EFAULT;
+		} else {
+			*str = s->pos;
+			s->pos += seq->count;
+		}
+	}
+	kvfree(seq->buf);
+	sm->mask |= mask;
+}
+
+static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
+{
+	if (s->err || !(s->mask & mask))
+		return;
+
+	s->err = func(s);
+	s->sm.mask |= mask;
+}
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+	u64 attr_flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		attr_flags |= MOUNT_ATTR_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		attr_flags |= MOUNT_ATTR_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		attr_flags |= MOUNT_ATTR_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		attr_flags |= MOUNT_ATTR_NOEXEC;
+	if (mnt_flags & MNT_NODIRATIME)
+		attr_flags |= MOUNT_ATTR_NODIRATIME;
+	if (mnt_flags & MNT_NOSYMFOLLOW)
+		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+	if (mnt_flags & MNT_NOATIME)
+		attr_flags |= MOUNT_ATTR_NOATIME;
+	else if (mnt_flags & MNT_RELATIME)
+		attr_flags |= MOUNT_ATTR_RELATIME;
+	else
+		attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+	if (is_idmapped_mnt(mnt))
+		attr_flags |= MOUNT_ATTR_IDMAP;
+
+	return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+	u64 propagation = 0;
+
+	if (IS_MNT_SHARED(m))
+		propagation |= MS_SHARED;
+	if (IS_MNT_SLAVE(m))
+		propagation |= MS_SLAVE;
+	if (IS_MNT_UNBINDABLE(m))
+		propagation |= MS_UNBINDABLE;
+	if (!propagation)
+		propagation |= MS_PRIVATE;
+
+	return propagation;
+}
+
+static int stmt_sb_basic(struct stmt_state *s)
+{
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	s->sm.sb_dev_major = MAJOR(sb->s_dev);
+	s->sm.sb_dev_minor = MINOR(sb->s_dev);
+	s->sm.sb_magic = sb->s_magic;
+	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+
+	return 0;
+}
+
+static int stmt_mnt_basic(struct stmt_state *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	s->sm.mnt_id = m->mnt_id_unique;
+	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+	s->sm.mnt_id_old = m->mnt_id;
+	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+
+	return 0;
+}
+
+static int stmt_propagate_from(struct stmt_state *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	if (!IS_MNT_SLAVE(m))
+		return 0;
+
+	s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+
+	return 0;
+}
+
+static int stmt_mnt_root(struct stmt_state *s)
+{
+	struct seq_file *seq = &s->seq;
+	int err = show_path(seq, s->mnt->mnt_root);
+
+	if (!err && !seq_has_overflowed(seq)) {
+		seq->buf[seq->count] = '\0';
+		seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
+	}
+	return err;
+}
+
+static int stmt_mnt_point(struct stmt_state *s)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
+
+	return err == SEQ_SKIP ? 0 : err;
+}
+
+static int stmt_fs_type(struct stmt_state *s)
+{
+	struct seq_file *seq = &s->seq;
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	seq_puts(seq, sb->s_type->name);
+	return 0;
+}
+
+static int do_statmount(struct stmt_state *s)
+{
+	struct statmnt *sm = &s->sm;
+	struct mount *m = real_mount(s->mnt);
+	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
+	int err;
+
+	err = security_sb_statfs(s->mnt->mnt_root);
+	if (err)
+		return err;
+
+	if (!capable(CAP_SYS_ADMIN) &&
+	    !is_path_reachable(m, m->mnt.mnt_root, &s->root))
+		return -EPERM;
+
+	stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
+	stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
+	stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
+	stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
+	stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
+	stmt_string(s, STMT_MNT_POINT, stmt_mnt_point, &sm->mnt_point);
+
+	if (s->err)
+		return s->err;
+
+	/* Return the number of bytes copied to the buffer */
+	sm->size = copysize + s->pos;
+
+	if (copy_to_user(s->buf, sm, copysize))
+		return -EFAULT;
+
+	return 0;
+}
+
+SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
+		struct statmnt __user *, buf, size_t, bufsize,
+		unsigned int, flags)
+{
+	struct vfsmount *mnt;
+	struct __mount_arg kreq;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (copy_from_user(&kreq, req, sizeof(kreq)))
+		return -EFAULT;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+	ret = -ENOENT;
+	if (mnt) {
+		struct stmt_state s = {
+			.mask = kreq.request_mask,
+			.buf = buf,
+			.bufsize = bufsize,
+			.mnt = mnt,
+		};
+
+		get_fs_root(current->fs, &s.root);
+		ret = do_statmount(&s);
+		path_put(&s.root);
+	}
+	up_read(&namespace_sem);
+
+	return ret;
+}
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/fs/statfs.c b/fs/statfs.c
index 96d1c3edf289..cc774c2e2c9a 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -9,6 +9,7 @@ 
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..ba371024d902 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,8 @@  struct landlock_ruleset_attr;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
+struct statmnt;
+struct __mount_arg;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -408,6 +410,9 @@  asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
+asmlinkage long sys_statmount(const struct __mount_arg __user *req,
+			      struct statmnt __user *buf, size_t bufsize,
+			      unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..8f034e934a2e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,8 +823,11 @@  __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 
+#define __NR_statmount   454
+__SYSCALL(__NR_statmount, sys_statmount)
+
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index bb242fdcfe6b..d2c988ab526b 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,4 +138,60 @@  struct mount_attr {
 /* List of all mount_attr versions. */
 #define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
 
+
+/*
+ * Structure for getting mount/superblock/filesystem info with statmount(2).
+ *
+ * The interface is similar to statx(2): individual fields or groups can be
+ * selected with the @mask argument of statmount().  Kernel will set the @mask
+ * field according to the supported fields.
+ *
+ * If string fields are selected, then the caller needs to pass a buffer that
+ * has space after the fixed part of the structure.  Nul terminated strings are
+ * copied there and offsets relative to @str are stored in the relevant fields.
+ * If the buffer is too small, then EOVERFLOW is returned.  The actually used
+ * size is returned in @size.
+ */
+struct statmnt {
+	__u32 size;		/* Total size, including strings */
+	__u32 __spare1;
+	__u64 mask;		/* What results were written */
+	__u32 sb_dev_major;	/* Device ID */
+	__u32 sb_dev_minor;
+	__u64 sb_magic;		/* ..._SUPER_MAGIC */
+	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+	__u32 fs_type;		/* [str] Filesystem type */
+	__u64 mnt_id;		/* Unique ID of mount */
+	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
+	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
+	__u32 mnt_parent_id_old;
+	__u64 mnt_attr;		/* MOUNT_ATTR_... */
+	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+	__u64 mnt_peer_group;	/* ID of shared peer group */
+	__u64 mnt_master;	/* Mount receives propagation from this ID */
+	__u64 propagate_from;	/* Propagation from in current namespace */
+	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
+	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
+	__u64 __spare2[50];
+	char str[];		/* Variable size part containing strings */
+};
+
+/*
+ * To be used on the kernel ABI only for passing 64bit arguments to statmount(2)
+ */
+struct __mount_arg {
+	__u64 mnt_id;
+	__u64 request_mask;
+};
+
+/*
+ * @mask bits for statmount(2)
+ */
+#define STMT_SB_BASIC		0x00000001U     /* Want/got sb_... */
+#define STMT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
+#define STMT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
+#define STMT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
+#define STMT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
+#define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
+
 #endif /* _UAPI_LINUX_MOUNT_H */