diff mbox

[06/14] VFS: Implement fsmount() to effect a pre-configured mount [ver #6]

Message ID 150730499269.6182.8121149524716523148.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

David Howells Oct. 6, 2017, 3:49 p.m. UTC
Provide a system call by which a filesystem opened with fsopen() and
configured by a series of writes can be mounted:

	int ret = fsmount(int fsfd, int dfd, const char *path,
			  unsigned int at_flags, unsigned int flags);

where fsfd is the fd returned by fsopen(), dfd, path and at_flags locate
the mountpoint and flags are the applicable MS_* flags.  dfd can be
AT_FDCWD or an fd open to a directory.

In the event that fsmount() fails, it may be possible to get an error
message by calling read().  If no message is available, ENODATA will be
reported.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/namespace.c                         |   82 ++++++++++++++++++++++++++++++++
 include/linux/syscalls.h               |    2 +
 kernel/sys_ni.c                        |    1 
 5 files changed, 87 insertions(+)

Comments

Miklos Szeredi Oct. 10, 2017, 8 a.m. UTC | #1
On Fri, Oct 6, 2017 at 5:49 PM, David Howells <dhowells@redhat.com> wrote:
> Provide a system call by which a filesystem opened with fsopen() and
> configured by a series of writes can be mounted:
>
>         int ret = fsmount(int fsfd, int dfd, const char *path,
>                           unsigned int at_flags, unsigned int flags);
>
> where fsfd is the fd returned by fsopen(), dfd, path and at_flags locate
> the mountpoint and flags are the applicable MS_* flags.  dfd can be
> AT_FDCWD or an fd open to a directory.
>
> In the event that fsmount() fails, it may be possible to get an error
> message by calling read().  If no message is available, ENODATA will be
> reported.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  arch/x86/entry/syscalls/syscall_32.tbl |    1
>  arch/x86/entry/syscalls/syscall_64.tbl |    1
>  fs/namespace.c                         |   82 ++++++++++++++++++++++++++++++++
>  include/linux/syscalls.h               |    2 +
>  kernel/sys_ni.c                        |    1
>  5 files changed, 87 insertions(+)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 9bf8d4c62f85..abe6ea95e0e6 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -392,3 +392,4 @@
>  383    i386    statx                   sys_statx
>  384    i386    arch_prctl              sys_arch_prctl                  compat_sys_arch_prctl
>  385    i386    fsopen                  sys_fsopen
> +386    i386    fsmount                 sys_fsmount
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 9b198c5fc412..0977c5079831 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -340,6 +340,7 @@
>  331    common  pkey_free               sys_pkey_free
>  332    common  statx                   sys_statx
>  333    common  fsopen                  sys_fsopen
> +334    common  fsmount                 sys_fsmount
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d6b0b0067f6d..8676658b6b2c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3188,6 +3188,88 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
>  EXPORT_SYMBOL_GPL(kern_mount_data);
>
>  /*
> + * Mount a new, prepared superblock (specified by fs_fd) on the location
> + * specified by dfd and dir_name.  dfd can be AT_FDCWD, a dir fd or a container
> + * fd.  This cannot be used for binding, moving or remounting mounts.
> + */
> +SYSCALL_DEFINE5(fsmount, int, fs_fd, int, dfd, const char __user *, dir_name,
> +               unsigned int, at_flags, unsigned int, flags)
> +{
> +       struct fs_context *fc;
> +       struct path mountpoint;
> +       struct fd f;
> +       unsigned int lookup_flags, mnt_flags = 0;
> +       long ret;
> +
> +       if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
> +                         AT_EMPTY_PATH)) != 0)
> +               return -EINVAL;
> +
> +       if (flags & ~(MS_RDONLY | MS_NOSUID | MS_NODEV | MS_NOEXEC |
> +                     MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME))
> +               return -EINVAL;

How about propagation flags?  Those are also mount specific.




> +
> +       if (flags & MS_RDONLY)
> +               mnt_flags |= MNT_READONLY;
> +       if (flags & MS_NOSUID)
> +               mnt_flags |= MNT_NOSUID;
> +       if (flags & MS_NODEV)
> +               mnt_flags |= MNT_NODEV;
> +       if (flags & MS_NOEXEC)
> +               mnt_flags |= MNT_NOEXEC;
> +       if (flags & MS_NODIRATIME)
> +               mnt_flags |= MNT_NODIRATIME;
> +
> +       if (flags & MS_STRICTATIME) {
> +               if (flags & MS_NOATIME)
> +                       return -EINVAL;
> +       } else if (flags & MS_NOATIME) {
> +               mnt_flags |= MNT_NOATIME;
> +       } else {
> +               mnt_flags |= MNT_RELATIME;
> +       }

I'm not sure reusing the MS_FLAGS is the right choice.  Why not export
MNT_* to userspace?  That would get us a clean namespace without
confusion with sb flags and no need to convert back and forth.

> +
> +       f = fdget(fs_fd);
> +       if (!f.file)
> +               return -EBADF;
> +
> +       ret = -EINVAL;
> +       if (f.file->f_op != &fs_fs_fops)
> +               goto err_fsfd;
> +
> +       fc = f.file->private_data;
> +
> +       ret = -EPERM;
> +       if (!may_mount() ||
> +           ((fc->sb_flags & MS_MANDLOCK) && !may_mandlock()))
> +               goto err_fsfd;
> +
> +       /* There must be a valid superblock or we can't mount it */
> +       ret = -EINVAL;
> +       if (!fc->root)
> +               goto err_fsfd;
> +
> +       /* Find the mountpoint.  A container can be specified in dfd. */
> +       lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
> +       if (at_flags & AT_SYMLINK_NOFOLLOW)
> +               lookup_flags &= ~LOOKUP_FOLLOW;
> +       if (at_flags & AT_NO_AUTOMOUNT)
> +               lookup_flags &= ~LOOKUP_AUTOMOUNT;
> +       if (at_flags & AT_EMPTY_PATH)
> +               lookup_flags |= LOOKUP_EMPTY;
> +       ret = user_path_at(dfd, dir_name, lookup_flags, &mountpoint);
> +       if (ret < 0)
> +               goto err_fsfd;
> +
> +       ret = do_new_mount_fc(fc, &mountpoint, mnt_flags);
> +
> +       path_put(&mountpoint);
> +err_fsfd:
> +       fdput(f);
> +       return ret;
> +}
> +
> +/*
>   * Return true if path is reachable from root
>   *
>   * namespace_sem or mount_lock is held
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 7cd1b65a4152..e82dde171ce8 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -942,5 +942,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>                           unsigned mask, struct statx __user *buffer);
>  asmlinkage long sys_fsopen(const char *fs_name, unsigned int flags,
>                            void *reserved3, void *reserved4, void *reserved5);
> +asmlinkage long sys_fsmount(int fsfd, int dfd, const char *path, unsigned int at_flags,
> +                           unsigned int flags);
>
>  #endif
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index de1dc63e7e47..a0fe764bd5dd 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -261,3 +261,4 @@ cond_syscall(sys_pkey_free);
>
>  /* fd-based mount */
>  cond_syscall(sys_fsopen);
> +cond_syscall(sys_fsmount);
>
Karel Zak Oct. 10, 2017, 9:51 a.m. UTC | #2
On Tue, Oct 10, 2017 at 10:00:01AM +0200, Miklos Szeredi wrote:
> > +
> > +       if (flags & MS_RDONLY)
> > +               mnt_flags |= MNT_READONLY;
> > +       if (flags & MS_NOSUID)
> > +               mnt_flags |= MNT_NOSUID;
> > +       if (flags & MS_NODEV)
> > +               mnt_flags |= MNT_NODEV;
> > +       if (flags & MS_NOEXEC)
> > +               mnt_flags |= MNT_NOEXEC;
> > +       if (flags & MS_NODIRATIME)
> > +               mnt_flags |= MNT_NODIRATIME;
> > +
> > +       if (flags & MS_STRICTATIME) {
> > +               if (flags & MS_NOATIME)
> > +                       return -EINVAL;
> > +       } else if (flags & MS_NOATIME) {
> > +               mnt_flags |= MNT_NOATIME;
> > +       } else {
> > +               mnt_flags |= MNT_RELATIME;
> > +       }
> 
> I'm not sure reusing the MS_FLAGS is the right choice.  Why not export
> MNT_* to userspace?  That would get us a clean namespace without
> confusion with sb flags and no need to convert back and forth.

Well, if you think about it as about two separated things -- VFS-flags
and FS-flags (and for example /proc/#/mountinfo already uses two
columns for the flags) than the question is why the API uses one
variable?

Would be better to use two variables everywhere? (mostly for the
syscall). 

It would be nice to keep for example propagation flags only in
vfs_flags, or use MS_RDONLY according to context (for FS or for VFS)
without extra MS_BIND, etc.

    Karel
Miklos Szeredi Oct. 10, 2017, 1:38 p.m. UTC | #3
On Tue, Oct 10, 2017 at 11:51 AM, Karel Zak <kzak@redhat.com> wrote:
> On Tue, Oct 10, 2017 at 10:00:01AM +0200, Miklos Szeredi wrote:
>> > +
>> > +       if (flags & MS_RDONLY)
>> > +               mnt_flags |= MNT_READONLY;
>> > +       if (flags & MS_NOSUID)
>> > +               mnt_flags |= MNT_NOSUID;
>> > +       if (flags & MS_NODEV)
>> > +               mnt_flags |= MNT_NODEV;
>> > +       if (flags & MS_NOEXEC)
>> > +               mnt_flags |= MNT_NOEXEC;
>> > +       if (flags & MS_NODIRATIME)
>> > +               mnt_flags |= MNT_NODIRATIME;
>> > +
>> > +       if (flags & MS_STRICTATIME) {
>> > +               if (flags & MS_NOATIME)
>> > +                       return -EINVAL;
>> > +       } else if (flags & MS_NOATIME) {
>> > +               mnt_flags |= MNT_NOATIME;
>> > +       } else {
>> > +               mnt_flags |= MNT_RELATIME;
>> > +       }
>>
>> I'm not sure reusing the MS_FLAGS is the right choice.  Why not export
>> MNT_* to userspace?  That would get us a clean namespace without
>> confusion with sb flags and no need to convert back and forth.
>
> Well, if you think about it as about two separated things -- VFS-flags
> and FS-flags (and for example /proc/#/mountinfo already uses two
> columns for the flags) than the question is why the API uses one
> variable?
>
> Would be better to use two variables everywhere? (mostly for the
> syscall).
>
> It would be nice to keep for example propagation flags only in
> vfs_flags, or use MS_RDONLY according to context (for FS or for VFS)
> without extra MS_BIND, etc.

MS_BIND will be gone in the new API.   The two separate columns in
/proc/#/mountinfo are going to be two separate things on the new
interface (one is writes to the fsfd provided by fsopen(2), the other
in flags for fsmount(2)).  The question is how to call the mount flags
(what you call vfs flags), "MS_RDONLY" or "MNT_RDONLY" on the uAPI.
Either is probably fine, but I feel that "MNT_FOO" is better, because
it's a relatively clean namespace concerned with mount flags and not
polluted with all the scum that mount(2) collected.

BTW, I think <linux-api@vger.kernel.org> should be CC-d on all patches
that concern the userspace API.

Thanks,
Miklos
Karel Zak Oct. 11, 2017, 8:54 a.m. UTC | #4
On Tue, Oct 10, 2017 at 03:38:21PM +0200, Miklos Szeredi wrote:
> On Tue, Oct 10, 2017 at 11:51 AM, Karel Zak <kzak@redhat.com> wrote:
> > On Tue, Oct 10, 2017 at 10:00:01AM +0200, Miklos Szeredi wrote:
> >> > +
> >> > +       if (flags & MS_RDONLY)
> >> > +               mnt_flags |= MNT_READONLY;
> >> > +       if (flags & MS_NOSUID)
> >> > +               mnt_flags |= MNT_NOSUID;
> >> > +       if (flags & MS_NODEV)
> >> > +               mnt_flags |= MNT_NODEV;
> >> > +       if (flags & MS_NOEXEC)
> >> > +               mnt_flags |= MNT_NOEXEC;
> >> > +       if (flags & MS_NODIRATIME)
> >> > +               mnt_flags |= MNT_NODIRATIME;
> >> > +
> >> > +       if (flags & MS_STRICTATIME) {
> >> > +               if (flags & MS_NOATIME)
> >> > +                       return -EINVAL;
> >> > +       } else if (flags & MS_NOATIME) {
> >> > +               mnt_flags |= MNT_NOATIME;
> >> > +       } else {
> >> > +               mnt_flags |= MNT_RELATIME;
> >> > +       }
> >>
> >> I'm not sure reusing the MS_FLAGS is the right choice.  Why not export
> >> MNT_* to userspace?  That would get us a clean namespace without
> >> confusion with sb flags and no need to convert back and forth.
> >
> > Well, if you think about it as about two separated things -- VFS-flags
> > and FS-flags (and for example /proc/#/mountinfo already uses two
> > columns for the flags) than the question is why the API uses one
> > variable?
> >
> > Would be better to use two variables everywhere? (mostly for the
> > syscall).
> >
> > It would be nice to keep for example propagation flags only in
> > vfs_flags, or use MS_RDONLY according to context (for FS or for VFS)
> > without extra MS_BIND, etc.
> 
> MS_BIND will be gone in the new API.   The two separate columns in
> /proc/#/mountinfo are going to be two separate things on the new
> interface (one is writes to the fsfd provided by fsopen(2), the other
> in flags for fsmount(2)).

Ah, nice. 

> The question is how to call the mount flags
> (what you call vfs flags), "MS_RDONLY" or "MNT_RDONLY" on the uAPI.
> Either is probably fine, but I feel that "MNT_FOO" is better, because
> it's a relatively clean namespace concerned with mount flags and not
> polluted with all the scum that mount(2) collected.

Hmm.. for example libmount already uses MNT_ namespace in header
files for all macros. So, I wont be happy with MNT_ in userspace ;-(

I like clone, epoll, etc flags ... there is no any abbreviation and
the prefix follows syscall or API name (CLONE_xxx, EPOLLxxx), what
about MOUNT_FOO ?

    Karel
diff mbox

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9bf8d4c62f85..abe6ea95e0e6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -392,3 +392,4 @@ 
 383	i386	statx			sys_statx
 384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
 385	i386	fsopen			sys_fsopen
+386	i386	fsmount			sys_fsmount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9b198c5fc412..0977c5079831 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -340,6 +340,7 @@ 
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
 333	common	fsopen			sys_fsopen
+334	common	fsmount			sys_fsmount
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/namespace.c b/fs/namespace.c
index d6b0b0067f6d..8676658b6b2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3188,6 +3188,88 @@  struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 EXPORT_SYMBOL_GPL(kern_mount_data);
 
 /*
+ * Mount a new, prepared superblock (specified by fs_fd) on the location
+ * specified by dfd and dir_name.  dfd can be AT_FDCWD, a dir fd or a container
+ * fd.  This cannot be used for binding, moving or remounting mounts.
+ */
+SYSCALL_DEFINE5(fsmount, int, fs_fd, int, dfd, const char __user *, dir_name,
+		unsigned int, at_flags, unsigned int, flags)
+{
+	struct fs_context *fc;
+	struct path mountpoint;
+	struct fd f;
+	unsigned int lookup_flags, mnt_flags = 0;
+	long ret;
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+			  AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (flags & ~(MS_RDONLY | MS_NOSUID | MS_NODEV | MS_NOEXEC |
+		      MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME))
+		return -EINVAL;
+
+	if (flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
+	if (flags & MS_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (flags & MS_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (flags & MS_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	if (flags & MS_STRICTATIME) {
+		if (flags & MS_NOATIME)
+			return -EINVAL;
+	} else if (flags & MS_NOATIME) {
+		mnt_flags |= MNT_NOATIME;
+	} else {
+		mnt_flags |= MNT_RELATIME;
+	}
+
+	f = fdget(fs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EINVAL;
+	if (f.file->f_op != &fs_fs_fops)
+		goto err_fsfd;
+
+	fc = f.file->private_data;
+
+	ret = -EPERM;
+	if (!may_mount() ||
+	    ((fc->sb_flags & MS_MANDLOCK) && !may_mandlock()))
+		goto err_fsfd;
+
+	/* There must be a valid superblock or we can't mount it */
+	ret = -EINVAL;
+	if (!fc->root)
+		goto err_fsfd;
+
+	/* Find the mountpoint.  A container can be specified in dfd. */
+	lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	if (at_flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (at_flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (at_flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	ret = user_path_at(dfd, dir_name, lookup_flags, &mountpoint);
+	if (ret < 0)
+		goto err_fsfd;
+
+	ret = do_new_mount_fc(fc, &mountpoint, mnt_flags);
+
+	path_put(&mountpoint);
+err_fsfd:
+	fdput(f);
+	return ret;
+}
+
+/*
  * Return true if path is reachable from root
  *
  * namespace_sem or mount_lock is held
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7cd1b65a4152..e82dde171ce8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -942,5 +942,7 @@  asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_fsopen(const char *fs_name, unsigned int flags,
 			   void *reserved3, void *reserved4, void *reserved5);
+asmlinkage long sys_fsmount(int fsfd, int dfd, const char *path, unsigned int at_flags,
+			    unsigned int flags);
 
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index de1dc63e7e47..a0fe764bd5dd 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -261,3 +261,4 @@  cond_syscall(sys_pkey_free);
 
 /* fd-based mount */
 cond_syscall(sys_fsopen);
+cond_syscall(sys_fsmount);