diff mbox series

[RFC,07/27] containers: Make fsopen() able to create a superblock in a container

Message ID 155024689635.21651.15943029551519736259.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series Containers and using authenticated filesystems | expand

Commit Message

David Howells Feb. 15, 2019, 4:08 p.m. UTC
Make it possible for fsopen() to create a superblock in a specified
container, using the namespaces associated with that container to cover UID
translation, networking and filesystem content.  This involves adding a new
fsconfig command to specify the container.

For example:

	cfd = container_create("fred", CONTAINER_NEW_FS_NS);

	fsfd = fsopen("ext4", 0);
	fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, cfd);
	fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "/dev/sda3", 0);
	fsconfig(fsfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0);
	fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
	mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY);
	move_mount(mfd, "", cfd, "/",
		   MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_CONTAINER_ROOT);

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/fs_context.c            |   19 +++++++++++++++
 fs/fsopen.c                |   54 +++++++++++++++++++++++++++++++++++++-------
 fs/namespace.c             |   19 +++++++++++----
 fs/proc/root.c             |   11 +++++++--
 include/linux/container.h  |    1 +
 include/linux/fs_context.h |    3 ++
 include/linux/pid.h        |    5 +++-
 include/linux/proc_ns.h    |    6 +++--
 include/uapi/linux/mount.h |    1 +
 kernel/container.c         |    4 +++
 kernel/fork.c              |    2 +-
 kernel/pid.c               |    4 ++-
 12 files changed, 108 insertions(+), 21 deletions(-)
diff mbox series

Patch

diff --git a/fs/fs_context.c b/fs/fs_context.c
index a47ccd5a4a78..fc76ac02d618 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -20,6 +20,7 @@ 
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/security.h>
+#include <linux/container.h>
 #include <linux/mnt_namespace.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
@@ -169,6 +170,21 @@  int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
 }
 EXPORT_SYMBOL(vfs_parse_fs_param);
 
+/*
+ * Specify a container in which a superblock will exist.
+ */
+void vfs_set_container(struct fs_context *fc, struct container *container)
+{
+	if (container) {
+		put_user_ns(fc->user_ns);
+		put_net(fc->net_ns);
+
+		fc->container = get_container(container);
+		fc->user_ns = get_user_ns(container->cred->user_ns);
+		fc->net_ns = get_net(container->ns->net_ns);
+	}
+}
+
 /**
  * vfs_parse_fs_string - Convenience function to just parse a string.
  */
@@ -364,6 +380,8 @@  struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
 	fc->source	= NULL;
 	fc->security	= NULL;
 	get_filesystem(fc->fs_type);
+	if (fc->container)
+		get_container(fc->container);
 	get_net(fc->net_ns);
 	get_user_ns(fc->user_ns);
 	get_cred(fc->cred);
@@ -510,6 +528,7 @@  void put_fs_context(struct fs_context *fc)
 	put_net(fc->net_ns);
 	put_user_ns(fc->user_ns);
 	put_cred(fc->cred);
+	put_container(fc->container);
 	kfree(fc->subtype);
 	put_fc_log(fc);
 	put_filesystem(fc->fs_type);
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 3bb9c0c8cbcc..d0fe9e563ebb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -17,11 +17,33 @@ 
 #include <linux/security.h>
 #include <linux/anon_inodes.h>
 #include <linux/namei.h>
+#include <linux/container.h>
 #include <linux/file.h>
 #include <uapi/linux/mount.h>
 #include "internal.h"
 #include "mount.h"
 
+/*
+ * Configure the destination container on a filesystem context.  This must be
+ * done before any other parameters are offered.  Containers are presented as
+ * fds attached to such objects given by the auxiliary parameter.
+ *
+ * For example:
+ *
+ *	fsconfig(fsfd, FSCONFIG_SET_CONTAINER, NULL, NULL, container_fd);
+ */
+static int fsconfig_set_container(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct container *c;
+
+	if (!is_container_file(param->file))
+		return -EINVAL;
+
+	c = param->file->private_data;
+	vfs_set_container(fc, c);
+	return 0;
+}
+
 /*
  * Allow the user to read back any error, warning or informational messages.
  */
@@ -111,10 +133,6 @@  static int fscontext_alloc_log(struct fs_context *fc)
 
 /*
  * Open a filesystem by name so that it can be configured for mounting.
- *
- * We are allowed to specify a container in which the filesystem will be
- * opened, thereby indicating which namespaces will be used (notably, which
- * network namespace will be used for network filesystems).
  */
 SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
 {
@@ -143,7 +161,7 @@  SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
 	if (IS_ERR(fc))
 		return PTR_ERR(fc);
 
-	fc->phase = FS_CONTEXT_CREATE_PARAMS;
+	fc->phase = FS_CONTEXT_CREATE_NS;
 
 	ret = fscontext_alloc_log(fc);
 	if (ret < 0)
@@ -228,7 +246,8 @@  static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
 		return ret;
 	switch (cmd) {
 	case FSCONFIG_CMD_CREATE:
-		if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+		if (fc->phase != FS_CONTEXT_CREATE_NS &&
+		    fc->phase != FS_CONTEXT_CREATE_PARAMS)
 			return -EBUSY;
 		fc->phase = FS_CONTEXT_CREATING;
 		ret = vfs_get_tree(fc);
@@ -259,9 +278,17 @@  static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
 			break;
 		vfs_clean_context(fc);
 		return 0;
+
+	case FSCONFIG_SET_CONTAINER:
+		if (fc->phase != FS_CONTEXT_CREATE_NS)
+			return -EBUSY;
+		return fsconfig_set_container(fc, param);
+
 	default:
-		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
-		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
+		if (fc->phase == FS_CONTEXT_CREATE_NS)
+			fc->phase = FS_CONTEXT_CREATE_PARAMS;
+		else if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+			 fc->phase != FS_CONTEXT_RECONF_PARAMS)
 			return -EBUSY;
 
 		return vfs_parse_fs_param(fc, param);
@@ -353,6 +380,10 @@  SYSCALL_DEFINE5(fsconfig,
 		if (!_key || _value || aux < 0)
 			return -EINVAL;
 		break;
+	case FSCONFIG_SET_CONTAINER:
+		if (_key || _value || aux < 0)
+			return -EINVAL;
+		break;
 	case FSCONFIG_CMD_CREATE:
 	case FSCONFIG_CMD_RECONFIGURE:
 		if (_key || _value || aux)
@@ -438,6 +469,12 @@  SYSCALL_DEFINE5(fsconfig,
 		if (!param.file)
 			goto out_key;
 		break;
+	case FSCONFIG_SET_CONTAINER:
+		ret = -EBADF;
+		param.file = fget(aux);
+		if (!param.file)
+			goto out_key;
+		break;
 	default:
 		break;
 	}
@@ -463,6 +500,7 @@  SYSCALL_DEFINE5(fsconfig,
 			putname(param.name);
 		break;
 	case FSCONFIG_SET_FD:
+	case FSCONFIG_SET_CONTAINER:
 		if (param.file)
 			fput(param.file);
 		break;
diff --git a/fs/namespace.c b/fs/namespace.c
index ea005f55ec4c..cc5d56f7ae29 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -781,9 +781,16 @@  static void put_mountpoint(struct mountpoint *mp)
 	}
 }
 
+static inline int __check_mnt(struct mount *mnt, struct mnt_namespace *mnt_ns)
+{
+	if (!mnt_ns)
+		mnt_ns = current->nsproxy->mnt_ns;
+	return mnt->mnt_ns == mnt_ns;
+}
+
 static inline int check_mnt(struct mount *mnt)
 {
-	return mnt->mnt_ns == current->nsproxy->mnt_ns;
+	return __check_mnt(mnt, NULL);
 }
 
 /*
@@ -2696,7 +2703,8 @@  static int do_move_mount_old(struct path *path, const char *old_name)
 /*
  * add a mount into a namespace's mount tree
  */
-static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags,
+			struct mnt_namespace *mnt_ns)
 {
 	struct mountpoint *mp;
 	struct mount *parent;
@@ -2710,7 +2718,7 @@  static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 
 	parent = real_mount(path->mnt);
 	err = -EINVAL;
-	if (unlikely(!check_mnt(parent))) {
+	if (unlikely(!__check_mnt(parent, mnt_ns))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
 			goto unlock;
@@ -2765,7 +2773,8 @@  static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+	error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags,
+			     fc->container ? fc->container->ns->mnt_ns : NULL);
 	if (error < 0)
 		mntput(mnt);
 	return error;
@@ -2839,7 +2848,7 @@  int finish_automount(struct vfsmount *m, struct path *path)
 		goto fail;
 	}
 
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE, NULL);
 	if (!err)
 		return 0;
 fail:
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6927b29ece76..aa802006d855 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@ 
 #include <linux/sched/stat.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/container.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_context.h>
 #include <linux/mount.h>
@@ -186,8 +187,12 @@  static int proc_init_fs_context(struct fs_context *fc)
 	ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
+	
+	if (fc->container)
+		ctx->pid_ns = get_pid_ns(fc->container->pid_ns);
+	else
+		ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
 
-	ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
 	fc->fs_private = ctx;
 	fc->ops = &proc_fs_context_ops;
 	return 0;
@@ -300,7 +305,7 @@  struct proc_dir_entry proc_root = {
 	.name		= "/proc",
 };
 
-int pid_ns_prepare_proc(struct pid_namespace *ns)
+int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container)
 {
 	struct proc_fs_context *ctx;
 	struct fs_context *fc;
@@ -315,6 +320,8 @@  int pid_ns_prepare_proc(struct pid_namespace *ns)
 		fc->user_ns = get_user_ns(ns->user_ns);
 	}
 
+	vfs_set_container(fc, container);
+	
 	ctx = fc->fs_private;
 	if (ctx->pid_ns != ns) {
 		put_pid_ns(ctx->pid_ns);
diff --git a/include/linux/container.h b/include/linux/container.h
index 0a8918435097..087aa1885ef7 100644
--- a/include/linux/container.h
+++ b/include/linux/container.h
@@ -37,6 +37,7 @@  struct container {
 	struct path		root;		/* The root of the container's fs namespace */
 	struct task_struct	*init;		/* The 'init' task for this container */
 	struct container	*parent;	/* Parent of this container. */
+	struct pid_namespace	*pid_ns;	/* The process ID namespace for this container */
 	void			*security;	/* LSM data */
 	struct list_head	members;	/* Member processes, guarded with ->lock */
 	struct list_head	child_link;	/* Link in parent->children */
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index dc8c9fcba341..45486080eb84 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -40,6 +40,7 @@  enum fs_context_purpose {
  * Userspace usage phase for fsopen/fspick.
  */
 enum fs_context_phase {
+	FS_CONTEXT_CREATE_NS,		/* Set namespaces for sb creation */
 	FS_CONTEXT_CREATE_PARAMS,	/* Loading params for sb creation */
 	FS_CONTEXT_CREATING,		/* A superblock is being created */
 	FS_CONTEXT_AWAITING_MOUNT,	/* Superblock created, awaiting fsmount() */
@@ -93,6 +94,7 @@  struct fs_context {
 	struct file_system_type	*fs_type;
 	void			*fs_private;	/* The filesystem's context */
 	struct dentry		*root;		/* The root and superblock */
+	struct container	*container;	/* The container in which the mount will exist */
 	struct user_namespace	*user_ns;	/* The user namespace for this mount */
 	struct net		*net_ns;	/* The network namespace for this mount */
 	const struct cred	*cred;		/* The mounter's credentials */
@@ -136,6 +138,7 @@  extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
 extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 			       const char *value, size_t v_size);
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
+extern void vfs_set_container(struct fs_context *fc, struct container *container);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..16dc152ceef1 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -73,6 +73,8 @@  static inline struct pid *get_pid(struct pid *pid)
 	return pid;
 }
 
+struct container;
+
 extern void put_pid(struct pid *pid);
 extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
 extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
@@ -111,7 +113,8 @@  extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
 
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns,
+			     struct container *container);
 extern void free_pid(struct pid *pid);
 extern void disable_pid_allocation(struct pid_namespace *ns);
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb6215905..dee0881eca5c 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -47,14 +47,16 @@  enum {
 
 #ifdef CONFIG_PROC_FS
 
-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
+extern int pid_ns_prepare_proc(struct pid_namespace *ns,
+			       struct container *container);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
 
 #else /* CONFIG_PROC_FS */
 
-static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
+static inline int pid_ns_prepare_proc(struct pid_namespace *ns, struct container *container)
+{ return 0; }
 static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
 
 static inline int proc_alloc_inum(unsigned int *inum)
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 96a0240f23fe..f60bbe6f4099 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -97,6 +97,7 @@  enum fsconfig_command {
 	FSCONFIG_SET_FD		= 5,	/* Set parameter, supplying an object by fd */
 	FSCONFIG_CMD_CREATE	= 6,	/* Invoke superblock creation */
 	FSCONFIG_CMD_RECONFIGURE = 7,	/* Invoke superblock reconfiguration */
+	FSCONFIG_SET_CONTAINER	= 8,	/* Set a container, supplied by fd */
 };
 
 /*
diff --git a/kernel/container.c b/kernel/container.c
index 1d2cb1c1e9b1..fd3b2a6849a1 100644
--- a/kernel/container.c
+++ b/kernel/container.c
@@ -30,6 +30,7 @@  struct container init_container = {
 	.cred		= &init_cred,
 	.ns		= &init_nsproxy,
 	.init		= &init_task,
+	.pid_ns		= &init_pid_ns,
 	.members.next	= &init_task.container_link,
 	.members.prev	= &init_task.container_link,
 	.children	= LIST_HEAD_INIT(init_container.children),
@@ -51,6 +52,8 @@  void put_container(struct container *c)
 
 	while (c && refcount_dec_and_test(&c->usage)) {
 		BUG_ON(!list_empty(&c->members));
+		if (c->pid_ns)
+			put_pid_ns(c->pid_ns);
 		if (c->ns)
 			put_nsproxy(c->ns);
 		path_put(&c->root);
@@ -391,6 +394,7 @@  static struct container *create_container(const char __user *name, unsigned int
 	}
 
 	c->ns = ns;
+	c->pid_ns = get_pid_ns(c->ns->pid_ns_for_children);
 	c->root = fs->root;
 	c->seq = fs->seq;
 	fs->root.mnt = NULL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 71401deb4434..09de5f35d312 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1958,7 +1958,7 @@  static __latent_entropy struct task_struct *copy_process(
 	stackleak_task_init(p);
 
 	if (pid != &init_struct_pid) {
-		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+		pid = alloc_pid(p->nsproxy->pid_ns_for_children, dest_container);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_thread;
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..6528a75e6c0d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -156,7 +156,7 @@  void free_pid(struct pid *pid)
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, struct container *container)
 {
 	struct pid *pid;
 	enum pid_type type;
@@ -205,7 +205,7 @@  struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns))
+		if (pid_ns_prepare_proc(ns, container))
 			goto out_free;
 	}