diff mbox series

nsfs: add pid translation ioctls

Message ID 20240619-work-ns_ioctl-v1-1-7c0097e6bb6b@kernel.org (mailing list archive)
State New
Headers show
Series nsfs: add pid translation ioctls | expand

Commit Message

Christian Brauner June 19, 2024, 1:49 p.m. UTC
Add ioctl()s to translate pids between pid namespaces.

LXCFS is a tiny fuse filesystem used to virtualize various aspects of
procfs. LXCFS is run on the host. The files and directories it creates
can be bind-mounted by e.g. a container at startup and mounted over the
various procfs files the container wishes to have virtualized. When e.g.
a read request for uptime is received, LXCFS will receive the pid of the
reader. In order to virtualize the corresponding read, LXCFS needs to
know the pid of the init process of the reader's pid namespace. In order
to do this, LXCFS first needs to fork() two helper processes. The first
helper process setns() to the readers pid namespace. The second helper
process is needed to create a process that is a proper member of the pid
namespace. The second helper process then creates a ucred message with
ucred.pid set to 1 and sends it back to LXCFS. The kernel will translate
the ucred.pid field to the corresponding pid number in LXCFS's pid
namespace. This way LXCFS can learn the init pid number of the reader's
pid namespace and can go on to virtualize. Since these two forks() are
costly LXCFS maintains an init pid cache that caches a given pid for a
fixed amount of time. The cache is pruned during new read requests.
However, even with the cache the hit of the two forks() is singificant
when a very large number of containers are running. With this simple
patch we add an ns ioctl that let's a caller retrieve the init pid nr of
a pid namespace through its pid namespace fd. This significantly
improves performance with a very simple change.

Support translation of pids and tgids. Other concepts can be added but
there are no obvious users for this right now.

To protect against races pidfds can be used to check whether the process
is still valid. If needed, this can also be extended to work on pidfds
directly.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
---
 fs/nsfs.c                 | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/nsfs.h |  8 ++++++++
 2 files changed, 55 insertions(+)


---
base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
change-id: 20240619-work-ns_ioctl-447979cf0820

Comments

Alexander Mikhalitsyn June 27, 2024, 7:31 p.m. UTC | #1
Am Mi., 19. Juni 2024 um 15:50 Uhr schrieb Christian Brauner
<brauner@kernel.org>:
>
> Add ioctl()s to translate pids between pid namespaces.
>
> LXCFS is a tiny fuse filesystem used to virtualize various aspects of
> procfs. LXCFS is run on the host. The files and directories it creates
> can be bind-mounted by e.g. a container at startup and mounted over the
> various procfs files the container wishes to have virtualized. When e.g.
> a read request for uptime is received, LXCFS will receive the pid of the
> reader. In order to virtualize the corresponding read, LXCFS needs to
> know the pid of the init process of the reader's pid namespace. In order
> to do this, LXCFS first needs to fork() two helper processes. The first
> helper process setns() to the readers pid namespace. The second helper
> process is needed to create a process that is a proper member of the pid
> namespace. The second helper process then creates a ucred message with
> ucred.pid set to 1 and sends it back to LXCFS. The kernel will translate
> the ucred.pid field to the corresponding pid number in LXCFS's pid
> namespace. This way LXCFS can learn the init pid number of the reader's
> pid namespace and can go on to virtualize. Since these two forks() are
> costly LXCFS maintains an init pid cache that caches a given pid for a
> fixed amount of time. The cache is pruned during new read requests.
> However, even with the cache the hit of the two forks() is singificant
> when a very large number of containers are running. With this simple
> patch we add an ns ioctl that let's a caller retrieve the init pid nr of
> a pid namespace through its pid namespace fd. This significantly
> improves performance with a very simple change.
>
> Support translation of pids and tgids. Other concepts can be added but
> there are no obvious users for this right now.
>
> To protect against races pidfds can be used to check whether the process
> is still valid. If needed, this can also be extended to work on pidfds
> directly.
>
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Dear Christian,

This is an amazing idea! Thanks for implementing and posting this!

Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>

> ---
> ---
>  fs/nsfs.c                 | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/nsfs.h |  8 ++++++++
>  2 files changed, 55 insertions(+)
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index 07e22a15ef02..4a4d7b1eb38c 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -8,9 +8,11 @@
>  #include <linux/magic.h>
>  #include <linux/ktime.h>
>  #include <linux/seq_file.h>
> +#include <linux/pid_namespace.h>
>  #include <linux/user_namespace.h>
>  #include <linux/nsfs.h>
>  #include <linux/uaccess.h>
> +#include <linux/cleanup.h>
>
>  #include "internal.h"
>
> @@ -123,9 +125,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>                         unsigned long arg)
>  {
>         struct user_namespace *user_ns;
> +       struct pid_namespace *pid_ns;
> +       struct task_struct *tsk;
>         struct ns_common *ns = get_proc_ns(file_inode(filp));
>         uid_t __user *argp;
>         uid_t uid;
> +       pid_t pid_nr;
>
>         switch (ioctl) {
>         case NS_GET_USERNS:
> @@ -143,6 +148,48 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
>                 argp = (uid_t __user *) arg;
>                 uid = from_kuid_munged(current_user_ns(), user_ns->owner);
>                 return put_user(uid, argp);
> +       case NS_GET_PID_FROM_PIDNS:
> +               fallthrough;
> +       case NS_GET_TGID_FROM_PIDNS:
> +               fallthrough;
> +       case NS_GET_PID_IN_PIDNS:
> +               fallthrough;
> +       case NS_GET_TGID_IN_PIDNS:
> +               if (ns->ops->type != CLONE_NEWPID)
> +                       return -EINVAL;
> +
> +               pid_ns = container_of(ns, struct pid_namespace, ns);
> +
> +               guard(rcu)();
> +               if (ioctl == NS_GET_PID_IN_PIDNS ||
> +                   ioctl == NS_GET_TGID_IN_PIDNS)
> +                       tsk = find_task_by_vpid(arg);
> +               else
> +                       tsk = find_task_by_pid_ns(arg, pid_ns);
> +               if (!tsk)
> +                       return -ESRCH;
> +
> +               switch (ioctl) {
> +               case NS_GET_PID_FROM_PIDNS:
> +                       pid_nr = task_pid_vnr(tsk);
> +                       break;
> +               case NS_GET_TGID_FROM_PIDNS:
> +                       pid_nr = task_tgid_vnr(tsk);
> +                       break;
> +               case NS_GET_PID_IN_PIDNS:
> +                       pid_nr = task_pid_nr_ns(tsk, pid_ns);
> +                       break;
> +               case NS_GET_TGID_IN_PIDNS:
> +                       pid_nr = task_tgid_nr_ns(tsk, pid_ns);
> +                       break;
> +               default:
> +                       pid_nr = 0;
> +                       break;
> +               }
> +               if (!pid_nr)
> +                       return -ESRCH;
> +
> +               return pid_nr;
>         default:
>                 return -ENOTTY;
>         }
> diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
> index a0c8552b64ee..faeb9195da08 100644
> --- a/include/uapi/linux/nsfs.h
> +++ b/include/uapi/linux/nsfs.h
> @@ -15,5 +15,13 @@
>  #define NS_GET_NSTYPE          _IO(NSIO, 0x3)
>  /* Get owner UID (in the caller's user namespace) for a user namespace */
>  #define NS_GET_OWNER_UID       _IO(NSIO, 0x4)
> +/* Translate pid from target pid namespace into the caller's pid namespace. */
> +#define NS_GET_PID_FROM_PIDNS  _IOR(NSIO, 0x5, int)
> +/* Return thread-group leader id of pid in the callers pid namespace. */
> +#define NS_GET_TGID_FROM_PIDNS _IOR(NSIO, 0x7, int)
> +/* Translate pid from caller's pid namespace into a target pid namespace. */
> +#define NS_GET_PID_IN_PIDNS    _IOR(NSIO, 0x6, int)
> +/* Return thread-group leader id of pid in the target pid namespace. */
> +#define NS_GET_TGID_IN_PIDNS   _IOR(NSIO, 0x8, int)
>
>  #endif /* __LINUX_NSFS_H */
>
> ---
> base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
> change-id: 20240619-work-ns_ioctl-447979cf0820
>
diff mbox series

Patch

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 07e22a15ef02..4a4d7b1eb38c 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,9 +8,11 @@ 
 #include <linux/magic.h>
 #include <linux/ktime.h>
 #include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
 #include <linux/uaccess.h>
+#include <linux/cleanup.h>
 
 #include "internal.h"
 
@@ -123,9 +125,12 @@  static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg)
 {
 	struct user_namespace *user_ns;
+	struct pid_namespace *pid_ns;
+	struct task_struct *tsk;
 	struct ns_common *ns = get_proc_ns(file_inode(filp));
 	uid_t __user *argp;
 	uid_t uid;
+	pid_t pid_nr;
 
 	switch (ioctl) {
 	case NS_GET_USERNS:
@@ -143,6 +148,48 @@  static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		argp = (uid_t __user *) arg;
 		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
 		return put_user(uid, argp);
+	case NS_GET_PID_FROM_PIDNS:
+		fallthrough;
+	case NS_GET_TGID_FROM_PIDNS:
+		fallthrough;
+	case NS_GET_PID_IN_PIDNS:
+		fallthrough;
+	case NS_GET_TGID_IN_PIDNS:
+		if (ns->ops->type != CLONE_NEWPID)
+			return -EINVAL;
+
+		pid_ns = container_of(ns, struct pid_namespace, ns);
+
+		guard(rcu)();
+		if (ioctl == NS_GET_PID_IN_PIDNS ||
+		    ioctl == NS_GET_TGID_IN_PIDNS)
+			tsk = find_task_by_vpid(arg);
+		else
+			tsk = find_task_by_pid_ns(arg, pid_ns);
+		if (!tsk)
+			return -ESRCH;
+
+		switch (ioctl) {
+		case NS_GET_PID_FROM_PIDNS:
+			pid_nr = task_pid_vnr(tsk);
+			break;
+		case NS_GET_TGID_FROM_PIDNS:
+			pid_nr = task_tgid_vnr(tsk);
+			break;
+		case NS_GET_PID_IN_PIDNS:
+			pid_nr = task_pid_nr_ns(tsk, pid_ns);
+			break;
+		case NS_GET_TGID_IN_PIDNS:
+			pid_nr = task_tgid_nr_ns(tsk, pid_ns);
+			break;
+		default:
+			pid_nr = 0;
+			break;
+		}
+		if (!pid_nr)
+			return -ESRCH;
+
+		return pid_nr;
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index a0c8552b64ee..faeb9195da08 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -15,5 +15,13 @@ 
 #define NS_GET_NSTYPE		_IO(NSIO, 0x3)
 /* Get owner UID (in the caller's user namespace) for a user namespace */
 #define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
+/* Translate pid from target pid namespace into the caller's pid namespace. */
+#define NS_GET_PID_FROM_PIDNS	_IOR(NSIO, 0x5, int)
+/* Return thread-group leader id of pid in the callers pid namespace. */
+#define NS_GET_TGID_FROM_PIDNS	_IOR(NSIO, 0x7, int)
+/* Translate pid from caller's pid namespace into a target pid namespace. */
+#define NS_GET_PID_IN_PIDNS	_IOR(NSIO, 0x6, int)
+/* Return thread-group leader id of pid in the target pid namespace. */
+#define NS_GET_TGID_IN_PIDNS	_IOR(NSIO, 0x8, int)
 
 #endif /* __LINUX_NSFS_H */