capabilities: Introduce CAP_RESTORE
diff mbox series

Message ID 20200522055350.806609-1-areber@redhat.com
State New
Headers show
Series
  • capabilities: Introduce CAP_RESTORE
Related show

Commit Message

Adrian Reber May 22, 2020, 5:53 a.m. UTC
This enables CRIU to checkpoint and restore a process as non-root.

Over the last years CRIU upstream has been asked a couple of time if it
is possible to checkpoint and restore a process as non-root. The answer
usually was: 'almost'.

The main blocker to restore a process was that selecting the PID of the
restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.

In the last two years the questions about checkpoint/restore as non-root
have increased and especially in the last few months we have seen
multiple people inventing workarounds.

The use-cases so far and their workarounds:

 * Checkpoint/Restore in an HPC environment in combination with
   a resource manager distributing jobs. Users are always running
   as non root, but there was the desire to provide a way to
   checkpoint and restore long running jobs.
   Workaround: setuid wrapper to start CRIU as root as non-root
   https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
 * Another use case to checkpoint/restore processes as non-root
   uses as workaround a non privileged process which cycles through
   PIDs by calling fork() as fast as possible with a rate of
   100,000 pids/s instead of writing to ns_last_pid
   https://github.com/twosigma/set_ns_last_pid
 * Fast Java startup using checkpoint/restore.
   We have been in contact with JVM developers who are integrating
   CRIU into a JVM to decrease the startup time.
   Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
 * Container migration as non root. There are people already
   using CRIU to migrate containers as non-root. The solution
   there is to run it in a user namespace. So if you are able
   to carefully setup your environment with the namespaces
   it is already possible to restore a container/process as non-root.
   Unfortunately it is not always possible to setup an environment
   in such a way and for easier access to non-root based container
   migration this patch is also required.

There are probably a few more things guarded by CAP_SYS_ADMIN required
to run checkpoint/restore as non-root, but by applying this patch I can
already checkpoint and restore processes as non-root. As there are
already multiple workarounds I would prefer to do it correctly in the
kernel to avoid that CRIU users are starting to invent more workarounds.

I have used the following tests to verify that this change works as
expected by setting the new capability CAP_RESTORE on the two resulting
test binaries:

$ cat ns_last_pid.c
 // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/file.h>
 #include <sys/types.h>
 #include <unistd.h>

int main(int argc, char *argv[])
{
	pid_t pid, new_pid;
	char buf[32];
	int fd;

	if (argc != 2)
		return 1;

	printf("Opening ns_last_pid...\n");
	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
	if (fd < 0) {
		perror("Cannot open ns_last_pid");
		return 1;
	}

	printf("Locking ns_last_pid...\n");
	if (flock(fd, LOCK_EX)) {
		close(fd);
		printf("Cannot lock ns_last_pid\n");
		return 1;
	}

	pid = atoi(argv[1]);
	snprintf(buf, sizeof(buf), "%d", pid - 1);
	printf("Writing pid-1 to ns_last_pid...\n");
	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
		printf("Cannot write to buf\n");
		return 1;
	}

	printf("Forking...\n");
	new_pid = fork();
	if (new_pid == 0) {
		printf("I am the child!\n");
		exit(0);
	} else if (new_pid == pid)
		printf("I am the parent. My child got the pid %d!\n", new_pid);
	else
		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);

	printf("Cleaning up...\n");
	if (flock(fd, LOCK_UN))
		printf("Cannot unlock\n");
	close(fd);
	return 0;
}
$ id -u; /home/libcap/ns_last_pid 300000
1001
Opening ns_last_pid...
Locking ns_last_pid...
Writing pid-1 to ns_last_pid...
Forking...
I am the parent. My child got the pid 300000!
I am the child!
Cleaning up...

For the clone3() based approach:
$ cat clone3_set_tid.c
 #define _GNU_SOURCE
 #include <linux/sched.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <unistd.h>

 #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))

int main(int argc, char *argv[])
{
	struct clone_args c_args = { };
	pid_t pid, new_pid;

	if (argc != 2)
		return 1;

	pid = atoi(argv[1]);
	c_args.set_tid = ptr_to_u64(&pid);
	c_args.set_tid_size = 1;

	printf("Forking...\n");
	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
	if (new_pid == 0) {
		printf("I am the child!\n");
		exit(0);
	} else if (new_pid == pid)
		printf("I am the parent. My child got the pid %d!\n", new_pid);
	else
		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
	printf("Done\n");

	return 0;
}
$ id -u; /home/libcap/clone3_set_tid 300000
1001
Forking...
I am the parent. My child got the pid 300000!
Done
I am the child!

Signed-off-by: Adrian Reber <areber@redhat.com>
---
 include/linux/capability.h          | 5 +++++
 include/uapi/linux/capability.h     | 9 ++++++++-
 kernel/pid.c                        | 2 +-
 kernel/pid_namespace.c              | 2 +-
 security/selinux/include/classmap.h | 5 +++--
 5 files changed, 18 insertions(+), 5 deletions(-)


base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5

Comments

Christian Brauner May 22, 2020, 7:53 a.m. UTC | #1
On Fri, May 22, 2020 at 07:53:50AM +0200, Adrian Reber wrote:
> This enables CRIU to checkpoint and restore a process as non-root.
> 
> Over the last years CRIU upstream has been asked a couple of time if it
> is possible to checkpoint and restore a process as non-root. The answer
> usually was: 'almost'.
> 
> The main blocker to restore a process was that selecting the PID of the
> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> 
> In the last two years the questions about checkpoint/restore as non-root
> have increased and especially in the last few months we have seen
> multiple people inventing workarounds.
> 
> The use-cases so far and their workarounds:
> 
>  * Checkpoint/Restore in an HPC environment in combination with
>    a resource manager distributing jobs. Users are always running
>    as non root, but there was the desire to provide a way to
>    checkpoint and restore long running jobs.
>    Workaround: setuid wrapper to start CRIU as root as non-root
>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
>  * Another use case to checkpoint/restore processes as non-root
>    uses as workaround a non privileged process which cycles through
>    PIDs by calling fork() as fast as possible with a rate of
>    100,000 pids/s instead of writing to ns_last_pid
>    https://github.com/twosigma/set_ns_last_pid
>  * Fast Java startup using checkpoint/restore.
>    We have been in contact with JVM developers who are integrating
>    CRIU into a JVM to decrease the startup time.
>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
>  * Container migration as non root. There are people already
>    using CRIU to migrate containers as non-root. The solution
>    there is to run it in a user namespace. So if you are able
>    to carefully setup your environment with the namespaces
>    it is already possible to restore a container/process as non-root.
>    Unfortunately it is not always possible to setup an environment
>    in such a way and for easier access to non-root based container
>    migration this patch is also required.
> 
> There are probably a few more things guarded by CAP_SYS_ADMIN required
> to run checkpoint/restore as non-root, but by applying this patch I can
> already checkpoint and restore processes as non-root. As there are
> already multiple workarounds I would prefer to do it correctly in the
> kernel to avoid that CRIU users are starting to invent more workarounds.

It sounds ok to me as long as this feature is guarded by any sensible
capability. I don't want users to be able to randomly choose their pid
without any capability required.

We've heard the plea for unprivileged checkpoint/restore through the
grapevine and a few times about CAP_RESTORE at plumbers but it's one of
those cases where nobody pushed for it so it's urgency was questionable.
This is 5.9 material though and could you please add selftests?

It also seems you have future changes planned that would make certain
things accessible via CAP_RESTORE that are currently guarded by other
capabilities. Any specific things in mind? It might be worth knowing
what we'd be getting ourselves into if you're planning on flipping
switches in other places.

Christian
Christian Brauner May 22, 2020, 1:41 p.m. UTC | #2
On Fri, May 22, 2020 at 07:53:50AM +0200, Adrian Reber wrote:
> This enables CRIU to checkpoint and restore a process as non-root.
> 
> Over the last years CRIU upstream has been asked a couple of time if it
> is possible to checkpoint and restore a process as non-root. The answer
> usually was: 'almost'.
> 
> The main blocker to restore a process was that selecting the PID of the
> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> 
> In the last two years the questions about checkpoint/restore as non-root
> have increased and especially in the last few months we have seen
> multiple people inventing workarounds.
> 
> The use-cases so far and their workarounds:
> 
>  * Checkpoint/Restore in an HPC environment in combination with
>    a resource manager distributing jobs. Users are always running
>    as non root, but there was the desire to provide a way to
>    checkpoint and restore long running jobs.
>    Workaround: setuid wrapper to start CRIU as root as non-root
>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
>  * Another use case to checkpoint/restore processes as non-root
>    uses as workaround a non privileged process which cycles through
>    PIDs by calling fork() as fast as possible with a rate of
>    100,000 pids/s instead of writing to ns_last_pid
>    https://github.com/twosigma/set_ns_last_pid
>  * Fast Java startup using checkpoint/restore.
>    We have been in contact with JVM developers who are integrating
>    CRIU into a JVM to decrease the startup time.
>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
>  * Container migration as non root. There are people already
>    using CRIU to migrate containers as non-root. The solution
>    there is to run it in a user namespace. So if you are able
>    to carefully setup your environment with the namespaces
>    it is already possible to restore a container/process as non-root.
>    Unfortunately it is not always possible to setup an environment
>    in such a way and for easier access to non-root based container
>    migration this patch is also required.
> 
> There are probably a few more things guarded by CAP_SYS_ADMIN required
> to run checkpoint/restore as non-root, but by applying this patch I can
> already checkpoint and restore processes as non-root. As there are
> already multiple workarounds I would prefer to do it correctly in the
> kernel to avoid that CRIU users are starting to invent more workarounds.
> 
> I have used the following tests to verify that this change works as
> expected by setting the new capability CAP_RESTORE on the two resulting
> test binaries:
> 
> $ cat ns_last_pid.c
>  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/file.h>
>  #include <sys/types.h>
>  #include <unistd.h>
> 
> int main(int argc, char *argv[])
> {
> 	pid_t pid, new_pid;
> 	char buf[32];
> 	int fd;
> 
> 	if (argc != 2)
> 		return 1;
> 
> 	printf("Opening ns_last_pid...\n");
> 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> 	if (fd < 0) {
> 		perror("Cannot open ns_last_pid");
> 		return 1;
> 	}
> 
> 	printf("Locking ns_last_pid...\n");
> 	if (flock(fd, LOCK_EX)) {
> 		close(fd);
> 		printf("Cannot lock ns_last_pid\n");
> 		return 1;
> 	}
> 
> 	pid = atoi(argv[1]);
> 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> 	printf("Writing pid-1 to ns_last_pid...\n");
> 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> 		printf("Cannot write to buf\n");
> 		return 1;
> 	}
> 
> 	printf("Forking...\n");
> 	new_pid = fork();
> 	if (new_pid == 0) {
> 		printf("I am the child!\n");
> 		exit(0);
> 	} else if (new_pid == pid)
> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> 	else
> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> 
> 	printf("Cleaning up...\n");
> 	if (flock(fd, LOCK_UN))
> 		printf("Cannot unlock\n");
> 	close(fd);
> 	return 0;
> }
> $ id -u; /home/libcap/ns_last_pid 300000
> 1001
> Opening ns_last_pid...
> Locking ns_last_pid...
> Writing pid-1 to ns_last_pid...
> Forking...
> I am the parent. My child got the pid 300000!
> I am the child!
> Cleaning up...
> 
> For the clone3() based approach:
> $ cat clone3_set_tid.c
>  #define _GNU_SOURCE
>  #include <linux/sched.h>
>  #include <stdint.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <sys/syscall.h>
>  #include <unistd.h>
> 
>  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
> 
> int main(int argc, char *argv[])
> {
> 	struct clone_args c_args = { };
> 	pid_t pid, new_pid;
> 
> 	if (argc != 2)
> 		return 1;
> 
> 	pid = atoi(argv[1]);
> 	c_args.set_tid = ptr_to_u64(&pid);
> 	c_args.set_tid_size = 1;
> 
> 	printf("Forking...\n");
> 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
> 	if (new_pid == 0) {
> 		printf("I am the child!\n");
> 		exit(0);
> 	} else if (new_pid == pid)
> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> 	else
> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> 	printf("Done\n");
> 
> 	return 0;
> }
> $ id -u; /home/libcap/clone3_set_tid 300000
> 1001
> Forking...
> I am the parent. My child got the pid 300000!
> Done
> I am the child!
> 
> Signed-off-by: Adrian Reber <areber@redhat.com>
> ---
>  include/linux/capability.h          | 5 +++++
>  include/uapi/linux/capability.h     | 9 ++++++++-
>  kernel/pid.c                        | 2 +-
>  kernel/pid_namespace.c              | 2 +-
>  security/selinux/include/classmap.h | 5 +++--
>  5 files changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index b4345b38a6be..1278313cb2bc 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -261,6 +261,11 @@ static inline bool bpf_capable(void)
>  	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
>  }
>  
> +static inline bool restore_ns_capable(struct user_namespace *ns)
> +{
> +	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
> +}
> +

When did everyone start putting their favorite little helpers in here?
I see both bpf_capable() and perform_capable() are going to be in
linux/capability.h if linux-next is to be believed. Seems they could
very well go in their own bpf headers.
Anyway, if everyone is fine with putting specific helpers in
capability.h then ok.


>  /* audit system wants to get cap info from files as well */
>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>  
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index c7372180a0a9..4bcc4e3d41ff 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -406,7 +406,14 @@ struct vfs_ns_cap_data {
>   */
>  #define CAP_BPF			39
>  
> -#define CAP_LAST_CAP         CAP_BPF
> +
> +/* Allow checkpoint/restore related operations */
> +/* Allow PID selection during clone3() */
> +/* Allow writing to ns_last_pid */
> +
> +#define CAP_RESTORE		40

Thinking about what Adrian said this should probably really be:

CAP_CHECKPOINT_RESTORE

I initially thought that might be too long but our config option is the
same name CONFIG_CHECKPOINT_RESTORE so seems ok and other capabilities
have similar long names (CAP_DAC_READ_SEARCH, CAP_NET_BIND_SERVICE come
to mind)

> +
> +#define CAP_LAST_CAP         CAP_RESTORE
>  
>  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
>  
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3122043fe364..bbc26f2bcff6 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  			if (tid != 1 && !tmp->child_reaper)
>  				goto out_free;
>  			retval = -EPERM;
> -			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
> +			if (!restore_ns_capable(tmp->user_ns))
>  				goto out_free;
>  			set_tid_size--;
>  		}
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0e5ac162c3a8..f58186b31ce6 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>  	struct ctl_table tmp = *table;
>  	int ret, next;
>  
> -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> +	if (write && !restore_ns_capable(pid_ns->user_ns))
>  		return -EPERM;
>  
>  	/*
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index 98e1513b608a..f8b8f12a6ebd 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -27,9 +27,10 @@
>  	    "audit_control", "setfcap"
>  
>  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
> -		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
> +		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
> +		"restore"
>  
> -#if CAP_LAST_CAP > CAP_BPF
> +#if CAP_LAST_CAP > CAP_RESTORE
>  #error New capability defined, please update COMMON_CAP2_PERMS.
>  #endif
>  
> 
> base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5
> -- 
> 2.26.2
>
Casey Schaufler May 22, 2020, 4:40 p.m. UTC | #3
On 5/21/2020 10:53 PM, Adrian Reber wrote:
> This enables CRIU to checkpoint and restore a process as non-root.

I know it sounds pedantic, but could you spell out CRIU once?
While I know that everyone who cares either knows or can guess
what you're talking about, it may be a mystery to some of the
newer kernel developers.

> Over the last years CRIU upstream has been asked a couple of time if it
> is possible to checkpoint and restore a process as non-root. The answer
> usually was: 'almost'.
>
> The main blocker to restore a process was that selecting the PID of the
> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.

What are the other blockers? Are you going to suggest additional new
capabilities to clear them?

> In the last two years the questions about checkpoint/restore as non-root
> have increased and especially in the last few months we have seen
> multiple people inventing workarounds.

Giving a process CAP_SYS_ADMIN is a non-root solution.

> The use-cases so far and their workarounds:
>
>  * Checkpoint/Restore in an HPC environment in combination with
>    a resource manager distributing jobs. Users are always running
>    as non root, but there was the desire to provide a way to
>    checkpoint and restore long running jobs.
>    Workaround: setuid wrapper to start CRIU as root as non-root
>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c

This is a classic and well understood mechanism for dealing with
this kind of situation. You could have checkpointer-filecap-sys_admin.c
instead, if you want to reduce use of the super-user.

> * Another use case to checkpoint/restore processes as non-root
>    uses as workaround a non privileged process which cycles through
>    PIDs by calling fork() as fast as possible with a rate of
>    100,000 pids/s instead of writing to ns_last_pid
>    https://github.com/twosigma/set_ns_last_pid

Oh dear.

>  * Fast Java startup using checkpoint/restore.
>    We have been in contact with JVM developers who are integrating
>    CRIU into a JVM to decrease the startup time.
>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel

That's not a workaround, it's a policy violation.
Bad JVM! No biscuit!

>  * Container migration as non root. There are people already
>    using CRIU to migrate containers as non-root. The solution
>    there is to run it in a user namespace. So if you are able
>    to carefully setup your environment with the namespaces
>    it is already possible to restore a container/process as non-root.

This is exactly the kind of situation that user namespaces are
supposed to address.

>    Unfortunately it is not always possible to setup an environment
>    in such a way and for easier access to non-root based container
>    migration this patch is also required.

If a user namespace solution is impossible or (more likely) too
expensive, there's always the checkpointer-filecap-sys_admin option.

> There are probably a few more things guarded by CAP_SYS_ADMIN required
> to run checkpoint/restore as non-root,

If you need CAP_SYS_ADMIN anyway you're not gaining anything by
separating out CAP_RESTORE.

>  but by applying this patch I can
> already checkpoint and restore processes as non-root. As there are
> already multiple workarounds I would prefer to do it correctly in the
> kernel to avoid that CRIU users are starting to invent more workarounds.

You've presented a couple of really inappropriate implementations
that would qualify as workarounds. But the other two are completely
appropriate within the system security policy. They don't "get around"
the problem, they use existing mechanisms as they are intended.



> I have used the following tests to verify that this change works as
> expected by setting the new capability CAP_RESTORE on the two resulting
> test binaries:
>
> $ cat ns_last_pid.c
>  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/file.h>
>  #include <sys/types.h>
>  #include <unistd.h>
>
> int main(int argc, char *argv[])
> {
> 	pid_t pid, new_pid;
> 	char buf[32];
> 	int fd;
>
> 	if (argc != 2)
> 		return 1;
>
> 	printf("Opening ns_last_pid...\n");
> 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> 	if (fd < 0) {
> 		perror("Cannot open ns_last_pid");
> 		return 1;
> 	}
>
> 	printf("Locking ns_last_pid...\n");
> 	if (flock(fd, LOCK_EX)) {
> 		close(fd);
> 		printf("Cannot lock ns_last_pid\n");
> 		return 1;
> 	}
>
> 	pid = atoi(argv[1]);
> 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> 	printf("Writing pid-1 to ns_last_pid...\n");
> 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> 		printf("Cannot write to buf\n");
> 		return 1;
> 	}
>
> 	printf("Forking...\n");
> 	new_pid = fork();
> 	if (new_pid == 0) {
> 		printf("I am the child!\n");
> 		exit(0);
> 	} else if (new_pid == pid)
> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> 	else
> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
>
> 	printf("Cleaning up...\n");
> 	if (flock(fd, LOCK_UN))
> 		printf("Cannot unlock\n");
> 	close(fd);
> 	return 0;
> }
> $ id -u; /home/libcap/ns_last_pid 300000
> 1001
> Opening ns_last_pid...
> Locking ns_last_pid...
> Writing pid-1 to ns_last_pid...
> Forking...
> I am the parent. My child got the pid 300000!
> I am the child!
> Cleaning up...
>
> For the clone3() based approach:
> $ cat clone3_set_tid.c
>  #define _GNU_SOURCE
>  #include <linux/sched.h>
>  #include <stdint.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <sys/syscall.h>
>  #include <unistd.h>
>
>  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
>
> int main(int argc, char *argv[])
> {
> 	struct clone_args c_args = { };
> 	pid_t pid, new_pid;
>
> 	if (argc != 2)
> 		return 1;
>
> 	pid = atoi(argv[1]);
> 	c_args.set_tid = ptr_to_u64(&pid);
> 	c_args.set_tid_size = 1;
>
> 	printf("Forking...\n");
> 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
> 	if (new_pid == 0) {
> 		printf("I am the child!\n");
> 		exit(0);
> 	} else if (new_pid == pid)
> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> 	else
> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> 	printf("Done\n");
>
> 	return 0;
> }
> $ id -u; /home/libcap/clone3_set_tid 300000
> 1001
> Forking...
> I am the parent. My child got the pid 300000!
> Done
> I am the child!
>
> Signed-off-by: Adrian Reber <areber@redhat.com>
> ---
>  include/linux/capability.h          | 5 +++++
>  include/uapi/linux/capability.h     | 9 ++++++++-
>  kernel/pid.c                        | 2 +-
>  kernel/pid_namespace.c              | 2 +-
>  security/selinux/include/classmap.h | 5 +++--
>  5 files changed, 18 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index b4345b38a6be..1278313cb2bc 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -261,6 +261,11 @@ static inline bool bpf_capable(void)
>  	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
>  }
>  
> +static inline bool restore_ns_capable(struct user_namespace *ns)
> +{
> +	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
> +}
> +
>  /* audit system wants to get cap info from files as well */
>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>  
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index c7372180a0a9..4bcc4e3d41ff 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -406,7 +406,14 @@ struct vfs_ns_cap_data {
>   */
>  #define CAP_BPF			39
>  
> -#define CAP_LAST_CAP         CAP_BPF
> +
> +/* Allow checkpoint/restore related operations */
> +/* Allow PID selection during clone3() */
> +/* Allow writing to ns_last_pid */
> +
> +#define CAP_RESTORE		40
> +
> +#define CAP_LAST_CAP         CAP_RESTORE
>  
>  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
>  
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3122043fe364..bbc26f2bcff6 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  			if (tid != 1 && !tmp->child_reaper)
>  				goto out_free;
>  			retval = -EPERM;
> -			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
> +			if (!restore_ns_capable(tmp->user_ns))
>  				goto out_free;
>  			set_tid_size--;
>  		}
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0e5ac162c3a8..f58186b31ce6 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>  	struct ctl_table tmp = *table;
>  	int ret, next;
>  
> -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> +	if (write && !restore_ns_capable(pid_ns->user_ns))
>  		return -EPERM;
>  
>  	/*
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index 98e1513b608a..f8b8f12a6ebd 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -27,9 +27,10 @@
>  	    "audit_control", "setfcap"
>  
>  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
> -		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
> +		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
> +		"restore"
>  
> -#if CAP_LAST_CAP > CAP_BPF
> +#if CAP_LAST_CAP > CAP_RESTORE
>  #error New capability defined, please update COMMON_CAP2_PERMS.
>  #endif
>  
>
> base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5
Andrei Vagin May 22, 2020, 6:02 p.m. UTC | #4
On Fri, May 22, 2020 at 09:53:31AM +0200, Christian Brauner wrote:
> On Fri, May 22, 2020 at 07:53:50AM +0200, Adrian Reber wrote:
> > 
> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> > to run checkpoint/restore as non-root, but by applying this patch I can
> > already checkpoint and restore processes as non-root. As there are
> > already multiple workarounds I would prefer to do it correctly in the
> > kernel to avoid that CRIU users are starting to invent more workarounds.
> 
> It sounds ok to me as long as this feature is guarded by any sensible
> capability. I don't want users to be able to randomly choose their pid
> without any capability required.
> 
> We've heard the plea for unprivileged checkpoint/restore through the
> grapevine and a few times about CAP_RESTORE at plumbers but it's one of
> those cases where nobody pushed for it so it's urgency was questionable.
> This is 5.9 material though and could you please add selftests?
> 
> It also seems you have future changes planned that would make certain
> things accessible via CAP_RESTORE that are currently guarded by other
> capabilities. Any specific things in mind? It might be worth knowing
> what we'd be getting ourselves into if you're planning on flipping
> switches in other places.

/proc/pid/map_files is one of the first candidate what we need to think
about. CRIU opens files from /proc/pid/map_files to dump file mappings,
shared memory mappings, memfd files.

Right now, it is impossible to open these files without CAP_SYS_ADMIN in
the root user-namespace (proc_map_files_get_link).
Andrei Vagin May 23, 2020, 4:27 a.m. UTC | #5
On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
> On 5/21/2020 10:53 PM, Adrian Reber wrote:
> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> > to run checkpoint/restore as non-root,
> 
> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
> separating out CAP_RESTORE.
> 
> >  but by applying this patch I can
> > already checkpoint and restore processes as non-root. As there are
> > already multiple workarounds I would prefer to do it correctly in the
> > kernel to avoid that CRIU users are starting to invent more workarounds.
> 
> You've presented a couple of really inappropriate implementations
> that would qualify as workarounds. But the other two are completely
> appropriate within the system security policy. They don't "get around"
> the problem, they use existing mechanisms as they are intended.
> 

With CAP_CHECKPOINT_RESTORE, we will need to use the same mechanisms.

The problem is that CAP_SYS_ADMIN is too wide. If a process has
CAP_SYS_ADMIN, it can do a lot of things and  the operation of forking a
process with a specified pid isn't the most dangerous one in this case.
Offten security policies don't allow to grant CAP_SYS_ADMIN to any
third-party tools even in non-root user namespaces.
Casey Schaufler May 25, 2020, 2:01 a.m. UTC | #6
On 5/22/2020 9:27 PM, Andrei Vagin wrote:
> On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
>> On 5/21/2020 10:53 PM, Adrian Reber wrote:
>>> There are probably a few more things guarded by CAP_SYS_ADMIN required
>>> to run checkpoint/restore as non-root,
>> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
>> separating out CAP_RESTORE.
>>
>>>  but by applying this patch I can
>>> already checkpoint and restore processes as non-root. As there are
>>> already multiple workarounds I would prefer to do it correctly in the
>>> kernel to avoid that CRIU users are starting to invent more workarounds.
>> You've presented a couple of really inappropriate implementations
>> that would qualify as workarounds. But the other two are completely
>> appropriate within the system security policy. They don't "get around"
>> the problem, they use existing mechanisms as they are intended.
>>
> With CAP_CHECKPOINT_RESTORE, we will need to use the same mechanisms.

Then why call them out as objectionable "workarounds"?

> The problem is that CAP_SYS_ADMIN is too wide.

This is well understood, and irrelevant.

If we broke out CAP_SYS_ADMIN properly we'd have hundreds of
capabilities, and no one would be able to manage the capability
sets on anything. Just breaking out of CAP_SYS_ADMIN, especially
if the process is going to need other capabilities anyway, gains
you nothing.

>  If a process has
> CAP_SYS_ADMIN, it can do a lot of things and  the operation of forking a
> process with a specified pid isn't the most dangerous one in this case.
> Offten security policies don't allow to grant CAP_SYS_ADMIN to any
> third-party tools even in non-root user namespaces.
>
Adrian Reber May 25, 2020, 8:05 a.m. UTC | #7
On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
> On 5/21/2020 10:53 PM, Adrian Reber wrote:
> > This enables CRIU to checkpoint and restore a process as non-root.
> 
> I know it sounds pedantic, but could you spell out CRIU once?
> While I know that everyone who cares either knows or can guess
> what you're talking about, it may be a mystery to some of the
> newer kernel developers.

Sure. CRIU - Checkpoint/Restore In Userspace.

> > Over the last years CRIU upstream has been asked a couple of time if it
> > is possible to checkpoint and restore a process as non-root. The answer
> > usually was: 'almost'.
> >
> > The main blocker to restore a process was that selecting the PID of the
> > restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> 
> What are the other blockers? Are you going to suggest additional new
> capabilities to clear them?

As mentioned somewhere else access to /proc/<pid>/map_files/ would be
helpful. Right now I am testing with a JVM and it works without root
just with the attached patch. Without access to /proc/<pid>/map_files/
not everything CRIU can do will actually work, but we are a lot closer
to what our users have been asking for.

> > In the last two years the questions about checkpoint/restore as non-root
> > have increased and especially in the last few months we have seen
> > multiple people inventing workarounds.
> 
> Giving a process CAP_SYS_ADMIN is a non-root solution.

Yes, but like mentioned somewhere else not a solution that actually
works, because CAP_SYS_ADMIN allows too much. Especially for the
checkpoint/restore case, we really need one (setting the PID of a new
process) and to make it complete a second (reading map_files).

Reading the comments in include/uapi/linux/capability.h concerning
CAP_SYS_ADMIN it allows the binary to do at least 35 things. The two
(three) I mentioned above (ns_last_pid (clone3) map_files) are not
mentioned in that list, so CAP_SYS_ADMIN allows probably much more.

To allow checkpoint/restore as non-root nobody will give CRIU
CAP_SYS_ADMIN because it is too wide.

> > The use-cases so far and their workarounds:
> >
> >  * Checkpoint/Restore in an HPC environment in combination with
> >    a resource manager distributing jobs. Users are always running
> >    as non root, but there was the desire to provide a way to
> >    checkpoint and restore long running jobs.
> >    Workaround: setuid wrapper to start CRIU as root as non-root
> >    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
> 
> This is a classic and well understood mechanism for dealing with
> this kind of situation. You could have checkpointer-filecap-sys_admin.c
> instead, if you want to reduce use of the super-user.
> 
> > * Another use case to checkpoint/restore processes as non-root
> >    uses as workaround a non privileged process which cycles through
> >    PIDs by calling fork() as fast as possible with a rate of
> >    100,000 pids/s instead of writing to ns_last_pid
> >    https://github.com/twosigma/set_ns_last_pid
> 
> Oh dear.
> 
> >  * Fast Java startup using checkpoint/restore.
> >    We have been in contact with JVM developers who are integrating
> >    CRIU into a JVM to decrease the startup time.
> >    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
> 
> That's not a workaround, it's a policy violation.
> Bad JVM! No biscuit!

This was used as a proof of concept to see if we can checkpoint and
restore a JVM without root. Only the ns_last_pid check was removed to
see if it works and it does.

> >  * Container migration as non root. There are people already
> >    using CRIU to migrate containers as non-root. The solution
> >    there is to run it in a user namespace. So if you are able
> >    to carefully setup your environment with the namespaces
> >    it is already possible to restore a container/process as non-root.
> 
> This is exactly the kind of situation that user namespaces are
> supposed to address.
> 
> >    Unfortunately it is not always possible to setup an environment
> >    in such a way and for easier access to non-root based container
> >    migration this patch is also required.
> 
> If a user namespace solution is impossible or (more likely) too
> expensive, there's always the checkpointer-filecap-sys_admin option.

But then again we open up all of CAP_SYS_ADMIN, which is not necessary.

> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> > to run checkpoint/restore as non-root,
> 
> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
> separating out CAP_RESTORE.

No, as described we can checkpoint and restore a JVM with this patch and
it also solves the problem the set_ns_last_pid fork() loop daemon tries
to solve. It is not enough to support the full functionality of CRIU as
map_files is also important, but we do not need CAP_SYS_ADMIN and
CAP_RESTORE. Only CAP_RESTORE would be necessary.

With a new capability users can enable checkpoint/restore as non-root
without giving CRIU access to any of the other possibilities offered by
CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
for checkpoint/restore would make it easier for CRIU users to run it as
non-root and make it very clear what is possible when giving CRIU the
new capability. No other things would be allowed than necessary for
checkpoint/restore. Setting a PID is most important for the restore part
and reading map_files would be helpful during checkpoint. So it actually
should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
another email.

> >  but by applying this patch I can
> > already checkpoint and restore processes as non-root. As there are
> > already multiple workarounds I would prefer to do it correctly in the
> > kernel to avoid that CRIU users are starting to invent more workarounds.
> 
> You've presented a couple of really inappropriate implementations
> that would qualify as workarounds. But the other two are completely
> appropriate within the system security policy. They don't "get around"
> the problem, they use existing mechanisms as they are intended.

I agree with the user namespace approach to be appropriate, but not the
CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
what CAP_SYS_ADMIN allows.

> > I have used the following tests to verify that this change works as
> > expected by setting the new capability CAP_RESTORE on the two resulting
> > test binaries:
> >
> > $ cat ns_last_pid.c
> >  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
> >  #include <stdio.h>
> >  #include <stdlib.h>
> >  #include <string.h>
> >  #include <sys/file.h>
> >  #include <sys/types.h>
> >  #include <unistd.h>
> >
> > int main(int argc, char *argv[])
> > {
> > 	pid_t pid, new_pid;
> > 	char buf[32];
> > 	int fd;
> >
> > 	if (argc != 2)
> > 		return 1;
> >
> > 	printf("Opening ns_last_pid...\n");
> > 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> > 	if (fd < 0) {
> > 		perror("Cannot open ns_last_pid");
> > 		return 1;
> > 	}
> >
> > 	printf("Locking ns_last_pid...\n");
> > 	if (flock(fd, LOCK_EX)) {
> > 		close(fd);
> > 		printf("Cannot lock ns_last_pid\n");
> > 		return 1;
> > 	}
> >
> > 	pid = atoi(argv[1]);
> > 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> > 	printf("Writing pid-1 to ns_last_pid...\n");
> > 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> > 		printf("Cannot write to buf\n");
> > 		return 1;
> > 	}
> >
> > 	printf("Forking...\n");
> > 	new_pid = fork();
> > 	if (new_pid == 0) {
> > 		printf("I am the child!\n");
> > 		exit(0);
> > 	} else if (new_pid == pid)
> > 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> > 	else
> > 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> >
> > 	printf("Cleaning up...\n");
> > 	if (flock(fd, LOCK_UN))
> > 		printf("Cannot unlock\n");
> > 	close(fd);
> > 	return 0;
> > }
> > $ id -u; /home/libcap/ns_last_pid 300000
> > 1001
> > Opening ns_last_pid...
> > Locking ns_last_pid...
> > Writing pid-1 to ns_last_pid...
> > Forking...
> > I am the parent. My child got the pid 300000!
> > I am the child!
> > Cleaning up...
> >
> > For the clone3() based approach:
> > $ cat clone3_set_tid.c
> >  #define _GNU_SOURCE
> >  #include <linux/sched.h>
> >  #include <stdint.h>
> >  #include <stdio.h>
> >  #include <stdlib.h>
> >  #include <string.h>
> >  #include <sys/types.h>
> >  #include <sys/stat.h>
> >  #include <sys/syscall.h>
> >  #include <unistd.h>
> >
> >  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
> >
> > int main(int argc, char *argv[])
> > {
> > 	struct clone_args c_args = { };
> > 	pid_t pid, new_pid;
> >
> > 	if (argc != 2)
> > 		return 1;
> >
> > 	pid = atoi(argv[1]);
> > 	c_args.set_tid = ptr_to_u64(&pid);
> > 	c_args.set_tid_size = 1;
> >
> > 	printf("Forking...\n");
> > 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
> > 	if (new_pid == 0) {
> > 		printf("I am the child!\n");
> > 		exit(0);
> > 	} else if (new_pid == pid)
> > 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> > 	else
> > 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> > 	printf("Done\n");
> >
> > 	return 0;
> > }
> > $ id -u; /home/libcap/clone3_set_tid 300000
> > 1001
> > Forking...
> > I am the parent. My child got the pid 300000!
> > Done
> > I am the child!
> >
> > Signed-off-by: Adrian Reber <areber@redhat.com>
> > ---
> >  include/linux/capability.h          | 5 +++++
> >  include/uapi/linux/capability.h     | 9 ++++++++-
> >  kernel/pid.c                        | 2 +-
> >  kernel/pid_namespace.c              | 2 +-
> >  security/selinux/include/classmap.h | 5 +++--
> >  5 files changed, 18 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/capability.h b/include/linux/capability.h
> > index b4345b38a6be..1278313cb2bc 100644
> > --- a/include/linux/capability.h
> > +++ b/include/linux/capability.h
> > @@ -261,6 +261,11 @@ static inline bool bpf_capable(void)
> >  	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
> >  }
> >  
> > +static inline bool restore_ns_capable(struct user_namespace *ns)
> > +{
> > +	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
> > +}
> > +
> >  /* audit system wants to get cap info from files as well */
> >  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
> >  
> > diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> > index c7372180a0a9..4bcc4e3d41ff 100644
> > --- a/include/uapi/linux/capability.h
> > +++ b/include/uapi/linux/capability.h
> > @@ -406,7 +406,14 @@ struct vfs_ns_cap_data {
> >   */
> >  #define CAP_BPF			39
> >  
> > -#define CAP_LAST_CAP         CAP_BPF
> > +
> > +/* Allow checkpoint/restore related operations */
> > +/* Allow PID selection during clone3() */
> > +/* Allow writing to ns_last_pid */
> > +
> > +#define CAP_RESTORE		40
> > +
> > +#define CAP_LAST_CAP         CAP_RESTORE
> >  
> >  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
> >  
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 3122043fe364..bbc26f2bcff6 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> >  			if (tid != 1 && !tmp->child_reaper)
> >  				goto out_free;
> >  			retval = -EPERM;
> > -			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
> > +			if (!restore_ns_capable(tmp->user_ns))
> >  				goto out_free;
> >  			set_tid_size--;
> >  		}
> > diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> > index 0e5ac162c3a8..f58186b31ce6 100644
> > --- a/kernel/pid_namespace.c
> > +++ b/kernel/pid_namespace.c
> > @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> >  	struct ctl_table tmp = *table;
> >  	int ret, next;
> >  
> > -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> > +	if (write && !restore_ns_capable(pid_ns->user_ns))
> >  		return -EPERM;
> >  
> >  	/*
> > diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> > index 98e1513b608a..f8b8f12a6ebd 100644
> > --- a/security/selinux/include/classmap.h
> > +++ b/security/selinux/include/classmap.h
> > @@ -27,9 +27,10 @@
> >  	    "audit_control", "setfcap"
> >  
> >  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
> > -		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
> > +		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
> > +		"restore"
> >  
> > -#if CAP_LAST_CAP > CAP_BPF
> > +#if CAP_LAST_CAP > CAP_RESTORE
> >  #error New capability defined, please update COMMON_CAP2_PERMS.
> >  #endif
> >  
> >
> > base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5
>
Casey Schaufler May 25, 2020, 6:55 p.m. UTC | #8
On 5/25/2020 1:05 AM, Adrian Reber wrote:
> On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
>> On 5/21/2020 10:53 PM, Adrian Reber wrote:
>>> This enables CRIU to checkpoint and restore a process as non-root.
>> I know it sounds pedantic, but could you spell out CRIU once?
>> While I know that everyone who cares either knows or can guess
>> what you're talking about, it may be a mystery to some of the
>> newer kernel developers.
> Sure. CRIU - Checkpoint/Restore In Userspace.

Thanks. I blew out my acronym processor in the 1990's while
working on trusted Unix system security evaluations.

>>> Over the last years CRIU upstream has been asked a couple of time if it
>>> is possible to checkpoint and restore a process as non-root. The answer
>>> usually was: 'almost'.
>>>
>>> The main blocker to restore a process was that selecting the PID of the
>>> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
>> What are the other blockers? Are you going to suggest additional new
>> capabilities to clear them?
> As mentioned somewhere else access to /proc/<pid>/map_files/ would be
> helpful. Right now I am testing with a JVM and it works without root
> just with the attached patch. Without access to /proc/<pid>/map_files/
> not everything CRIU can do will actually work, but we are a lot closer
> to what our users have been asking for.

Are you talking about read access to map_files owned by other users
or write access to map_files for the current user?
 

>>> In the last two years the questions about checkpoint/restore as non-root
>>> have increased and especially in the last few months we have seen
>>> multiple people inventing workarounds.
>> Giving a process CAP_SYS_ADMIN is a non-root solution.
> Yes, but like mentioned somewhere else not a solution that actually
> works,

It's a solution that will execute and do what you're asking of it ...

>  because CAP_SYS_ADMIN allows too much.

... but apparently not one that your users find satisfactory.

>  Especially for the
> checkpoint/restore case, we really need one (setting the PID of a new
> process) and to make it complete a second (reading map_files).
>
> Reading the comments in include/uapi/linux/capability.h concerning
> CAP_SYS_ADMIN it allows the binary to do at least 35 things. The two
> (three) I mentioned above (ns_last_pid (clone3) map_files) are not
> mentioned in that list, so CAP_SYS_ADMIN allows probably much more.
>
> To allow checkpoint/restore as non-root nobody will give CRIU
> CAP_SYS_ADMIN because it is too wide.

CAP_SYS_ADMIN exists for system behaviors that are not policy enforcement,
but important to the system nonetheless. If you argue that checkpoint/restart
is system policy enforcement rather then an administrative task it would
be easier to sell.

Nobody likes CAP_SYS_ADMIN, but usually a process that does one of the
things it covers will do more (sometimes many more) of the things it
covers. The longstanding problem with breaking up CAP_SYS_ADMIN is that
most breakouts result in programs that still need CAP_SYS_ADMIN anyway.

>>> The use-cases so far and their workarounds:
>>>
>>>  * Checkpoint/Restore in an HPC environment in combination with
>>>    a resource manager distributing jobs. Users are always running
>>>    as non root, but there was the desire to provide a way to
>>>    checkpoint and restore long running jobs.
>>>    Workaround: setuid wrapper to start CRIU as root as non-root
>>>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
>> This is a classic and well understood mechanism for dealing with
>> this kind of situation. You could have checkpointer-filecap-sys_admin.c
>> instead, if you want to reduce use of the super-user.
>>
>>> * Another use case to checkpoint/restore processes as non-root
>>>    uses as workaround a non privileged process which cycles through
>>>    PIDs by calling fork() as fast as possible with a rate of
>>>    100,000 pids/s instead of writing to ns_last_pid
>>>    https://github.com/twosigma/set_ns_last_pid
>> Oh dear.
>>
>>>  * Fast Java startup using checkpoint/restore.
>>>    We have been in contact with JVM developers who are integrating
>>>    CRIU into a JVM to decrease the startup time.
>>>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
>> That's not a workaround, it's a policy violation.
>> Bad JVM! No biscuit!
> This was used as a proof of concept to see if we can checkpoint and
> restore a JVM without root. Only the ns_last_pid check was removed to
> see if it works and it does.
>
>>>  * Container migration as non root. There are people already
>>>    using CRIU to migrate containers as non-root. The solution
>>>    there is to run it in a user namespace. So if you are able
>>>    to carefully setup your environment with the namespaces
>>>    it is already possible to restore a container/process as non-root.
>> This is exactly the kind of situation that user namespaces are
>> supposed to address.
>>
>>>    Unfortunately it is not always possible to setup an environment
>>>    in such a way and for easier access to non-root based container
>>>    migration this patch is also required.
>> If a user namespace solution is impossible or (more likely) too
>> expensive, there's always the checkpointer-filecap-sys_admin option.
> But then again we open up all of CAP_SYS_ADMIN, which is not necessary.

Right, I understand that.

>>> There are probably a few more things guarded by CAP_SYS_ADMIN required
>>> to run checkpoint/restore as non-root,
>> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
>> separating out CAP_RESTORE.
> No, as described we can checkpoint and restore a JVM with this patch and
> it also solves the problem the set_ns_last_pid fork() loop daemon tries
> to solve. It is not enough to support the full functionality of CRIU as
> map_files is also important, but we do not need CAP_SYS_ADMIN and
> CAP_RESTORE. Only CAP_RESTORE would be necessary.

Excellent!

Now, is there any reason other than your program that a process would
use CAP_RESTORE? If a process has this capability what damage could it
do to the system?

>
> With a new capability users can enable checkpoint/restore as non-root
> without giving CRIU access to any of the other possibilities offered by
> CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
> and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
> for checkpoint/restore would make it easier for CRIU users to run it as
> non-root and make it very clear what is possible when giving CRIU the
> new capability. No other things would be allowed than necessary for
> checkpoint/restore. Setting a PID is most important for the restore part
> and reading map_files would be helpful during checkpoint. So it actually
> should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
> another email.
>
>>>  but by applying this patch I can
>>> already checkpoint and restore processes as non-root. As there are
>>> already multiple workarounds I would prefer to do it correctly in the
>>> kernel to avoid that CRIU users are starting to invent more workarounds.
>> You've presented a couple of really inappropriate implementations
>> that would qualify as workarounds. But the other two are completely
>> appropriate within the system security policy. They don't "get around"
>> the problem, they use existing mechanisms as they are intended.
> I agree with the user namespace approach to be appropriate, but not the
> CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
> what CAP_SYS_ADMIN allows.
>
>>> I have used the following tests to verify that this change works as
>>> expected by setting the new capability CAP_RESTORE on the two resulting
>>> test binaries:
>>>
>>> $ cat ns_last_pid.c
>>>  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
>>>  #include <stdio.h>
>>>  #include <stdlib.h>
>>>  #include <string.h>
>>>  #include <sys/file.h>
>>>  #include <sys/types.h>
>>>  #include <unistd.h>
>>>
>>> int main(int argc, char *argv[])
>>> {
>>> 	pid_t pid, new_pid;
>>> 	char buf[32];
>>> 	int fd;
>>>
>>> 	if (argc != 2)
>>> 		return 1;
>>>
>>> 	printf("Opening ns_last_pid...\n");
>>> 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
>>> 	if (fd < 0) {
>>> 		perror("Cannot open ns_last_pid");
>>> 		return 1;
>>> 	}
>>>
>>> 	printf("Locking ns_last_pid...\n");
>>> 	if (flock(fd, LOCK_EX)) {
>>> 		close(fd);
>>> 		printf("Cannot lock ns_last_pid\n");
>>> 		return 1;
>>> 	}
>>>
>>> 	pid = atoi(argv[1]);
>>> 	snprintf(buf, sizeof(buf), "%d", pid - 1);
>>> 	printf("Writing pid-1 to ns_last_pid...\n");
>>> 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
>>> 		printf("Cannot write to buf\n");
>>> 		return 1;
>>> 	}
>>>
>>> 	printf("Forking...\n");
>>> 	new_pid = fork();
>>> 	if (new_pid == 0) {
>>> 		printf("I am the child!\n");
>>> 		exit(0);
>>> 	} else if (new_pid == pid)
>>> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
>>> 	else
>>> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
>>>
>>> 	printf("Cleaning up...\n");
>>> 	if (flock(fd, LOCK_UN))
>>> 		printf("Cannot unlock\n");
>>> 	close(fd);
>>> 	return 0;
>>> }
>>> $ id -u; /home/libcap/ns_last_pid 300000
>>> 1001
>>> Opening ns_last_pid...
>>> Locking ns_last_pid...
>>> Writing pid-1 to ns_last_pid...
>>> Forking...
>>> I am the parent. My child got the pid 300000!
>>> I am the child!
>>> Cleaning up...
>>>
>>> For the clone3() based approach:
>>> $ cat clone3_set_tid.c
>>>  #define _GNU_SOURCE
>>>  #include <linux/sched.h>
>>>  #include <stdint.h>
>>>  #include <stdio.h>
>>>  #include <stdlib.h>
>>>  #include <string.h>
>>>  #include <sys/types.h>
>>>  #include <sys/stat.h>
>>>  #include <sys/syscall.h>
>>>  #include <unistd.h>
>>>
>>>  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
>>>
>>> int main(int argc, char *argv[])
>>> {
>>> 	struct clone_args c_args = { };
>>> 	pid_t pid, new_pid;
>>>
>>> 	if (argc != 2)
>>> 		return 1;
>>>
>>> 	pid = atoi(argv[1]);
>>> 	c_args.set_tid = ptr_to_u64(&pid);
>>> 	c_args.set_tid_size = 1;
>>>
>>> 	printf("Forking...\n");
>>> 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
>>> 	if (new_pid == 0) {
>>> 		printf("I am the child!\n");
>>> 		exit(0);
>>> 	} else if (new_pid == pid)
>>> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
>>> 	else
>>> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
>>> 	printf("Done\n");
>>>
>>> 	return 0;
>>> }
>>> $ id -u; /home/libcap/clone3_set_tid 300000
>>> 1001
>>> Forking...
>>> I am the parent. My child got the pid 300000!
>>> Done
>>> I am the child!
>>>
>>> Signed-off-by: Adrian Reber <areber@redhat.com>
>>> ---
>>>  include/linux/capability.h          | 5 +++++
>>>  include/uapi/linux/capability.h     | 9 ++++++++-
>>>  kernel/pid.c                        | 2 +-
>>>  kernel/pid_namespace.c              | 2 +-
>>>  security/selinux/include/classmap.h | 5 +++--
>>>  5 files changed, 18 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/include/linux/capability.h b/include/linux/capability.h
>>> index b4345b38a6be..1278313cb2bc 100644
>>> --- a/include/linux/capability.h
>>> +++ b/include/linux/capability.h
>>> @@ -261,6 +261,11 @@ static inline bool bpf_capable(void)
>>>  	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
>>>  }
>>>  
>>> +static inline bool restore_ns_capable(struct user_namespace *ns)
>>> +{
>>> +	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
>>> +}
>>> +
>>>  /* audit system wants to get cap info from files as well */
>>>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
>>>  
>>> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
>>> index c7372180a0a9..4bcc4e3d41ff 100644
>>> --- a/include/uapi/linux/capability.h
>>> +++ b/include/uapi/linux/capability.h
>>> @@ -406,7 +406,14 @@ struct vfs_ns_cap_data {
>>>   */
>>>  #define CAP_BPF			39
>>>  
>>> -#define CAP_LAST_CAP         CAP_BPF
>>> +
>>> +/* Allow checkpoint/restore related operations */
>>> +/* Allow PID selection during clone3() */
>>> +/* Allow writing to ns_last_pid */
>>> +
>>> +#define CAP_RESTORE		40
>>> +
>>> +#define CAP_LAST_CAP         CAP_RESTORE
>>>  
>>>  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
>>>  
>>> diff --git a/kernel/pid.c b/kernel/pid.c
>>> index 3122043fe364..bbc26f2bcff6 100644
>>> --- a/kernel/pid.c
>>> +++ b/kernel/pid.c
>>> @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>>>  			if (tid != 1 && !tmp->child_reaper)
>>>  				goto out_free;
>>>  			retval = -EPERM;
>>> -			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
>>> +			if (!restore_ns_capable(tmp->user_ns))
>>>  				goto out_free;
>>>  			set_tid_size--;
>>>  		}
>>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>>> index 0e5ac162c3a8..f58186b31ce6 100644
>>> --- a/kernel/pid_namespace.c
>>> +++ b/kernel/pid_namespace.c
>>> @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>>>  	struct ctl_table tmp = *table;
>>>  	int ret, next;
>>>  
>>> -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
>>> +	if (write && !restore_ns_capable(pid_ns->user_ns))
>>>  		return -EPERM;
>>>  
>>>  	/*
>>> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
>>> index 98e1513b608a..f8b8f12a6ebd 100644
>>> --- a/security/selinux/include/classmap.h
>>> +++ b/security/selinux/include/classmap.h
>>> @@ -27,9 +27,10 @@
>>>  	    "audit_control", "setfcap"
>>>  
>>>  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
>>> -		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
>>> +		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
>>> +		"restore"
>>>  
>>> -#if CAP_LAST_CAP > CAP_BPF
>>> +#if CAP_LAST_CAP > CAP_RESTORE
>>>  #error New capability defined, please update COMMON_CAP2_PERMS.
>>>  #endif
>>>  
>>>
>>> base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5
Jann Horn May 25, 2020, 9:53 p.m. UTC | #9
On Fri, May 22, 2020 at 7:55 AM Adrian Reber <areber@redhat.com> wrote:
> This enables CRIU to checkpoint and restore a process as non-root.
>
> Over the last years CRIU upstream has been asked a couple of time if it
> is possible to checkpoint and restore a process as non-root. The answer
> usually was: 'almost'.
>
> The main blocker to restore a process was that selecting the PID of the
> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.

And if you were restoring the process into your own PID namespace, so
that you actually have a guarantee that this isn't going to blow up in
your face because one of your PIDs is allocated for a different
process, this part of the problem could be simplified.

I don't get why your users are fine with a "oh it kinda works 99% of
the time but sometimes it randomly doesn't and then you have to go
reboot or whatever" model.
Radostin Stoyanov May 26, 2020, 9:09 a.m. UTC | #10
On 25/05/2020 22:53, Jann Horn wrote:
> On Fri, May 22, 2020 at 7:55 AM Adrian Reber <areber@redhat.com> wrote:
>> This enables CRIU to checkpoint and restore a process as non-root.
>>
>> Over the last years CRIU upstream has been asked a couple of time if it
>> is possible to checkpoint and restore a process as non-root. The answer
>> usually was: 'almost'.
>>
>> The main blocker to restore a process was that selecting the PID of the
>> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> And if you were restoring the process into your own PID namespace, so
> that you actually have a guarantee that this isn't going to blow up in
> your face because one of your PIDs is allocated for a different
> process, this part of the problem could be simplified.
>
> I don't get why your users are fine with a "oh it kinda works 99% of
> the time but sometimes it randomly doesn't and then you have to go
> reboot or whatever" model.
Transparent checkpoint and restore of a process tree is not simple, 
especially when it is done entirely in user-space. To best of my 
knowledge, CRIU is the only tool out there that is able to achieve this, 
it is actively being tested and maintained, and it has been integrated 
into several container runtimes. Like any other software, CRIU has 
limitations but, as said in the README file, contributions are welcome.
Eric W. Biederman May 26, 2020, 1:59 p.m. UTC | #11
Adrian Reber <areber@redhat.com> writes:

> On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:

>> What are the other blockers? Are you going to suggest additional new
>> capabilities to clear them?
>
> As mentioned somewhere else access to /proc/<pid>/map_files/ would be
> helpful. Right now I am testing with a JVM and it works without root
> just with the attached patch. Without access to /proc/<pid>/map_files/
> not everything CRIU can do will actually work, but we are a lot closer
> to what our users have been asking for.

The current permission checks on /proc/<pid>/map_files/ are simply
someone being over-cautious.

Someone needs to think through the threat landscape and figure out what
permission checks are actually needed.

Making the permission check ns_capable instead of capable is a
no-brainer.  Figuring out which user_ns to test against might be a
we bit harder.

We could probably even allow the owner of the process to open the files
but that requires someone doing the work of thinking through how
being able to opening files that you have mmaped might be a problem.

>> > There are probably a few more things guarded by CAP_SYS_ADMIN required
>> > to run checkpoint/restore as non-root,
>> 
>> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
>> separating out CAP_RESTORE.
>
> No, as described we can checkpoint and restore a JVM with this patch and
> it also solves the problem the set_ns_last_pid fork() loop daemon tries
> to solve. It is not enough to support the full functionality of CRIU as
> map_files is also important, but we do not need CAP_SYS_ADMIN and
> CAP_RESTORE. Only CAP_RESTORE would be necessary.
>
> With a new capability users can enable checkpoint/restore as non-root
> without giving CRIU access to any of the other possibilities offered by
> CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
> and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
> for checkpoint/restore would make it easier for CRIU users to run it as
> non-root and make it very clear what is possible when giving CRIU the
> new capability. No other things would be allowed than necessary for
> checkpoint/restore. Setting a PID is most important for the restore part
> and reading map_files would be helpful during checkpoint. So it actually
> should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
> another email.

Please if one is for checkpoint and one is for restore asking for a pair
of capabilities is probably more appropriate.

>> >  but by applying this patch I can
>> > already checkpoint and restore processes as non-root. As there are
>> > already multiple workarounds I would prefer to do it correctly in the
>> > kernel to avoid that CRIU users are starting to invent more workarounds.
>> 
>> You've presented a couple of really inappropriate implementations
>> that would qualify as workarounds. But the other two are completely
>> appropriate within the system security policy. They don't "get around"
>> the problem, they use existing mechanisms as they are intended.
>
> I agree with the user namespace approach to be appropriate, but not the
> CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
> what CAP_SYS_ADMIN allows.


If we are only talking 2 things can you please include in your patchset
a patch enabling those 2 things?

But even more than this we need a request that asks not for the least
you can possibly ask for but asks for what you need to do a good job.

I am having visions of a recurring discussion that says can we add one
more permission check to CAP_RESTORE or CAP_CHECKPOINT when they are
things we could know today.

Eric
Casey Schaufler May 26, 2020, 7:19 p.m. UTC | #12
On 5/26/2020 12:01 PM, Christine Flood wrote:

Please do not top-post on this list.

> Java applications suffer from slow startup times due to dynamic class loading and warming up the Just In Time compilers.  Not all Java users have root access on their machines.  Enabling CRIU in user mode solves this problem for us.  We are about to release a user library that will allow check pointing Java from within Java.  Having to run this as root would severely limit its utility.

The performance of dynamic loading is a well understood issue.
Please don't conflate that with the security issues involved.
Security is *not* the basic problem. If you are having problems
with application start-up performance you really should be be
addressing that directly rather than implementing sophisticated
workarounds that require system security changes.

>
>
> Christine
>
> On Tue, May 26, 2020 at 10:05 AM Eric W. Biederman <ebiederm@xmission.com <mailto:ebiederm@xmission.com>> wrote:
>
>     Adrian Reber <areber@redhat.com <mailto:areber@redhat.com>> writes:
>
>     > On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
>
>     >> What are the other blockers? Are you going to suggest additional new
>     >> capabilities to clear them?
>     >
>     > As mentioned somewhere else access to /proc/<pid>/map_files/ would be
>     > helpful. Right now I am testing with a JVM and it works without root
>     > just with the attached patch. Without access to /proc/<pid>/map_files/
>     > not everything CRIU can do will actually work, but we are a lot closer
>     > to what our users have been asking for.
>
>     The current permission checks on /proc/<pid>/map_files/ are simply
>     someone being over-cautious.
>
>     Someone needs to think through the threat landscape and figure out what
>     permission checks are actually needed.
>
>     Making the permission check ns_capable instead of capable is a
>     no-brainer.  Figuring out which user_ns to test against might be a
>     we bit harder.
>
>     We could probably even allow the owner of the process to open the files
>     but that requires someone doing the work of thinking through how
>     being able to opening files that you have mmaped might be a problem.
>
>     >> > There are probably a few more things guarded by CAP_SYS_ADMIN required
>     >> > to run checkpoint/restore as non-root,
>     >>
>     >> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
>     >> separating out CAP_RESTORE.
>     >
>     > No, as described we can checkpoint and restore a JVM with this patch and
>     > it also solves the problem the set_ns_last_pid fork() loop daemon tries
>     > to solve. It is not enough to support the full functionality of CRIU as
>     > map_files is also important, but we do not need CAP_SYS_ADMIN and
>     > CAP_RESTORE. Only CAP_RESTORE would be necessary.
>     >
>     > With a new capability users can enable checkpoint/restore as non-root
>     > without giving CRIU access to any of the other possibilities offered by
>     > CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
>     > and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
>     > for checkpoint/restore would make it easier for CRIU users to run it as
>     > non-root and make it very clear what is possible when giving CRIU the
>     > new capability. No other things would be allowed than necessary for
>     > checkpoint/restore. Setting a PID is most important for the restore part
>     > and reading map_files would be helpful during checkpoint. So it actually
>     > should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
>     > another email.
>
>     Please if one is for checkpoint and one is for restore asking for a pair
>     of capabilities is probably more appropriate.
>
>     >> >  but by applying this patch I can
>     >> > already checkpoint and restore processes as non-root. As there are
>     >> > already multiple workarounds I would prefer to do it correctly in the
>     >> > kernel to avoid that CRIU users are starting to invent more workarounds.
>     >>
>     >> You've presented a couple of really inappropriate implementations
>     >> that would qualify as workarounds. But the other two are completely
>     >> appropriate within the system security policy. They don't "get around"
>     >> the problem, they use existing mechanisms as they are intended.
>     >
>     > I agree with the user namespace approach to be appropriate, but not the
>     > CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
>     > what CAP_SYS_ADMIN allows.
>
>
>     If we are only talking 2 things can you please include in your patchset
>     a patch enabling those 2 things?
>
>     But even more than this we need a request that asks not for the least
>     you can possibly ask for but asks for what you need to do a good job.
>
>     I am having visions of a recurring discussion that says can we add one
>     more permission check to CAP_RESTORE or CAP_CHECKPOINT when they are
>     things we could know today.
>
>     Eric
>
Jann Horn May 26, 2020, 7:51 p.m. UTC | #13
On Tue, May 26, 2020 at 9:01 PM Christine Flood <chf@redhat.com> wrote:
> Java applications suffer from slow startup times due to dynamic class loading and warming up the Just In Time compilers.  Not all Java users have root access on their machines.  Enabling CRIU in user mode solves this problem for us.  We are about to release a user library that will allow check pointing Java from within Java.  Having to run this as root would severely limit its utility.

Have you looked into whether it would be practical to restore the
saved process state with different PIDs, and then fix up all places
that might have stored the old PIDs? As long as all threads are
managed by the JVM, that might be doable, right?

If you did that, you would also solve the problem of not being able to
start two copies of the same image (because their PIDs would collide)
or randomly not being able to start processes (because their PIDs
collide with other existing things).
Adrian Reber May 27, 2020, 1:48 p.m. UTC | #14
On Mon, May 25, 2020 at 11:55:20AM -0700, Casey Schaufler wrote:
> On 5/25/2020 1:05 AM, Adrian Reber wrote:
> > On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
> >> On 5/21/2020 10:53 PM, Adrian Reber wrote:
> >>> This enables CRIU to checkpoint and restore a process as non-root.
> >> I know it sounds pedantic, but could you spell out CRIU once?
> >> While I know that everyone who cares either knows or can guess
> >> what you're talking about, it may be a mystery to some of the
> >> newer kernel developers.
> > Sure. CRIU - Checkpoint/Restore In Userspace.
> 
> Thanks. I blew out my acronym processor in the 1990's while
> working on trusted Unix system security evaluations.
> 
> >>> Over the last years CRIU upstream has been asked a couple of time if it
> >>> is possible to checkpoint and restore a process as non-root. The answer
> >>> usually was: 'almost'.
> >>>
> >>> The main blocker to restore a process was that selecting the PID of the
> >>> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> >> What are the other blockers? Are you going to suggest additional new
> >> capabilities to clear them?
> > As mentioned somewhere else access to /proc/<pid>/map_files/ would be
> > helpful. Right now I am testing with a JVM and it works without root
> > just with the attached patch. Without access to /proc/<pid>/map_files/
> > not everything CRIU can do will actually work, but we are a lot closer
> > to what our users have been asking for.
> 
> Are you talking about read access to map_files owned by other users
> or write access to map_files for the current user?

If I understand part of CRIU correctly, then we only need read-access
for the current user. I am sure Andrei, Pavel or Cyrill will correct me
if I am wrong concerning map_files.

> >>> In the last two years the questions about checkpoint/restore as non-root
> >>> have increased and especially in the last few months we have seen
> >>> multiple people inventing workarounds.
> >> Giving a process CAP_SYS_ADMIN is a non-root solution.
> > Yes, but like mentioned somewhere else not a solution that actually
> > works,
> 
> It's a solution that will execute and do what you're asking of it ...
> 
> >  because CAP_SYS_ADMIN allows too much.
> 
> ... but apparently not one that your users find satisfactory.
> 
> >  Especially for the
> > checkpoint/restore case, we really need one (setting the PID of a new
> > process) and to make it complete a second (reading map_files).
> >
> > Reading the comments in include/uapi/linux/capability.h concerning
> > CAP_SYS_ADMIN it allows the binary to do at least 35 things. The two
> > (three) I mentioned above (ns_last_pid (clone3) map_files) are not
> > mentioned in that list, so CAP_SYS_ADMIN allows probably much more.
> >
> > To allow checkpoint/restore as non-root nobody will give CRIU
> > CAP_SYS_ADMIN because it is too wide.
> 
> CAP_SYS_ADMIN exists for system behaviors that are not policy enforcement,
> but important to the system nonetheless. If you argue that checkpoint/restart
> is system policy enforcement rather then an administrative task it would
> be easier to sell.
> 
> Nobody likes CAP_SYS_ADMIN, but usually a process that does one of the
> things it covers will do more (sometimes many more) of the things it
> covers. The longstanding problem with breaking up CAP_SYS_ADMIN is that
> most breakouts result in programs that still need CAP_SYS_ADMIN anyway.
> 
> >>> The use-cases so far and their workarounds:
> >>>
> >>>  * Checkpoint/Restore in an HPC environment in combination with
> >>>    a resource manager distributing jobs. Users are always running
> >>>    as non root, but there was the desire to provide a way to
> >>>    checkpoint and restore long running jobs.
> >>>    Workaround: setuid wrapper to start CRIU as root as non-root
> >>>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
> >> This is a classic and well understood mechanism for dealing with
> >> this kind of situation. You could have checkpointer-filecap-sys_admin.c
> >> instead, if you want to reduce use of the super-user.
> >>
> >>> * Another use case to checkpoint/restore processes as non-root
> >>>    uses as workaround a non privileged process which cycles through
> >>>    PIDs by calling fork() as fast as possible with a rate of
> >>>    100,000 pids/s instead of writing to ns_last_pid
> >>>    https://github.com/twosigma/set_ns_last_pid
> >> Oh dear.
> >>
> >>>  * Fast Java startup using checkpoint/restore.
> >>>    We have been in contact with JVM developers who are integrating
> >>>    CRIU into a JVM to decrease the startup time.
> >>>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
> >> That's not a workaround, it's a policy violation.
> >> Bad JVM! No biscuit!
> > This was used as a proof of concept to see if we can checkpoint and
> > restore a JVM without root. Only the ns_last_pid check was removed to
> > see if it works and it does.
> >
> >>>  * Container migration as non root. There are people already
> >>>    using CRIU to migrate containers as non-root. The solution
> >>>    there is to run it in a user namespace. So if you are able
> >>>    to carefully setup your environment with the namespaces
> >>>    it is already possible to restore a container/process as non-root.
> >> This is exactly the kind of situation that user namespaces are
> >> supposed to address.
> >>
> >>>    Unfortunately it is not always possible to setup an environment
> >>>    in such a way and for easier access to non-root based container
> >>>    migration this patch is also required.
> >> If a user namespace solution is impossible or (more likely) too
> >> expensive, there's always the checkpointer-filecap-sys_admin option.
> > But then again we open up all of CAP_SYS_ADMIN, which is not necessary.
> 
> Right, I understand that.
> 
> >>> There are probably a few more things guarded by CAP_SYS_ADMIN required
> >>> to run checkpoint/restore as non-root,
> >> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
> >> separating out CAP_RESTORE.
> > No, as described we can checkpoint and restore a JVM with this patch and
> > it also solves the problem the set_ns_last_pid fork() loop daemon tries
> > to solve. It is not enough to support the full functionality of CRIU as
> > map_files is also important, but we do not need CAP_SYS_ADMIN and
> > CAP_RESTORE. Only CAP_RESTORE would be necessary.
> 
> Excellent!
> 
> Now, is there any reason other than your program that a process would
> use CAP_RESTORE? If a process has this capability what damage could it
> do to the system?

When I introduced clone3() with sett_tid to create process with a
certain PID I first did it without the CAP_SYS_ADMIN check and there
were concerns that this would make it too easy to re-use PIDs. If I
understood it correctly. So if you are only asking about the damage that
could be done to the system if any user can create a process with any
PID via ns_last_pid or clone3(), then the damage could be easy re-use of
PIDs and probably easier re-creation of re-use PID problems.

I cannot comment on what damage could be done by allowing read-access to
map_files. Eric commented in another part of the thread that he thinks
that it might not be necessary at all.

> > With a new capability users can enable checkpoint/restore as non-root
> > without giving CRIU access to any of the other possibilities offered by
> > CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
> > and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
> > for checkpoint/restore would make it easier for CRIU users to run it as
> > non-root and make it very clear what is possible when giving CRIU the
> > new capability. No other things would be allowed than necessary for
> > checkpoint/restore. Setting a PID is most important for the restore part
> > and reading map_files would be helpful during checkpoint. So it actually
> > should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
> > another email.
> >
> >>>  but by applying this patch I can
> >>> already checkpoint and restore processes as non-root. As there are
> >>> already multiple workarounds I would prefer to do it correctly in the
> >>> kernel to avoid that CRIU users are starting to invent more workarounds.
> >> You've presented a couple of really inappropriate implementations
> >> that would qualify as workarounds. But the other two are completely
> >> appropriate within the system security policy. They don't "get around"
> >> the problem, they use existing mechanisms as they are intended.
> > I agree with the user namespace approach to be appropriate, but not the
> > CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
> > what CAP_SYS_ADMIN allows.
> >
> >>> I have used the following tests to verify that this change works as
> >>> expected by setting the new capability CAP_RESTORE on the two resulting
> >>> test binaries:
> >>>
> >>> $ cat ns_last_pid.c
> >>>  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
> >>>  #include <stdio.h>
> >>>  #include <stdlib.h>
> >>>  #include <string.h>
> >>>  #include <sys/file.h>
> >>>  #include <sys/types.h>
> >>>  #include <unistd.h>
> >>>
> >>> int main(int argc, char *argv[])
> >>> {
> >>> 	pid_t pid, new_pid;
> >>> 	char buf[32];
> >>> 	int fd;
> >>>
> >>> 	if (argc != 2)
> >>> 		return 1;
> >>>
> >>> 	printf("Opening ns_last_pid...\n");
> >>> 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> >>> 	if (fd < 0) {
> >>> 		perror("Cannot open ns_last_pid");
> >>> 		return 1;
> >>> 	}
> >>>
> >>> 	printf("Locking ns_last_pid...\n");
> >>> 	if (flock(fd, LOCK_EX)) {
> >>> 		close(fd);
> >>> 		printf("Cannot lock ns_last_pid\n");
> >>> 		return 1;
> >>> 	}
> >>>
> >>> 	pid = atoi(argv[1]);
> >>> 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> >>> 	printf("Writing pid-1 to ns_last_pid...\n");
> >>> 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> >>> 		printf("Cannot write to buf\n");
> >>> 		return 1;
> >>> 	}
> >>>
> >>> 	printf("Forking...\n");
> >>> 	new_pid = fork();
> >>> 	if (new_pid == 0) {
> >>> 		printf("I am the child!\n");
> >>> 		exit(0);
> >>> 	} else if (new_pid == pid)
> >>> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> >>> 	else
> >>> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> >>>
> >>> 	printf("Cleaning up...\n");
> >>> 	if (flock(fd, LOCK_UN))
> >>> 		printf("Cannot unlock\n");
> >>> 	close(fd);
> >>> 	return 0;
> >>> }
> >>> $ id -u; /home/libcap/ns_last_pid 300000
> >>> 1001
> >>> Opening ns_last_pid...
> >>> Locking ns_last_pid...
> >>> Writing pid-1 to ns_last_pid...
> >>> Forking...
> >>> I am the parent. My child got the pid 300000!
> >>> I am the child!
> >>> Cleaning up...
> >>>
> >>> For the clone3() based approach:
> >>> $ cat clone3_set_tid.c
> >>>  #define _GNU_SOURCE
> >>>  #include <linux/sched.h>
> >>>  #include <stdint.h>
> >>>  #include <stdio.h>
> >>>  #include <stdlib.h>
> >>>  #include <string.h>
> >>>  #include <sys/types.h>
> >>>  #include <sys/stat.h>
> >>>  #include <sys/syscall.h>
> >>>  #include <unistd.h>
> >>>
> >>>  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
> >>>
> >>> int main(int argc, char *argv[])
> >>> {
> >>> 	struct clone_args c_args = { };
> >>> 	pid_t pid, new_pid;
> >>>
> >>> 	if (argc != 2)
> >>> 		return 1;
> >>>
> >>> 	pid = atoi(argv[1]);
> >>> 	c_args.set_tid = ptr_to_u64(&pid);
> >>> 	c_args.set_tid_size = 1;
> >>>
> >>> 	printf("Forking...\n");
> >>> 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
> >>> 	if (new_pid == 0) {
> >>> 		printf("I am the child!\n");
> >>> 		exit(0);
> >>> 	} else if (new_pid == pid)
> >>> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> >>> 	else
> >>> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> >>> 	printf("Done\n");
> >>>
> >>> 	return 0;
> >>> }
> >>> $ id -u; /home/libcap/clone3_set_tid 300000
> >>> 1001
> >>> Forking...
> >>> I am the parent. My child got the pid 300000!
> >>> Done
> >>> I am the child!
> >>>
> >>> Signed-off-by: Adrian Reber <areber@redhat.com>
> >>> ---
> >>>  include/linux/capability.h          | 5 +++++
> >>>  include/uapi/linux/capability.h     | 9 ++++++++-
> >>>  kernel/pid.c                        | 2 +-
> >>>  kernel/pid_namespace.c              | 2 +-
> >>>  security/selinux/include/classmap.h | 5 +++--
> >>>  5 files changed, 18 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/include/linux/capability.h b/include/linux/capability.h
> >>> index b4345b38a6be..1278313cb2bc 100644
> >>> --- a/include/linux/capability.h
> >>> +++ b/include/linux/capability.h
> >>> @@ -261,6 +261,11 @@ static inline bool bpf_capable(void)
> >>>  	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
> >>>  }
> >>>  
> >>> +static inline bool restore_ns_capable(struct user_namespace *ns)
> >>> +{
> >>> +	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
> >>> +}
> >>> +
> >>>  /* audit system wants to get cap info from files as well */
> >>>  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
> >>>  
> >>> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> >>> index c7372180a0a9..4bcc4e3d41ff 100644
> >>> --- a/include/uapi/linux/capability.h
> >>> +++ b/include/uapi/linux/capability.h
> >>> @@ -406,7 +406,14 @@ struct vfs_ns_cap_data {
> >>>   */
> >>>  #define CAP_BPF			39
> >>>  
> >>> -#define CAP_LAST_CAP         CAP_BPF
> >>> +
> >>> +/* Allow checkpoint/restore related operations */
> >>> +/* Allow PID selection during clone3() */
> >>> +/* Allow writing to ns_last_pid */
> >>> +
> >>> +#define CAP_RESTORE		40
> >>> +
> >>> +#define CAP_LAST_CAP         CAP_RESTORE
> >>>  
> >>>  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
> >>>  
> >>> diff --git a/kernel/pid.c b/kernel/pid.c
> >>> index 3122043fe364..bbc26f2bcff6 100644
> >>> --- a/kernel/pid.c
> >>> +++ b/kernel/pid.c
> >>> @@ -198,7 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> >>>  			if (tid != 1 && !tmp->child_reaper)
> >>>  				goto out_free;
> >>>  			retval = -EPERM;
> >>> -			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
> >>> +			if (!restore_ns_capable(tmp->user_ns))
> >>>  				goto out_free;
> >>>  			set_tid_size--;
> >>>  		}
> >>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> >>> index 0e5ac162c3a8..f58186b31ce6 100644
> >>> --- a/kernel/pid_namespace.c
> >>> +++ b/kernel/pid_namespace.c
> >>> @@ -269,7 +269,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
> >>>  	struct ctl_table tmp = *table;
> >>>  	int ret, next;
> >>>  
> >>> -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> >>> +	if (write && !restore_ns_capable(pid_ns->user_ns))
> >>>  		return -EPERM;
> >>>  
> >>>  	/*
> >>> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> >>> index 98e1513b608a..f8b8f12a6ebd 100644
> >>> --- a/security/selinux/include/classmap.h
> >>> +++ b/security/selinux/include/classmap.h
> >>> @@ -27,9 +27,10 @@
> >>>  	    "audit_control", "setfcap"
> >>>  
> >>>  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
> >>> -		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
> >>> +		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
> >>> +		"restore"
> >>>  
> >>> -#if CAP_LAST_CAP > CAP_BPF
> >>> +#if CAP_LAST_CAP > CAP_RESTORE
> >>>  #error New capability defined, please update COMMON_CAP2_PERMS.
> >>>  #endif
> >>>  
> >>>
> >>> base-commit: e8f3274774b45b5f4e9e3d5cad7ff9f43ae3add5
>
Adrian Reber May 27, 2020, 2:14 p.m. UTC | #15
On Tue, May 26, 2020 at 08:59:29AM -0500, Eric W. Biederman wrote:
> Adrian Reber <areber@redhat.com> writes:
> 
> > On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
> 
> >> What are the other blockers? Are you going to suggest additional new
> >> capabilities to clear them?
> >
> > As mentioned somewhere else access to /proc/<pid>/map_files/ would be
> > helpful. Right now I am testing with a JVM and it works without root
> > just with the attached patch. Without access to /proc/<pid>/map_files/
> > not everything CRIU can do will actually work, but we are a lot closer
> > to what our users have been asking for.
> 
> The current permission checks on /proc/<pid>/map_files/ are simply
> someone being over-cautious.
> 
> Someone needs to think through the threat landscape and figure out what
> permission checks are actually needed.
> 
> Making the permission check ns_capable instead of capable is a
> no-brainer.  Figuring out which user_ns to test against might be a
> we bit harder.
> 
> We could probably even allow the owner of the process to open the files
> but that requires someone doing the work of thinking through how
> being able to opening files that you have mmaped might be a problem.

As mentioned in the other thread, CRIU can work with read access to
map_files.

> >> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> >> > to run checkpoint/restore as non-root,
> >> 
> >> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
> >> separating out CAP_RESTORE.
> >
> > No, as described we can checkpoint and restore a JVM with this patch and
> > it also solves the problem the set_ns_last_pid fork() loop daemon tries
> > to solve. It is not enough to support the full functionality of CRIU as
> > map_files is also important, but we do not need CAP_SYS_ADMIN and
> > CAP_RESTORE. Only CAP_RESTORE would be necessary.
> >
> > With a new capability users can enable checkpoint/restore as non-root
> > without giving CRIU access to any of the other possibilities offered by
> > CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
> > and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
> > for checkpoint/restore would make it easier for CRIU users to run it as
> > non-root and make it very clear what is possible when giving CRIU the
> > new capability. No other things would be allowed than necessary for
> > checkpoint/restore. Setting a PID is most important for the restore part
> > and reading map_files would be helpful during checkpoint. So it actually
> > should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
> > another email.
> 
> Please if one is for checkpoint and one is for restore asking for a pair
> of capabilities is probably more appropriate.

I will send out a v2 with a renamed capability soon and also include
map_files to be readable with that capability.

> >> >  but by applying this patch I can
> >> > already checkpoint and restore processes as non-root. As there are
> >> > already multiple workarounds I would prefer to do it correctly in the
> >> > kernel to avoid that CRIU users are starting to invent more workarounds.
> >> 
> >> You've presented a couple of really inappropriate implementations
> >> that would qualify as workarounds. But the other two are completely
> >> appropriate within the system security policy. They don't "get around"
> >> the problem, they use existing mechanisms as they are intended.
> >
> > I agree with the user namespace approach to be appropriate, but not the
> > CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
> > what CAP_SYS_ADMIN allows.
> 
> 
> If we are only talking 2 things can you please include in your patchset
> a patch enabling those 2 things?

The two things are setting a PID via ns_last_pid/clone3() and reading
map_files.

> But even more than this we need a request that asks not for the least
> you can possibly ask for but asks for what you need to do a good job.

Also in this thread Kamil mentioned that they also need calling prctl
with PR_SET_MM during restore in their production setup.

> I am having visions of a recurring discussion that says can we add one
> more permission check to CAP_RESTORE or CAP_CHECKPOINT when they are
> things we could know today.

I will prepare a new version of this patch using CAP_CHECKPOINT_RESTORE
for ns_last_pid/clone3(), map_files, and prctl with PR_SET_MM.

		Adrian
Christian Brauner May 27, 2020, 3:29 p.m. UTC | #16
On Wed, May 27, 2020 at 04:14:03PM +0200, Adrian Reber wrote:
> On Tue, May 26, 2020 at 08:59:29AM -0500, Eric W. Biederman wrote:
> > Adrian Reber <areber@redhat.com> writes:
> > 
> > > On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
> > 
> > >> What are the other blockers? Are you going to suggest additional new
> > >> capabilities to clear them?
> > >
> > > As mentioned somewhere else access to /proc/<pid>/map_files/ would be
> > > helpful. Right now I am testing with a JVM and it works without root
> > > just with the attached patch. Without access to /proc/<pid>/map_files/
> > > not everything CRIU can do will actually work, but we are a lot closer
> > > to what our users have been asking for.
> > 
> > The current permission checks on /proc/<pid>/map_files/ are simply
> > someone being over-cautious.
> > 
> > Someone needs to think through the threat landscape and figure out what
> > permission checks are actually needed.
> > 
> > Making the permission check ns_capable instead of capable is a
> > no-brainer.  Figuring out which user_ns to test against might be a
> > we bit harder.
> > 
> > We could probably even allow the owner of the process to open the files
> > but that requires someone doing the work of thinking through how
> > being able to opening files that you have mmaped might be a problem.
> 
> As mentioned in the other thread, CRIU can work with read access to
> map_files.
> 
> > >> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> > >> > to run checkpoint/restore as non-root,
> > >> 
> > >> If you need CAP_SYS_ADMIN anyway you're not gaining anything by
> > >> separating out CAP_RESTORE.
> > >
> > > No, as described we can checkpoint and restore a JVM with this patch and
> > > it also solves the problem the set_ns_last_pid fork() loop daemon tries
> > > to solve. It is not enough to support the full functionality of CRIU as
> > > map_files is also important, but we do not need CAP_SYS_ADMIN and
> > > CAP_RESTORE. Only CAP_RESTORE would be necessary.
> > >
> > > With a new capability users can enable checkpoint/restore as non-root
> > > without giving CRIU access to any of the other possibilities offered by
> > > CAP_SYS_ADMIN. Setting a PID and map_files have been introduced for CRIU
> > > and used to live behind CONFIG_CHECKPOINT_RESTORE. Having a capability
> > > for checkpoint/restore would make it easier for CRIU users to run it as
> > > non-root and make it very clear what is possible when giving CRIU the
> > > new capability. No other things would be allowed than necessary for
> > > checkpoint/restore. Setting a PID is most important for the restore part
> > > and reading map_files would be helpful during checkpoint. So it actually
> > > should be called CAP_CHECKPOINT_RESTORE as Christian mentioned in
> > > another email.
> > 
> > Please if one is for checkpoint and one is for restore asking for a pair
> > of capabilities is probably more appropriate.
> 
> I will send out a v2 with a renamed capability soon and also include
> map_files to be readable with that capability.
> 
> > >> >  but by applying this patch I can
> > >> > already checkpoint and restore processes as non-root. As there are
> > >> > already multiple workarounds I would prefer to do it correctly in the
> > >> > kernel to avoid that CRIU users are starting to invent more workarounds.
> > >> 
> > >> You've presented a couple of really inappropriate implementations
> > >> that would qualify as workarounds. But the other two are completely
> > >> appropriate within the system security policy. They don't "get around"
> > >> the problem, they use existing mechanisms as they are intended.
> > >
> > > I agree with the user namespace approach to be appropriate, but not the
> > > CAP_SYS_ADMIN approach as CRIU only needs a tiny subset (2 things) of
> > > what CAP_SYS_ADMIN allows.
> > 
> > 
> > If we are only talking 2 things can you please include in your patchset
> > a patch enabling those 2 things?
> 
> The two things are setting a PID via ns_last_pid/clone3() and reading
> map_files.
> 
> > But even more than this we need a request that asks not for the least
> > you can possibly ask for but asks for what you need to do a good job.
> 
> Also in this thread Kamil mentioned that they also need calling prctl
> with PR_SET_MM during restore in their production setup.

We're using that as well but it really feels like this:

	prctl_map = (struct prctl_mm_map){
	    .start_code = start_code,
	    .end_code = end_code,
	    .start_stack = start_stack,
	    .start_data = start_data,
	    .end_data = end_data,
	    .start_brk = start_brk,
	    .brk = brk_val,
	    .arg_start = arg_start,
	    .arg_end = arg_end,
	    .env_start = env_start,
	    .env_end = env_end,
	    .auxv = NULL,
	    .auxv_size = 0,
	    .exe_fd = -1,
	};

should belong under ns_capable(CAP_SYS_ADMIN). Why is that necessary to
relax?

> 
> > I am having visions of a recurring discussion that says can we add one
> > more permission check to CAP_RESTORE or CAP_CHECKPOINT when they are
> > things we could know today.
> 
> I will prepare a new version of this patch using CAP_CHECKPOINT_RESTORE
> for ns_last_pid/clone3(), map_files, and prctl with PR_SET_MM.
> 
> 		Adrian
>
Casey Schaufler May 27, 2020, 3:57 p.m. UTC | #17
On 5/27/2020 6:48 AM, Adrian Reber wrote:
> On Mon, May 25, 2020 at 11:55:20AM -0700, Casey Schaufler wrote:
>> On 5/25/2020 1:05 AM, Adrian Reber wrote:
>>> On Fri, May 22, 2020 at 09:40:37AM -0700, Casey Schaufler wrote:
>>>> On 5/21/2020 10:53 PM, Adrian Reber wrote:
>>>>> This enables CRIU to checkpoint and restore a process as non-root.
>>>> I know it sounds pedantic, but could you spell out CRIU once?
>>>> While I know that everyone who cares either knows or can guess
>>>> what you're talking about, it may be a mystery to some of the
>>>> newer kernel developers.
>>> Sure. CRIU - Checkpoint/Restore In Userspace.
>> Thanks. I blew out my acronym processor in the 1990's while
>> working on trusted Unix system security evaluations.
>>
>>>>> Over the last years CRIU upstream has been asked a couple of time if it
>>>>> is possible to checkpoint and restore a process as non-root. The answer
>>>>> usually was: 'almost'.
>>>>>
>>>>> The main blocker to restore a process was that selecting the PID of the
>>>>> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
>>>> What are the other blockers? Are you going to suggest additional new
>>>> capabilities to clear them?
>>> As mentioned somewhere else access to /proc/<pid>/map_files/ would be
>>> helpful. Right now I am testing with a JVM and it works without root
>>> just with the attached patch. Without access to /proc/<pid>/map_files/
>>> not everything CRIU can do will actually work, but we are a lot closer
>>> to what our users have been asking for.
>> Are you talking about read access to map_files owned by other users
>> or write access to map_files for the current user?
> If I understand part of CRIU correctly, then we only need read-access
> for the current user. I am sure Andrei, Pavel or Cyrill will correct me
> if I am wrong concerning map_files.

If I do "ls -l /proc/self/map_files" I get the link name and link content.
While I can't open /proc/self/map_files/7fbde0c3200-7fbde0c3300 I can read
that it points to /usr/lib64/ld-2.30.so, which is something I can open 
and read. Sure, it's an extra step, but it's no big deal. It does raise the
question of what value comes from disallowing open via the symlink.
Nicolas Viennot May 27, 2020, 4:37 p.m. UTC | #18
> > If I understand part of CRIU correctly, then we only need read-access 
> > for the current user. I am sure Andrei, Pavel or Cyrill will correct 
> > me if I am wrong concerning map_files.
> If I do "ls -l /proc/self/map_files" I get the link name and link content.
> While I can't open /proc/self/map_files/7fbde0c3200-7fbde0c3300 I can read that it points to /usr/lib64/ld-2.30.so, which is something I can open and read. Sure, it's an extra step, but it's no big deal. It does raise the question of what value comes from disallowing open via the symlink.

Reading the symlink doesn't work in two cases:
1) The file has been deleted
2) The file is a memfd file
Casey Schaufler May 27, 2020, 4:46 p.m. UTC | #19
On 5/27/2020 9:37 AM, Nicolas Viennot wrote:
>>> If I understand part of CRIU correctly, then we only need read-access 
>>> for the current user. I am sure Andrei, Pavel or Cyrill will correct 
>>> me if I am wrong concerning map_files.
>> If I do "ls -l /proc/self/map_files" I get the link name and link content.
>> While I can't open /proc/self/map_files/7fbde0c3200-7fbde0c3300 I can read that it points to /usr/lib64/ld-2.30.so, which is something I can open and read. Sure, it's an extra step, but it's no big deal. It does raise the question of what value comes from disallowing open via the symlink.
> Reading the symlink doesn't work in two cases:
> 1) The file has been deleted

In which case you won't be able to read it directly from
the symlink, either.

> 2) The file is a memfd file

Ditto? Or is there some other problem?
Nicolas Viennot May 27, 2020, 6:05 p.m. UTC | #20
> > Also in this thread Kamil mentioned that they also need calling prctl 
> > with PR_SET_MM during restore in their production setup.
>
> We're using that as well but it really feels like this:
>
>	prctl_map = (struct prctl_mm_map){
>	    .start_code = start_code,
>	    .end_code = end_code,
>	    .start_stack = start_stack,
>	    .start_data = start_data,
>	    .end_data = end_data,
>	    .start_brk = start_brk,
>	    .brk = brk_val,
>	    .arg_start = arg_start,
>	    .arg_end = arg_end,
>	    .env_start = env_start,
>	    .env_end = env_end,
>	    .auxv = NULL,
>	    .auxv_size = 0,
>	    .exe_fd = -1,
>	};
>
> should belong under ns_capable(CAP_SYS_ADMIN). Why is that necessary to relax?

When the prctl(PR_SET_MM_MAP...), the only privileged operation is to change the symlink of /proc/self/exe via set_mm_exe_file().
See https://github.com/torvalds/linux/blob/444fc5cde64330661bf59944c43844e7d4c2ccd8/kernel/sys.c#L2001-L2004
It needs CAP_SYS_ADMIN of the current namespace.

I would argue that setting the current process exe file check should just be reduced to a "can you ptrace a children" check.
Here's why: any process can masquerade into another executable with ptrace.
One can fork a child, ptrace it, have the child execve("target_exe"), then replace its memory content with an arbitrary program.
With CRIU's libcompel parasite mechanism (https://criu.org/Compel) this is fairly easy to implement.
In fact, we could modify CRIU to do just that (but with a fair amount of efforts due to the way CRIU is written),
and not rely on being able to SET_MM_EXE_FILE via prctl(). In turn, that would give an easy way to masquerade any process
into another one, provided that one can ptrace a child.

When not using PR_SET_MM_MAP, but using SET_MM_EXE_FILE, the CAP_RESOURCES at the root namespace level is required:
https://github.com/torvalds/linux/blob/444fc5cde64330661bf59944c43844e7d4c2ccd8/kernel/sys.c#L2109
This seems inconsistent. Also for some reason changing auxv is not privileged if using prctl via the MM_MAP mechanism, but is privileged otherwise.
Christian Brauner May 28, 2020, 9:48 a.m. UTC | #21
On Wed, May 27, 2020 at 06:05:55PM +0000, Nicolas Viennot wrote:
> > > Also in this thread Kamil mentioned that they also need calling prctl 
> > > with PR_SET_MM during restore in their production setup.
> >
> > We're using that as well but it really feels like this:
> >
> >	prctl_map = (struct prctl_mm_map){
> >	    .start_code = start_code,
> >	    .end_code = end_code,
> >	    .start_stack = start_stack,
> >	    .start_data = start_data,
> >	    .end_data = end_data,
> >	    .start_brk = start_brk,
> >	    .brk = brk_val,
> >	    .arg_start = arg_start,
> >	    .arg_end = arg_end,
> >	    .env_start = env_start,
> >	    .env_end = env_end,
> >	    .auxv = NULL,
> >	    .auxv_size = 0,
> >	    .exe_fd = -1,
> >	};
> >
> > should belong under ns_capable(CAP_SYS_ADMIN). Why is that necessary to relax?
> 
> When the prctl(PR_SET_MM_MAP...), the only privileged operation is to change the symlink of /proc/self/exe via set_mm_exe_file().
> See https://github.com/torvalds/linux/blob/444fc5cde64330661bf59944c43844e7d4c2ccd8/kernel/sys.c#L2001-L2004
> It needs CAP_SYS_ADMIN of the current namespace.

This already has been relaxed before (see commit below) and why I'm at
least symbolically pushing back is that I'm getting worried that we're
just removing restrictions left and right and making kernel interfaces
available to fully unprivileged user that very much seem to belong into
the realm of local cap_sys_admin.

    prctl: Allow local CAP_SYS_ADMIN changing exe_file

    During checkpointing and restore of userspace tasks
    we bumped into the situation, that it's not possible
    to restore the tasks, which user namespace does not
    have uid 0 or gid 0 mapped.

    People create user namespace mappings like they want,
    and there is no a limitation on obligatory uid and gid
    "must be mapped". So, if there is no uid 0 or gid 0
    in the mapping, it's impossible to restore mm->exe_file
    of the processes belonging to this user namespace.

    Also, there is no a workaround. It's impossible
    to create a temporary uid/gid mapping, because
    only one write to /proc/[pid]/uid_map and gid_map
    is allowed during a namespace lifetime.
    If there is an entry, then no more mapings can't be
    written. If there isn't an entry, we can't write
    there too, otherwise user task won't be able
    to do that in the future.

    The patch changes the check, and looks for CAP_SYS_ADMIN
    instead of zero uid and gid. This allows to restore
    a task independently of its user namespace mappings.

> 
> I would argue that setting the current process exe file check should just be reduced to a "can you ptrace a children" check.
> Here's why: any process can masquerade into another executable with ptrace.
> One can fork a child, ptrace it, have the child execve("target_exe"), then replace its memory content with an arbitrary program.

Then it should probably be relaxed to CAP_SYS_PTRACE in the user
namespace and not CAP_CHECKPOINT_RESTORE. (But apparently you also have
a way of achieving what you want anyway. Imho, it's not necessarily
wrong to require a bit more work when you want something like fully
unprivileged c/r that's a rather special interest group.)

> With CRIU's libcompel parasite mechanism (https://criu.org/Compel) this is fairly easy to implement.
> In fact, we could modify CRIU to do just that (but with a fair amount of efforts due to the way CRIU is written),
> and not rely on being able to SET_MM_EXE_FILE via prctl(). In turn, that would give an easy way to masquerade any process
> into another one, provided that one can ptrace a child.
> 
> When not using PR_SET_MM_MAP, but using SET_MM_EXE_FILE, the CAP_RESOURCES at the root namespace level is required:
> https://github.com/torvalds/linux/blob/444fc5cde64330661bf59944c43844e7d4c2ccd8/kernel/sys.c#L2109
> This seems inconsistent. Also for some reason changing auxv is not privileged if using prctl via the MM_MAP mechanism, but is privileged otherwise.

Fwiw, it always helps if people take the time to dig through the history
of specifc changes. That usually helps explain why things ended up as
confusing as they are now:

	commit f606b77f1a9e362451aca8f81d8f36a3a112139e
	Author: Cyrill Gorcunov <gorcunov@openvz.org>
	Date:   Thu Oct 9 15:27:37 2014 -0700
	
	    prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation
	
	    During development of c/r we've noticed that in case if we need to support
	    user namespaces we face a problem with capabilities in prctl(PR_SET_MM,
	    ...) call, in particular once new user namespace is created
	    capable(CAP_SYS_RESOURCE) no longer passes.
	
	    A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values
	    in one bundle, which would allow the kernel to make more intensive test

[snip]

	Still note that updating exe-file link now doesn't require sys-resource
    	capability anymore, after all there is no much profit in preventing setup
    	own file link (there are a number of ways to execute own code -- ptrace,
    	ld-preload, so that the only reliable way to find which exactly code is
    	executed is to inspect running program memory).  Still we require the
    	caller to be at least user-namespace root user.

    	I believe the old interface should be deprecated and ripped off in a
    	couple of kernel releases if no one against.

Sine you already have an interface that works and the argument for the
new interface is that it let's the kernel do better validation if all
arguments be passed at once I don't see the point in removing the
CAP_SYS_RESOURCE restricting from the old interface possible introducing
more maintenance burden or bugs when changing this. It would also
encourage users to use an interface that even c/r people seemed to have
viewed as being deprecated.

Christian
Andrei Vagin June 8, 2020, 2:09 a.m. UTC | #22
> >
> > I would argue that setting the current process exe file check should just be reduced to a "can you ptrace a children" check.
> > Here's why: any process can masquerade into another executable with ptrace.
> > One can fork a child, ptrace it, have the child execve("target_exe"), then replace its memory content with an arbitrary program.
>
> Then it should probably be relaxed to CAP_SYS_PTRACE in the user
> namespace and not CAP_CHECKPOINT_RESTORE. (But apparently you also have
> a way of achieving what you want anyway. Imho, it's not necessarily
> wrong to require a bit more work when you want something like fully
> unprivileged c/r that's a rather special interest group.)
>
> > With CRIU's libcompel parasite mechanism (https://criu.org/Compel) this is fairly easy to implement.
> > In fact, we could modify CRIU to do just that (but with a fair amount of efforts due to the way CRIU is written),
> > and not rely on being able to SET_MM_EXE_FILE via prctl(). In turn, that would give an easy way to masquerade any process
> > into another one, provided that one can ptrace a child.
> >

I think you misunderstand this. In the case of malicious processes,
when only one or two processes must be hidden, they can use this trick
with execve+ptrace and this is relatively simple. But in the case of
CRIU, where we need to restore a process tree with cow memory
mappings, shared mappings, file descriptors and etc, this trick with
execve+ptrace doesn't work at all. We are in a weird situation when
malicious processes can do some operations, but useful tools like CRIU
needed to be running with extra capabilities that actually reduces the
security of the entire system.
Matt Helsley June 12, 2020, 12:17 a.m. UTC | #23
On Fri, May 22, 2020 at 07:53:50AM +0200, Adrian Reber wrote:
> This enables CRIU to checkpoint and restore a process as non-root.
> 
> Over the last years CRIU upstream has been asked a couple of time if it
> is possible to checkpoint and restore a process as non-root. The answer
> usually was: 'almost'.
> 
> The main blocker to restore a process was that selecting the PID of the
> restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> 
> In the last two years the questions about checkpoint/restore as non-root
> have increased and especially in the last few months we have seen
> multiple people inventing workarounds.
> 
> The use-cases so far and their workarounds:
> 
>  * Checkpoint/Restore in an HPC environment in combination with
>    a resource manager distributing jobs. Users are always running
>    as non root, but there was the desire to provide a way to
>    checkpoint and restore long running jobs.
>    Workaround: setuid wrapper to start CRIU as root as non-root
>    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
>  * Another use case to checkpoint/restore processes as non-root
>    uses as workaround a non privileged process which cycles through
>    PIDs by calling fork() as fast as possible with a rate of
>    100,000 pids/s instead of writing to ns_last_pid
>    https://github.com/twosigma/set_ns_last_pid
>  * Fast Java startup using checkpoint/restore.
>    We have been in contact with JVM developers who are integrating
>    CRIU into a JVM to decrease the startup time.
>    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
>  * Container migration as non root. There are people already
>    using CRIU to migrate containers as non-root. The solution
>    there is to run it in a user namespace. So if you are able
>    to carefully setup your environment with the namespaces
>    it is already possible to restore a container/process as non-root.
>    Unfortunately it is not always possible to setup an environment
>    in such a way and for easier access to non-root based container
>    migration this patch is also required.
> 
> There are probably a few more things guarded by CAP_SYS_ADMIN required
> to run checkpoint/restore as non-root, but by applying this patch I can
> already checkpoint and restore processes as non-root. As there are
> already multiple workarounds I would prefer to do it correctly in the
> kernel to avoid that CRIU users are starting to invent more workarounds.
> 
> I have used the following tests to verify that this change works as
> expected by setting the new capability CAP_RESTORE on the two resulting
> test binaries:
> 
> $ cat ns_last_pid.c
>  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/file.h>
>  #include <sys/types.h>
>  #include <unistd.h>
> 
> int main(int argc, char *argv[])
> {
> 	pid_t pid, new_pid;
> 	char buf[32];
> 	int fd;
> 
> 	if (argc != 2)
> 		return 1;
> 
> 	printf("Opening ns_last_pid...\n");
> 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> 	if (fd < 0) {
> 		perror("Cannot open ns_last_pid");
> 		return 1;
> 	}
> 
> 	printf("Locking ns_last_pid...\n");
> 	if (flock(fd, LOCK_EX)) {
> 		close(fd);
> 		printf("Cannot lock ns_last_pid\n");
> 		return 1;
> 	}
> 
> 	pid = atoi(argv[1]);
> 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> 	printf("Writing pid-1 to ns_last_pid...\n");
> 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> 		printf("Cannot write to buf\n");
> 		return 1;
> 	}
> 
> 	printf("Forking...\n");
> 	new_pid = fork();
> 	if (new_pid == 0) {
> 		printf("I am the child!\n");
> 		exit(0);
> 	} else if (new_pid == pid)
> 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> 	else
> 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> 
> 	printf("Cleaning up...\n");
> 	if (flock(fd, LOCK_UN))
> 		printf("Cannot unlock\n");
> 	close(fd);
> 	return 0;
> }
> $ id -u; /home/libcap/ns_last_pid 300000
> 1001
> Opening ns_last_pid...
> Locking ns_last_pid...
> Writing pid-1 to ns_last_pid...
> Forking...
> I am the parent. My child got the pid 300000!
> I am the child!
> Cleaning up...
> 
> For the clone3() based approach:
> $ cat clone3_set_tid.c
>  #define _GNU_SOURCE
>  #include <linux/sched.h>
>  #include <stdint.h>
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
>  #include <sys/syscall.h>
>  #include <unistd.h>
> 
>  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
> 
> int main(int argc, char *argv[])
> {
> 	struct clone_args c_args = { };
> 	pid_t pid, new_pid;
> 
> 	if (argc != 2)
> 		return 1;
> 
> 	pid = atoi(argv[1]);
> 	c_args.set_tid = ptr_to_u64(&pid);
> 	c_args.set_tid_size = 1;
> 
> 	printf("Forking...\n");
> 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));

(Note: I'm going to call the capability CAP_RESTORE but I think this
applies regardless of whether the permissions stay with CAP_SYS_ADMIN..)

I haven't fully reviewed the discussion of the security consequences but
my sense is this would require retaining CAP_RESTORE down the entire tree
of processes being restored so each parent could call clone3() with the
correct pid value for its child(ren).

Ideally you would drop CAP_RESTORE sooner -- preferrably only one
process would need it. I think you could do that by changing what you pass
down; instead of passing down a capability and a pid number, pass down a
special "reservation" pidfd:

1. Have CAP_RESTORE enable opening a pidfd with the desired pid as a
   reservation for the pid (i.e. can't use it to signal, wait, ...
   perhaps these return -EBUSY, -EAGAIN or something...).

2. Only one process needs CAP_RESTORE -- it can drop CAP_RESTORE after
   reserving all of the pids but before kicking off the clone3() calls
   to recreate all of the tasks.

3. Pass the pidfd down the tree of restoring processes. Note how the
   set of specific pids to be created is limited at this point --
   the software cannot be tricked into recreating processes other pids
   using a capability, CAP_RESTORE, it doesn't have. You might even set a
   flag on the pidfd which closes the pidfd in the sender when it's passed
   over a socket so that only the appropriate processes retain the pidfd.

4. The parent can then pass the *reserved* pidfd into clone3(). The
   reserved pidfd seamlessly turns into a pidfd reference to the child if
   clone3() succeeds. If it fails the eservation is still consumed -- a
   given pidfd reserved for clone3() can only be passed to clone3() and
   succeed once.

Perhaps this scheme could concentrate the need for CAP_SYS_ADMIN
to one process so maybe it would obviate the need for CAP_RESTORE.
That said, perhaps there's something that prevents implementing such
a pidfd or perhaps I have misunderstood what CRIU is doing and this
pidfd idea isn't workable.

Cheers,
    -Matt Helsley
Christian Brauner June 12, 2020, 2:39 p.m. UTC | #24
On Thu, Jun 11, 2020 at 05:17:09PM -0700, Matt Helsley wrote:
> On Fri, May 22, 2020 at 07:53:50AM +0200, Adrian Reber wrote:
> > This enables CRIU to checkpoint and restore a process as non-root.
> > 
> > Over the last years CRIU upstream has been asked a couple of time if it
> > is possible to checkpoint and restore a process as non-root. The answer
> > usually was: 'almost'.
> > 
> > The main blocker to restore a process was that selecting the PID of the
> > restored process, which is necessary for CRIU, is guarded by CAP_SYS_ADMIN.
> > 
> > In the last two years the questions about checkpoint/restore as non-root
> > have increased and especially in the last few months we have seen
> > multiple people inventing workarounds.
> > 
> > The use-cases so far and their workarounds:
> > 
> >  * Checkpoint/Restore in an HPC environment in combination with
> >    a resource manager distributing jobs. Users are always running
> >    as non root, but there was the desire to provide a way to
> >    checkpoint and restore long running jobs.
> >    Workaround: setuid wrapper to start CRIU as root as non-root
> >    https://github.com/FredHutch/slurm-examples/blob/master/checkpointer/lib/checkpointer/checkpointer-suid.c
> >  * Another use case to checkpoint/restore processes as non-root
> >    uses as workaround a non privileged process which cycles through
> >    PIDs by calling fork() as fast as possible with a rate of
> >    100,000 pids/s instead of writing to ns_last_pid
> >    https://github.com/twosigma/set_ns_last_pid
> >  * Fast Java startup using checkpoint/restore.
> >    We have been in contact with JVM developers who are integrating
> >    CRIU into a JVM to decrease the startup time.
> >    Workaround so far: patch out CAP_SYS_ADMIN checks in the kernel
> >  * Container migration as non root. There are people already
> >    using CRIU to migrate containers as non-root. The solution
> >    there is to run it in a user namespace. So if you are able
> >    to carefully setup your environment with the namespaces
> >    it is already possible to restore a container/process as non-root.
> >    Unfortunately it is not always possible to setup an environment
> >    in such a way and for easier access to non-root based container
> >    migration this patch is also required.
> > 
> > There are probably a few more things guarded by CAP_SYS_ADMIN required
> > to run checkpoint/restore as non-root, but by applying this patch I can
> > already checkpoint and restore processes as non-root. As there are
> > already multiple workarounds I would prefer to do it correctly in the
> > kernel to avoid that CRIU users are starting to invent more workarounds.
> > 
> > I have used the following tests to verify that this change works as
> > expected by setting the new capability CAP_RESTORE on the two resulting
> > test binaries:
> > 
> > $ cat ns_last_pid.c
> >  // http://efiop-notes.blogspot.com/2014/06/how-to-set-pid-using-nslastpid.html
> >  #include <stdio.h>
> >  #include <stdlib.h>
> >  #include <string.h>
> >  #include <sys/file.h>
> >  #include <sys/types.h>
> >  #include <unistd.h>
> > 
> > int main(int argc, char *argv[])
> > {
> > 	pid_t pid, new_pid;
> > 	char buf[32];
> > 	int fd;
> > 
> > 	if (argc != 2)
> > 		return 1;
> > 
> > 	printf("Opening ns_last_pid...\n");
> > 	fd = open("/proc/sys/kernel/ns_last_pid", O_RDWR | O_CREAT, 0644);
> > 	if (fd < 0) {
> > 		perror("Cannot open ns_last_pid");
> > 		return 1;
> > 	}
> > 
> > 	printf("Locking ns_last_pid...\n");
> > 	if (flock(fd, LOCK_EX)) {
> > 		close(fd);
> > 		printf("Cannot lock ns_last_pid\n");
> > 		return 1;
> > 	}
> > 
> > 	pid = atoi(argv[1]);
> > 	snprintf(buf, sizeof(buf), "%d", pid - 1);
> > 	printf("Writing pid-1 to ns_last_pid...\n");
> > 	if (write(fd, buf, strlen(buf)) != strlen(buf)) {
> > 		printf("Cannot write to buf\n");
> > 		return 1;
> > 	}
> > 
> > 	printf("Forking...\n");
> > 	new_pid = fork();
> > 	if (new_pid == 0) {
> > 		printf("I am the child!\n");
> > 		exit(0);
> > 	} else if (new_pid == pid)
> > 		printf("I am the parent. My child got the pid %d!\n", new_pid);
> > 	else
> > 		printf("pid (%d) does not match expected pid (%d)\n", new_pid, pid);
> > 
> > 	printf("Cleaning up...\n");
> > 	if (flock(fd, LOCK_UN))
> > 		printf("Cannot unlock\n");
> > 	close(fd);
> > 	return 0;
> > }
> > $ id -u; /home/libcap/ns_last_pid 300000
> > 1001
> > Opening ns_last_pid...
> > Locking ns_last_pid...
> > Writing pid-1 to ns_last_pid...
> > Forking...
> > I am the parent. My child got the pid 300000!
> > I am the child!
> > Cleaning up...
> > 
> > For the clone3() based approach:
> > $ cat clone3_set_tid.c
> >  #define _GNU_SOURCE
> >  #include <linux/sched.h>
> >  #include <stdint.h>
> >  #include <stdio.h>
> >  #include <stdlib.h>
> >  #include <string.h>
> >  #include <sys/types.h>
> >  #include <sys/stat.h>
> >  #include <sys/syscall.h>
> >  #include <unistd.h>
> > 
> >  #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
> > 
> > int main(int argc, char *argv[])
> > {
> > 	struct clone_args c_args = { };
> > 	pid_t pid, new_pid;
> > 
> > 	if (argc != 2)
> > 		return 1;
> > 
> > 	pid = atoi(argv[1]);
> > 	c_args.set_tid = ptr_to_u64(&pid);
> > 	c_args.set_tid_size = 1;
> > 
> > 	printf("Forking...\n");
> > 	new_pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
> 
> (Note: I'm going to call the capability CAP_RESTORE but I think this
> applies regardless of whether the permissions stay with CAP_SYS_ADMIN..)
> 
> I haven't fully reviewed the discussion of the security consequences but
> my sense is this would require retaining CAP_RESTORE down the entire tree
> of processes being restored so each parent could call clone3() with the
> correct pid value for its child(ren).

If criu restores a process tree you need to have the required capability
in the starting user namespace and by design this means you also have it
automatically in all child user namespaces. Let alone that if you/criu
created the user namespace you have full privileges over it anway.
While criu restores a process tree it needs to retain its privilege
level. The individual tasks in the process tree that criu restores will
obviously be restored with the creds they were checkpointed with so it's
not like any of them inherit any capabilities they didn't already have
after criu finishes. So I don't see why any of this would be a problem.
Restoring into a (pre-existing) pid namespace hierarchy is another
problem but how this can be solved using the set_tid array was outlined
during Plumbers last year; one of the reasons this is an array.

> 
> Ideally you would drop CAP_RESTORE sooner -- preferrably only one
> process would need it. I think you could do that by changing what you pass
> down; instead of passing down a capability and a pid number, pass down a
> special "reservation" pidfd:

I'm sorry, but nack.
We're not going to do shenaningans like that with pidfds and clone3()
just to cater to this specific use-case that is solvable in less hacky
ways.

Thanks!
Christian

Patch
diff mbox series

diff --git a/include/linux/capability.h b/include/linux/capability.h
index b4345b38a6be..1278313cb2bc 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -261,6 +261,11 @@  static inline bool bpf_capable(void)
 	return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
 }
 
+static inline bool restore_ns_capable(struct user_namespace *ns)
+{
+	return ns_capable(ns, CAP_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN);
+}
+
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c7372180a0a9..4bcc4e3d41ff 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -406,7 +406,14 @@  struct vfs_ns_cap_data {
  */
 #define CAP_BPF			39
 
-#define CAP_LAST_CAP         CAP_BPF
+
+/* Allow checkpoint/restore related operations */
+/* Allow PID selection during clone3() */
+/* Allow writing to ns_last_pid */
+
+#define CAP_RESTORE		40
+
+#define CAP_LAST_CAP         CAP_RESTORE
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 3122043fe364..bbc26f2bcff6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,7 +198,7 @@  struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 			if (tid != 1 && !tmp->child_reaper)
 				goto out_free;
 			retval = -EPERM;
-			if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
+			if (!restore_ns_capable(tmp->user_ns))
 				goto out_free;
 			set_tid_size--;
 		}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0e5ac162c3a8..f58186b31ce6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -269,7 +269,7 @@  static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	struct ctl_table tmp = *table;
 	int ret, next;
 
-	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+	if (write && !restore_ns_capable(pid_ns->user_ns))
 		return -EPERM;
 
 	/*
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 98e1513b608a..f8b8f12a6ebd 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -27,9 +27,10 @@ 
 	    "audit_control", "setfcap"
 
 #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
-		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
+		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
+		"restore"
 
-#if CAP_LAST_CAP > CAP_BPF
+#if CAP_LAST_CAP > CAP_RESTORE
 #error New capability defined, please update COMMON_CAP2_PERMS.
 #endif