diff mbox series

[v2,1/1] userfaultfd/sysctl: add vm.unprivileged_userfaultfd

Message ID 20190319030722.12441-2-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: allow to forbid unprivileged users | expand

Commit Message

Peter Xu March 19, 2019, 3:07 a.m. UTC
Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
whether userfaultfd is allowed by unprivileged users.  When this is
set to zero, only privileged users (root user, or users with the
CAP_SYS_PTRACE capability) will be able to use the userfaultfd
syscalls.

Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Suggested-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 Documentation/sysctl/vm.txt   | 12 ++++++++++++
 fs/userfaultfd.c              |  5 +++++
 include/linux/userfaultfd_k.h |  2 ++
 kernel/sysctl.c               | 12 ++++++++++++
 4 files changed, 31 insertions(+)

Comments

Mike Rapoport March 19, 2019, 7:11 a.m. UTC | #1
Hi Peter,

On Tue, Mar 19, 2019 at 11:07:22AM +0800, Peter Xu wrote:
> Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> whether userfaultfd is allowed by unprivileged users.  When this is
> set to zero, only privileged users (root user, or users with the
> CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> syscalls.
> 
> Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
> Suggested-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>

Just one minor note below

> ---
>  Documentation/sysctl/vm.txt   | 12 ++++++++++++
>  fs/userfaultfd.c              |  5 +++++
>  include/linux/userfaultfd_k.h |  2 ++
>  kernel/sysctl.c               | 12 ++++++++++++
>  4 files changed, 31 insertions(+)
> 
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index 187ce4f599a2..f146712f67bb 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
>  - stat_refresh
>  - numa_stat
>  - swappiness
> +- unprivileged_userfaultfd
>  - user_reserve_kbytes
>  - vfs_cache_pressure
>  - watermark_boost_factor
> @@ -818,6 +819,17 @@ The default value is 60.
> 
>  ==============================================================
> 
> +unprivileged_userfaultfd
> +
> +This flag controls whether unprivileged users can use the userfaultfd
> +syscalls.  Set this to 1 to allow unprivileged users to use the
> +userfaultfd syscalls, or set this to 0 to restrict userfaultfd to only
> +privileged users (with SYS_CAP_PTRACE capability).

Can you please fully spell "system call"?

> +
> +The default value is 1.
> +
> +==============================================================
> +
>  - user_reserve_kbytes
> 
>  When overcommit_memory is set to 2, "never overcommit" mode, reserve
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 89800fc7dc9d..7e856a25cc2f 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -30,6 +30,8 @@
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
> 
> +int sysctl_unprivileged_userfaultfd __read_mostly = 1;
> +
>  static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
> 
>  enum userfaultfd_state {
> @@ -1921,6 +1923,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
>  	struct userfaultfd_ctx *ctx;
>  	int fd;
> 
> +	if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
> +		return -EPERM;
> +
>  	BUG_ON(!current->mm);
> 
>  	/* Check the UFFD_* constants for consistency.  */
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 37c9eba75c98..ac9d71e24b81 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -28,6 +28,8 @@
>  #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
>  #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
> 
> +extern int sysctl_unprivileged_userfaultfd;
> +
>  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
> 
>  extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 7578e21a711b..9b8ff1881df9 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -66,6 +66,7 @@
>  #include <linux/kexec.h>
>  #include <linux/bpf.h>
>  #include <linux/mount.h>
> +#include <linux/userfaultfd_k.h>
> 
>  #include <linux/uaccess.h>
>  #include <asm/processor.h>
> @@ -1704,6 +1705,17 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= (void *)&mmap_rnd_compat_bits_min,
>  		.extra2		= (void *)&mmap_rnd_compat_bits_max,
>  	},
> +#endif
> +#ifdef CONFIG_USERFAULTFD
> +	{
> +		.procname	= "unprivileged_userfaultfd",
> +		.data		= &sysctl_unprivileged_userfaultfd,
> +		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
>  #endif
>  	{ }
>  };
> -- 
> 2.17.1
>
Andrew Morton March 19, 2019, 6:02 p.m. UTC | #2
On Tue, 19 Mar 2019 11:07:22 +0800 Peter Xu <peterx@redhat.com> wrote:

> Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> whether userfaultfd is allowed by unprivileged users.  When this is
> set to zero, only privileged users (root user, or users with the
> CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> syscalls.

Please send along a full description of why you believe Linux needs
this feature, for me to add to the changelog.  What is the benefit to
our users?  How will it be used?

etcetera.  As it was presented I'm seeing no justification for adding
the patch!
Andrea Arcangeli March 19, 2019, 6:07 p.m. UTC | #3
Hello,

On Tue, Mar 19, 2019 at 09:11:04AM +0200, Mike Rapoport wrote:
> Hi Peter,
> 
> On Tue, Mar 19, 2019 at 11:07:22AM +0800, Peter Xu wrote:
> > Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> > whether userfaultfd is allowed by unprivileged users.  When this is
> > set to zero, only privileged users (root user, or users with the
> > CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> > syscalls.
> > 
> > Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
> > Suggested-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
> 
> Just one minor note below

This looks fine with me too.

> > +	if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
> > +		return -EPERM;

The only difference between the bpf sysctl and the userfaultfd sysctl
this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability
requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement,
because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE
already if it's doing other kind of tracking on processes runtime, in
addition of userfaultfd. In other words both syscalls works only for
root, when the two sysctl are opt-in set to 1.

Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Dr. David Alan Gilbert March 19, 2019, 6:28 p.m. UTC | #4
* Andrew Morton (akpm@linux-foundation.org) wrote:
> On Tue, 19 Mar 2019 11:07:22 +0800 Peter Xu <peterx@redhat.com> wrote:
> 
> > Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> > whether userfaultfd is allowed by unprivileged users.  When this is
> > set to zero, only privileged users (root user, or users with the
> > CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> > syscalls.
> 
> Please send along a full description of why you believe Linux needs
> this feature, for me to add to the changelog.  What is the benefit to
> our users?  How will it be used?
> 
> etcetera.  As it was presented I'm seeing no justification for adding
> the patch!

How about:

---
Userfaultfd can be misued to make it easier to exploit existing use-after-free
(and similar) bugs that might otherwise only make a short window
or race condition available.  By using userfaultfd to stall a kernel
thread, a malicious program can keep some state, that it wrote, stable
for an extended period, which it can then access using an existing
exploit.   While it doesn't cause the exploit itself, and while it's not
the only thing that can stall a kernel thread when accessing a memory location,
it's one of the few that never needs priviledge.

Add a flag, allowing userfaultfd to be restricted, so that in general 
it won't be useable by arbitrary user programs, but in environments that
require userfaultfd it can be turned back on.

---

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu March 20, 2019, 12:20 a.m. UTC | #5
On Tue, Mar 19, 2019 at 06:28:23PM +0000, Dr. David Alan Gilbert wrote:
> * Andrew Morton (akpm@linux-foundation.org) wrote:
> > On Tue, 19 Mar 2019 11:07:22 +0800 Peter Xu <peterx@redhat.com> wrote:
> > 
> > > Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> > > whether userfaultfd is allowed by unprivileged users.  When this is
> > > set to zero, only privileged users (root user, or users with the
> > > CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> > > syscalls.
> > 
> > Please send along a full description of why you believe Linux needs
> > this feature, for me to add to the changelog.  What is the benefit to
> > our users?  How will it be used?
> > 
> > etcetera.  As it was presented I'm seeing no justification for adding
> > the patch!
> 
> How about:
> 
> ---
> Userfaultfd can be misued to make it easier to exploit existing use-after-free
> (and similar) bugs that might otherwise only make a short window
> or race condition available.  By using userfaultfd to stall a kernel
> thread, a malicious program can keep some state, that it wrote, stable
> for an extended period, which it can then access using an existing
> exploit.   While it doesn't cause the exploit itself, and while it's not
> the only thing that can stall a kernel thread when accessing a memory location,
> it's one of the few that never needs priviledge.
> 
> Add a flag, allowing userfaultfd to be restricted, so that in general 
> it won't be useable by arbitrary user programs, but in environments that
> require userfaultfd it can be turned back on.

Thanks for the quick write up, Dave!  I definitely should have some
justification in the cover letter and carry it until the last version.
Sorry to be unclear at the first glance.
Andrea Arcangeli March 20, 2019, 7:01 p.m. UTC | #6
Hello,

On Tue, Mar 19, 2019 at 06:28:23PM +0000, Dr. David Alan Gilbert wrote:
> ---
> Userfaultfd can be misued to make it easier to exploit existing use-after-free
> (and similar) bugs that might otherwise only make a short window
> or race condition available.  By using userfaultfd to stall a kernel
> thread, a malicious program can keep some state, that it wrote, stable
> for an extended period, which it can then access using an existing
> exploit.   While it doesn't cause the exploit itself, and while it's not
> the only thing that can stall a kernel thread when accessing a memory location,
> it's one of the few that never needs priviledge.
> 
> Add a flag, allowing userfaultfd to be restricted, so that in general 
> it won't be useable by arbitrary user programs, but in environments that
> require userfaultfd it can be turned back on.

The default in the patch leaves userfaultfd enabled to all users, so
it may be clearer to reverse the last sentence to "in hardened
environments it allows to restrict userfaultfd to privileged processes.".

We can also make example that 'While this is not a kernel issue, in
practice unless you also "chmod u-s /usr/bin/fusermount" there's no
tangible benefit in removing privileges for userfaultfd, other than
probabilistic ones by decreasig the attack surface of the kernel, but
that would be better be achieved through SECCOMP and not globally.'.
Luis Chamberlain March 21, 2019, 1:43 p.m. UTC | #7
On Wed, Mar 20, 2019 at 03:01:12PM -0400, Andrea Arcangeli wrote:
> but
> that would be better be achieved through SECCOMP and not globally.'.

That begs the question why not use seccomp for this? What if everyone
decided to add a knob for all syscalls to do the same? For the commit
log, why is it OK then to justify a knob for this syscall?

 Luis
Andrea Arcangeli March 21, 2019, 9:06 p.m. UTC | #8
Hello,

On Thu, Mar 21, 2019 at 01:43:35PM +0000, Luis Chamberlain wrote:
> On Wed, Mar 20, 2019 at 03:01:12PM -0400, Andrea Arcangeli wrote:
> > but
> > that would be better be achieved through SECCOMP and not globally.'.
> 
> That begs the question why not use seccomp for this? What if everyone
> decided to add a knob for all syscalls to do the same? For the commit
> log, why is it OK then to justify a knob for this syscall?

That's a good point and it's obviously more secure because you can
block a lot more than just bpf and userfaultfd: however not all
syscalls have CONFIG_USERFAULTFD=n or CONFIG_BPF_SYSCALL=n that you
can set to =n at build time, then they'll return -ENOSYS (implemented
as sys_ni_syscall in the =n case).

The point of the bpf (already included upstream) and userfaultfd
(proposed) sysctl is to avoid users having to rebuild the kernel if
they want to harden their setup without being forced to run all
containers under seccomp, just like they could by setting those two
config options "=n" at build time.

So you can see it like allowing a runtime selection of
CONFIG_USERFAULTFD and CONFIG_BPF_SYSCALL without the kernel build
time config forcing the decision on behalf of the end user.

Thanks,
Andrea
Kees Cook April 23, 2019, 10:19 p.m. UTC | #9
On Tue, Mar 19, 2019 at 11:02 AM Andrew Morton
<akpm@linux-foundation.org> wrote:
>
> On Tue, 19 Mar 2019 11:07:22 +0800 Peter Xu <peterx@redhat.com> wrote:
>
> > Add a global sysctl knob "vm.unprivileged_userfaultfd" to control
> > whether userfaultfd is allowed by unprivileged users.  When this is
> > set to zero, only privileged users (root user, or users with the
> > CAP_SYS_PTRACE capability) will be able to use the userfaultfd
> > syscalls.
>
> Please send along a full description of why you believe Linux needs
> this feature, for me to add to the changelog.  What is the benefit to
> our users?  How will it be used?
>
> etcetera.  As it was presented I'm seeing no justification for adding
> the patch!

Was there a v3 of this patch? I'd still really like to have this knob added. :)

Thanks!
diff mbox series

Patch

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 187ce4f599a2..f146712f67bb 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@  Currently, these files are in /proc/sys/vm:
 - stat_refresh
 - numa_stat
 - swappiness
+- unprivileged_userfaultfd
 - user_reserve_kbytes
 - vfs_cache_pressure
 - watermark_boost_factor
@@ -818,6 +819,17 @@  The default value is 60.
 
 ==============================================================
 
+unprivileged_userfaultfd
+
+This flag controls whether unprivileged users can use the userfaultfd
+syscalls.  Set this to 1 to allow unprivileged users to use the
+userfaultfd syscalls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+==============================================================
+
 - user_reserve_kbytes
 
 When overcommit_memory is set to 2, "never overcommit" mode, reserve
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 89800fc7dc9d..7e856a25cc2f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,8 @@ 
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 
+int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
 enum userfaultfd_state {
@@ -1921,6 +1923,9 @@  SYSCALL_DEFINE1(userfaultfd, int, flags)
 	struct userfaultfd_ctx *ctx;
 	int fd;
 
+	if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
 	BUG_ON(!current->mm);
 
 	/* Check the UFFD_* constants for consistency.  */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 37c9eba75c98..ac9d71e24b81 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -28,6 +28,8 @@ 
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+extern int sysctl_unprivileged_userfaultfd;
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7578e21a711b..9b8ff1881df9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@ 
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -1704,6 +1705,17 @@  static struct ctl_table vm_table[] = {
 		.extra1		= (void *)&mmap_rnd_compat_bits_min,
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
+#endif
+#ifdef CONFIG_USERFAULTFD
+	{
+		.procname	= "unprivileged_userfaultfd",
+		.data		= &sysctl_unprivileged_userfaultfd,
+		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 	{ }
 };