Message ID | 20180927151119.9989-4-tycho@tycho.ws (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | seccomp trap to userspace | expand |
On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote: > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > version which can acquire filters is useful. There are at least two reasons > this is preferable, even though it uses ptrace: > > 1. You can control tasks that aren't cooperating with you > 2. You can control tasks whose filters block sendmsg() and socket(); if the > task installs a filter which blocks these calls, there's no way with > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > v2: fix a bug where listener mode was not unset when an unused fd was not > available > v3: fix refcounting bug (Oleg) > v4: * change the listener's fd flags to be 0 > * rename GET_LISTENER to NEW_LISTENER (Matthew) > v5: * add capable(CAP_SYS_ADMIN) requirement > v7: * point the new listener at the right filter (Jann) > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > CC: Kees Cook <keescook@chromium.org> > CC: Andy Lutomirski <luto@amacapital.net> > CC: Oleg Nesterov <oleg@redhat.com> > CC: Eric W. Biederman <ebiederm@xmission.com> > CC: "Serge E. Hallyn" <serge@hallyn.com> > CC: Christian Brauner <christian.brauner@ubuntu.com> > CC: Tyler Hicks <tyhicks@canonical.com> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> If you address the two nits below, you can add: Reviewed-by: Jann Horn <jannh@google.com> > include/linux/seccomp.h | 7 ++ > include/uapi/linux/ptrace.h | 2 + > kernel/ptrace.c | 4 ++ > kernel/seccomp.c | 31 +++++++++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > 5 files changed, 112 insertions(+) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 017444b5efed..234c61b37405 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > +extern long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off); Nit: Sorry, I only noticed this just now, but this should have return type int, not long. ptrace_request() returns an int, and an fd is also normally represented as an int, not a long. > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > { > return; > } > +static inline long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + return -EINVAL; > +} > #endif /* CONFIG_SECCOMP_FILTER */ > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > index d5a1b8a492b9..e80ecb1bd427 100644 > --- a/include/uapi/linux/ptrace.h > +++ b/include/uapi/linux/ptrace.h > @@ -73,6 +73,8 @@ struct seccomp_metadata { > __u64 flags; /* Output: filter's flags */ > }; > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > + > /* Read signals from a shared (process wide) queue */ > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > index 21fec73d45d4..289960ac181b 100644 > --- a/kernel/ptrace.c > +++ b/kernel/ptrace.c > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > ret = seccomp_get_metadata(child, addr, datavp); > break; > > + case PTRACE_SECCOMP_NEW_LISTENER: > + ret = seccomp_new_listener(child, addr); > + break; > + > default: > break; > } > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 44a31ac8373a..17685803a2af 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > return ret; > } > + > +long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + struct seccomp_filter *filter; > + struct file *listener; > + int fd; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EACCES; > + > + filter = get_nth_filter(task, filter_off); > + if (IS_ERR(filter)) > + return PTR_ERR(filter); > + > + fd = get_unused_fd_flags(0); s/0/O_CLOEXEC/ ? If userspace needs a non-cloexec fd, userspace can easily unset O_CLOEXEC; but the reverse isn't true, because it'd be racy. > + if (fd < 0) { > + __put_seccomp_filter(filter); > + return fd; > + } > + > + listener = init_listener(task, filter); > + __put_seccomp_filter(filter); > + if (IS_ERR(listener)) { > + put_unused_fd(fd); > + return PTR_ERR(listener); > + } > + > + fd_install(fd, listener); > + return fd; > +} > #endif > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > index 5f4b836a6792..c6ba3ed5392e 100644 > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args) > } > #endif > > +#ifndef PTRACE_SECCOMP_NEW_LISTENER > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > +#endif > + > #if __BYTE_ORDER == __LITTLE_ENDIAN > #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) > #elif __BYTE_ORDER == __BIG_ENDIAN > @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall) > EXPECT_EQ(0, WEXITSTATUS(status)); > } > > +TEST(get_user_notification_ptrace) > +{ > + pid_t pid; > + int status, listener; > + int sk_pair[2]; > + char c; > + struct seccomp_notif req = {}; > + struct seccomp_notif_resp resp = {}; > + > + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); > + > + pid = fork(); > + ASSERT_GE(pid, 0); > + > + if (pid == 0) { > + EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0); > + > + /* Test that we get ENOSYS while not attached */ > + EXPECT_EQ(syscall(__NR_getpid), -1); > + EXPECT_EQ(errno, ENOSYS); > + > + /* Signal we're ready and have installed the filter. */ > + EXPECT_EQ(write(sk_pair[1], "J", 1), 1); > + > + EXPECT_EQ(read(sk_pair[1], &c, 1), 1); > + EXPECT_EQ(c, 'H'); > + > + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); > + } > + > + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); > + EXPECT_EQ(c, 'J'); > + > + EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0); > + EXPECT_EQ(waitpid(pid, NULL, 0), pid); > + listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0); > + EXPECT_GE(listener, 0); > + > + /* EBUSY for second listener */ > + EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1); > + EXPECT_EQ(errno, EBUSY); > + > + EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0); > + > + /* Now signal we are done and respond with magic */ > + EXPECT_EQ(write(sk_pair[0], "H", 1), 1); > + > + req.len = sizeof(req); > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req)); > + > + resp.len = sizeof(resp); > + resp.id = req.id; > + resp.error = 0; > + resp.val = USER_NOTIF_MAGIC; > + > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp)); > + > + EXPECT_EQ(waitpid(pid, &status, 0), pid); > + EXPECT_EQ(true, WIFEXITED(status)); > + EXPECT_EQ(0, WEXITSTATUS(status)); > + > + close(listener); > +} > + > /* > * Check that a pid in a child namespace still shows up as valid in ours. > */ > -- > 2.17.1 >
On Thu, Sep 27, 2018 at 06:20:23PM +0200, Jann Horn wrote: > On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote: > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > version which can acquire filters is useful. There are at least two reasons > > this is preferable, even though it uses ptrace: > > > > 1. You can control tasks that aren't cooperating with you > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > task installs a filter which blocks these calls, there's no way with > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > v2: fix a bug where listener mode was not unset when an unused fd was not > > available > > v3: fix refcounting bug (Oleg) > > v4: * change the listener's fd flags to be 0 > > * rename GET_LISTENER to NEW_LISTENER (Matthew) > > v5: * add capable(CAP_SYS_ADMIN) requirement > > v7: * point the new listener at the right filter (Jann) > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > > CC: Kees Cook <keescook@chromium.org> > > CC: Andy Lutomirski <luto@amacapital.net> > > CC: Oleg Nesterov <oleg@redhat.com> > > CC: Eric W. Biederman <ebiederm@xmission.com> > > CC: "Serge E. Hallyn" <serge@hallyn.com> > > CC: Christian Brauner <christian.brauner@ubuntu.com> > > CC: Tyler Hicks <tyhicks@canonical.com> > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > > If you address the two nits below, you can add: > Reviewed-by: Jann Horn <jannh@google.com> Thanks! > > include/linux/seccomp.h | 7 ++ > > include/uapi/linux/ptrace.h | 2 + > > kernel/ptrace.c | 4 ++ > > kernel/seccomp.c | 31 +++++++++ > > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > > 5 files changed, 112 insertions(+) > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > > index 017444b5efed..234c61b37405 100644 > > --- a/include/linux/seccomp.h > > +++ b/include/linux/seccomp.h > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > > #ifdef CONFIG_SECCOMP_FILTER > > extern void put_seccomp_filter(struct task_struct *tsk); > > extern void get_seccomp_filter(struct task_struct *tsk); > > +extern long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off); > > Nit: Sorry, I only noticed this just now, but this should have return > type int, not long. ptrace_request() returns an int, and an fd is also > normally represented as an int, not a long. Ugh, I could have sworn I checked this. In particular because the other seccomp code that's called from ptrace returns a long too :) I'll fix that for the next version, and send a different patch for the other two. > > #else /* CONFIG_SECCOMP_FILTER */ > > static inline void put_seccomp_filter(struct task_struct *tsk) > > { > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > > { > > return; > > } > > +static inline long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off) > > +{ > > + return -EINVAL; > > +} > > #endif /* CONFIG_SECCOMP_FILTER */ > > > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > > index d5a1b8a492b9..e80ecb1bd427 100644 > > --- a/include/uapi/linux/ptrace.h > > +++ b/include/uapi/linux/ptrace.h > > @@ -73,6 +73,8 @@ struct seccomp_metadata { > > __u64 flags; /* Output: filter's flags */ > > }; > > > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > > + > > /* Read signals from a shared (process wide) queue */ > > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > index 21fec73d45d4..289960ac181b 100644 > > --- a/kernel/ptrace.c > > +++ b/kernel/ptrace.c > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > ret = seccomp_get_metadata(child, addr, datavp); > > break; > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > + ret = seccomp_new_listener(child, addr); > > + break; > > + > > default: > > break; > > } > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > index 44a31ac8373a..17685803a2af 100644 > > --- a/kernel/seccomp.c > > +++ b/kernel/seccomp.c > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > return ret; > > } > > + > > +long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off) > > +{ > > + struct seccomp_filter *filter; > > + struct file *listener; > > + int fd; > > + > > + if (!capable(CAP_SYS_ADMIN)) > > + return -EACCES; > > + > > + filter = get_nth_filter(task, filter_off); > > + if (IS_ERR(filter)) > > + return PTR_ERR(filter); > > + > > + fd = get_unused_fd_flags(0); > > s/0/O_CLOEXEC/ ? If userspace needs a non-cloexec fd, userspace can > easily unset O_CLOEXEC; but the reverse isn't true, because it'd be > racy. Sure, will do. Tycho
On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote: > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > version which can acquire filters is useful. There are at least two reasons > this is preferable, even though it uses ptrace: > > 1. You can control tasks that aren't cooperating with you > 2. You can control tasks whose filters block sendmsg() and socket(); if the > task installs a filter which blocks these calls, there's no way with > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. [...] > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > index 21fec73d45d4..289960ac181b 100644 > --- a/kernel/ptrace.c > +++ b/kernel/ptrace.c > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > ret = seccomp_get_metadata(child, addr, datavp); > break; > > + case PTRACE_SECCOMP_NEW_LISTENER: > + ret = seccomp_new_listener(child, addr); > + break; Actually, could you amend this to also ensure that `data == 0` and return -EINVAL otherwise? Then if we want to abuse `data` for passing flags in the future, we don't have to worry about what happens if someone passes in garbage as `data`.
On Thu, Sep 27, 2018 at 07:35:06PM +0200, Jann Horn wrote: > On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote: > > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > version which can acquire filters is useful. There are at least two reasons > > this is preferable, even though it uses ptrace: > > > > 1. You can control tasks that aren't cooperating with you > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > task installs a filter which blocks these calls, there's no way with > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > [...] > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > index 21fec73d45d4..289960ac181b 100644 > > --- a/kernel/ptrace.c > > +++ b/kernel/ptrace.c > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > ret = seccomp_get_metadata(child, addr, datavp); > > break; > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > + ret = seccomp_new_listener(child, addr); > > + break; > > Actually, could you amend this to also ensure that `data == 0` and > return -EINVAL otherwise? Then if we want to abuse `data` for passing > flags in the future, we don't have to worry about what happens if > someone passes in garbage as `data`. Yes, good idea. Thanks! Tycho
On Thu, Sep 27, 2018 at 8:11 AM, Tycho Andersen <tycho@tycho.ws> wrote: > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > version which can acquire filters is useful. There are at least two reasons > this is preferable, even though it uses ptrace: > > 1. You can control tasks that aren't cooperating with you > 2. You can control tasks whose filters block sendmsg() and socket(); if the > task installs a filter which blocks these calls, there's no way with > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > v2: fix a bug where listener mode was not unset when an unused fd was not > available > v3: fix refcounting bug (Oleg) > v4: * change the listener's fd flags to be 0 > * rename GET_LISTENER to NEW_LISTENER (Matthew) > v5: * add capable(CAP_SYS_ADMIN) requirement > v7: * point the new listener at the right filter (Jann) > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > CC: Kees Cook <keescook@chromium.org> > CC: Andy Lutomirski <luto@amacapital.net> > CC: Oleg Nesterov <oleg@redhat.com> > CC: Eric W. Biederman <ebiederm@xmission.com> > CC: "Serge E. Hallyn" <serge@hallyn.com> > CC: Christian Brauner <christian.brauner@ubuntu.com> > CC: Tyler Hicks <tyhicks@canonical.com> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > --- > include/linux/seccomp.h | 7 ++ > include/uapi/linux/ptrace.h | 2 + > kernel/ptrace.c | 4 ++ > kernel/seccomp.c | 31 +++++++++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > 5 files changed, 112 insertions(+) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 017444b5efed..234c61b37405 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > +extern long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off); > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > { > return; > } > +static inline long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + return -EINVAL; > +} > #endif /* CONFIG_SECCOMP_FILTER */ > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > index d5a1b8a492b9..e80ecb1bd427 100644 > --- a/include/uapi/linux/ptrace.h > +++ b/include/uapi/linux/ptrace.h > @@ -73,6 +73,8 @@ struct seccomp_metadata { > __u64 flags; /* Output: filter's flags */ > }; > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > + > /* Read signals from a shared (process wide) queue */ > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > index 21fec73d45d4..289960ac181b 100644 > --- a/kernel/ptrace.c > +++ b/kernel/ptrace.c > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > ret = seccomp_get_metadata(child, addr, datavp); > break; > > + case PTRACE_SECCOMP_NEW_LISTENER: > + ret = seccomp_new_listener(child, addr); > + break; > + > default: > break; > } > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 44a31ac8373a..17685803a2af 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > return ret; > } > + > +long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + struct seccomp_filter *filter; > + struct file *listener; > + int fd; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EACCES; > + > + filter = get_nth_filter(task, filter_off); > + if (IS_ERR(filter)) > + return PTR_ERR(filter); > + > + fd = get_unused_fd_flags(0); > + if (fd < 0) { > + __put_seccomp_filter(filter); > + return fd; > + } > + > + listener = init_listener(task, filter); > + __put_seccomp_filter(filter); > + if (IS_ERR(listener)) { > + put_unused_fd(fd); > + return PTR_ERR(listener); > + } > + > + fd_install(fd, listener); > + return fd; > +} Observation both here and with SECCOMP_FILTER_FLAG_NEW_LISTENER: nothing actually checks that there is a RET_USER_NOTIF bpf rule in the filter. *shrug* Not a problem, just a weird state. > #endif > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > index 5f4b836a6792..c6ba3ed5392e 100644 > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args) > } > #endif > > +#ifndef PTRACE_SECCOMP_NEW_LISTENER > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > +#endif > + > #if __BYTE_ORDER == __LITTLE_ENDIAN > #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) > #elif __BYTE_ORDER == __BIG_ENDIAN > @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall) > EXPECT_EQ(0, WEXITSTATUS(status)); > } > > +TEST(get_user_notification_ptrace) > +{ > + pid_t pid; > + int status, listener; > + int sk_pair[2]; > + char c; > + struct seccomp_notif req = {}; > + struct seccomp_notif_resp resp = {}; > + > + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); > + > + pid = fork(); > + ASSERT_GE(pid, 0); > + > + if (pid == 0) { > + EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0); > + > + /* Test that we get ENOSYS while not attached */ > + EXPECT_EQ(syscall(__NR_getpid), -1); > + EXPECT_EQ(errno, ENOSYS); > + > + /* Signal we're ready and have installed the filter. */ > + EXPECT_EQ(write(sk_pair[1], "J", 1), 1); > + > + EXPECT_EQ(read(sk_pair[1], &c, 1), 1); > + EXPECT_EQ(c, 'H'); > + > + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); > + } > + > + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); > + EXPECT_EQ(c, 'J'); > + > + EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0); > + EXPECT_EQ(waitpid(pid, NULL, 0), pid); > + listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0); > + EXPECT_GE(listener, 0); > + > + /* EBUSY for second listener */ > + EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1); > + EXPECT_EQ(errno, EBUSY); > + > + EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0); > + > + /* Now signal we are done and respond with magic */ > + EXPECT_EQ(write(sk_pair[0], "H", 1), 1); > + > + req.len = sizeof(req); > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req)); > + > + resp.len = sizeof(resp); > + resp.id = req.id; > + resp.error = 0; > + resp.val = USER_NOTIF_MAGIC; > + > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp)); > + > + EXPECT_EQ(waitpid(pid, &status, 0), pid); > + EXPECT_EQ(true, WIFEXITED(status)); > + EXPECT_EQ(0, WEXITSTATUS(status)); > + > + close(listener); > +} > + > /* > * Check that a pid in a child namespace still shows up as valid in ours. > */ > -- > 2.17.1 > And FWIW, I agree with Jann's review notes here too. :) Looks good! -Kees
On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > version which can acquire filters is useful. There are at least two reasons > this is preferable, even though it uses ptrace: > > 1. You can control tasks that aren't cooperating with you > 2. You can control tasks whose filters block sendmsg() and socket(); if the > task installs a filter which blocks these calls, there's no way with > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. So for the slow of mind aka me: I'm not sure I completely understand this problem. Can you outline how sendmsg() and socket() are involved in this? I'm also not sure that this holds (but I might misunderstand the problem) afaict, you could do try to get the fd out via CLONE_FILES and other means so something like: // let's pretend the libc wrapper for clone actually has sane semantics pid = clone(CLONE_FILES); if (pid == 0) { fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); // Now this fd will be valid in both parent and child. // If you haven't blocked it you can inform the parent what // the fd number is via pipe2(). If you have blocked it you can // use dup2() and dup to a known fd number. } > > v2: fix a bug where listener mode was not unset when an unused fd was not > available > v3: fix refcounting bug (Oleg) > v4: * change the listener's fd flags to be 0 > * rename GET_LISTENER to NEW_LISTENER (Matthew) > v5: * add capable(CAP_SYS_ADMIN) requirement > v7: * point the new listener at the right filter (Jann) > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > CC: Kees Cook <keescook@chromium.org> > CC: Andy Lutomirski <luto@amacapital.net> > CC: Oleg Nesterov <oleg@redhat.com> > CC: Eric W. Biederman <ebiederm@xmission.com> > CC: "Serge E. Hallyn" <serge@hallyn.com> > CC: Christian Brauner <christian.brauner@ubuntu.com> > CC: Tyler Hicks <tyhicks@canonical.com> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > --- > include/linux/seccomp.h | 7 ++ > include/uapi/linux/ptrace.h | 2 + > kernel/ptrace.c | 4 ++ > kernel/seccomp.c | 31 +++++++++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > 5 files changed, 112 insertions(+) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 017444b5efed..234c61b37405 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > +extern long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off); > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > { > return; > } > +static inline long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + return -EINVAL; > +} > #endif /* CONFIG_SECCOMP_FILTER */ > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > index d5a1b8a492b9..e80ecb1bd427 100644 > --- a/include/uapi/linux/ptrace.h > +++ b/include/uapi/linux/ptrace.h > @@ -73,6 +73,8 @@ struct seccomp_metadata { > __u64 flags; /* Output: filter's flags */ > }; > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > + > /* Read signals from a shared (process wide) queue */ > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > index 21fec73d45d4..289960ac181b 100644 > --- a/kernel/ptrace.c > +++ b/kernel/ptrace.c > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > ret = seccomp_get_metadata(child, addr, datavp); > break; > > + case PTRACE_SECCOMP_NEW_LISTENER: > + ret = seccomp_new_listener(child, addr); > + break; > + > default: > break; > } > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 44a31ac8373a..17685803a2af 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > return ret; > } > + > +long seccomp_new_listener(struct task_struct *task, > + unsigned long filter_off) > +{ > + struct seccomp_filter *filter; > + struct file *listener; > + int fd; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EACCES; I know this might have been discussed a while back but why exactly do we require CAP_SYS_ADMIN in init_userns and not in the target userns? What if I want to do a setns()fd, CLONE_NEWUSER) to the target process and use ptrace from in there? > + > + filter = get_nth_filter(task, filter_off); > + if (IS_ERR(filter)) > + return PTR_ERR(filter); > + > + fd = get_unused_fd_flags(0); > + if (fd < 0) { > + __put_seccomp_filter(filter); > + return fd; > + } > + > + listener = init_listener(task, filter); > + __put_seccomp_filter(filter); > + if (IS_ERR(listener)) { > + put_unused_fd(fd); > + return PTR_ERR(listener); > + } > + > + fd_install(fd, listener); > + return fd; > +} > #endif > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > index 5f4b836a6792..c6ba3ed5392e 100644 > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args) > } > #endif > > +#ifndef PTRACE_SECCOMP_NEW_LISTENER > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > +#endif > + > #if __BYTE_ORDER == __LITTLE_ENDIAN > #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) > #elif __BYTE_ORDER == __BIG_ENDIAN > @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall) > EXPECT_EQ(0, WEXITSTATUS(status)); > } > > +TEST(get_user_notification_ptrace) > +{ > + pid_t pid; > + int status, listener; > + int sk_pair[2]; > + char c; > + struct seccomp_notif req = {}; > + struct seccomp_notif_resp resp = {}; > + > + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); > + > + pid = fork(); > + ASSERT_GE(pid, 0); > + > + if (pid == 0) { > + EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0); > + > + /* Test that we get ENOSYS while not attached */ > + EXPECT_EQ(syscall(__NR_getpid), -1); > + EXPECT_EQ(errno, ENOSYS); > + > + /* Signal we're ready and have installed the filter. */ > + EXPECT_EQ(write(sk_pair[1], "J", 1), 1); > + > + EXPECT_EQ(read(sk_pair[1], &c, 1), 1); > + EXPECT_EQ(c, 'H'); > + > + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); > + } > + > + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); > + EXPECT_EQ(c, 'J'); > + > + EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0); > + EXPECT_EQ(waitpid(pid, NULL, 0), pid); > + listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0); > + EXPECT_GE(listener, 0); > + > + /* EBUSY for second listener */ > + EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1); > + EXPECT_EQ(errno, EBUSY); > + > + EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0); > + > + /* Now signal we are done and respond with magic */ > + EXPECT_EQ(write(sk_pair[0], "H", 1), 1); > + > + req.len = sizeof(req); > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req)); > + > + resp.len = sizeof(resp); > + resp.id = req.id; > + resp.error = 0; > + resp.val = USER_NOTIF_MAGIC; > + > + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp)); > + > + EXPECT_EQ(waitpid(pid, &status, 0), pid); > + EXPECT_EQ(true, WIFEXITED(status)); > + EXPECT_EQ(0, WEXITSTATUS(status)); > + > + close(listener); > +} > + > /* > * Check that a pid in a child namespace still shows up as valid in ours. > */ > -- > 2.17.1 > > _______________________________________________ > Containers mailing list > Containers@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/containers
On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > version which can acquire filters is useful. There are at least two reasons > > this is preferable, even though it uses ptrace: > > > > 1. You can control tasks that aren't cooperating with you > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > task installs a filter which blocks these calls, there's no way with > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > So for the slow of mind aka me: > I'm not sure I completely understand this problem. Can you outline how > sendmsg() and socket() are involved in this? > > I'm also not sure that this holds (but I might misunderstand the > problem) afaict, you could do try to get the fd out via CLONE_FILES and > other means so something like: > > // let's pretend the libc wrapper for clone actually has sane semantics > pid = clone(CLONE_FILES); > if (pid == 0) { > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > // Now this fd will be valid in both parent and child. > // If you haven't blocked it you can inform the parent what > // the fd number is via pipe2(). If you have blocked it you can > // use dup2() and dup to a known fd number. > } > > > > > v2: fix a bug where listener mode was not unset when an unused fd was not > > available > > v3: fix refcounting bug (Oleg) > > v4: * change the listener's fd flags to be 0 > > * rename GET_LISTENER to NEW_LISTENER (Matthew) > > v5: * add capable(CAP_SYS_ADMIN) requirement > > v7: * point the new listener at the right filter (Jann) > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > > CC: Kees Cook <keescook@chromium.org> > > CC: Andy Lutomirski <luto@amacapital.net> > > CC: Oleg Nesterov <oleg@redhat.com> > > CC: Eric W. Biederman <ebiederm@xmission.com> > > CC: "Serge E. Hallyn" <serge@hallyn.com> > > CC: Christian Brauner <christian.brauner@ubuntu.com> > > CC: Tyler Hicks <tyhicks@canonical.com> > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > > --- > > include/linux/seccomp.h | 7 ++ > > include/uapi/linux/ptrace.h | 2 + > > kernel/ptrace.c | 4 ++ > > kernel/seccomp.c | 31 +++++++++ > > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > > 5 files changed, 112 insertions(+) > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > > index 017444b5efed..234c61b37405 100644 > > --- a/include/linux/seccomp.h > > +++ b/include/linux/seccomp.h > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > > #ifdef CONFIG_SECCOMP_FILTER > > extern void put_seccomp_filter(struct task_struct *tsk); > > extern void get_seccomp_filter(struct task_struct *tsk); > > +extern long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off); > > #else /* CONFIG_SECCOMP_FILTER */ > > static inline void put_seccomp_filter(struct task_struct *tsk) > > { > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > > { > > return; > > } > > +static inline long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off) > > +{ > > + return -EINVAL; > > +} > > #endif /* CONFIG_SECCOMP_FILTER */ > > > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > > index d5a1b8a492b9..e80ecb1bd427 100644 > > --- a/include/uapi/linux/ptrace.h > > +++ b/include/uapi/linux/ptrace.h > > @@ -73,6 +73,8 @@ struct seccomp_metadata { > > __u64 flags; /* Output: filter's flags */ > > }; > > > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > > + > > /* Read signals from a shared (process wide) queue */ > > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > index 21fec73d45d4..289960ac181b 100644 > > --- a/kernel/ptrace.c > > +++ b/kernel/ptrace.c > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > ret = seccomp_get_metadata(child, addr, datavp); > > break; > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > + ret = seccomp_new_listener(child, addr); > > + break; > > + > > default: > > break; > > } > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > index 44a31ac8373a..17685803a2af 100644 > > --- a/kernel/seccomp.c > > +++ b/kernel/seccomp.c > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > return ret; > > } > > + > > +long seccomp_new_listener(struct task_struct *task, > > + unsigned long filter_off) > > +{ > > + struct seccomp_filter *filter; > > + struct file *listener; > > + int fd; > > + > > + if (!capable(CAP_SYS_ADMIN)) > > + return -EACCES; > > I know this might have been discussed a while back but why exactly do we > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > use ptrace from in there? See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ . Basically, the problem is that this doesn't just give you capability over the target task, but also over every other task that has the same filter installed; you need some sort of "is the caller capable over the filter and anyone who uses it" check.
On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > version which can acquire filters is useful. There are at least two reasons > > > this is preferable, even though it uses ptrace: > > > > > > 1. You can control tasks that aren't cooperating with you > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > task installs a filter which blocks these calls, there's no way with > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > So for the slow of mind aka me: > > I'm not sure I completely understand this problem. Can you outline how > > sendmsg() and socket() are involved in this? > > > > I'm also not sure that this holds (but I might misunderstand the > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > other means so something like: > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > pid = clone(CLONE_FILES); > > if (pid == 0) { > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > // Now this fd will be valid in both parent and child. > > // If you haven't blocked it you can inform the parent what > > // the fd number is via pipe2(). If you have blocked it you can > > // use dup2() and dup to a known fd number. > > } > > > > > > > > v2: fix a bug where listener mode was not unset when an unused fd was not > > > available > > > v3: fix refcounting bug (Oleg) > > > v4: * change the listener's fd flags to be 0 > > > * rename GET_LISTENER to NEW_LISTENER (Matthew) > > > v5: * add capable(CAP_SYS_ADMIN) requirement > > > v7: * point the new listener at the right filter (Jann) > > > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > > > CC: Kees Cook <keescook@chromium.org> > > > CC: Andy Lutomirski <luto@amacapital.net> > > > CC: Oleg Nesterov <oleg@redhat.com> > > > CC: Eric W. Biederman <ebiederm@xmission.com> > > > CC: "Serge E. Hallyn" <serge@hallyn.com> > > > CC: Christian Brauner <christian.brauner@ubuntu.com> > > > CC: Tyler Hicks <tyhicks@canonical.com> > > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > > > --- > > > include/linux/seccomp.h | 7 ++ > > > include/uapi/linux/ptrace.h | 2 + > > > kernel/ptrace.c | 4 ++ > > > kernel/seccomp.c | 31 +++++++++ > > > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > > > 5 files changed, 112 insertions(+) > > > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > > > index 017444b5efed..234c61b37405 100644 > > > --- a/include/linux/seccomp.h > > > +++ b/include/linux/seccomp.h > > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > > > #ifdef CONFIG_SECCOMP_FILTER > > > extern void put_seccomp_filter(struct task_struct *tsk); > > > extern void get_seccomp_filter(struct task_struct *tsk); > > > +extern long seccomp_new_listener(struct task_struct *task, > > > + unsigned long filter_off); > > > #else /* CONFIG_SECCOMP_FILTER */ > > > static inline void put_seccomp_filter(struct task_struct *tsk) > > > { > > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > > > { > > > return; > > > } > > > +static inline long seccomp_new_listener(struct task_struct *task, > > > + unsigned long filter_off) > > > +{ > > > + return -EINVAL; > > > +} > > > #endif /* CONFIG_SECCOMP_FILTER */ > > > > > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > > > index d5a1b8a492b9..e80ecb1bd427 100644 > > > --- a/include/uapi/linux/ptrace.h > > > +++ b/include/uapi/linux/ptrace.h > > > @@ -73,6 +73,8 @@ struct seccomp_metadata { > > > __u64 flags; /* Output: filter's flags */ > > > }; > > > > > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > > > + > > > /* Read signals from a shared (process wide) queue */ > > > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > > index 21fec73d45d4..289960ac181b 100644 > > > --- a/kernel/ptrace.c > > > +++ b/kernel/ptrace.c > > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > > ret = seccomp_get_metadata(child, addr, datavp); > > > break; > > > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > > + ret = seccomp_new_listener(child, addr); > > > + break; > > > + > > > default: > > > break; > > > } > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > index 44a31ac8373a..17685803a2af 100644 > > > --- a/kernel/seccomp.c > > > +++ b/kernel/seccomp.c > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > return ret; > > > } > > > + > > > +long seccomp_new_listener(struct task_struct *task, > > > + unsigned long filter_off) > > > +{ > > > + struct seccomp_filter *filter; > > > + struct file *listener; > > > + int fd; > > > + > > > + if (!capable(CAP_SYS_ADMIN)) > > > + return -EACCES; > > > > I know this might have been discussed a while back but why exactly do we > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > use ptrace from in there? > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > . Basically, the problem is that this doesn't just give you capability > over the target task, but also over every other task that has the same > filter installed; you need some sort of "is the caller capable over > the filter and anyone who uses it" check. Thanks. But then this new ptrace feature as it stands is imho currently broken. If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api extension should be fixed to allow for this too or the seccomp() way of retrieving the pid - which I really think we want - needs to be fixed to require capable(CAP_SYS_ADMIN) too. The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - the preferred way to solve this. Everything else will just be confusing.
On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > > version which can acquire filters is useful. There are at least two reasons > > > > this is preferable, even though it uses ptrace: > > > > > > > > 1. You can control tasks that aren't cooperating with you > > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > > task installs a filter which blocks these calls, there's no way with > > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > > > So for the slow of mind aka me: > > > I'm not sure I completely understand this problem. Can you outline how > > > sendmsg() and socket() are involved in this? > > > > > > I'm also not sure that this holds (but I might misunderstand the > > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > > other means so something like: > > > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > > pid = clone(CLONE_FILES); > > > if (pid == 0) { > > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > > > // Now this fd will be valid in both parent and child. > > > // If you haven't blocked it you can inform the parent what > > > // the fd number is via pipe2(). If you have blocked it you can > > > // use dup2() and dup to a known fd number. > > > } > > > > > > > > > > > v2: fix a bug where listener mode was not unset when an unused fd was not > > > > available > > > > v3: fix refcounting bug (Oleg) > > > > v4: * change the listener's fd flags to be 0 > > > > * rename GET_LISTENER to NEW_LISTENER (Matthew) > > > > v5: * add capable(CAP_SYS_ADMIN) requirement > > > > v7: * point the new listener at the right filter (Jann) > > > > > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > > > > CC: Kees Cook <keescook@chromium.org> > > > > CC: Andy Lutomirski <luto@amacapital.net> > > > > CC: Oleg Nesterov <oleg@redhat.com> > > > > CC: Eric W. Biederman <ebiederm@xmission.com> > > > > CC: "Serge E. Hallyn" <serge@hallyn.com> > > > > CC: Christian Brauner <christian.brauner@ubuntu.com> > > > > CC: Tyler Hicks <tyhicks@canonical.com> > > > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > > > > --- > > > > include/linux/seccomp.h | 7 ++ > > > > include/uapi/linux/ptrace.h | 2 + > > > > kernel/ptrace.c | 4 ++ > > > > kernel/seccomp.c | 31 +++++++++ > > > > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > > > > 5 files changed, 112 insertions(+) > > > > > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > > > > index 017444b5efed..234c61b37405 100644 > > > > --- a/include/linux/seccomp.h > > > > +++ b/include/linux/seccomp.h > > > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > > > > #ifdef CONFIG_SECCOMP_FILTER > > > > extern void put_seccomp_filter(struct task_struct *tsk); > > > > extern void get_seccomp_filter(struct task_struct *tsk); > > > > +extern long seccomp_new_listener(struct task_struct *task, > > > > + unsigned long filter_off); > > > > #else /* CONFIG_SECCOMP_FILTER */ > > > > static inline void put_seccomp_filter(struct task_struct *tsk) > > > > { > > > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > > > > { > > > > return; > > > > } > > > > +static inline long seccomp_new_listener(struct task_struct *task, > > > > + unsigned long filter_off) > > > > +{ > > > > + return -EINVAL; > > > > +} > > > > #endif /* CONFIG_SECCOMP_FILTER */ > > > > > > > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > > > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > > > > index d5a1b8a492b9..e80ecb1bd427 100644 > > > > --- a/include/uapi/linux/ptrace.h > > > > +++ b/include/uapi/linux/ptrace.h > > > > @@ -73,6 +73,8 @@ struct seccomp_metadata { > > > > __u64 flags; /* Output: filter's flags */ > > > > }; > > > > > > > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > > > > + > > > > /* Read signals from a shared (process wide) queue */ > > > > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > > > > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > > > index 21fec73d45d4..289960ac181b 100644 > > > > --- a/kernel/ptrace.c > > > > +++ b/kernel/ptrace.c > > > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > > > ret = seccomp_get_metadata(child, addr, datavp); > > > > break; > > > > > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > > > + ret = seccomp_new_listener(child, addr); > > > > + break; > > > > + > > > > default: > > > > break; > > > > } > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > index 44a31ac8373a..17685803a2af 100644 > > > > --- a/kernel/seccomp.c > > > > +++ b/kernel/seccomp.c > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > return ret; > > > > } > > > > + > > > > +long seccomp_new_listener(struct task_struct *task, > > > > + unsigned long filter_off) > > > > +{ > > > > + struct seccomp_filter *filter; > > > > + struct file *listener; > > > > + int fd; > > > > + > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > + return -EACCES; > > > > > > I know this might have been discussed a while back but why exactly do we > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > use ptrace from in there? > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > . Basically, the problem is that this doesn't just give you capability > > over the target task, but also over every other task that has the same > > filter installed; you need some sort of "is the caller capable over > > the filter and anyone who uses it" check. > > Thanks. > But then this new ptrace feature as it stands is imho currently broken. > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api > extension should be fixed to allow for this too or the seccomp() way of > retrieving the pid - which I really think we want - needs to be fixed to > require capable(CAP_SYS_ADMIN) too. > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > the preferred way to solve this. > Everything else will just be confusing. First you say "broken", then you say "confusing". Which one do you mean? Regarding requiring ns_capable() for ptrace: That means that you'll have to stash namespace information in the seccomp filter. You'd also potentially be eliding the LSM check that would normally have to occur between the tracer and the tracee; but I guess that's probably fine? CAP_SYS_ADMIN in the init namespace already has some abilities that LSMs can't observe; you could argue that CAP_SYS_ADMIN in another namespace should have similar semantics, but I'm not sure whether that matches what the LSM people want as semantics.
On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote: > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > version which can acquire filters is useful. There are at least two reasons > > this is preferable, even though it uses ptrace: > > > > 1. You can control tasks that aren't cooperating with you > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > task installs a filter which blocks these calls, there's no way with > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > So for the slow of mind aka me: > I'm not sure I completely understand this problem. Can you outline how > sendmsg() and socket() are involved in this? > > I'm also not sure that this holds (but I might misunderstand the > problem) afaict, you could do try to get the fd out via CLONE_FILES and > other means so something like: > > // let's pretend the libc wrapper for clone actually has sane semantics > pid = clone(CLONE_FILES); > if (pid == 0) { > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > // Now this fd will be valid in both parent and child. > // If you haven't blocked it you can inform the parent what > // the fd number is via pipe2(). If you have blocked it you can > // use dup2() and dup to a known fd number. > } But what if your seccomp filter wants to block both pipe2() and dup2()? Whatever syscall you want to use to do this could be blocked by some seccomp policy, which means you might not be able to use this feature in some cases. Perhaps it's unlikely, and we can just go forward knowing this. But it seems like it is worth at least acknowledging that you can wedge yourself into a corner. Tycho
On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > > > version which can acquire filters is useful. There are at least two reasons > > > > > this is preferable, even though it uses ptrace: > > > > > > > > > > 1. You can control tasks that aren't cooperating with you > > > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > > > task installs a filter which blocks these calls, there's no way with > > > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > > > > > So for the slow of mind aka me: > > > > I'm not sure I completely understand this problem. Can you outline how > > > > sendmsg() and socket() are involved in this? > > > > > > > > I'm also not sure that this holds (but I might misunderstand the > > > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > > > other means so something like: > > > > > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > > > pid = clone(CLONE_FILES); > > > > if (pid == 0) { > > > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > > > > > // Now this fd will be valid in both parent and child. > > > > // If you haven't blocked it you can inform the parent what > > > > // the fd number is via pipe2(). If you have blocked it you can > > > > // use dup2() and dup to a known fd number. > > > > } > > > > > > > > > > > > > > v2: fix a bug where listener mode was not unset when an unused fd was not > > > > > available > > > > > v3: fix refcounting bug (Oleg) > > > > > v4: * change the listener's fd flags to be 0 > > > > > * rename GET_LISTENER to NEW_LISTENER (Matthew) > > > > > v5: * add capable(CAP_SYS_ADMIN) requirement > > > > > v7: * point the new listener at the right filter (Jann) > > > > > > > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws> > > > > > CC: Kees Cook <keescook@chromium.org> > > > > > CC: Andy Lutomirski <luto@amacapital.net> > > > > > CC: Oleg Nesterov <oleg@redhat.com> > > > > > CC: Eric W. Biederman <ebiederm@xmission.com> > > > > > CC: "Serge E. Hallyn" <serge@hallyn.com> > > > > > CC: Christian Brauner <christian.brauner@ubuntu.com> > > > > > CC: Tyler Hicks <tyhicks@canonical.com> > > > > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> > > > > > --- > > > > > include/linux/seccomp.h | 7 ++ > > > > > include/uapi/linux/ptrace.h | 2 + > > > > > kernel/ptrace.c | 4 ++ > > > > > kernel/seccomp.c | 31 +++++++++ > > > > > tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ > > > > > 5 files changed, 112 insertions(+) > > > > > > > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > > > > > index 017444b5efed..234c61b37405 100644 > > > > > --- a/include/linux/seccomp.h > > > > > +++ b/include/linux/seccomp.h > > > > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) > > > > > #ifdef CONFIG_SECCOMP_FILTER > > > > > extern void put_seccomp_filter(struct task_struct *tsk); > > > > > extern void get_seccomp_filter(struct task_struct *tsk); > > > > > +extern long seccomp_new_listener(struct task_struct *task, > > > > > + unsigned long filter_off); > > > > > #else /* CONFIG_SECCOMP_FILTER */ > > > > > static inline void put_seccomp_filter(struct task_struct *tsk) > > > > > { > > > > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) > > > > > { > > > > > return; > > > > > } > > > > > +static inline long seccomp_new_listener(struct task_struct *task, > > > > > + unsigned long filter_off) > > > > > +{ > > > > > + return -EINVAL; > > > > > +} > > > > > #endif /* CONFIG_SECCOMP_FILTER */ > > > > > > > > > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > > > > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h > > > > > index d5a1b8a492b9..e80ecb1bd427 100644 > > > > > --- a/include/uapi/linux/ptrace.h > > > > > +++ b/include/uapi/linux/ptrace.h > > > > > @@ -73,6 +73,8 @@ struct seccomp_metadata { > > > > > __u64 flags; /* Output: filter's flags */ > > > > > }; > > > > > > > > > > +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e > > > > > + > > > > > /* Read signals from a shared (process wide) queue */ > > > > > #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) > > > > > > > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c > > > > > index 21fec73d45d4..289960ac181b 100644 > > > > > --- a/kernel/ptrace.c > > > > > +++ b/kernel/ptrace.c > > > > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, > > > > > ret = seccomp_get_metadata(child, addr, datavp); > > > > > break; > > > > > > > > > > + case PTRACE_SECCOMP_NEW_LISTENER: > > > > > + ret = seccomp_new_listener(child, addr); > > > > > + break; > > > > > + > > > > > default: > > > > > break; > > > > > } > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > --- a/kernel/seccomp.c > > > > > +++ b/kernel/seccomp.c > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > return ret; > > > > > } > > > > > + > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > + unsigned long filter_off) > > > > > +{ > > > > > + struct seccomp_filter *filter; > > > > > + struct file *listener; > > > > > + int fd; > > > > > + > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > + return -EACCES; > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > use ptrace from in there? > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > . Basically, the problem is that this doesn't just give you capability > > > over the target task, but also over every other task that has the same > > > filter installed; you need some sort of "is the caller capable over > > > the filter and anyone who uses it" check. > > > > Thanks. > > But then this new ptrace feature as it stands is imho currently broken. > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api > > extension should be fixed to allow for this too or the seccomp() way of > > retrieving the pid - which I really think we want - needs to be fixed to > > require capable(CAP_SYS_ADMIN) too. > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > the preferred way to solve this. > > Everything else will just be confusing. > > First you say "broken", then you say "confusing". Which one do you mean? Both. It's broken in so far as it places a seemingly unnecessary restriction that could be fixed. You outlined one possible fix yourself in the link you provided. And it's confusing in so far as there is a way via seccomp() to get the fd without said requirement. > > Regarding requiring ns_capable() for ptrace: That means that you'll > have to stash namespace information in the seccomp filter. You'd also > potentially be eliding the LSM check that would normally have to occur > between the tracer and the tracee; but I guess that's probably fine? > CAP_SYS_ADMIN in the init namespace already has some abilities that > LSMs can't observe; you could argue that CAP_SYS_ADMIN in another > namespace should have similar semantics, but I'm not sure whether that > matches what the LSM people want as semantics.
On Mon, Oct 08, 2018 at 12:00:43PM -0600, Tycho Andersen wrote: > On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote: > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > version which can acquire filters is useful. There are at least two reasons > > > this is preferable, even though it uses ptrace: > > > > > > 1. You can control tasks that aren't cooperating with you > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > task installs a filter which blocks these calls, there's no way with > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > So for the slow of mind aka me: > > I'm not sure I completely understand this problem. Can you outline how > > sendmsg() and socket() are involved in this? > > > > I'm also not sure that this holds (but I might misunderstand the > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > other means so something like: > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > pid = clone(CLONE_FILES); > > if (pid == 0) { > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > // Now this fd will be valid in both parent and child. > > // If you haven't blocked it you can inform the parent what > > // the fd number is via pipe2(). If you have blocked it you can > > // use dup2() and dup to a known fd number. > > } > > But what if your seccomp filter wants to block both pipe2() and > dup2()? Whatever syscall you want to use to do this could be blocked (Fwiw, setup shared memory before the clone(CLONE_FILES) and write the fd in the shared memory. :)) > by some seccomp policy, which means you might not be able to use this > feature in some cases. > > Perhaps it's unlikely, and we can just go forward knowing this. But it > seems like it is worth at least acknowledging that you can wedge > yourself into a corner. Sure, if you try really really hard to shoot yourself in the foot you'll always be able to. From the top of my hat I'd say you can at least probably filter the seccomp() syscall with the listener argument. Once you've loaded the policy you're out of luck. You also might be seccomp confided and not be able to use the ptrace() syscall. AppArmor might prevent you from using ptrace()ing etc. pp. So I think we really want both ways but the seccomp interface is way cleaner. To get the fd from ptrace() you need three syscalls. With seccomp() you need one.
On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > --- a/kernel/seccomp.c > > > > > > +++ b/kernel/seccomp.c > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > return ret; > > > > > > } > > > > > > + > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > + unsigned long filter_off) > > > > > > +{ > > > > > > + struct seccomp_filter *filter; > > > > > > + struct file *listener; > > > > > > + int fd; > > > > > > + > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > + return -EACCES; > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > use ptrace from in there? > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > . Basically, the problem is that this doesn't just give you capability > > > > over the target task, but also over every other task that has the same > > > > filter installed; you need some sort of "is the caller capable over > > > > the filter and anyone who uses it" check. > > > > > > Thanks. > > > But then this new ptrace feature as it stands is imho currently broken. > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > if you are ns_cpabable(CAP_SYS_ADMIN) Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as you enable the NNP flag, I think? > > > then either the new ptrace() api > > > extension should be fixed to allow for this too or the seccomp() way of > > > retrieving the pid - which I really think we want - needs to be fixed to > > > require capable(CAP_SYS_ADMIN) too. > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > the preferred way to solve this. > > > Everything else will just be confusing. > > > > First you say "broken", then you say "confusing". Which one do you mean? > > Both. It's broken in so far as it places a seemingly unnecessary > restriction that could be fixed. You outlined one possible fix yourself > in the link you provided. If by "possible fix" you mean "check whether the seccomp filter is only attached to a single task": That wouldn't fundamentally change the situation, it would only add an additional special case. > And it's confusing in so far as there is a way > via seccomp() to get the fd without said requirement. I don't find it confusing at all. seccomp() and ptrace() are very different situations: When you use seccomp(), infrastructure is already in place for ensuring that your filter is only applied to processes over which you are capable, and propagation is limited by inheritance from your task down. When you use ptrace(), you need a pretty different sort of access check that checks whether you're privileged over ancestors, siblings and so on of the target task. But thinking about it more, I think that CAP_SYS_ADMIN over the saved current->mm->user_ns of the task that installed the filter (stored as a "struct user_namespace *" in the filter) should be acceptable.
On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > --- a/kernel/seccomp.c > > > > > > > +++ b/kernel/seccomp.c > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > return ret; > > > > > > > } > > > > > > > + > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > + unsigned long filter_off) > > > > > > > +{ > > > > > > > + struct seccomp_filter *filter; > > > > > > > + struct file *listener; > > > > > > > + int fd; > > > > > > > + > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > + return -EACCES; > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > use ptrace from in there? > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > over the target task, but also over every other task that has the same > > > > > filter installed; you need some sort of "is the caller capable over > > > > > the filter and anyone who uses it" check. > > > > > > > > Thanks. > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > you enable the NNP flag, I think? Yes, if you turn on NNP you don't even need sys_admin. > > > > > then either the new ptrace() api > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > require capable(CAP_SYS_ADMIN) too. > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > the preferred way to solve this. > > > > Everything else will just be confusing. > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > Both. It's broken in so far as it places a seemingly unnecessary > > restriction that could be fixed. You outlined one possible fix yourself > > in the link you provided. > > If by "possible fix" you mean "check whether the seccomp filter is > only attached to a single task": That wouldn't fundamentally change > the situation, it would only add an additional special case. > > > And it's confusing in so far as there is a way > > via seccomp() to get the fd without said requirement. > > I don't find it confusing at all. seccomp() and ptrace() are very Fine, then that's a matter of opinion. I find it counterintuitive that you can get an fd without privileges via one interface but not via another. > different situations: When you use seccomp(), infrastructure is Sure. Note, that this is _one_ of the reasons why I want to make sure we keep the native seccomp() only based way of getting an fd without forcing userspace to switching to a differnet kernel api. > already in place for ensuring that your filter is only applied to > processes over which you are capable, and propagation is limited by > inheritance from your task down. When you use ptrace(), you need a > pretty different sort of access check that checks whether you're > privileged over ancestors, siblings and so on of the target task. So, don't get me wrong I'm not arguing against the ptrace() interface in general. If this is something that people find useful, fine. But, I would like to have a simple single-syscall pure-seccomp() based way of getting an fd, i.e. what we have in patch 1 of this series. > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > current->mm->user_ns of the task that installed the filter (stored as > a "struct user_namespace *" in the filter) should be acceptable. Hm... Why not CAP_SYS_PTRACE? One more thing. Citing from [1] > I think there's a security problem here. Imagine the following scenario: > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > 2. task A forks off a child B > 3. task B uses setuid(1) to drop its privileges > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > or via execve() > 5. task C (the attacker, uid==1) attaches to task B via ptrace > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B Sorry, to be late to the party but would this really pass __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me that it would... Doesn't look like it would get past: tcred = __task_cred(task); if (uid_eq(caller_uid, tcred->euid) && uid_eq(caller_uid, tcred->suid) && uid_eq(caller_uid, tcred->uid) && gid_eq(caller_gid, tcred->egid) && gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; ok: rcu_read_unlock(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; > 7. because the seccomp filter is shared by task A and task B, task C > is now able to influence syscall results for syscalls performed by > task A [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
+cc selinux people explicitly, since they probably have opinions on this On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > return ret; > > > > > > > > } > > > > > > > > + > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > + unsigned long filter_off) > > > > > > > > +{ > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > + struct file *listener; > > > > > > > > + int fd; > > > > > > > > + > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > use ptrace from in there? > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > over the target task, but also over every other task that has the same > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > Thanks. > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > you enable the NNP flag, I think? > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > then either the new ptrace() api > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > the preferred way to solve this. > > > > > Everything else will just be confusing. > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > restriction that could be fixed. You outlined one possible fix yourself > > > in the link you provided. > > > > If by "possible fix" you mean "check whether the seccomp filter is > > only attached to a single task": That wouldn't fundamentally change > > the situation, it would only add an additional special case. > > > > > And it's confusing in so far as there is a way > > > via seccomp() to get the fd without said requirement. > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > Fine, then that's a matter of opinion. I find it counterintuitive that > you can get an fd without privileges via one interface but not via > another. > > > different situations: When you use seccomp(), infrastructure is > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > keep the native seccomp() only based way of getting an fd without > forcing userspace to switching to a differnet kernel api. > > > already in place for ensuring that your filter is only applied to > > processes over which you are capable, and propagation is limited by > > inheritance from your task down. When you use ptrace(), you need a > > pretty different sort of access check that checks whether you're > > privileged over ancestors, siblings and so on of the target task. > > So, don't get me wrong I'm not arguing against the ptrace() interface in > general. If this is something that people find useful, fine. But, I > would like to have a simple single-syscall pure-seccomp() based way of > getting an fd, i.e. what we have in patch 1 of this series. Yeah, I also prefer the seccomp() one. > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > current->mm->user_ns of the task that installed the filter (stored as > > a "struct user_namespace *" in the filter) should be acceptable. > > Hm... Why not CAP_SYS_PTRACE? Because LSMs like SELinux add extra checks that apply even if you have CAP_SYS_PTRACE, and this would subvert those. The only capability I know of that lets you bypass LSM checks by design (if no LSM blocks the capability itself) is CAP_SYS_ADMIN. > One more thing. Citing from [1] > > > I think there's a security problem here. Imagine the following scenario: > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > 2. task A forks off a child B > > 3. task B uses setuid(1) to drop its privileges > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > or via execve() > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > Sorry, to be late to the party but would this really pass > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > that it would... Doesn't look like it would get past: > > tcred = __task_cred(task); > if (uid_eq(caller_uid, tcred->euid) && > uid_eq(caller_uid, tcred->suid) && > uid_eq(caller_uid, tcred->uid) && > gid_eq(caller_gid, tcred->egid) && > gid_eq(caller_gid, tcred->sgid) && > gid_eq(caller_gid, tcred->gid)) > goto ok; > if (ptrace_has_cap(tcred->user_ns, mode)) > goto ok; > rcu_read_unlock(); > return -EPERM; > ok: > rcu_read_unlock(); > mm = task->mm; > if (mm && > ((get_dumpable(mm) != SUID_DUMP_USER) && > !ptrace_has_cap(mm->user_ns, mode))) > return -EPERM; Which specific check would prevent task C from attaching to task B? If the UIDs match, the first "goto ok" executes; and you're dumpable, so you don't trigger the second "return -EPERM". > > 7. because the seccomp filter is shared by task A and task B, task C > > is now able to influence syscall results for syscalls performed by > > task A > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > +cc selinux people explicitly, since they probably have opinions on this > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > } > > > > > > > > > + > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > + unsigned long filter_off) > > > > > > > > > +{ > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > + struct file *listener; > > > > > > > > > + int fd; > > > > > > > > > + > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > over the target task, but also over every other task that has the same > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > Thanks. > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > you enable the NNP flag, I think? > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > then either the new ptrace() api > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > the preferred way to solve this. > > > > > > Everything else will just be confusing. > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > in the link you provided. > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > only attached to a single task": That wouldn't fundamentally change > > > the situation, it would only add an additional special case. > > > > > > > And it's confusing in so far as there is a way > > > > via seccomp() to get the fd without said requirement. > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > you can get an fd without privileges via one interface but not via > > another. > > > > > different situations: When you use seccomp(), infrastructure is > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > keep the native seccomp() only based way of getting an fd without > > forcing userspace to switching to a differnet kernel api. > > > > > already in place for ensuring that your filter is only applied to > > > processes over which you are capable, and propagation is limited by > > > inheritance from your task down. When you use ptrace(), you need a > > > pretty different sort of access check that checks whether you're > > > privileged over ancestors, siblings and so on of the target task. > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > general. If this is something that people find useful, fine. But, I > > would like to have a simple single-syscall pure-seccomp() based way of > > getting an fd, i.e. what we have in patch 1 of this series. > > Yeah, I also prefer the seccomp() one. > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > current->mm->user_ns of the task that installed the filter (stored as > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > Hm... Why not CAP_SYS_PTRACE? > > Because LSMs like SELinux add extra checks that apply even if you have > CAP_SYS_PTRACE, and this would subvert those. The only capability I > know of that lets you bypass LSM checks by design (if no LSM blocks > the capability itself) is CAP_SYS_ADMIN. > > > One more thing. Citing from [1] > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > 2. task A forks off a child B > > > 3. task B uses setuid(1) to drop its privileges > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > or via execve() > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > Sorry, to be late to the party but would this really pass > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > that it would... Doesn't look like it would get past: > > > > tcred = __task_cred(task); > > if (uid_eq(caller_uid, tcred->euid) && > > uid_eq(caller_uid, tcred->suid) && > > uid_eq(caller_uid, tcred->uid) && > > gid_eq(caller_gid, tcred->egid) && > > gid_eq(caller_gid, tcred->sgid) && > > gid_eq(caller_gid, tcred->gid)) > > goto ok; > > if (ptrace_has_cap(tcred->user_ns, mode)) > > goto ok; > > rcu_read_unlock(); > > return -EPERM; > > ok: > > rcu_read_unlock(); > > mm = task->mm; > > if (mm && > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > !ptrace_has_cap(mm->user_ns, mode))) > > return -EPERM; > > Which specific check would prevent task C from attaching to task B? If > the UIDs match, the first "goto ok" executes; and you're dumpable, so > you don't trigger the second "return -EPERM". You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't have if you did a setuid to an unpriv user. (But I always find that code confusing.) > > > > 7. because the seccomp filter is shared by task A and task B, task C > > > is now able to influence syscall results for syscalls performed by > > > task A > > > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > +cc selinux people explicitly, since they probably have opinions on this > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > > } > > > > > > > > > > + > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > > + unsigned long filter_off) > > > > > > > > > > +{ > > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > > + struct file *listener; > > > > > > > > > > + int fd; > > > > > > > > > > + > > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > > over the target task, but also over every other task that has the same > > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > > > Thanks. > > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > > you enable the NNP flag, I think? > > > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > > > > then either the new ptrace() api > > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > > the preferred way to solve this. > > > > > > > Everything else will just be confusing. > > > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > > in the link you provided. > > > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > > only attached to a single task": That wouldn't fundamentally change > > > > the situation, it would only add an additional special case. > > > > > > > > > And it's confusing in so far as there is a way > > > > > via seccomp() to get the fd without said requirement. > > > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > > you can get an fd without privileges via one interface but not via > > > another. > > > > > > > different situations: When you use seccomp(), infrastructure is > > > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > > keep the native seccomp() only based way of getting an fd without > > > forcing userspace to switching to a differnet kernel api. > > > > > > > already in place for ensuring that your filter is only applied to > > > > processes over which you are capable, and propagation is limited by > > > > inheritance from your task down. When you use ptrace(), you need a > > > > pretty different sort of access check that checks whether you're > > > > privileged over ancestors, siblings and so on of the target task. > > > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > > general. If this is something that people find useful, fine. But, I > > > would like to have a simple single-syscall pure-seccomp() based way of > > > getting an fd, i.e. what we have in patch 1 of this series. > > > > Yeah, I also prefer the seccomp() one. > > > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > > current->mm->user_ns of the task that installed the filter (stored as > > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > > > Hm... Why not CAP_SYS_PTRACE? > > > > Because LSMs like SELinux add extra checks that apply even if you have > > CAP_SYS_PTRACE, and this would subvert those. The only capability I > > know of that lets you bypass LSM checks by design (if no LSM blocks > > the capability itself) is CAP_SYS_ADMIN. > > > > > One more thing. Citing from [1] > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > 2. task A forks off a child B > > > > 3. task B uses setuid(1) to drop its privileges > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > or via execve() > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > Sorry, to be late to the party but would this really pass > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > that it would... Doesn't look like it would get past: > > > > > > tcred = __task_cred(task); > > > if (uid_eq(caller_uid, tcred->euid) && > > > uid_eq(caller_uid, tcred->suid) && > > > uid_eq(caller_uid, tcred->uid) && > > > gid_eq(caller_gid, tcred->egid) && > > > gid_eq(caller_gid, tcred->sgid) && > > > gid_eq(caller_gid, tcred->gid)) > > > goto ok; > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > goto ok; > > > rcu_read_unlock(); > > > return -EPERM; > > > ok: > > > rcu_read_unlock(); > > > mm = task->mm; > > > if (mm && > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > !ptrace_has_cap(mm->user_ns, mode))) > > > return -EPERM; > > > > Which specific check would prevent task C from attaching to task B? If > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > you don't trigger the second "return -EPERM". > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > have if you did a setuid to an unpriv user. (But I always find that code > confusing.) Only if the target hasn't gone through execve() since setuid().
On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > +cc selinux people explicitly, since they probably have opinions on this > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > > > } > > > > > > > > > > > + > > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > > > + unsigned long filter_off) > > > > > > > > > > > +{ > > > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > > > + struct file *listener; > > > > > > > > > > > + int fd; > > > > > > > > > > > + > > > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > > > over the target task, but also over every other task that has the same > > > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > > > > > Thanks. > > > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > > > you enable the NNP flag, I think? > > > > > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > > > > > > > then either the new ptrace() api > > > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > > > the preferred way to solve this. > > > > > > > > Everything else will just be confusing. > > > > > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > > > in the link you provided. > > > > > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > > > only attached to a single task": That wouldn't fundamentally change > > > > > the situation, it would only add an additional special case. > > > > > > > > > > > And it's confusing in so far as there is a way > > > > > > via seccomp() to get the fd without said requirement. > > > > > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > > > you can get an fd without privileges via one interface but not via > > > > another. > > > > > > > > > different situations: When you use seccomp(), infrastructure is > > > > > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > > > keep the native seccomp() only based way of getting an fd without > > > > forcing userspace to switching to a differnet kernel api. > > > > > > > > > already in place for ensuring that your filter is only applied to > > > > > processes over which you are capable, and propagation is limited by > > > > > inheritance from your task down. When you use ptrace(), you need a > > > > > pretty different sort of access check that checks whether you're > > > > > privileged over ancestors, siblings and so on of the target task. > > > > > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > > > general. If this is something that people find useful, fine. But, I > > > > would like to have a simple single-syscall pure-seccomp() based way of > > > > getting an fd, i.e. what we have in patch 1 of this series. > > > > > > Yeah, I also prefer the seccomp() one. > > > > > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > > > current->mm->user_ns of the task that installed the filter (stored as > > > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > > > > > Hm... Why not CAP_SYS_PTRACE? > > > > > > Because LSMs like SELinux add extra checks that apply even if you have > > > CAP_SYS_PTRACE, and this would subvert those. The only capability I > > > know of that lets you bypass LSM checks by design (if no LSM blocks > > > the capability itself) is CAP_SYS_ADMIN. > > > > > > > One more thing. Citing from [1] > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > 2. task A forks off a child B > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > or via execve() > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > Sorry, to be late to the party but would this really pass > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > that it would... Doesn't look like it would get past: > > > > > > > > tcred = __task_cred(task); > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > uid_eq(caller_uid, tcred->suid) && > > > > uid_eq(caller_uid, tcred->uid) && > > > > gid_eq(caller_gid, tcred->egid) && > > > > gid_eq(caller_gid, tcred->sgid) && > > > > gid_eq(caller_gid, tcred->gid)) > > > > goto ok; > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > goto ok; > > > > rcu_read_unlock(); > > > > return -EPERM; > > > > ok: > > > > rcu_read_unlock(); > > > > mm = task->mm; > > > > if (mm && > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > return -EPERM; > > > > > > Which specific check would prevent task C from attaching to task B? If > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > you don't trigger the second "return -EPERM". > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > have if you did a setuid to an unpriv user. (But I always find that code > > confusing.) > > Only if the target hasn't gone through execve() since setuid(). Sorry if I want to know this in excessive detail but I'd like to understand this properly so bear with me :) - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). - If task B has setuid()ed, exeved()ed it will get its dumpable flag set to /proc/sys/fs/suid_dumpable which by default is 0. So C won't pass (get_dumpable(mm) != SUID_DUMP_USER). In both cases PTRACE_ATTACH shouldn't work. Now, if /proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work. This is an administrator choice.
On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > One more thing. Citing from [1] > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > 2. task A forks off a child B > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > or via execve() > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > tcred = __task_cred(task); > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > goto ok; > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > goto ok; > > > > > rcu_read_unlock(); > > > > > return -EPERM; > > > > > ok: > > > > > rcu_read_unlock(); > > > > > mm = task->mm; > > > > > if (mm && > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > return -EPERM; > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > you don't trigger the second "return -EPERM". > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > have if you did a setuid to an unpriv user. (But I always find that code > > > confusing.) > > > > Only if the target hasn't gone through execve() since setuid(). > > Sorry if I want to know this in excessive detail but I'd like to > understand this properly so bear with me :) > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). Yeah. > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > to /proc/sys/fs/suid_dumpable Not if you changed all UIDs (e.g. by calling setuid() as root). In that case, setup_new_exec() calls "set_dumpable(current->mm, SUID_DUMP_USER)". > which by default is 0. So C won't pass > (get_dumpable(mm) != SUID_DUMP_USER). > In both cases PTRACE_ATTACH shouldn't work. Now, if > /proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work. > This is an administrator choice.
On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > 2. task A forks off a child B > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > or via execve() > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > goto ok; > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > goto ok; > > > > > > rcu_read_unlock(); > > > > > > return -EPERM; > > > > > > ok: > > > > > > rcu_read_unlock(); > > > > > > mm = task->mm; > > > > > > if (mm && > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > return -EPERM; > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > you don't trigger the second "return -EPERM". > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > confusing.) > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > Sorry if I want to know this in excessive detail but I'd like to > > understand this properly so bear with me :) > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > Yeah. > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > to /proc/sys/fs/suid_dumpable > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > that case, setup_new_exec() calls "set_dumpable(current->mm, > SUID_DUMP_USER)". Actually, looking at this when C is trying to PTRACE_ATTACH to B as an unprivileged user even if B execve()ed and it is dumpable C still wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is privileged over mm->user_ns which means it must be in an ancestor user_ns. > > > which by default is 0. So C won't pass > > (get_dumpable(mm) != SUID_DUMP_USER). > > In both cases PTRACE_ATTACH shouldn't work. Now, if > > /proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work. > > This is an administrator choice.
On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote: > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > > 2. task A forks off a child B > > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > > or via execve() > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > > goto ok; > > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > > goto ok; > > > > > > > rcu_read_unlock(); > > > > > > > return -EPERM; > > > > > > > ok: > > > > > > > rcu_read_unlock(); > > > > > > > mm = task->mm; > > > > > > > if (mm && > > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > > return -EPERM; > > > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > > you don't trigger the second "return -EPERM". > > > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > > confusing.) > > > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > > > Sorry if I want to know this in excessive detail but I'd like to > > > understand this properly so bear with me :) > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > > > Yeah. > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > > to /proc/sys/fs/suid_dumpable > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > > that case, setup_new_exec() calls "set_dumpable(current->mm, > > SUID_DUMP_USER)". > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an > unprivileged user even if B execve()ed and it is dumpable C still > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is > privileged over mm->user_ns which means it must be in an ancestor > user_ns. Huh? Why would you need CAP_SYS_PTRACE for anything here? You can ptrace another process running under your UID just fine, no matter what the namespaces are. I'm not sure what you're saying.
On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote: > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > > > 2. task A forks off a child B > > > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > > > or via execve() > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > > > goto ok; > > > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > > > goto ok; > > > > > > > > rcu_read_unlock(); > > > > > > > > return -EPERM; > > > > > > > > ok: > > > > > > > > rcu_read_unlock(); > > > > > > > > mm = task->mm; > > > > > > > > if (mm && > > > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > > > return -EPERM; > > > > > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > > > you don't trigger the second "return -EPERM". > > > > > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > > > confusing.) > > > > > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > > > > > Sorry if I want to know this in excessive detail but I'd like to > > > > understand this properly so bear with me :) > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > > > > > Yeah. > > > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > > > to /proc/sys/fs/suid_dumpable > > > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > > > that case, setup_new_exec() calls "set_dumpable(current->mm, > > > SUID_DUMP_USER)". > > > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an > > unprivileged user even if B execve()ed and it is dumpable C still > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is > > privileged over mm->user_ns which means it must be in an ancestor > > user_ns. > > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can > ptrace another process running under your UID just fine, no matter > what the namespaces are. I'm not sure what you're saying. Sorry, I was out the door yesterday when answering this and was too brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It should be enabled by default on nearly all distros and even if not - which is an administrators choice - you can usually easily enable it via sysctl. 1 ("restricted ptrace") [default value] When performing an operation that requires a PTRACE_MODE_ATTACH check, the calling process must either have the CAP_SYS_PTRACE capability in the user namespace of the target process or it must have a prede‐ fined relationship with the target process. By default, the predefined relationship is that the target process must be a descendant of the caller. If you don't have it set you're already susceptible to all kinds of other attacks and I'm still not convinced we need to bring out the big capable(CAP_SYS_ADMIN) gun here.
On Wed, Oct 10, 2018 at 02:54:22PM +0200, Christian Brauner wrote: > On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote: > > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > > > > 2. task A forks off a child B > > > > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > > > > or via execve() > > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > > > > goto ok; > > > > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > > > > goto ok; > > > > > > > > > rcu_read_unlock(); > > > > > > > > > return -EPERM; > > > > > > > > > ok: > > > > > > > > > rcu_read_unlock(); > > > > > > > > > mm = task->mm; > > > > > > > > > if (mm && > > > > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > > > > return -EPERM; > > > > > > > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > > > > you don't trigger the second "return -EPERM". > > > > > > > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > > > > confusing.) > > > > > > > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > > > > > > > Sorry if I want to know this in excessive detail but I'd like to > > > > > understand this properly so bear with me :) > > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > > > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > > > > > > > Yeah. > > > > > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > > > > to /proc/sys/fs/suid_dumpable > > > > > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > > > > that case, setup_new_exec() calls "set_dumpable(current->mm, > > > > SUID_DUMP_USER)". > > > > > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an > > > unprivileged user even if B execve()ed and it is dumpable C still > > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is > > > privileged over mm->user_ns which means it must be in an ancestor > > > user_ns. > > > > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can > > ptrace another process running under your UID just fine, no matter > > what the namespaces are. I'm not sure what you're saying. > > Sorry, I was out the door yesterday when answering this and was too > brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It > should be enabled by default on nearly all distros and even if not - > which is an administrators choice - you can usually easily enable it via > sysctl. > > 1 ("restricted ptrace") [default value] > When performing an operation that requires a PTRACE_MODE_ATTACH check, > the calling process must either have the CAP_SYS_PTRACE capability in > the user namespace of the target process or it must have a prede‐ fined > relationship with the target process. By default, the predefined > relationship is that the target process must be a descendant of the > caller. > > If you don't have it set you're already susceptible to all kinds of > other attacks and I'm still not convinced we need to bring out the big > capable(CAP_SYS_ADMIN) gun here. That being said, given that Tycho agreed to leave in the native seccomp() way of retrieving an fd from the task without the sys_admin restriction [1] which we prefer and if we merge it with aforementioned feature I care way less about whether or not the restriction is upheld for ptrace() or not. [1]: https://lists.linuxfoundation.org/pipermail/containers/2018-October/039553.html
On Wed, Oct 10, 2018 at 2:54 PM Christian Brauner <christian@brauner.io> wrote: > On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote: > > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > > > > 2. task A forks off a child B > > > > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > > > > or via execve() > > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > > > > goto ok; > > > > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > > > > goto ok; > > > > > > > > > rcu_read_unlock(); > > > > > > > > > return -EPERM; > > > > > > > > > ok: > > > > > > > > > rcu_read_unlock(); > > > > > > > > > mm = task->mm; > > > > > > > > > if (mm && > > > > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > > > > return -EPERM; > > > > > > > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > > > > you don't trigger the second "return -EPERM". > > > > > > > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > > > > confusing.) > > > > > > > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > > > > > > > Sorry if I want to know this in excessive detail but I'd like to > > > > > understand this properly so bear with me :) > > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > > > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > > > > > > > Yeah. > > > > > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > > > > to /proc/sys/fs/suid_dumpable > > > > > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > > > > that case, setup_new_exec() calls "set_dumpable(current->mm, > > > > SUID_DUMP_USER)". > > > > > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an > > > unprivileged user even if B execve()ed and it is dumpable C still > > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is > > > privileged over mm->user_ns which means it must be in an ancestor > > > user_ns. > > > > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can > > ptrace another process running under your UID just fine, no matter > > what the namespaces are. I'm not sure what you're saying. > > Sorry, I was out the door yesterday when answering this and was too > brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It > should be enabled by default on nearly all distros "nearly all distros"? AFAIK it's off on Debian, for starters. And Yama still doesn't help you if one of the tasks enters a new user namespace or whatever. Yama is a little bit of extra, heuristic, **opt-in** hardening enabled in some configurations. It is **not** a fundamental building block you can rely on. > and even if not - > which is an administrators choice - you can usually easily enable it via > sysctl. Opt-in security isn't good enough. Kernel interfaces should still be safe to use even on a system that has all the LSM stuff disabled in the kernel config. > 1 ("restricted ptrace") [default value] > When performing an operation that requires a PTRACE_MODE_ATTACH check, > the calling process must either have the CAP_SYS_PTRACE capability in > the user namespace of the target process or it must have a prede‐ fined > relationship with the target process. By default, the predefined > relationship is that the target process must be a descendant of the > caller. > > If you don't have it set you're already susceptible to all kinds of > other attacks Oh? Can you be more specific, please? > and I'm still not convinced we need to bring out the big > capable(CAP_SYS_ADMIN) gun here.
On Wed, Oct 10, 2018 at 03:10:11PM +0200, Jann Horn wrote: > On Wed, Oct 10, 2018 at 2:54 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote: > > > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote: > > > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote: > > > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote: > > > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > > One more thing. Citing from [1] > > > > > > > > > > > > > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > > > > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > > > > > > > 2. task A forks off a child B > > > > > > > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > > > > > > > or via execve() > > > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > > > > > > > > > > > > > Sorry, to be late to the party but would this really pass > > > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > > > > > > > that it would... Doesn't look like it would get past: > > > > > > > > > > > > > > > > > > > > tcred = __task_cred(task); > > > > > > > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > > > > > > > uid_eq(caller_uid, tcred->suid) && > > > > > > > > > > uid_eq(caller_uid, tcred->uid) && > > > > > > > > > > gid_eq(caller_gid, tcred->egid) && > > > > > > > > > > gid_eq(caller_gid, tcred->sgid) && > > > > > > > > > > gid_eq(caller_gid, tcred->gid)) > > > > > > > > > > goto ok; > > > > > > > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > > > > > > > goto ok; > > > > > > > > > > rcu_read_unlock(); > > > > > > > > > > return -EPERM; > > > > > > > > > > ok: > > > > > > > > > > rcu_read_unlock(); > > > > > > > > > > mm = task->mm; > > > > > > > > > > if (mm && > > > > > > > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > > > > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > > > > > > > return -EPERM; > > > > > > > > > > > > > > > > > > Which specific check would prevent task C from attaching to task B? If > > > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > > > > > > > you don't trigger the second "return -EPERM". > > > > > > > > > > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't > > > > > > > > have if you did a setuid to an unpriv user. (But I always find that code > > > > > > > > confusing.) > > > > > > > > > > > > > > Only if the target hasn't gone through execve() since setuid(). > > > > > > > > > > > > Sorry if I want to know this in excessive detail but I'd like to > > > > > > understand this properly so bear with me :) > > > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not > > > > > > execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode). > > > > > > > > > > Yeah. > > > > > > > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set > > > > > > to /proc/sys/fs/suid_dumpable > > > > > > > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In > > > > > that case, setup_new_exec() calls "set_dumpable(current->mm, > > > > > SUID_DUMP_USER)". > > > > > > > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an > > > > unprivileged user even if B execve()ed and it is dumpable C still > > > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is > > > > privileged over mm->user_ns which means it must be in an ancestor > > > > user_ns. > > > > > > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can > > > ptrace another process running under your UID just fine, no matter > > > what the namespaces are. I'm not sure what you're saying. > > > > Sorry, I was out the door yesterday when answering this and was too > > brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It > > should be enabled by default on nearly all distros > > "nearly all distros"? AFAIK it's off on Debian, for starters. And Yama > still doesn't help you if one of the tasks enters a new user namespace > or whatever. > > Yama is a little bit of extra, heuristic, **opt-in** hardening enabled > in some configurations. It is **not** a fundamental building block you > can rely on. > > > and even if not - > > which is an administrators choice - you can usually easily enable it via > > sysctl. > > Opt-in security isn't good enough. Kernel interfaces should still be > safe to use even on a system that has all the LSM stuff disabled in > the kernel config. Then ptrace() isn't, I guess? But see https://lists.linuxfoundation.org/pipermail/containers/2018-October/039567.html I don't care as long as we have a way of getting the fd without the CAP_SYS_ADMIN requirement throught seccomp(). > > > 1 ("restricted ptrace") [default value] > > When performing an operation that requires a PTRACE_MODE_ATTACH check, > > the calling process must either have the CAP_SYS_PTRACE capability in > > the user namespace of the target process or it must have a prede‐ fined > > relationship with the target process. By default, the predefined > > relationship is that the target process must be a descendant of the > > caller. > > > > If you don't have it set you're already susceptible to all kinds of > > other attacks > > Oh? Can you be more specific, please? I was referring to attacks where you attach to processes that run as your user but might expose in-memory credentials or other sensitive information, (essentially what the manpage is outlining). > > > and I'm still not convinced we need to bring out the big > > capable(CAP_SYS_ADMIN) gun here.
On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > +cc selinux people explicitly, since they probably have opinions on this I just spent about twenty minutes working my way through this thread, and digging through the containers archive trying to get a good understanding of what you guys are trying to do, and I'm not quite sure I understand it all. However, from what I have seen, this approach looks very ptrace-y to me (I imagine to others as well based on the comments) and because of this I think ensuring the usual ptrace access controls are evaluated, including the ptrace LSM hooks, is the right thing to do. If I've missed something, or I'm thinking about this wrong, please educate me; just a heads-up that I'm largely offline for most of this week so responses on my end are going to be delayed much more than usual. > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > } > > > > > > > > > + > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > + unsigned long filter_off) > > > > > > > > > +{ > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > + struct file *listener; > > > > > > > > > + int fd; > > > > > > > > > + > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > over the target task, but also over every other task that has the same > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > Thanks. > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > you enable the NNP flag, I think? > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > then either the new ptrace() api > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > the preferred way to solve this. > > > > > > Everything else will just be confusing. > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > in the link you provided. > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > only attached to a single task": That wouldn't fundamentally change > > > the situation, it would only add an additional special case. > > > > > > > And it's confusing in so far as there is a way > > > > via seccomp() to get the fd without said requirement. > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > you can get an fd without privileges via one interface but not via > > another. > > > > > different situations: When you use seccomp(), infrastructure is > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > keep the native seccomp() only based way of getting an fd without > > forcing userspace to switching to a differnet kernel api. > > > > > already in place for ensuring that your filter is only applied to > > > processes over which you are capable, and propagation is limited by > > > inheritance from your task down. When you use ptrace(), you need a > > > pretty different sort of access check that checks whether you're > > > privileged over ancestors, siblings and so on of the target task. > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > general. If this is something that people find useful, fine. But, I > > would like to have a simple single-syscall pure-seccomp() based way of > > getting an fd, i.e. what we have in patch 1 of this series. > > Yeah, I also prefer the seccomp() one. > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > current->mm->user_ns of the task that installed the filter (stored as > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > Hm... Why not CAP_SYS_PTRACE? > > Because LSMs like SELinux add extra checks that apply even if you have > CAP_SYS_PTRACE, and this would subvert those. The only capability I > know of that lets you bypass LSM checks by design (if no LSM blocks > the capability itself) is CAP_SYS_ADMIN. > > > One more thing. Citing from [1] > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > 2. task A forks off a child B > > > 3. task B uses setuid(1) to drop its privileges > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > or via execve() > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > Sorry, to be late to the party but would this really pass > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > that it would... Doesn't look like it would get past: > > > > tcred = __task_cred(task); > > if (uid_eq(caller_uid, tcred->euid) && > > uid_eq(caller_uid, tcred->suid) && > > uid_eq(caller_uid, tcred->uid) && > > gid_eq(caller_gid, tcred->egid) && > > gid_eq(caller_gid, tcred->sgid) && > > gid_eq(caller_gid, tcred->gid)) > > goto ok; > > if (ptrace_has_cap(tcred->user_ns, mode)) > > goto ok; > > rcu_read_unlock(); > > return -EPERM; > > ok: > > rcu_read_unlock(); > > mm = task->mm; > > if (mm && > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > !ptrace_has_cap(mm->user_ns, mode))) > > return -EPERM; > > Which specific check would prevent task C from attaching to task B? If > the UIDs match, the first "goto ok" executes; and you're dumpable, so > you don't trigger the second "return -EPERM". > > > > 7. because the seccomp filter is shared by task A and task B, task C > > > is now able to influence syscall results for syscalls performed by > > > task A > > > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > +cc selinux people explicitly, since they probably have opinions on this > > I just spent about twenty minutes working my way through this thread, > and digging through the containers archive trying to get a good > understanding of what you guys are trying to do, and I'm not quite > sure I understand it all. However, from what I have seen, this > approach looks very ptrace-y to me (I imagine to others as well based > on the comments) and because of this I think ensuring the usual ptrace > access controls are evaluated, including the ptrace LSM hooks, is the > right thing to do. Basically the problem is that this new ptrace() API does something that doesn't just influence the target task, but also every other task that has the same seccomp filter. So the classic ptrace check doesn't work here. > If I've missed something, or I'm thinking about this wrong, please > educate me; just a heads-up that I'm largely offline for most of this > week so responses on my end are going to be delayed much more than > usual. > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > > } > > > > > > > > > > + > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > > + unsigned long filter_off) > > > > > > > > > > +{ > > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > > + struct file *listener; > > > > > > > > > > + int fd; > > > > > > > > > > + > > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > > over the target task, but also over every other task that has the same > > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > > > Thanks. > > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > > you enable the NNP flag, I think? > > > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > > > > then either the new ptrace() api > > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > > the preferred way to solve this. > > > > > > > Everything else will just be confusing. > > > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > > in the link you provided. > > > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > > only attached to a single task": That wouldn't fundamentally change > > > > the situation, it would only add an additional special case. > > > > > > > > > And it's confusing in so far as there is a way > > > > > via seccomp() to get the fd without said requirement. > > > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > > you can get an fd without privileges via one interface but not via > > > another. > > > > > > > different situations: When you use seccomp(), infrastructure is > > > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > > keep the native seccomp() only based way of getting an fd without > > > forcing userspace to switching to a differnet kernel api. > > > > > > > already in place for ensuring that your filter is only applied to > > > > processes over which you are capable, and propagation is limited by > > > > inheritance from your task down. When you use ptrace(), you need a > > > > pretty different sort of access check that checks whether you're > > > > privileged over ancestors, siblings and so on of the target task. > > > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > > general. If this is something that people find useful, fine. But, I > > > would like to have a simple single-syscall pure-seccomp() based way of > > > getting an fd, i.e. what we have in patch 1 of this series. > > > > Yeah, I also prefer the seccomp() one. > > > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > > current->mm->user_ns of the task that installed the filter (stored as > > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > > > Hm... Why not CAP_SYS_PTRACE? > > > > Because LSMs like SELinux add extra checks that apply even if you have > > CAP_SYS_PTRACE, and this would subvert those. The only capability I > > know of that lets you bypass LSM checks by design (if no LSM blocks > > the capability itself) is CAP_SYS_ADMIN. > > > > > One more thing. Citing from [1] > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > 2. task A forks off a child B > > > > 3. task B uses setuid(1) to drop its privileges > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > or via execve() > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > Sorry, to be late to the party but would this really pass > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > that it would... Doesn't look like it would get past: > > > > > > tcred = __task_cred(task); > > > if (uid_eq(caller_uid, tcred->euid) && > > > uid_eq(caller_uid, tcred->suid) && > > > uid_eq(caller_uid, tcred->uid) && > > > gid_eq(caller_gid, tcred->egid) && > > > gid_eq(caller_gid, tcred->sgid) && > > > gid_eq(caller_gid, tcred->gid)) > > > goto ok; > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > goto ok; > > > rcu_read_unlock(); > > > return -EPERM; > > > ok: > > > rcu_read_unlock(); > > > mm = task->mm; > > > if (mm && > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > !ptrace_has_cap(mm->user_ns, mode))) > > > return -EPERM; > > > > Which specific check would prevent task C from attaching to task B? If > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > you don't trigger the second "return -EPERM". > > > > > > 7. because the seccomp filter is shared by task A and task B, task C > > > > is now able to influence syscall results for syscalls performed by > > > > task A > > > > > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > -- > paul moore > www.paul-moore.com
On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote: > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > +cc selinux people explicitly, since they probably have opinions on this > > > > I just spent about twenty minutes working my way through this thread, > > and digging through the containers archive trying to get a good > > understanding of what you guys are trying to do, and I'm not quite > > sure I understand it all. However, from what I have seen, this > > approach looks very ptrace-y to me (I imagine to others as well based > > on the comments) and because of this I think ensuring the usual ptrace > > access controls are evaluated, including the ptrace LSM hooks, is the > > right thing to do. > > Basically the problem is that this new ptrace() API does something > that doesn't just influence the target task, but also every other task > that has the same seccomp filter. So the classic ptrace check doesn't > work here. Just to throw this into the mix: then maybe ptrace() isn't the right interface and we should just go with the native seccomp() approach for now. > > > If I've missed something, or I'm thinking about this wrong, please > > educate me; just a heads-up that I'm largely offline for most of this > > week so responses on my end are going to be delayed much more than > > usual. > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: > > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: > > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: > > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: > > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: > > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644 > > > > > > > > > > > --- a/kernel/seccomp.c > > > > > > > > > > > +++ b/kernel/seccomp.c > > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, > > > > > > > > > > > > > > > > > > > > > > return ret; > > > > > > > > > > > } > > > > > > > > > > > + > > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task, > > > > > > > > > > > + unsigned long filter_off) > > > > > > > > > > > +{ > > > > > > > > > > > + struct seccomp_filter *filter; > > > > > > > > > > > + struct file *listener; > > > > > > > > > > > + int fd; > > > > > > > > > > > + > > > > > > > > > > > + if (!capable(CAP_SYS_ADMIN)) > > > > > > > > > > > + return -EACCES; > > > > > > > > > > > > > > > > > > > > I know this might have been discussed a while back but why exactly do we > > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What > > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and > > > > > > > > > > use ptrace from in there? > > > > > > > > > > > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > > > . Basically, the problem is that this doesn't just give you capability > > > > > > > > > over the target task, but also over every other task that has the same > > > > > > > > > filter installed; you need some sort of "is the caller capable over > > > > > > > > > the filter and anyone who uses it" check. > > > > > > > > > > > > > > > > Thanks. > > > > > > > > But then this new ptrace feature as it stands is imho currently broken. > > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you > > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself > > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN) > > > > > > > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as > > > > > you enable the NNP flag, I think? > > > > > > > > Yes, if you turn on NNP you don't even need sys_admin. > > > > > > > > > > > > > > > > > then either the new ptrace() api > > > > > > > > extension should be fixed to allow for this too or the seccomp() way of > > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to > > > > > > > > require capable(CAP_SYS_ADMIN) too. > > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - > > > > > > > > the preferred way to solve this. > > > > > > > > Everything else will just be confusing. > > > > > > > > > > > > > > First you say "broken", then you say "confusing". Which one do you mean? > > > > > > > > > > > > Both. It's broken in so far as it places a seemingly unnecessary > > > > > > restriction that could be fixed. You outlined one possible fix yourself > > > > > > in the link you provided. > > > > > > > > > > If by "possible fix" you mean "check whether the seccomp filter is > > > > > only attached to a single task": That wouldn't fundamentally change > > > > > the situation, it would only add an additional special case. > > > > > > > > > > > And it's confusing in so far as there is a way > > > > > > via seccomp() to get the fd without said requirement. > > > > > > > > > > I don't find it confusing at all. seccomp() and ptrace() are very > > > > > > > > Fine, then that's a matter of opinion. I find it counterintuitive that > > > > you can get an fd without privileges via one interface but not via > > > > another. > > > > > > > > > different situations: When you use seccomp(), infrastructure is > > > > > > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we > > > > keep the native seccomp() only based way of getting an fd without > > > > forcing userspace to switching to a differnet kernel api. > > > > > > > > > already in place for ensuring that your filter is only applied to > > > > > processes over which you are capable, and propagation is limited by > > > > > inheritance from your task down. When you use ptrace(), you need a > > > > > pretty different sort of access check that checks whether you're > > > > > privileged over ancestors, siblings and so on of the target task. > > > > > > > > So, don't get me wrong I'm not arguing against the ptrace() interface in > > > > general. If this is something that people find useful, fine. But, I > > > > would like to have a simple single-syscall pure-seccomp() based way of > > > > getting an fd, i.e. what we have in patch 1 of this series. > > > > > > Yeah, I also prefer the seccomp() one. > > > > > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved > > > > > current->mm->user_ns of the task that installed the filter (stored as > > > > > a "struct user_namespace *" in the filter) should be acceptable. > > > > > > > > Hm... Why not CAP_SYS_PTRACE? > > > > > > Because LSMs like SELinux add extra checks that apply even if you have > > > CAP_SYS_PTRACE, and this would subvert those. The only capability I > > > know of that lets you bypass LSM checks by design (if no LSM blocks > > > the capability itself) is CAP_SYS_ADMIN. > > > > > > > One more thing. Citing from [1] > > > > > > > > > I think there's a security problem here. Imagine the following scenario: > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF > > > > > 2. task A forks off a child B > > > > > 3. task B uses setuid(1) to drop its privileges > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) > > > > > or via execve() > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B > > > > > > > > Sorry, to be late to the party but would this really pass > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me > > > > that it would... Doesn't look like it would get past: > > > > > > > > tcred = __task_cred(task); > > > > if (uid_eq(caller_uid, tcred->euid) && > > > > uid_eq(caller_uid, tcred->suid) && > > > > uid_eq(caller_uid, tcred->uid) && > > > > gid_eq(caller_gid, tcred->egid) && > > > > gid_eq(caller_gid, tcred->sgid) && > > > > gid_eq(caller_gid, tcred->gid)) > > > > goto ok; > > > > if (ptrace_has_cap(tcred->user_ns, mode)) > > > > goto ok; > > > > rcu_read_unlock(); > > > > return -EPERM; > > > > ok: > > > > rcu_read_unlock(); > > > > mm = task->mm; > > > > if (mm && > > > > ((get_dumpable(mm) != SUID_DUMP_USER) && > > > > !ptrace_has_cap(mm->user_ns, mode))) > > > > return -EPERM; > > > > > > Which specific check would prevent task C from attaching to task B? If > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so > > > you don't trigger the second "return -EPERM". > > > > > > > > 7. because the seccomp filter is shared by task A and task B, task C > > > > > is now able to influence syscall results for syscalls performed by > > > > > task A > > > > > > > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ > > > > > > > > -- > > paul moore > > www.paul-moore.com
On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote: > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote: > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > > +cc selinux people explicitly, since they probably have opinions on this > > > > > > I just spent about twenty minutes working my way through this thread, > > > and digging through the containers archive trying to get a good > > > understanding of what you guys are trying to do, and I'm not quite > > > sure I understand it all. However, from what I have seen, this > > > approach looks very ptrace-y to me (I imagine to others as well based > > > on the comments) and because of this I think ensuring the usual ptrace > > > access controls are evaluated, including the ptrace LSM hooks, is the > > > right thing to do. > > > > Basically the problem is that this new ptrace() API does something > > that doesn't just influence the target task, but also every other task > > that has the same seccomp filter. So the classic ptrace check doesn't > > work here. > > Just to throw this into the mix: then maybe ptrace() isn't the right > interface and we should just go with the native seccomp() approach for > now. Please no :). I don't buy your arguments that 3-syscalls vs. one is better. If I'm doing this setup with a new container, I have to do clone(CLONE_FILES), do this seccomp thing, so that my parent can pick it up again, then do another clone without CLONE_FILES, because in the general case I don't want to share my fd table with the container, wait on the middle task for errors, etc. So we're still doing a bunch of setup, and it feels more awkward than ptrace, with at least as many syscalls, and it only works for your children. I don't mind leaving capable(CAP_SYS_ADMIN) for the ptrace() part, though. So if that's ok, then I think we can agree :) Tycho
On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote: > On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote: > > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote: > > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > > > +cc selinux people explicitly, since they probably have opinions on this > > > > > > > > I just spent about twenty minutes working my way through this thread, > > > > and digging through the containers archive trying to get a good > > > > understanding of what you guys are trying to do, and I'm not quite > > > > sure I understand it all. However, from what I have seen, this > > > > approach looks very ptrace-y to me (I imagine to others as well based > > > > on the comments) and because of this I think ensuring the usual ptrace > > > > access controls are evaluated, including the ptrace LSM hooks, is the > > > > right thing to do. > > > > > > Basically the problem is that this new ptrace() API does something > > > that doesn't just influence the target task, but also every other task > > > that has the same seccomp filter. So the classic ptrace check doesn't > > > work here. > > > > Just to throw this into the mix: then maybe ptrace() isn't the right > > interface and we should just go with the native seccomp() approach for > > now. > > Please no :). > > I don't buy your arguments that 3-syscalls vs. one is better. If I'm > doing this setup with a new container, I have to do > clone(CLONE_FILES), do this seccomp thing, so that my parent can pick > it up again, then do another clone without CLONE_FILES, because in the > general case I don't want to share my fd table with the container, > wait on the middle task for errors, etc. So we're still doing a bunch > of setup, and it feels more awkward than ptrace, with at least as many > syscalls, and it only works for your children. You're talking about the case where you already have shot yourself in the foot by blocking basically all other sensible ways of getting the fd out. Also, this was meant to show that parts of your initial justification for implementing the ptrace() way of getting an fd doesn't really stand. And it doesn't really. Even with ptrace() you can get into situations where you're not able to get an fd. (see prior threads) > > I don't mind leaving capable(CAP_SYS_ADMIN) for the ptrace() part, Again, (prior threads) ptrace() or no ptrace() is something I do not particularly care about as long as we have the non-capable(CAP_SYS_ADMIN) seccomp() way of getting an fd out. > though. So if that's ok, then I think we can agree :) > > Tycho
On Wed, Oct 10, 2018 at 07:15:02PM +0200, Christian Brauner wrote: > On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote: > > On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote: > > > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote: > > > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > > > > +cc selinux people explicitly, since they probably have opinions on this > > > > > > > > > > I just spent about twenty minutes working my way through this thread, > > > > > and digging through the containers archive trying to get a good > > > > > understanding of what you guys are trying to do, and I'm not quite > > > > > sure I understand it all. However, from what I have seen, this > > > > > approach looks very ptrace-y to me (I imagine to others as well based > > > > > on the comments) and because of this I think ensuring the usual ptrace > > > > > access controls are evaluated, including the ptrace LSM hooks, is the > > > > > right thing to do. > > > > > > > > Basically the problem is that this new ptrace() API does something > > > > that doesn't just influence the target task, but also every other task > > > > that has the same seccomp filter. So the classic ptrace check doesn't > > > > work here. > > > > > > Just to throw this into the mix: then maybe ptrace() isn't the right > > > interface and we should just go with the native seccomp() approach for > > > now. > > > > Please no :). > > > > I don't buy your arguments that 3-syscalls vs. one is better. If I'm > > doing this setup with a new container, I have to do > > clone(CLONE_FILES), do this seccomp thing, so that my parent can pick > > it up again, then do another clone without CLONE_FILES, because in the > > general case I don't want to share my fd table with the container, > > wait on the middle task for errors, etc. So we're still doing a bunch > > of setup, and it feels more awkward than ptrace, with at least as many > > syscalls, and it only works for your children. > > You're talking about the case where you already have shot yourself in > the foot by blocking basically all other sensible ways of getting the fd > out. Ok, but these other ways involve syscalls too (sendmsg() or whatever). And if you're going to allow arbitrary policy from your users, you have to be maximally flexible. > Also, this was meant to show that parts of your initial justification > for implementing the ptrace() way of getting an fd doesn't really stand. > And it doesn't really. Even with ptrace() you can get into situations > where you're not able to get an fd. (see prior threads) Of course. I guess my point was that we shouldn't design an API that's impossible to use. I'll drop the notes about sendmsg() from the commit message. Tycho
On Mon, Oct 8, 2018 at 11:00 AM Tycho Andersen <tycho@tycho.ws> wrote: > > On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote: > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > version which can acquire filters is useful. There are at least two reasons > > > this is preferable, even though it uses ptrace: > > > > > > 1. You can control tasks that aren't cooperating with you > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > task installs a filter which blocks these calls, there's no way with > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > So for the slow of mind aka me: > > I'm not sure I completely understand this problem. Can you outline how > > sendmsg() and socket() are involved in this? > > > > I'm also not sure that this holds (but I might misunderstand the > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > other means so something like: > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > pid = clone(CLONE_FILES); > > if (pid == 0) { > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > // Now this fd will be valid in both parent and child. > > // If you haven't blocked it you can inform the parent what > > // the fd number is via pipe2(). If you have blocked it you can > > // use dup2() and dup to a known fd number. > > } > > But what if your seccomp filter wants to block both pipe2() and > dup2()? Whatever syscall you want to use to do this could be blocked > by some seccomp policy, which means you might not be able to use this > feature in some cases. You don't need a syscall at all. You can use shared memory. > > Perhaps it's unlikely, and we can just go forward knowing this. But it > seems like it is worth at least acknowledging that you can wedge > yourself into a corner. > I think that what we *really* want is a way to create a seccomp fitter and activate it later (on execve or via another call to seccomp(), perhaps). And we already sort of have that using ptrace() but a better interface would be nice when a real use case gets figured out.
On Wed, Oct 10, 2018 at 10:45:29AM -0700, Andy Lutomirski wrote: > On Mon, Oct 8, 2018 at 11:00 AM Tycho Andersen <tycho@tycho.ws> wrote: > > > > On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote: > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: > > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() > > > > version which can acquire filters is useful. There are at least two reasons > > > > this is preferable, even though it uses ptrace: > > > > > > > > 1. You can control tasks that aren't cooperating with you > > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the > > > > task installs a filter which blocks these calls, there's no way with > > > > SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. > > > > > > So for the slow of mind aka me: > > > I'm not sure I completely understand this problem. Can you outline how > > > sendmsg() and socket() are involved in this? > > > > > > I'm also not sure that this holds (but I might misunderstand the > > > problem) afaict, you could do try to get the fd out via CLONE_FILES and > > > other means so something like: > > > > > > // let's pretend the libc wrapper for clone actually has sane semantics > > > pid = clone(CLONE_FILES); > > > if (pid == 0) { > > > fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); > > > > > > // Now this fd will be valid in both parent and child. > > > // If you haven't blocked it you can inform the parent what > > > // the fd number is via pipe2(). If you have blocked it you can > > > // use dup2() and dup to a known fd number. > > > } > > > > But what if your seccomp filter wants to block both pipe2() and > > dup2()? Whatever syscall you want to use to do this could be blocked > > by some seccomp policy, which means you might not be able to use this > > feature in some cases. > > You don't need a syscall at all. You can use shared memory. Yeah, I pointed that out too in the next mail. :) > > > > > Perhaps it's unlikely, and we can just go forward knowing this. But it > > seems like it is worth at least acknowledging that you can wedge > > yourself into a corner. > > > > I think that what we *really* want is a way to create a seccomp fitter I thought about this exact thing when discussing my reservations about ptrace() but I didn't want to defer this patchset any longer. But I really like this idea of being able to get an fd *before* the filter is loaded. > and activate it later (on execve or via another call to seccomp(), > perhaps). And we already sort of have that using ptrace() but a > better interface would be nice when a real use case gets figured out.
On Wed, Oct 10, 2018 at 10:26:22AM -0700, Tycho Andersen wrote: > On Wed, Oct 10, 2018 at 07:15:02PM +0200, Christian Brauner wrote: > > On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote: > > > On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote: > > > > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote: > > > > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > > > > > +cc selinux people explicitly, since they probably have opinions on this > > > > > > > > > > > > I just spent about twenty minutes working my way through this thread, > > > > > > and digging through the containers archive trying to get a good > > > > > > understanding of what you guys are trying to do, and I'm not quite > > > > > > sure I understand it all. However, from what I have seen, this > > > > > > approach looks very ptrace-y to me (I imagine to others as well based > > > > > > on the comments) and because of this I think ensuring the usual ptrace > > > > > > access controls are evaluated, including the ptrace LSM hooks, is the > > > > > > right thing to do. > > > > > > > > > > Basically the problem is that this new ptrace() API does something > > > > > that doesn't just influence the target task, but also every other task > > > > > that has the same seccomp filter. So the classic ptrace check doesn't > > > > > work here. > > > > > > > > Just to throw this into the mix: then maybe ptrace() isn't the right > > > > interface and we should just go with the native seccomp() approach for > > > > now. > > > > > > Please no :). > > > > > > I don't buy your arguments that 3-syscalls vs. one is better. If I'm > > > doing this setup with a new container, I have to do > > > clone(CLONE_FILES), do this seccomp thing, so that my parent can pick > > > it up again, then do another clone without CLONE_FILES, because in the > > > general case I don't want to share my fd table with the container, > > > wait on the middle task for errors, etc. So we're still doing a bunch > > > of setup, and it feels more awkward than ptrace, with at least as many > > > syscalls, and it only works for your children. > > > > You're talking about the case where you already have shot yourself in > > the foot by blocking basically all other sensible ways of getting the fd > > out. > > Ok, but these other ways involve syscalls too (sendmsg() or whatever). > And if you're going to allow arbitrary policy from your users, you > have to be maximally flexible. So, I totally like the idea of being able to get an fd before the filter is active. If this could be done in seccomp()-only it would be A+ (See Andy's mail in the other thread.) But I really don't want to keep you working on this forever. :) > > > Also, this was meant to show that parts of your initial justification > > for implementing the ptrace() way of getting an fd doesn't really stand. > > And it doesn't really. Even with ptrace() you can get into situations > > where you're not able to get an fd. (see prior threads) > > Of course. I guess my point was that we shouldn't design an API that's > impossible to use. I'll drop the notes about sendmsg() from the commit > message. > > Tycho
On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: >> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: >>> +cc selinux people explicitly, since they probably have opinions on this >> >> I just spent about twenty minutes working my way through this thread, >> and digging through the containers archive trying to get a good >> understanding of what you guys are trying to do, and I'm not quite >> sure I understand it all. However, from what I have seen, this >> approach looks very ptrace-y to me (I imagine to others as well based >> on the comments) and because of this I think ensuring the usual ptrace >> access controls are evaluated, including the ptrace LSM hooks, is the >> right thing to do. > > Basically the problem is that this new ptrace() API does something > that doesn't just influence the target task, but also every other task > that has the same seccomp filter. So the classic ptrace check doesn't > work here. Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > >> If I've missed something, or I'm thinking about this wrong, please >> educate me; just a heads-up that I'm largely offline for most of this >> week so responses on my end are going to be delayed much more than >> usual. >> >>> On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote: >>>> On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote: >>>>> On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote: >>>>>> On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote: >>>>>>> On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote: >>>>>>> > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote: >>>>>>> > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote: >>>>>>> > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote: >>>>>>> > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c >>>>>>> > > > > index 44a31ac8373a..17685803a2af 100644 >>>>>>> > > > > --- a/kernel/seccomp.c >>>>>>> > > > > +++ b/kernel/seccomp.c >>>>>>> > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, >>>>>>> > > > > >>>>>>> > > > > return ret; >>>>>>> > > > > } >>>>>>> > > > > + >>>>>>> > > > > +long seccomp_new_listener(struct task_struct *task, >>>>>>> > > > > + unsigned long filter_off) >>>>>>> > > > > +{ >>>>>>> > > > > + struct seccomp_filter *filter; >>>>>>> > > > > + struct file *listener; >>>>>>> > > > > + int fd; >>>>>>> > > > > + >>>>>>> > > > > + if (!capable(CAP_SYS_ADMIN)) >>>>>>> > > > > + return -EACCES; >>>>>>> > > > >>>>>>> > > > I know this might have been discussed a while back but why exactly do we >>>>>>> > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What >>>>>>> > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and >>>>>>> > > > use ptrace from in there? >>>>>>> > > >>>>>>> > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ >>>>>>> > > . Basically, the problem is that this doesn't just give you capability >>>>>>> > > over the target task, but also over every other task that has the same >>>>>>> > > filter installed; you need some sort of "is the caller capable over >>>>>>> > > the filter and anyone who uses it" check. >>>>>>> > >>>>>>> > Thanks. >>>>>>> > But then this new ptrace feature as it stands is imho currently broken. >>>>>>> > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you >>>>>>> > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself >>>>>>> > if you are ns_cpabable(CAP_SYS_ADMIN) >>>>> >>>>> Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as >>>>> you enable the NNP flag, I think? >>>> >>>> Yes, if you turn on NNP you don't even need sys_admin. >>>> >>>>> >>>>>>> > then either the new ptrace() api >>>>>>> > extension should be fixed to allow for this too or the seccomp() way of >>>>>>> > retrieving the pid - which I really think we want - needs to be fixed to >>>>>>> > require capable(CAP_SYS_ADMIN) too. >>>>>>> > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho - >>>>>>> > the preferred way to solve this. >>>>>>> > Everything else will just be confusing. >>>>>>> >>>>>>> First you say "broken", then you say "confusing". Which one do you mean? >>>>>> >>>>>> Both. It's broken in so far as it places a seemingly unnecessary >>>>>> restriction that could be fixed. You outlined one possible fix yourself >>>>>> in the link you provided. >>>>> >>>>> If by "possible fix" you mean "check whether the seccomp filter is >>>>> only attached to a single task": That wouldn't fundamentally change >>>>> the situation, it would only add an additional special case. >>>>> >>>>>> And it's confusing in so far as there is a way >>>>>> via seccomp() to get the fd without said requirement. >>>>> >>>>> I don't find it confusing at all. seccomp() and ptrace() are very >>>> >>>> Fine, then that's a matter of opinion. I find it counterintuitive that >>>> you can get an fd without privileges via one interface but not via >>>> another. >>>> >>>>> different situations: When you use seccomp(), infrastructure is >>>> >>>> Sure. Note, that this is _one_ of the reasons why I want to make sure we >>>> keep the native seccomp() only based way of getting an fd without >>>> forcing userspace to switching to a differnet kernel api. >>>> >>>>> already in place for ensuring that your filter is only applied to >>>>> processes over which you are capable, and propagation is limited by >>>>> inheritance from your task down. When you use ptrace(), you need a >>>>> pretty different sort of access check that checks whether you're >>>>> privileged over ancestors, siblings and so on of the target task. >>>> >>>> So, don't get me wrong I'm not arguing against the ptrace() interface in >>>> general. If this is something that people find useful, fine. But, I >>>> would like to have a simple single-syscall pure-seccomp() based way of >>>> getting an fd, i.e. what we have in patch 1 of this series. >>> >>> Yeah, I also prefer the seccomp() one. >>> >>>>> But thinking about it more, I think that CAP_SYS_ADMIN over the saved >>>>> current->mm->user_ns of the task that installed the filter (stored as >>>>> a "struct user_namespace *" in the filter) should be acceptable. >>>> >>>> Hm... Why not CAP_SYS_PTRACE? >>> >>> Because LSMs like SELinux add extra checks that apply even if you have >>> CAP_SYS_PTRACE, and this would subvert those. The only capability I >>> know of that lets you bypass LSM checks by design (if no LSM blocks >>> the capability itself) is CAP_SYS_ADMIN. >>> >>>> One more thing. Citing from [1] >>>> >>>>> I think there's a security problem here. Imagine the following scenario: >>>>> >>>>> 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF >>>>> 2. task A forks off a child B >>>>> 3. task B uses setuid(1) to drop its privileges >>>>> 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1) >>>>> or via execve() >>>>> 5. task C (the attacker, uid==1) attaches to task B via ptrace >>>>> 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B >>>> >>>> Sorry, to be late to the party but would this really pass >>>> __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me >>>> that it would... Doesn't look like it would get past: >>>> >>>> tcred = __task_cred(task); >>>> if (uid_eq(caller_uid, tcred->euid) && >>>> uid_eq(caller_uid, tcred->suid) && >>>> uid_eq(caller_uid, tcred->uid) && >>>> gid_eq(caller_gid, tcred->egid) && >>>> gid_eq(caller_gid, tcred->sgid) && >>>> gid_eq(caller_gid, tcred->gid)) >>>> goto ok; >>>> if (ptrace_has_cap(tcred->user_ns, mode)) >>>> goto ok; >>>> rcu_read_unlock(); >>>> return -EPERM; >>>> ok: >>>> rcu_read_unlock(); >>>> mm = task->mm; >>>> if (mm && >>>> ((get_dumpable(mm) != SUID_DUMP_USER) && >>>> !ptrace_has_cap(mm->user_ns, mode))) >>>> return -EPERM; >>> >>> Which specific check would prevent task C from attaching to task B? If >>> the UIDs match, the first "goto ok" executes; and you're dumpable, so >>> you don't trigger the second "return -EPERM". >>> >>>>> 7. because the seccomp filter is shared by task A and task B, task C >>>>> is now able to influence syscall results for syscalls performed by >>>>> task A >>>> >>>> [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/ >> >> >> >> -- >> paul moore >> www.paul-moore.com
On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: > On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > >> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > >>> +cc selinux people explicitly, since they probably have opinions on this > >> > >> I just spent about twenty minutes working my way through this thread, > >> and digging through the containers archive trying to get a good > >> understanding of what you guys are trying to do, and I'm not quite > >> sure I understand it all. However, from what I have seen, this > >> approach looks very ptrace-y to me (I imagine to others as well based > >> on the comments) and because of this I think ensuring the usual ptrace > >> access controls are evaluated, including the ptrace LSM hooks, is the > >> right thing to do. > > > > Basically the problem is that this new ptrace() API does something > > that doesn't just influence the target task, but also every other task > > that has the same seccomp filter. So the classic ptrace check doesn't > > work here. > > Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? There are currently no backlinks from seccomp filters to the tasks that use them; the only thing you have is a refcount. If the refcount is 1, and the target task uses the filter directly (it is the last installed one), you'd be able to infer that the ptrace target is the only task with a reference to the filter, and you could just do the direct check; but if the refcount is >1, you might end up having to take some spinlock and then iterate over all tasks' filters with that spinlock held, or something like that.
On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote: > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: >>>>> +cc selinux people explicitly, since they probably have opinions on this >>>> >>>> I just spent about twenty minutes working my way through this thread, >>>> and digging through the containers archive trying to get a good >>>> understanding of what you guys are trying to do, and I'm not quite >>>> sure I understand it all. However, from what I have seen, this >>>> approach looks very ptrace-y to me (I imagine to others as well based >>>> on the comments) and because of this I think ensuring the usual ptrace >>>> access controls are evaluated, including the ptrace LSM hooks, is the >>>> right thing to do. >>> >>> Basically the problem is that this new ptrace() API does something >>> that doesn't just influence the target task, but also every other task >>> that has the same seccomp filter. So the classic ptrace check doesn't >>> work here. >> >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > > There are currently no backlinks from seccomp filters to the tasks > that use them; the only thing you have is a refcount. If the refcount > is 1, and the target task uses the filter directly (it is the last > installed one), you'd be able to infer that the ptrace target is the > only task with a reference to the filter, and you could just do the > direct check; but if the refcount is >1, you might end up having to > take some spinlock and then iterate over all tasks' filters with that > spinlock held, or something like that. That's what I was afraid of. Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace. -- paul moore www.paul-moore.com
On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote: > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote: > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > >>>>> +cc selinux people explicitly, since they probably have opinions on this > >>>> > >>>> I just spent about twenty minutes working my way through this thread, > >>>> and digging through the containers archive trying to get a good > >>>> understanding of what you guys are trying to do, and I'm not quite > >>>> sure I understand it all. However, from what I have seen, this > >>>> approach looks very ptrace-y to me (I imagine to others as well based > >>>> on the comments) and because of this I think ensuring the usual ptrace > >>>> access controls are evaluated, including the ptrace LSM hooks, is the > >>>> right thing to do. > >>> > >>> Basically the problem is that this new ptrace() API does something > >>> that doesn't just influence the target task, but also every other task > >>> that has the same seccomp filter. So the classic ptrace check doesn't > >>> work here. > >> > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > > > > There are currently no backlinks from seccomp filters to the tasks > > that use them; the only thing you have is a refcount. If the refcount > > is 1, and the target task uses the filter directly (it is the last > > installed one), you'd be able to infer that the ptrace target is the > > only task with a reference to the filter, and you could just do the > > direct check; but if the refcount is >1, you might end up having to > > take some spinlock and then iterate over all tasks' filters with that > > spinlock held, or something like that. > > That's what I was afraid of. > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace. > I would argue that once "LSM" enters this conversation, it just means we're on the wrong track. Let's try to make this work without ptrace if possible :) The whole seccomp() mechanism is very carefully designed so that it's perfectly safe to install seccomp filters without involving LSM or even involving credentials at all. --Andy
On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote: > On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote: > > > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote: > > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: > > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > >>>>> +cc selinux people explicitly, since they probably have opinions on this > > >>>> > > >>>> I just spent about twenty minutes working my way through this thread, > > >>>> and digging through the containers archive trying to get a good > > >>>> understanding of what you guys are trying to do, and I'm not quite > > >>>> sure I understand it all. However, from what I have seen, this > > >>>> approach looks very ptrace-y to me (I imagine to others as well based > > >>>> on the comments) and because of this I think ensuring the usual ptrace > > >>>> access controls are evaluated, including the ptrace LSM hooks, is the > > >>>> right thing to do. > > >>> > > >>> Basically the problem is that this new ptrace() API does something > > >>> that doesn't just influence the target task, but also every other task > > >>> that has the same seccomp filter. So the classic ptrace check doesn't > > >>> work here. > > >> > > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > > > > > > There are currently no backlinks from seccomp filters to the tasks > > > that use them; the only thing you have is a refcount. If the refcount > > > is 1, and the target task uses the filter directly (it is the last > > > installed one), you'd be able to infer that the ptrace target is the > > > only task with a reference to the filter, and you could just do the > > > direct check; but if the refcount is >1, you might end up having to > > > take some spinlock and then iterate over all tasks' filters with that > > > spinlock held, or something like that. > > > > That's what I was afraid of. > > > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace. > > > > I would argue that once "LSM" enters this conversation, it just means > we're on the wrong track. Let's try to make this work without ptrace > if possible :) The whole seccomp() mechanism is very carefully > designed so that it's perfectly safe to install seccomp filters > without involving LSM or even involving credentials at all. In a last ditch effort to save the ptrace() interface: can we just only allow it when refcount_read(filter->usage) == 1? Tycho
On Fri, Oct 12, 2018 at 10:02 PM Tycho Andersen <tycho@tycho.ws> wrote: > On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote: > > On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote: > > > > > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote: > > > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: > > > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > > > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > >>>>> +cc selinux people explicitly, since they probably have opinions on this > > > >>>> > > > >>>> I just spent about twenty minutes working my way through this thread, > > > >>>> and digging through the containers archive trying to get a good > > > >>>> understanding of what you guys are trying to do, and I'm not quite > > > >>>> sure I understand it all. However, from what I have seen, this > > > >>>> approach looks very ptrace-y to me (I imagine to others as well based > > > >>>> on the comments) and because of this I think ensuring the usual ptrace > > > >>>> access controls are evaluated, including the ptrace LSM hooks, is the > > > >>>> right thing to do. > > > >>> > > > >>> Basically the problem is that this new ptrace() API does something > > > >>> that doesn't just influence the target task, but also every other task > > > >>> that has the same seccomp filter. So the classic ptrace check doesn't > > > >>> work here. > > > >> > > > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > > > > > > > > There are currently no backlinks from seccomp filters to the tasks > > > > that use them; the only thing you have is a refcount. If the refcount > > > > is 1, and the target task uses the filter directly (it is the last > > > > installed one), you'd be able to infer that the ptrace target is the > > > > only task with a reference to the filter, and you could just do the > > > > direct check; but if the refcount is >1, you might end up having to > > > > take some spinlock and then iterate over all tasks' filters with that > > > > spinlock held, or something like that. > > > > > > That's what I was afraid of. > > > > > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace. > > > > > > > I would argue that once "LSM" enters this conversation, it just means > > we're on the wrong track. Let's try to make this work without ptrace > > if possible :) The whole seccomp() mechanism is very carefully > > designed so that it's perfectly safe to install seccomp filters > > without involving LSM or even involving credentials at all. > > In a last ditch effort to save the ptrace() interface: can we just > only allow it when refcount_read(filter->usage) == 1? From a security perspective, I think that would be fine, assuming that we know that the target task is stopped. (But note that if the target process e.g. uses the filter on multiple threads, the refcount will be higher.)
On Fri, Oct 12, 2018 at 01:02:20PM -0700, Tycho Andersen wrote: > On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote: > > On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote: > > > > > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote: > > > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote: > > > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote: > > > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote: > > > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote: > > > >>>>> +cc selinux people explicitly, since they probably have opinions on this > > > >>>> > > > >>>> I just spent about twenty minutes working my way through this thread, > > > >>>> and digging through the containers archive trying to get a good > > > >>>> understanding of what you guys are trying to do, and I'm not quite > > > >>>> sure I understand it all. However, from what I have seen, this > > > >>>> approach looks very ptrace-y to me (I imagine to others as well based > > > >>>> on the comments) and because of this I think ensuring the usual ptrace > > > >>>> access controls are evaluated, including the ptrace LSM hooks, is the > > > >>>> right thing to do. > > > >>> > > > >>> Basically the problem is that this new ptrace() API does something > > > >>> that doesn't just influence the target task, but also every other task > > > >>> that has the same seccomp filter. So the classic ptrace check doesn't > > > >>> work here. > > > >> > > > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks? If it is possible, how painful would it be? > > > > > > > > There are currently no backlinks from seccomp filters to the tasks > > > > that use them; the only thing you have is a refcount. If the refcount > > > > is 1, and the target task uses the filter directly (it is the last > > > > installed one), you'd be able to infer that the ptrace target is the > > > > only task with a reference to the filter, and you could just do the > > > > direct check; but if the refcount is >1, you might end up having to > > > > take some spinlock and then iterate over all tasks' filters with that > > > > spinlock held, or something like that. > > > > > > That's what I was afraid of. > > > > > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace. > > > > > > > I would argue that once "LSM" enters this conversation, it just means > > we're on the wrong track. Let's try to make this work without ptrace > > if possible :) The whole seccomp() mechanism is very carefully > > designed so that it's perfectly safe to install seccomp filters > > without involving LSM or even involving credentials at all. > > In a last ditch effort to save the ptrace() interface: can we just > only allow it when refcount_read(filter->usage) == 1? I mean, the filter->usage == 1 means lets us get rid of capable(CAP_SYS_ADMIN) making the ptrace() way of getting an fd useable in nesting scenarios and from within user namespaces. That makes it a whole lot more useful and aligns it with the seccomp() way of getting the fd. So I wouldn't argue against it. I guess it comes down to (for me) whether you consider this a necessary part of this patchset aka meaning without it it wouldn't be useable. Christian
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 017444b5efed..234c61b37405 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s) #ifdef CONFIG_SECCOMP_FILTER extern void put_seccomp_filter(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); +extern long seccomp_new_listener(struct task_struct *task, + unsigned long filter_off); #else /* CONFIG_SECCOMP_FILTER */ static inline void put_seccomp_filter(struct task_struct *tsk) { @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk) { return; } +static inline long seccomp_new_listener(struct task_struct *task, + unsigned long filter_off) +{ + return -EINVAL; +} #endif /* CONFIG_SECCOMP_FILTER */ #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index d5a1b8a492b9..e80ecb1bd427 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -73,6 +73,8 @@ struct seccomp_metadata { __u64 flags; /* Output: filter's flags */ }; +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 21fec73d45d4..289960ac181b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request, ret = seccomp_get_metadata(child, addr, datavp); break; + case PTRACE_SECCOMP_NEW_LISTENER: + ret = seccomp_new_listener(child, addr); + break; + default: break; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44a31ac8373a..17685803a2af 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task, return ret; } + +long seccomp_new_listener(struct task_struct *task, + unsigned long filter_off) +{ + struct seccomp_filter *filter; + struct file *listener; + int fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + filter = get_nth_filter(task, filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + fd = get_unused_fd_flags(0); + if (fd < 0) { + __put_seccomp_filter(filter); + return fd; + } + + listener = init_listener(task, filter); + __put_seccomp_filter(filter); + if (IS_ERR(listener)) { + put_unused_fd(fd); + return PTR_ERR(listener); + } + + fd_install(fd, listener); + return fd; +} #endif diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 5f4b836a6792..c6ba3ed5392e 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args) } #endif +#ifndef PTRACE_SECCOMP_NEW_LISTENER +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e +#endif + #if __BYTE_ORDER == __LITTLE_ENDIAN #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) #elif __BYTE_ORDER == __BIG_ENDIAN @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(get_user_notification_ptrace) +{ + pid_t pid; + int status, listener; + int sk_pair[2]; + char c; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + + ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0); + + /* Test that we get ENOSYS while not attached */ + EXPECT_EQ(syscall(__NR_getpid), -1); + EXPECT_EQ(errno, ENOSYS); + + /* Signal we're ready and have installed the filter. */ + EXPECT_EQ(write(sk_pair[1], "J", 1), 1); + + EXPECT_EQ(read(sk_pair[1], &c, 1), 1); + EXPECT_EQ(c, 'H'); + + exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC); + } + + EXPECT_EQ(read(sk_pair[0], &c, 1), 1); + EXPECT_EQ(c, 'J'); + + EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0); + EXPECT_EQ(waitpid(pid, NULL, 0), pid); + listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0); + EXPECT_GE(listener, 0); + + /* EBUSY for second listener */ + EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1); + EXPECT_EQ(errno, EBUSY); + + EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0); + + /* Now signal we are done and respond with magic */ + EXPECT_EQ(write(sk_pair[0], "H", 1), 1); + + req.len = sizeof(req); + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req)); + + resp.len = sizeof(resp); + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp)); + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + close(listener); +} + /* * Check that a pid in a child namespace still shows up as valid in ours. */
As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace() version which can acquire filters is useful. There are at least two reasons this is preferable, even though it uses ptrace: 1. You can control tasks that aren't cooperating with you 2. You can control tasks whose filters block sendmsg() and socket(); if the task installs a filter which blocks these calls, there's no way with SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task. v2: fix a bug where listener mode was not unset when an unused fd was not available v3: fix refcounting bug (Oleg) v4: * change the listener's fd flags to be 0 * rename GET_LISTENER to NEW_LISTENER (Matthew) v5: * add capable(CAP_SYS_ADMIN) requirement v7: * point the new listener at the right filter (Jann) Signed-off-by: Tycho Andersen <tycho@tycho.ws> CC: Kees Cook <keescook@chromium.org> CC: Andy Lutomirski <luto@amacapital.net> CC: Oleg Nesterov <oleg@redhat.com> CC: Eric W. Biederman <ebiederm@xmission.com> CC: "Serge E. Hallyn" <serge@hallyn.com> CC: Christian Brauner <christian.brauner@ubuntu.com> CC: Tyler Hicks <tyhicks@canonical.com> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp> --- include/linux/seccomp.h | 7 ++ include/uapi/linux/ptrace.h | 2 + kernel/ptrace.c | 4 ++ kernel/seccomp.c | 31 +++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++ 5 files changed, 112 insertions(+)