diff mbox

[v6,6/9] seccomp: add "seccomp" syscall

Message ID 1402457121-8410-7-git-send-email-keescook@chromium.org (mailing list archive)
State New, archived
Headers show

Commit Message

Kees Cook June 11, 2014, 3:25 a.m. UTC
This adds the new "seccomp" syscall with both an "operation" and "flags"
parameter for future expansion. The third argument is a pointer value,
used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: linux-api@vger.kernel.org
---
 arch/x86/syscalls/syscall_32.tbl  |    1 +
 arch/x86/syscalls/syscall_64.tbl  |    1 +
 include/linux/syscalls.h          |    2 ++
 include/uapi/asm-generic/unistd.h |    4 ++-
 include/uapi/linux/seccomp.h      |    4 +++
 kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
 kernel/sys_ni.c                   |    3 ++
 7 files changed, 69 insertions(+), 9 deletions(-)

Comments

Andy Lutomirski June 13, 2014, 8:41 p.m. UTC | #1
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Question for the linux-abi people:

What's the preferred way to do this these days?  This syscall is a
general purpose "adjust the seccomp state" thing.  The alternative
would be a specific new syscall to add a filter with a flags argument.

--Andy
Alexei Starovoitov June 13, 2014, 9:22 p.m. UTC | #2
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Cc: linux-api@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>  include/linux/syscalls.h          |    2 ++
>  include/uapi/asm-generic/unistd.h |    4 ++-
>  include/uapi/linux/seccomp.h      |    4 +++
>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>  kernel/sys_ni.c                   |    3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351    i386    sched_setattr           sys_sched_setattr
>  352    i386    sched_getattr           sys_sched_getattr
>  353    i386    renameat2               sys_renameat2
> +354    i386    seccomp                 sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314    common  sched_setattr           sys_sched_setattr
>  315    common  sched_getattr           sys_sched_getattr
>  316    common  renameat2               sys_renameat2
> +317    common  seccomp                 sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>                          unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +                           const char __user *uargs);

It looks odd to add 'flags' argument to syscall that is not even used.
It don't think it will be extensible this way.
'uargs' is used only in 2nd command as well and it's not 'char __user *'
but rather 'struct sock_fprog __user *'
I think it makes more sense to define only first argument as 'int op' and the
rest as variable length array.
Something like:
long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
then different commands can interpret 'attrs' differently.
if op == mode_strict, then attrs == NULL, len == 0
if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
and nla_data(attrs) is 'struct sock_fprog'
If we decide to add new types of filters or new commands, the syscall prototype
won't need to change. New commands can be added preserving backward
compatibility.
The basic TLV concept has been around forever in netlink world. imo makes
sense to use it with new syscalls. Passing 'struct xxx' into syscalls
is the thing
of the past. TLV style is more extensible. Fields of structures can become
optional in the future, new fields added, etc.
'struct nlattr' brings the same benefits to kernel api as protobuf did
to user land.

>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
>  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
>  #define __NR_renameat2 276
>  __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
>  #define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
>  #define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT        0
> +#define SECCOMP_SET_MODE_FILTER        1
> +
>  /*
>   * All BPF programs must return a 32-bit value.
>   * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
>  #include <linux/sched.h>
>  #include <linux/seccomp.h>
>  #include <linux/slab.h>
> +#include <linux/syscalls.h>
>
>  /* #define SECCOMP_DEBUG 1 */
>
> @@ -301,8 +302,8 @@ free_prog:
>   *
>   * Returns filter on success and ERR_PTR otherwise.
>   */
> -static
> -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
> +static struct seccomp_filter *
> +seccomp_prepare_user_filter(const char __user *user_filter)
>  {
>         struct sock_fprog fprog;
>         struct seccomp_filter *filter = ERR_PTR(-EFAULT);
> @@ -325,19 +326,25 @@ out:
>
>  /**
>   * seccomp_attach_filter: validate and attach filter
> + * @flags:  flags to change filter behavior
>   * @filter: seccomp filter to add to the current process
>   *
>   * Caller must be holding current->sighand->siglock lock.
>   *
>   * Returns 0 on success, -ve on error.
>   */
> -static long seccomp_attach_filter(struct seccomp_filter *filter)
> +static long seccomp_attach_filter(unsigned int flags,
> +                                 struct seccomp_filter *filter)
>  {
>         unsigned long total_insns;
>         struct seccomp_filter *walker;
>
>         BUG_ON(!spin_is_locked(&current->sighand->siglock));
>
> +       /* Validate flags. */
> +       if (flags != 0)
> +               return -EINVAL;
> +
>         /* Validate resulting filter length. */
>         total_insns = filter->len;
>         for (walker = current->seccomp.filter; walker; walker = filter->prev)
> @@ -541,6 +548,7 @@ out:
>  #ifdef CONFIG_SECCOMP_FILTER
>  /**
>   * seccomp_set_mode_filter: internal function for setting seccomp filter
> + * @flags:  flags to change filter behavior
>   * @filter: struct sock_fprog containing filter
>   *
>   * This function may be called repeatedly to install additional filters.
> @@ -551,7 +559,8 @@ out:
>   *
>   * Returns 0 on success or -EINVAL on failure.
>   */
> -static long seccomp_set_mode_filter(char __user *filter)
> +static long seccomp_set_mode_filter(unsigned int flags,
> +                                   const char __user *filter)
>  {
>         const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
>         struct seccomp_filter *prepared = NULL;
> @@ -569,7 +578,7 @@ static long seccomp_set_mode_filter(char __user *filter)
>         if (!seccomp_check_mode(current, seccomp_mode))
>                 goto out;
>
> -       ret = seccomp_attach_filter(prepared);
> +       ret = seccomp_attach_filter(flags, prepared);
>         if (ret)
>                 goto out;
>         /* Do not free the successfully attached filter. */
> @@ -583,12 +592,35 @@ out_free:
>         return ret;
>  }
>  #else
> -static inline long seccomp_set_mode_filter(char __user *filter)
> +static inline long seccomp_set_mode_filter(unsigned int flags,
> +                                          const char __user *filter)
>  {
>         return -EINVAL;
>  }
>  #endif
>
> +/* Common entry point for both prctl and syscall. */
> +static long do_seccomp(unsigned int op, unsigned int flags,
> +                      const char __user *uargs)
> +{
> +       switch (op) {
> +       case SECCOMP_SET_MODE_STRICT:
> +               if (flags != 0 || uargs != NULL)
> +                       return -EINVAL;
> +               return seccomp_set_mode_strict();
> +       case SECCOMP_SET_MODE_FILTER:
> +               return seccomp_set_mode_filter(flags, uargs);
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
> +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> +                        const char __user *, uargs)
> +{
> +       return do_seccomp(op, flags, uargs);
> +}
> +
>  /**
>   * prctl_set_seccomp: configures current->seccomp.mode
>   * @seccomp_mode: requested mode to use
> @@ -598,12 +630,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
>   */
>  long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>  {
> +       unsigned int op;
> +       char __user *uargs;
> +
>         switch (seccomp_mode) {
>         case SECCOMP_MODE_STRICT:
> -               return seccomp_set_mode_strict();
> +               op = SECCOMP_SET_MODE_STRICT;
> +               /*
> +                * Setting strict mode through prctl always ignored filter,
> +                * so make sure it is always NULL here to pass the internal
> +                * check in do_seccomp().
> +                */
> +               uargs = NULL;
> +               break;
>         case SECCOMP_MODE_FILTER:
> -               return seccomp_set_mode_filter(filter);
> +               op = SECCOMP_SET_MODE_FILTER;
> +               uargs = filter;
> +               break;
>         default:
>                 return -EINVAL;
>         }
> +
> +       /* prctl interface doesn't have flags, so they are always zero. */
> +       return do_seccomp(op, 0, uargs);
>  }
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 36441b51b5df..2904a2105914 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
>
>  /* compare kernel pointers */
>  cond_syscall(sys_kcmp);
> +
> +/* operate on Secure Computing state */
> +cond_syscall(sys_seccomp);
> --
> 1.7.9.5
>
Andy Lutomirski June 13, 2014, 9:25 p.m. UTC | #3
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook <keescook@chromium.org>
>> Cc: linux-api@vger.kernel.org
>> ---
>>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>>  include/linux/syscalls.h          |    2 ++
>>  include/uapi/asm-generic/unistd.h |    4 ++-
>>  include/uapi/linux/seccomp.h      |    4 +++
>>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>>  kernel/sys_ni.c                   |    3 ++
>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>> index d6b867921612..7527eac24122 100644
>> --- a/arch/x86/syscalls/syscall_32.tbl
>> +++ b/arch/x86/syscalls/syscall_32.tbl
>> @@ -360,3 +360,4 @@
>>  351    i386    sched_setattr           sys_sched_setattr
>>  352    i386    sched_getattr           sys_sched_getattr
>>  353    i386    renameat2               sys_renameat2
>> +354    i386    seccomp                 sys_seccomp
>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>> index ec255a1646d2..16272a6c12b7 100644
>> --- a/arch/x86/syscalls/syscall_64.tbl
>> +++ b/arch/x86/syscalls/syscall_64.tbl
>> @@ -323,6 +323,7 @@
>>  314    common  sched_setattr           sys_sched_setattr
>>  315    common  sched_getattr           sys_sched_getattr
>>  316    common  renameat2               sys_renameat2
>> +317    common  seccomp                 sys_seccomp
>>
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>                          unsigned long idx1, unsigned long idx2);
>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> +                           const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.
> It don't think it will be extensible this way.
> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
> but rather 'struct sock_fprog __user *'
> I think it makes more sense to define only first argument as 'int op' and the
> rest as variable length array.
> Something like:
> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
> then different commands can interpret 'attrs' differently.
> if op == mode_strict, then attrs == NULL, len == 0
> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
> and nla_data(attrs) is 'struct sock_fprog'

Eww.  If the operation doesn't imply the type, then I think we've
totally screwed up.

> If we decide to add new types of filters or new commands, the syscall prototype
> won't need to change. New commands can be added preserving backward
> compatibility.
> The basic TLV concept has been around forever in netlink world. imo makes
> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
> is the thing
> of the past. TLV style is more extensible. Fields of structures can become
> optional in the future, new fields added, etc.
> 'struct nlattr' brings the same benefits to kernel api as protobuf did
> to user land.

I see no reason to bring nl_attr into this.

Admittedly, I've never dealt with nl_attr, but everything
netlink-related I've even been involved in has involved some sort of
API atrocity.

--Andy
Alexei Starovoitov June 13, 2014, 9:37 p.m. UTC | #4
On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>> parameter for future expansion. The third argument is a pointer value,
>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>
>>> Signed-off-by: Kees Cook <keescook@chromium.org>
>>> Cc: linux-api@vger.kernel.org
>>> ---
>>>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>>>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>>>  include/linux/syscalls.h          |    2 ++
>>>  include/uapi/asm-generic/unistd.h |    4 ++-
>>>  include/uapi/linux/seccomp.h      |    4 +++
>>>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>>>  kernel/sys_ni.c                   |    3 ++
>>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>> index d6b867921612..7527eac24122 100644
>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>> @@ -360,3 +360,4 @@
>>>  351    i386    sched_setattr           sys_sched_setattr
>>>  352    i386    sched_getattr           sys_sched_getattr
>>>  353    i386    renameat2               sys_renameat2
>>> +354    i386    seccomp                 sys_seccomp
>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>> index ec255a1646d2..16272a6c12b7 100644
>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>> @@ -323,6 +323,7 @@
>>>  314    common  sched_setattr           sys_sched_setattr
>>>  315    common  sched_getattr           sys_sched_getattr
>>>  316    common  renameat2               sys_renameat2
>>> +317    common  seccomp                 sys_seccomp
>>>
>>>  #
>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>> index b0881a0ed322..1713977ee26f 100644
>>> --- a/include/linux/syscalls.h
>>> +++ b/include/linux/syscalls.h
>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>                          unsigned long idx1, unsigned long idx2);
>>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>> +                           const char __user *uargs);
>>
>> It looks odd to add 'flags' argument to syscall that is not even used.
>> It don't think it will be extensible this way.
>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>> but rather 'struct sock_fprog __user *'
>> I think it makes more sense to define only first argument as 'int op' and the
>> rest as variable length array.
>> Something like:
>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>> then different commands can interpret 'attrs' differently.
>> if op == mode_strict, then attrs == NULL, len == 0
>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>> and nla_data(attrs) is 'struct sock_fprog'
>
> Eww.  If the operation doesn't imply the type, then I think we've
> totally screwed up.
>
>> If we decide to add new types of filters or new commands, the syscall prototype
>> won't need to change. New commands can be added preserving backward
>> compatibility.
>> The basic TLV concept has been around forever in netlink world. imo makes
>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>> is the thing
>> of the past. TLV style is more extensible. Fields of structures can become
>> optional in the future, new fields added, etc.
>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>> to user land.
>
> I see no reason to bring nl_attr into this.
>
> Admittedly, I've never dealt with nl_attr, but everything
> netlink-related I've even been involved in has involved some sort of
> API atrocity.

netlink has a lot of legacy and there is genetlink which is not pretty
either because of extra socket creation, binding, dealing with packet
loss issues, but the key concept of variable length encoding is sound.
Right now seccomp has two commands and they already don't fit
into single syscall neatly. Are you saying there should be two syscalls
here? What about another seccomp related command? Another syscall?
imo all seccomp related commands needs to be mux/demux-ed under
one syscall. What is the way to mux/demux potentially very different
commands under one syscall? I cannot think of anything better than
TLV style. 'struct nlattr' is what we have today and I think it works fine.
I'm not suggesting to bring the whole netlink into the picture, but rather
TLV style of encoding different arguments for different commands.
Andy Lutomirski June 13, 2014, 9:42 p.m. UTC | #5
On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>> parameter for future expansion. The third argument is a pointer value,
>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>
>>>> Signed-off-by: Kees Cook <keescook@chromium.org>
>>>> Cc: linux-api@vger.kernel.org
>>>> ---
>>>>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>>>>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>>>>  include/linux/syscalls.h          |    2 ++
>>>>  include/uapi/asm-generic/unistd.h |    4 ++-
>>>>  include/uapi/linux/seccomp.h      |    4 +++
>>>>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>>>>  kernel/sys_ni.c                   |    3 ++
>>>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>> index d6b867921612..7527eac24122 100644
>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>> @@ -360,3 +360,4 @@
>>>>  351    i386    sched_setattr           sys_sched_setattr
>>>>  352    i386    sched_getattr           sys_sched_getattr
>>>>  353    i386    renameat2               sys_renameat2
>>>> +354    i386    seccomp                 sys_seccomp
>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>> index ec255a1646d2..16272a6c12b7 100644
>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>> @@ -323,6 +323,7 @@
>>>>  314    common  sched_setattr           sys_sched_setattr
>>>>  315    common  sched_getattr           sys_sched_getattr
>>>>  316    common  renameat2               sys_renameat2
>>>> +317    common  seccomp                 sys_seccomp
>>>>
>>>>  #
>>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>> index b0881a0ed322..1713977ee26f 100644
>>>> --- a/include/linux/syscalls.h
>>>> +++ b/include/linux/syscalls.h
>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>>                          unsigned long idx1, unsigned long idx2);
>>>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>> +                           const char __user *uargs);
>>>
>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>> It don't think it will be extensible this way.
>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>> but rather 'struct sock_fprog __user *'
>>> I think it makes more sense to define only first argument as 'int op' and the
>>> rest as variable length array.
>>> Something like:
>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>> then different commands can interpret 'attrs' differently.
>>> if op == mode_strict, then attrs == NULL, len == 0
>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>> and nla_data(attrs) is 'struct sock_fprog'
>>
>> Eww.  If the operation doesn't imply the type, then I think we've
>> totally screwed up.
>>
>>> If we decide to add new types of filters or new commands, the syscall prototype
>>> won't need to change. New commands can be added preserving backward
>>> compatibility.
>>> The basic TLV concept has been around forever in netlink world. imo makes
>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>> is the thing
>>> of the past. TLV style is more extensible. Fields of structures can become
>>> optional in the future, new fields added, etc.
>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>> to user land.
>>
>> I see no reason to bring nl_attr into this.
>>
>> Admittedly, I've never dealt with nl_attr, but everything
>> netlink-related I've even been involved in has involved some sort of
>> API atrocity.
>
> netlink has a lot of legacy and there is genetlink which is not pretty
> either because of extra socket creation, binding, dealing with packet
> loss issues, but the key concept of variable length encoding is sound.
> Right now seccomp has two commands and they already don't fit
> into single syscall neatly. Are you saying there should be two syscalls
> here? What about another seccomp related command? Another syscall?
> imo all seccomp related commands needs to be mux/demux-ed under
> one syscall. What is the way to mux/demux potentially very different
> commands under one syscall? I cannot think of anything better than
> TLV style. 'struct nlattr' is what we have today and I think it works fine.
> I'm not suggesting to bring the whole netlink into the picture, but rather
> TLV style of encoding different arguments for different commands.

I'm unconvinced.  These are simple commands, and I think the interface
should be simple.  Syscalls are cheap.

As an example, the interface could be:

int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);

The "tsync" operation would be seccomp_add_filter(NULL,
SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
always-accept filter and syncing threads.

But, frankly, this kind of stuff should probably be "do operation X".
IIUC nl_attr is more like "do something, with these tags and values",
which results in oddities like whatever should happen of more than one
tag is set.

--Andy
Kees Cook June 13, 2014, 9:53 p.m. UTC | #6
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook <keescook@chromium.org>
>> Cc: linux-api@vger.kernel.org
>> ---
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>                          unsigned long idx1, unsigned long idx2);
>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> +                           const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.

FWIW, "flags" is given use in the next patch to support the tsync option.

-Kees
Kees Cook June 13, 2014, 10:01 p.m. UTC | #7
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>>> parameter for future expansion. The third argument is a pointer value,
>>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>>
>>>>> Signed-off-by: Kees Cook <keescook@chromium.org>
>>>>> Cc: linux-api@vger.kernel.org
>>>>> ---
>>>>>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>>>>>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>>>>>  include/linux/syscalls.h          |    2 ++
>>>>>  include/uapi/asm-generic/unistd.h |    4 ++-
>>>>>  include/uapi/linux/seccomp.h      |    4 +++
>>>>>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>>>>>  kernel/sys_ni.c                   |    3 ++
>>>>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>>> index d6b867921612..7527eac24122 100644
>>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>>> @@ -360,3 +360,4 @@
>>>>>  351    i386    sched_setattr           sys_sched_setattr
>>>>>  352    i386    sched_getattr           sys_sched_getattr
>>>>>  353    i386    renameat2               sys_renameat2
>>>>> +354    i386    seccomp                 sys_seccomp
>>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>>> index ec255a1646d2..16272a6c12b7 100644
>>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>>> @@ -323,6 +323,7 @@
>>>>>  314    common  sched_setattr           sys_sched_setattr
>>>>>  315    common  sched_getattr           sys_sched_getattr
>>>>>  316    common  renameat2               sys_renameat2
>>>>> +317    common  seccomp                 sys_seccomp
>>>>>
>>>>>  #
>>>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>>> index b0881a0ed322..1713977ee26f 100644
>>>>> --- a/include/linux/syscalls.h
>>>>> +++ b/include/linux/syscalls.h
>>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>>>                          unsigned long idx1, unsigned long idx2);
>>>>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>>> +                           const char __user *uargs);
>>>>
>>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>>> It don't think it will be extensible this way.
>>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>>> but rather 'struct sock_fprog __user *'
>>>> I think it makes more sense to define only first argument as 'int op' and the
>>>> rest as variable length array.
>>>> Something like:
>>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>>> then different commands can interpret 'attrs' differently.
>>>> if op == mode_strict, then attrs == NULL, len == 0
>>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>>> and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww.  If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
>>>> If we decide to add new types of filters or new commands, the syscall prototype
>>>> won't need to change. New commands can be added preserving backward
>>>> compatibility.
>>>> The basic TLV concept has been around forever in netlink world. imo makes
>>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>>> is the thing
>>>> of the past. TLV style is more extensible. Fields of structures can become
>>>> optional in the future, new fields added, etc.
>>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>>> to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one syscall. What is the way to mux/demux potentially very different
>> commands under one syscall? I cannot think of anything better than
>> TLV style. 'struct nlattr' is what we have today and I think it works fine.
>> I'm not suggesting to bring the whole netlink into the picture, but rather
>> TLV style of encoding different arguments for different commands.
>
> I'm unconvinced.  These are simple commands, and I think the interface
> should be simple.  Syscalls are cheap.
>
> As an example, the interface could be:
>
> int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);
>
> The "tsync" operation would be seccomp_add_filter(NULL,
> SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
> always-accept filter and syncing threads.

I think you convinced me that tsync should be part of adding a filter
(since now there are no failure side-effects), so this specific
example I would expect EFAULT from. But ...

>
> But, frankly, this kind of stuff should probably be "do operation X".
> IIUC nl_attr is more like "do something, with these tags and values",
> which results in oddities like whatever should happen of more than one
> tag is set.

I have no objection to eliminating seccomp-strict from the syscall,
and just making this the "add seccomp filter" syscall. My only
hesitation would be that if we need something besides adding a filter
in the future, we'd be back to extending this awkwardly or adding
another syscall. That's why I went with the "operation" argument. I'm
not opposed to passing attrs and len, but seccomp_add_filter does feel
cleaner.

-Kees
Alexei Starovoitov June 13, 2014, 10:26 p.m. UTC | #8
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook <keescook@chromium.org> wrote:
>>>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>>>> parameter for future expansion. The third argument is a pointer value,
>>>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>>>
>>>>> Signed-off-by: Kees Cook <keescook@chromium.org>
>>>>> Cc: linux-api@vger.kernel.org
>>>>> ---
>>>>>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>>>>>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>>>>>  include/linux/syscalls.h          |    2 ++
>>>>>  include/uapi/asm-generic/unistd.h |    4 ++-
>>>>>  include/uapi/linux/seccomp.h      |    4 +++
>>>>>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>>>>>  kernel/sys_ni.c                   |    3 ++
>>>>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
>>>>> index d6b867921612..7527eac24122 100644
>>>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>>>> @@ -360,3 +360,4 @@
>>>>>  351    i386    sched_setattr           sys_sched_setattr
>>>>>  352    i386    sched_getattr           sys_sched_getattr
>>>>>  353    i386    renameat2               sys_renameat2
>>>>> +354    i386    seccomp                 sys_seccomp
>>>>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>>>>> index ec255a1646d2..16272a6c12b7 100644
>>>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>>>> @@ -323,6 +323,7 @@
>>>>>  314    common  sched_setattr           sys_sched_setattr
>>>>>  315    common  sched_getattr           sys_sched_getattr
>>>>>  316    common  renameat2               sys_renameat2
>>>>> +317    common  seccomp                 sys_seccomp
>>>>>
>>>>>  #
>>>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>>>> index b0881a0ed322..1713977ee26f 100644
>>>>> --- a/include/linux/syscalls.h
>>>>> +++ b/include/linux/syscalls.h
>>>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>>>                          unsigned long idx1, unsigned long idx2);
>>>>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
>>>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>>>> +                           const char __user *uargs);
>>>>
>>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>>> It don't think it will be extensible this way.
>>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>>> but rather 'struct sock_fprog __user *'
>>>> I think it makes more sense to define only first argument as 'int op' and the
>>>> rest as variable length array.
>>>> Something like:
>>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>>> then different commands can interpret 'attrs' differently.
>>>> if op == mode_strict, then attrs == NULL, len == 0
>>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>>> and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww.  If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
>>>> If we decide to add new types of filters or new commands, the syscall prototype
>>>> won't need to change. New commands can be added preserving backward
>>>> compatibility.
>>>> The basic TLV concept has been around forever in netlink world. imo makes
>>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>>> is the thing
>>>> of the past. TLV style is more extensible. Fields of structures can become
>>>> optional in the future, new fields added, etc.
>>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>>> to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one syscall. What is the way to mux/demux potentially very different
>> commands under one syscall? I cannot think of anything better than
>> TLV style. 'struct nlattr' is what we have today and I think it works fine.
>> I'm not suggesting to bring the whole netlink into the picture, but rather
>> TLV style of encoding different arguments for different commands.
>
> I'm unconvinced.  These are simple commands, and I think the interface
> should be simple.  Syscalls are cheap.
>
> As an example, the interface could be:
>
> int seccomp_add_filter(const struct sock_fprog *filter, unsigned int flags);
>
> The "tsync" operation would be seccomp_add_filter(NULL,
> SECCOMP_ADD_FILTER_TSYNC) -- it's equivalent to adding an
> always-accept filter and syncing threads.
>
> But, frankly, this kind of stuff should probably be "do operation X".
> IIUC nl_attr is more like "do something, with these tags and values",
> which results in oddities like whatever should happen of more than one
> tag is set.

TLV is a price of extensibility vs simplicity.
Say we have a syscall_foo(struct XX __user *x); that takes
struct XX {
  int flag;
  int var1;
};
now we want to add another variable to the structure that will be used
only when certain flag is set. You cannot do it easily without
breaking old user binaries, since new structure will be:
struct XX2 {
  int flag;
  int var1;
  int var2;
};
if we do copy_from_user(,, sizeof(struct XX2)) it will brake old programs.
Potentially we can do it the ugly way:
copy_from_user(,, sizeof(struct XX)), then check flag and do another
copy_from_user(,, sizeof(struct XX2)) just to fetch extra argument.
but then another day passes and yet another new flag means that both
var1 and var2 are unused and it needs 'u64 var3' instead.
Now kernel looks very ugly by doing multiple copy_from_user() of different
structures.
It would be much cleaner with nlattr:
syscall_foo(struct nlattr *attrs, int len);
kernel fetches 'len' bytes onces and then picks var[123] fields from nlattr
array. In other words nlattr array is a way to represent flexible structure
where fields can come and go in the future.
This was a simple example. Consider the case where var1 and var2
are arrays of things.
imo the old:
struct sock_fprog {
        unsigned short          len;    /* Number of filter blocks */
        struct sock_filter __user *filter;
};
is the example of inflexible user interface.
It could have been single 'struct nlattr'
nlattr->nla_len == length of filter program in bytes
nlattr->nla_type == ARRAY_OF_SOCK_FILTER constant
nla_data(nlattr) - variable length array of 'struct sock_filter' bpf
instructions.
Right now I'd like to extend it for the work I'm doing around eBPF, but
I cannot, since it's rigid. If it was TLV, I could have easily added new flags,
new types, new sections to bpf programs.
For user space it would have been just as easy to populate such
'struct nlattr' as to populate 'struct sock_fprog'.
Michael Kerrisk (man-pages) June 16, 2014, 6:08 a.m. UTC | #9
Hi Kees,

On Wed, Jun 11, 2014 at 5:25 AM, Kees Cook <keescook@chromium.org> wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

I assume there'll be another iteration of these patches. With that
next iteration, could you write a man page (or at least free text
structured like a man page) that comprehensively documents the
user-space API?

Thanks,

Michael


> Signed-off-by: Kees Cook <keescook@chromium.org>
> Cc: linux-api@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |    1 +
>  arch/x86/syscalls/syscall_64.tbl  |    1 +
>  include/linux/syscalls.h          |    2 ++
>  include/uapi/asm-generic/unistd.h |    4 ++-
>  include/uapi/linux/seccomp.h      |    4 +++
>  kernel/seccomp.c                  |   63 ++++++++++++++++++++++++++++++++-----
>  kernel/sys_ni.c                   |    3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351    i386    sched_setattr           sys_sched_setattr
>  352    i386    sched_getattr           sys_sched_getattr
>  353    i386    renameat2               sys_renameat2
> +354    i386    seccomp                 sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314    common  sched_setattr           sys_sched_setattr
>  315    common  sched_getattr           sys_sched_getattr
>  316    common  renameat2               sys_renameat2
> +317    common  seccomp                 sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>                          unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +                           const char __user *uargs);
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
>  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
>  #define __NR_renameat2 276
>  __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
>  #define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
>  #define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT        0
> +#define SECCOMP_SET_MODE_FILTER        1
> +
>  /*
>   * All BPF programs must return a 32-bit value.
>   * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
>  #include <linux/sched.h>
>  #include <linux/seccomp.h>
>  #include <linux/slab.h>
> +#include <linux/syscalls.h>
>
>  /* #define SECCOMP_DEBUG 1 */
>
> @@ -301,8 +302,8 @@ free_prog:
>   *
>   * Returns filter on success and ERR_PTR otherwise.
>   */
> -static
> -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
> +static struct seccomp_filter *
> +seccomp_prepare_user_filter(const char __user *user_filter)
>  {
>         struct sock_fprog fprog;
>         struct seccomp_filter *filter = ERR_PTR(-EFAULT);
> @@ -325,19 +326,25 @@ out:
>
>  /**
>   * seccomp_attach_filter: validate and attach filter
> + * @flags:  flags to change filter behavior
>   * @filter: seccomp filter to add to the current process
>   *
>   * Caller must be holding current->sighand->siglock lock.
>   *
>   * Returns 0 on success, -ve on error.
>   */
> -static long seccomp_attach_filter(struct seccomp_filter *filter)
> +static long seccomp_attach_filter(unsigned int flags,
> +                                 struct seccomp_filter *filter)
>  {
>         unsigned long total_insns;
>         struct seccomp_filter *walker;
>
>         BUG_ON(!spin_is_locked(&current->sighand->siglock));
>
> +       /* Validate flags. */
> +       if (flags != 0)
> +               return -EINVAL;
> +
>         /* Validate resulting filter length. */
>         total_insns = filter->len;
>         for (walker = current->seccomp.filter; walker; walker = filter->prev)
> @@ -541,6 +548,7 @@ out:
>  #ifdef CONFIG_SECCOMP_FILTER
>  /**
>   * seccomp_set_mode_filter: internal function for setting seccomp filter
> + * @flags:  flags to change filter behavior
>   * @filter: struct sock_fprog containing filter
>   *
>   * This function may be called repeatedly to install additional filters.
> @@ -551,7 +559,8 @@ out:
>   *
>   * Returns 0 on success or -EINVAL on failure.
>   */
> -static long seccomp_set_mode_filter(char __user *filter)
> +static long seccomp_set_mode_filter(unsigned int flags,
> +                                   const char __user *filter)
>  {
>         const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
>         struct seccomp_filter *prepared = NULL;
> @@ -569,7 +578,7 @@ static long seccomp_set_mode_filter(char __user *filter)
>         if (!seccomp_check_mode(current, seccomp_mode))
>                 goto out;
>
> -       ret = seccomp_attach_filter(prepared);
> +       ret = seccomp_attach_filter(flags, prepared);
>         if (ret)
>                 goto out;
>         /* Do not free the successfully attached filter. */
> @@ -583,12 +592,35 @@ out_free:
>         return ret;
>  }
>  #else
> -static inline long seccomp_set_mode_filter(char __user *filter)
> +static inline long seccomp_set_mode_filter(unsigned int flags,
> +                                          const char __user *filter)
>  {
>         return -EINVAL;
>  }
>  #endif
>
> +/* Common entry point for both prctl and syscall. */
> +static long do_seccomp(unsigned int op, unsigned int flags,
> +                      const char __user *uargs)
> +{
> +       switch (op) {
> +       case SECCOMP_SET_MODE_STRICT:
> +               if (flags != 0 || uargs != NULL)
> +                       return -EINVAL;
> +               return seccomp_set_mode_strict();
> +       case SECCOMP_SET_MODE_FILTER:
> +               return seccomp_set_mode_filter(flags, uargs);
> +       default:
> +               return -EINVAL;
> +       }
> +}
> +
> +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
> +                        const char __user *, uargs)
> +{
> +       return do_seccomp(op, flags, uargs);
> +}
> +
>  /**
>   * prctl_set_seccomp: configures current->seccomp.mode
>   * @seccomp_mode: requested mode to use
> @@ -598,12 +630,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
>   */
>  long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>  {
> +       unsigned int op;
> +       char __user *uargs;
> +
>         switch (seccomp_mode) {
>         case SECCOMP_MODE_STRICT:
> -               return seccomp_set_mode_strict();
> +               op = SECCOMP_SET_MODE_STRICT;
> +               /*
> +                * Setting strict mode through prctl always ignored filter,
> +                * so make sure it is always NULL here to pass the internal
> +                * check in do_seccomp().
> +                */
> +               uargs = NULL;
> +               break;
>         case SECCOMP_MODE_FILTER:
> -               return seccomp_set_mode_filter(filter);
> +               op = SECCOMP_SET_MODE_FILTER;
> +               uargs = filter;
> +               break;
>         default:
>                 return -EINVAL;
>         }
> +
> +       /* prctl interface doesn't have flags, so they are always zero. */
> +       return do_seccomp(op, 0, uargs);
>  }
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 36441b51b5df..2904a2105914 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
>
>  /* compare kernel pointers */
>  cond_syscall(sys_kcmp);
> +
> +/* operate on Secure Computing state */
> +cond_syscall(sys_seccomp);
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..7527eac24122 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@ 
 351	i386	sched_setattr		sys_sched_setattr
 352	i386	sched_getattr		sys_sched_getattr
 353	i386	renameat2		sys_renameat2
+354	i386	seccomp			sys_seccomp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..16272a6c12b7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@ 
 314	common	sched_setattr		sys_sched_setattr
 315	common	sched_getattr		sys_sched_getattr
 316	common	renameat2		sys_renameat2
+317	common	seccomp			sys_seccomp
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0881a0ed322..1713977ee26f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -866,4 +866,6 @@  asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
+			    const char __user *uargs);
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 333640608087..65acbf0e2867 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -699,9 +699,11 @@  __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
 __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
 #define __NR_renameat2 276
 __SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
 
 #undef __NR_syscalls
-#define __NR_syscalls 277
+#define __NR_syscalls 278
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f72973..b258878ba754 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,10 @@ 
 #define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
 #define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
 
+/* Valid operations for seccomp syscall. */
+#define SECCOMP_SET_MODE_STRICT	0
+#define SECCOMP_SET_MODE_FILTER	1
+
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 39d32c2904fc..c0cafa9e84af 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,7 @@ 
 #include <linux/sched.h>
 #include <linux/seccomp.h>
 #include <linux/slab.h>
+#include <linux/syscalls.h>
 
 /* #define SECCOMP_DEBUG 1 */
 
@@ -301,8 +302,8 @@  free_prog:
  *
  * Returns filter on success and ERR_PTR otherwise.
  */
-static
-struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
 {
 	struct sock_fprog fprog;
 	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
@@ -325,19 +326,25 @@  out:
 
 /**
  * seccomp_attach_filter: validate and attach filter
+ * @flags:  flags to change filter behavior
  * @filter: seccomp filter to add to the current process
  *
  * Caller must be holding current->sighand->siglock lock.
  *
  * Returns 0 on success, -ve on error.
  */
-static long seccomp_attach_filter(struct seccomp_filter *filter)
+static long seccomp_attach_filter(unsigned int flags,
+				  struct seccomp_filter *filter)
 {
 	unsigned long total_insns;
 	struct seccomp_filter *walker;
 
 	BUG_ON(!spin_is_locked(&current->sighand->siglock));
 
+	/* Validate flags. */
+	if (flags != 0)
+		return -EINVAL;
+
 	/* Validate resulting filter length. */
 	total_insns = filter->len;
 	for (walker = current->seccomp.filter; walker; walker = filter->prev)
@@ -541,6 +548,7 @@  out:
 #ifdef CONFIG_SECCOMP_FILTER
 /**
  * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags:  flags to change filter behavior
  * @filter: struct sock_fprog containing filter
  *
  * This function may be called repeatedly to install additional filters.
@@ -551,7 +559,8 @@  out:
  *
  * Returns 0 on success or -EINVAL on failure.
  */
-static long seccomp_set_mode_filter(char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+				    const char __user *filter)
 {
 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
 	struct seccomp_filter *prepared = NULL;
@@ -569,7 +578,7 @@  static long seccomp_set_mode_filter(char __user *filter)
 	if (!seccomp_check_mode(current, seccomp_mode))
 		goto out;
 
-	ret = seccomp_attach_filter(prepared);
+	ret = seccomp_attach_filter(flags, prepared);
 	if (ret)
 		goto out;
 	/* Do not free the successfully attached filter. */
@@ -583,12 +592,35 @@  out_free:
 	return ret;
 }
 #else
-static inline long seccomp_set_mode_filter(char __user *filter)
+static inline long seccomp_set_mode_filter(unsigned int flags,
+					   const char __user *filter)
 {
 	return -EINVAL;
 }
 #endif
 
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+		       const char __user *uargs)
+{
+	switch (op) {
+	case SECCOMP_SET_MODE_STRICT:
+		if (flags != 0 || uargs != NULL)
+			return -EINVAL;
+		return seccomp_set_mode_strict();
+	case SECCOMP_SET_MODE_FILTER:
+		return seccomp_set_mode_filter(flags, uargs);
+	default:
+		return -EINVAL;
+	}
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+			 const char __user *, uargs)
+{
+	return do_seccomp(op, flags, uargs);
+}
+
 /**
  * prctl_set_seccomp: configures current->seccomp.mode
  * @seccomp_mode: requested mode to use
@@ -598,12 +630,27 @@  static inline long seccomp_set_mode_filter(char __user *filter)
  */
 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
+	unsigned int op;
+	char __user *uargs;
+
 	switch (seccomp_mode) {
 	case SECCOMP_MODE_STRICT:
-		return seccomp_set_mode_strict();
+		op = SECCOMP_SET_MODE_STRICT;
+		/*
+		 * Setting strict mode through prctl always ignored filter,
+		 * so make sure it is always NULL here to pass the internal
+		 * check in do_seccomp().
+		 */
+		uargs = NULL;
+		break;
 	case SECCOMP_MODE_FILTER:
-		return seccomp_set_mode_filter(filter);
+		op = SECCOMP_SET_MODE_FILTER;
+		uargs = filter;
+		break;
 	default:
 		return -EINVAL;
 	}
+
+	/* prctl interface doesn't have flags, so they are always zero. */
+	return do_seccomp(op, 0, uargs);
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..2904a2105914 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -213,3 +213,6 @@  cond_syscall(compat_sys_open_by_handle_at);
 
 /* compare kernel pointers */
 cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);