diff mbox series

[1/2] riscv: add support for SECCOMP incl. filters

Message ID 20181206150156.28210-2-david.abdurachmanov@gmail.com (mailing list archive)
State New, archived
Headers show
Series riscv: add support for SECCOMP and SECCOMP_FILTER | expand

Commit Message

David Abdurachmanov Dec. 6, 2018, 3:01 p.m. UTC
The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).

Signed-off-by: David Abdurachmanov <david.abdurachmanov@gmail.com>
---
 arch/riscv/Kconfig                   | 14 ++++++++++++++
 arch/riscv/include/asm/thread_info.h |  5 ++++-
 arch/riscv/kernel/entry.S            | 27 +++++++++++++++++++++++++--
 arch/riscv/kernel/ptrace.c           |  8 ++++++++
 4 files changed, 51 insertions(+), 3 deletions(-)

Comments

Kees Cook Dec. 6, 2018, 4:47 p.m. UTC | #1
On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@gmail.com>
> ---
>  arch/riscv/Kconfig                   | 14 ++++++++++++++
>  arch/riscv/include/asm/thread_info.h |  5 ++++-
>  arch/riscv/kernel/entry.S            | 27 +++++++++++++++++++++++++--
>  arch/riscv/kernel/ptrace.c           |  8 ++++++++
>  4 files changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a4f48f757204..49cd8e251547 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -29,6 +29,7 @@ config RISCV
>         select GENERIC_SMP_IDLE_THREAD
>         select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
>         select HAVE_ARCH_AUDITSYSCALL
> +       select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_MEMBLOCK_NODE_MAP
>         select HAVE_DMA_CONTIGUOUS
>         select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -228,6 +229,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +       bool "Enable seccomp to safely compute untrusted bytecode"
> +       help
> +         This kernel feature is useful for number crunching applications
> +         that may need to compute untrusted bytecode during their
> +         execution. By using pipes or other transports made available to
> +         the process as file descriptors supporting the read/write
> +         syscalls, it's possible to isolate those applications in
> +         their own address space using seccomp. Once seccomp is
> +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +         and the task is only allowed to execute a few safe syscalls
> +         defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 1c9cc8389928..1fd6e4130cab 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -81,6 +81,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP                    8       /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> @@ -88,11 +89,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
>         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT \
> +        _TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 355166f57205..e88ccbfa61ee 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -207,8 +207,25 @@ check_syscall_nr:
>         /* Check to make sure we don't jump to a bogus syscall number. */
>         li t0, __NR_syscalls
>         la s0, sys_ni_syscall
> -       /* Syscall number held in a7 */
> -       bgeu a7, t0, 1f
> +       /*
> +        * The tracer can change syscall number to valid/invalid value.
> +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> +        * cannot trust the current value in a7 and have to reload from
> +        * the current task pt_regs.
> +        */
> +       REG_L a7, PT_A7(sp)
> +       /*
> +        * Syscall number held in a7.
> +        * If syscall number is above allowed value, redirect to ni_syscall.
> +        */
> +       bge a7, t0, 1f
> +       /*
> +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +        * If yes, we pretend it was executed.
> +        */
> +       li t1, -1
> +       beq a7, t1, ret_from_syscall_rejected
> +       /* Call syscall */
>         la s0, sys_call_table
>         slli t0, a7, RISCV_LGPTR
>         add s0, s0, t0
> @@ -219,6 +236,12 @@ check_syscall_nr:
>  ret_from_syscall:
>         /* Set user a0 to kernel a0 */
>         REG_S a0, PT_A0(sp)
> +       /*
> +        * We didn't execute the actual syscall.
> +        * Seccomp already set return value for the current task pt_regs.
> +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +        */
> +ret_from_syscall_rejected:
>         /* Trace syscalls, but only if requested by the user. */
>         REG_L t0, TASK_TI_FLAGS(tp)
>         andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index c1b51539c3e2..598e48b8ca2b 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -160,6 +160,14 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>                 if (tracehook_report_syscall_entry(regs))
>                         syscall_set_nr(current, regs, -1);
>
> +       /*
> +        * Do the secure computing after ptrace; failures should be fast.
> +        * If this fails we might have return value in a0 from seccomp
> +        * (via SECCOMP_RET_ERRNO/TRACE).
> +        */
> +       if (secure_computing(NULL) == -1)
> +               syscall_set_nr(current, regs, -1);

On a -1 return, this should return immediately -- it should not
continue to process trace_sys_enter(), etc.

-Kees

> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> --
> 2.19.2
>
Kees Cook Dec. 6, 2018, 4:51 p.m. UTC | #2
On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
> The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).

Can you add support to tools/testing/selftests/seccomp/seccomp_bpf.c
as well? That selftest finds a lot of weird corner-cases...

> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 1c9cc8389928..1fd6e4130cab 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -81,6 +81,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP                    8       /* syscall secure computing */

Nit: extra tab needs to be removed.
Kees Cook Dec. 6, 2018, 5:06 p.m. UTC | #3
On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
> The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).

I built this against linux-next but it's missing seccomp.h. Was that
accidentally left out of the commit?


  CC      arch/riscv/kernel/asm-offsets.s
In file included from ./include/linux/sched.h:21:0,
                 from arch/riscv/kernel/asm-offsets.c:18:
./include/linux/seccomp.h:14:10: fatal error: asm/seccomp.h: No such
file or directory
 #include <asm/seccomp.h>
          ^~~~~~~~~~~~~~~
David Abdurachmanov Dec. 6, 2018, 5:10 p.m. UTC | #4
On Thu, Dec 6, 2018 at 5:47 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
> <david.abdurachmanov@gmail.com> wrote:
> >
> > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
> >
> > Signed-off-by: David Abdurachmanov <david.abdurachmanov@gmail.com>
> > ---
> >  arch/riscv/Kconfig                   | 14 ++++++++++++++
> >  arch/riscv/include/asm/thread_info.h |  5 ++++-
> >  arch/riscv/kernel/entry.S            | 27 +++++++++++++++++++++++++--
> >  arch/riscv/kernel/ptrace.c           |  8 ++++++++
> >  4 files changed, 51 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index a4f48f757204..49cd8e251547 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -29,6 +29,7 @@ config RISCV
> >         select GENERIC_SMP_IDLE_THREAD
> >         select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
> >         select HAVE_ARCH_AUDITSYSCALL
> > +       select HAVE_ARCH_SECCOMP_FILTER
> >         select HAVE_MEMBLOCK_NODE_MAP
> >         select HAVE_DMA_CONTIGUOUS
> >         select HAVE_FUTEX_CMPXCHG if FUTEX
> > @@ -228,6 +229,19 @@ menu "Kernel features"
> >
> >  source "kernel/Kconfig.hz"
> >
> > +config SECCOMP
> > +       bool "Enable seccomp to safely compute untrusted bytecode"
> > +       help
> > +         This kernel feature is useful for number crunching applications
> > +         that may need to compute untrusted bytecode during their
> > +         execution. By using pipes or other transports made available to
> > +         the process as file descriptors supporting the read/write
> > +         syscalls, it's possible to isolate those applications in
> > +         their own address space using seccomp. Once seccomp is
> > +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> > +         and the task is only allowed to execute a few safe syscalls
> > +         defined by each seccomp mode.
> > +
> >  endmenu
> >
> >  menu "Boot options"
> > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > index 1c9cc8389928..1fd6e4130cab 100644
> > --- a/arch/riscv/include/asm/thread_info.h
> > +++ b/arch/riscv/include/asm/thread_info.h
> > @@ -81,6 +81,7 @@ struct thread_info {
> >  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
> >  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
> >  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> > +#define TIF_SECCOMP                    8       /* syscall secure computing */
> >
> >  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
> >  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> > @@ -88,11 +89,13 @@ struct thread_info {
> >  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
> >  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
> >  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> > +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
> >
> >  #define _TIF_WORK_MASK \
> >         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
> >
> >  #define _TIF_SYSCALL_WORK \
> > -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> > +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT \
> > +        _TIF_SECCOMP )
> >
> >  #endif /* _ASM_RISCV_THREAD_INFO_H */
> > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> > index 355166f57205..e88ccbfa61ee 100644
> > --- a/arch/riscv/kernel/entry.S
> > +++ b/arch/riscv/kernel/entry.S
> > @@ -207,8 +207,25 @@ check_syscall_nr:
> >         /* Check to make sure we don't jump to a bogus syscall number. */
> >         li t0, __NR_syscalls
> >         la s0, sys_ni_syscall
> > -       /* Syscall number held in a7 */
> > -       bgeu a7, t0, 1f
> > +       /*
> > +        * The tracer can change syscall number to valid/invalid value.
> > +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> > +        * cannot trust the current value in a7 and have to reload from
> > +        * the current task pt_regs.
> > +        */
> > +       REG_L a7, PT_A7(sp)
> > +       /*
> > +        * Syscall number held in a7.
> > +        * If syscall number is above allowed value, redirect to ni_syscall.
> > +        */
> > +       bge a7, t0, 1f
> > +       /*
> > +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> > +        * If yes, we pretend it was executed.
> > +        */
> > +       li t1, -1
> > +       beq a7, t1, ret_from_syscall_rejected
> > +       /* Call syscall */
> >         la s0, sys_call_table
> >         slli t0, a7, RISCV_LGPTR
> >         add s0, s0, t0
> > @@ -219,6 +236,12 @@ check_syscall_nr:
> >  ret_from_syscall:
> >         /* Set user a0 to kernel a0 */
> >         REG_S a0, PT_A0(sp)
> > +       /*
> > +        * We didn't execute the actual syscall.
> > +        * Seccomp already set return value for the current task pt_regs.
> > +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> > +        */
> > +ret_from_syscall_rejected:
> >         /* Trace syscalls, but only if requested by the user. */
> >         REG_L t0, TASK_TI_FLAGS(tp)
> >         andi t0, t0, _TIF_SYSCALL_WORK
> > diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> > index c1b51539c3e2..598e48b8ca2b 100644
> > --- a/arch/riscv/kernel/ptrace.c
> > +++ b/arch/riscv/kernel/ptrace.c
> > @@ -160,6 +160,14 @@ void do_syscall_trace_enter(struct pt_regs *regs)
> >                 if (tracehook_report_syscall_entry(regs))
> >                         syscall_set_nr(current, regs, -1);
> >
> > +       /*
> > +        * Do the secure computing after ptrace; failures should be fast.
> > +        * If this fails we might have return value in a0 from seccomp
> > +        * (via SECCOMP_RET_ERRNO/TRACE).
> > +        */
> > +       if (secure_computing(NULL) == -1)
> > +               syscall_set_nr(current, regs, -1);
>
> On a -1 return, this should return immediately -- it should not
> continue to process trace_sys_enter(), etc.

Ops! No idea how I missed that.
Will fix it.

> -Kees
>
> > +
> >  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> >         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> >                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> > --
> > 2.19.2
> >
>
>
> --
> Kees Cook
David Abdurachmanov Dec. 6, 2018, 5:11 p.m. UTC | #5
On Thu, Dec 6, 2018 at 5:52 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
> <david.abdurachmanov@gmail.com> wrote:
> > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
>
> Can you add support to tools/testing/selftests/seccomp/seccomp_bpf.c
> as well? That selftest finds a lot of weird corner-cases...
>
> > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > index 1c9cc8389928..1fd6e4130cab 100644
> > --- a/arch/riscv/include/asm/thread_info.h
> > +++ b/arch/riscv/include/asm/thread_info.h
> > @@ -81,6 +81,7 @@ struct thread_info {
> >  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
> >  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
> >  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> > +#define TIF_SECCOMP                    8       /* syscall secure computing */
>
> Nit: extra tab needs to be removed.

Will fix it.
I need to cleanup my vim configuration for tab with.

david
David Abdurachmanov Dec. 6, 2018, 5:12 p.m. UTC | #6
On Thu, Dec 6, 2018 at 6:07 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
> <david.abdurachmanov@gmail.com> wrote:
> > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
>
> I built this against linux-next but it's missing seccomp.h. Was that
> accidentally left out of the commit?
>
>
>   CC      arch/riscv/kernel/asm-offsets.s
> In file included from ./include/linux/sched.h:21:0,
>                  from arch/riscv/kernel/asm-offsets.c:18:
> ./include/linux/seccomp.h:14:10: fatal error: asm/seccomp.h: No such
> file or directory
>  #include <asm/seccomp.h>
>           ^~~~~~~~~~~~~~~
>

I forgot to add it...
Will fix it.

david
David Abdurachmanov Dec. 6, 2018, 6:25 p.m. UTC | #7
On Thu, Dec 6, 2018 at 5:52 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
> <david.abdurachmanov@gmail.com> wrote:
> > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
>
> Can you add support to tools/testing/selftests/seccomp/seccomp_bpf.c
> as well? That selftest finds a lot of weird corner-cases...

I hate it locally and will include in v2.

The results see fine (tested in QEMU).

TAP version 13
selftests: seccomp: seccomp_bpf
========================================
[==========] Running 64 tests from 1 test cases.
[ RUN      ] global.mode_strict_support
[       OK ] global.mode_strict_support
[ RUN      ] global.mode_strict_cannot_call_prctl
[       OK ] global.mode_strict_cannot_call_prctl
[ RUN      ] global.no_new_privs_support
[       OK ] global.no_new_privs_support
[ RUN      ] global.mode_filter_support
[       OK ] global.mode_filter_support
[ RUN      ] global.mode_filter_without_nnp
[       OK ] global.mode_filter_without_nnp
[ RUN      ] global.filter_size_limits
[       OK ] global.filter_size_limits
[ RUN      ] global.filter_chain_limits
[       OK ] global.filter_chain_limits
[ RUN      ] global.mode_filter_cannot_move_to_strict
[       OK ] global.mode_filter_cannot_move_to_strict
[ RUN      ] global.mode_filter_get_seccomp
[       OK ] global.mode_filter_get_seccomp
[ RUN      ] global.ALLOW_all
[       OK ] global.ALLOW_all
[ RUN      ] global.empty_prog
[       OK ] global.empty_prog
[ RUN      ] global.log_all
[       OK ] global.log_all
[ RUN      ] global.unknown_ret_is_kill_inside
[       OK ] global.unknown_ret_is_kill_inside
[ RUN      ] global.unknown_ret_is_kill_above_allow
[       OK ] global.unknown_ret_is_kill_above_allow
[ RUN      ] global.KILL_all
[       OK ] global.KILL_all
[ RUN      ] global.KILL_one
[       OK ] global.KILL_one
[ RUN      ] global.KILL_one_arg_one
[       OK ] global.KILL_one_arg_one
[ RUN      ] global.KILL_one_arg_six
[       OK ] global.KILL_one_arg_six
[ RUN      ] global.KILL_thread
[       OK ] global.KILL_thread
[ RUN      ] global.KILL_process
[       OK ] global.KILL_process
[ RUN      ] global.arg_out_of_range
[       OK ] global.arg_out_of_range
[ RUN      ] global.ERRNO_valid
[       OK ] global.ERRNO_valid
[ RUN      ] global.ERRNO_zero
[       OK ] global.ERRNO_zero
[ RUN      ] global.ERRNO_capped
[       OK ] global.ERRNO_capped
[ RUN      ] global.ERRNO_order
[       OK ] global.ERRNO_order
[ RUN      ] TRAP.dfl
[       OK ] TRAP.dfl
[ RUN      ] TRAP.ign
[       OK ] TRAP.ign
[ RUN      ] TRAP.handler
[       OK ] TRAP.handler
[ RUN      ] precedence.allow_ok
[       OK ] precedence.allow_ok
[ RUN      ] precedence.kill_is_highest
[       OK ] precedence.kill_is_highest
[ RUN      ] precedence.kill_is_highest_in_any_order
[       OK ] precedence.kill_is_highest_in_any_order
[ RUN      ] precedence.trap_is_second
[       OK ] precedence.trap_is_second
[ RUN      ] precedence.trap_is_second_in_any_order
[       OK ] precedence.trap_is_second_in_any_order
[ RUN      ] precedence.errno_is_third
[       OK ] precedence.errno_is_third
[ RUN      ] precedence.errno_is_third_in_any_order
[       OK ] precedence.errno_is_third_in_any_order
[ RUN      ] precedence.trace_is_fourth
[       OK ] precedence.trace_is_fourth
[ RUN      ] precedence.trace_is_fourth_in_any_order
[       OK ] precedence.trace_is_fourth_in_any_order
[ RUN      ] precedence.log_is_fifth
[       OK ] precedence.log_is_fifth
[ RUN      ] precedence.log_is_fifth_in_any_order
[       OK ] precedence.log_is_fifth_in_any_order
[ RUN      ] TRACE_poke.read_has_side_effects
[       OK ] TRACE_poke.read_has_side_effects
[ RUN      ] TRACE_poke.getpid_runs_normally
[       OK ] TRACE_poke.getpid_runs_normally
[ RUN      ] TRACE_syscall.ptrace_syscall_redirected
[       OK ] TRACE_syscall.ptrace_syscall_redirected
[ RUN      ] TRACE_syscall.ptrace_syscall_dropped
[       OK ] TRACE_syscall.ptrace_syscall_dropped
[ RUN      ] TRACE_syscall.syscall_allowed
[       OK ] TRACE_syscall.syscall_allowed
[ RUN      ] TRACE_syscall.syscall_redirected
[       OK ] TRACE_syscall.syscall_redirected
[ RUN      ] TRACE_syscall.syscall_dropped
[       OK ] TRACE_syscall.syscall_dropped
[ RUN      ] TRACE_syscall.skip_after_RET_TRACE
[       OK ] TRACE_syscall.skip_after_RET_TRACE
[ RUN      ] TRACE_syscall.kill_after_RET_TRACE
[       OK ] TRACE_syscall.kill_after_RET_TRACE
[ RUN      ] TRACE_syscall.skip_after_ptrace
[       OK ] TRACE_syscall.skip_after_ptrace
[ RUN      ] TRACE_syscall.kill_after_ptrace
[       OK ] TRACE_syscall.kill_after_ptrace
[ RUN      ] global.seccomp_syscall
[       OK ] global.seccomp_syscall
[ RUN      ] global.seccomp_syscall_mode_lock
[       OK ] global.seccomp_syscall_mode_lock
[ RUN      ] global.detect_seccomp_filter_flags
[       OK ] global.detect_seccomp_filter_flags
[ RUN      ] global.TSYNC_first
[       OK ] global.TSYNC_first
[ RUN      ] TSYNC.siblings_fail_prctl
[       OK ] TSYNC.siblings_fail_prctl
[ RUN      ] TSYNC.two_siblings_with_ancestor
[       OK ] TSYNC.two_siblings_with_ancestor
[ RUN      ] TSYNC.two_sibling_want_nnp
[       OK ] TSYNC.two_sibling_want_nnp
[ RUN      ] TSYNC.two_siblings_with_no_filter
[       OK ] TSYNC.two_siblings_with_no_filter
[ RUN      ] TSYNC.two_siblings_with_one_divergence
[       OK ] TSYNC.two_siblings_with_one_divergence
[ RUN      ] TSYNC.two_siblings_not_under_filter
[       OK ] TSYNC.two_siblings_not_under_filter
[ RUN      ] global.syscall_restart
[       OK ] global.syscall_restart
[ RUN      ] global.filter_flag_log
[       OK ] global.filter_flag_log
[ RUN      ] global.get_action_avail
[       OK ] global.get_action_avail
[ RUN      ] global.get_metadata
[       OK ] global.get_metadata
[==========] 64 / 64 tests passed.
[  PASSED  ]
ok 1..1 selftests: seccomp: seccomp_bpf [PASS]
selftests: seccomp: seccomp_benchmark
========================================
Calibrating reasonable sample size...
1544120467.383132905 - 1544120467.382814604 = 318301
1544120467.384111505 - 1544120467.383931405 = 180100
1544120467.385728706 - 1544120467.384510905 = 1217801
1544120467.386858006 - 1544120467.386096506 = 761500
1544120467.388563407 - 1544120467.387171006 = 1392401
1544120467.392465908 - 1544120467.390143107 = 2322801
1544120467.397988410 - 1544120467.393666109 = 4322301
1544120467.406494614 - 1544120467.398347511 = 8147103
1544120467.427372522 - 1544120467.406955414 = 20417108
1544120467.467600338 - 1544120467.427772222 = 39828116
1544120467.542484667 - 1544120467.467954738 = 74529929
1544120467.693806026 - 1544120467.543004867 = 150801159
1544120467.970921334 - 1544120467.694244026 = 276677308
1544120468.522149049 - 1544120467.971549534 = 550599515
1544120469.637696984 - 1544120468.522606749 = 1115090235
1544120471.829467338 - 1544120469.638147084 = 2191320254
1544120476.263601568 - 1544120471.829850239 = 4433751329
1544120485.135465027 - 1544120476.263980268 = 8871484759
Benchmarking 4194304 samples...
26.716000000 - 17.812000000 = 8904000000
getpid native: 2122 ns
46.548000000 - 26.716000000 = 19832000000
getpid RET_ALLOW: 4728 ns
Estimated seccomp overhead per syscall: 2606 ns
ok 1..2 selftests: seccomp: seccomp_benchmark [PASS]

>
> > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > index 1c9cc8389928..1fd6e4130cab 100644
> > --- a/arch/riscv/include/asm/thread_info.h
> > +++ b/arch/riscv/include/asm/thread_info.h
> > @@ -81,6 +81,7 @@ struct thread_info {
> >  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
> >  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
> >  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> > +#define TIF_SECCOMP                    8       /* syscall secure computing */
>
> Nit: extra tab needs to be removed.
>
> --
> Kees Cook
Kees Cook Dec. 6, 2018, 6:32 p.m. UTC | #8
On Thu, Dec 6, 2018 at 10:26 AM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> On Thu, Dec 6, 2018 at 5:52 PM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov
> > <david.abdurachmanov@gmail.com> wrote:
> > > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF).
> >
> > Can you add support to tools/testing/selftests/seccomp/seccomp_bpf.c
> > as well? That selftest finds a lot of weird corner-cases...
>
> I hate it locally and will include in v2.

I hate it too. ;) But seriously (reading through the "have" typo),
that's awesome. Thanks!

> The results see fine (tested in QEMU).

Great!
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a4f48f757204..49cd8e251547 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -29,6 +29,7 @@  config RISCV
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_FUTEX_CMPXCHG if FUTEX
@@ -228,6 +229,19 @@  menu "Kernel features"
 
 source "kernel/Kconfig.hz"
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 1c9cc8389928..1fd6e4130cab 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -81,6 +81,7 @@  struct thread_info {
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
+#define TIF_SECCOMP			8	/* syscall secure computing */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -88,11 +89,13 @@  struct thread_info {
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
 #define _TIF_WORK_MASK \
 	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
 
 #define _TIF_SYSCALL_WORK \
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT \
+	 _TIF_SECCOMP )
 
 #endif /* _ASM_RISCV_THREAD_INFO_H */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 355166f57205..e88ccbfa61ee 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -207,8 +207,25 @@  check_syscall_nr:
 	/* Check to make sure we don't jump to a bogus syscall number. */
 	li t0, __NR_syscalls
 	la s0, sys_ni_syscall
-	/* Syscall number held in a7 */
-	bgeu a7, t0, 1f
+	/*
+	 * The tracer can change syscall number to valid/invalid value.
+	 * We use syscall_set_nr helper in syscall_trace_enter thus we
+	 * cannot trust the current value in a7 and have to reload from
+	 * the current task pt_regs.
+	 */
+	REG_L a7, PT_A7(sp)
+	/*
+	 * Syscall number held in a7.
+	 * If syscall number is above allowed value, redirect to ni_syscall.
+	 */
+	bge a7, t0, 1f
+	/*
+	 * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
+	 * If yes, we pretend it was executed.
+	 */
+	li t1, -1
+	beq a7, t1, ret_from_syscall_rejected
+	/* Call syscall */
 	la s0, sys_call_table
 	slli t0, a7, RISCV_LGPTR
 	add s0, s0, t0
@@ -219,6 +236,12 @@  check_syscall_nr:
 ret_from_syscall:
 	/* Set user a0 to kernel a0 */
 	REG_S a0, PT_A0(sp)
+	/*
+	 * We didn't execute the actual syscall.
+	 * Seccomp already set return value for the current task pt_regs.
+	 * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
+	 */
+ret_from_syscall_rejected:
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
 	andi t0, t0, _TIF_SYSCALL_WORK
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index c1b51539c3e2..598e48b8ca2b 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -160,6 +160,14 @@  void do_syscall_trace_enter(struct pt_regs *regs)
 		if (tracehook_report_syscall_entry(regs))
 			syscall_set_nr(current, regs, -1);
 
+	/*
+	 * Do the secure computing after ptrace; failures should be fast.
+	 * If this fails we might have return value in a0 from seccomp
+	 * (via SECCOMP_RET_ERRNO/TRACE).
+	 */
+	if (secure_computing(NULL) == -1)
+		syscall_set_nr(current, regs, -1);
+
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, syscall_get_nr(current, regs));