diff mbox series

[v2] riscv: add support for SECCOMP and SECCOMP_FILTER

Message ID 20190822205533.4877-1-david.abdurachmanov@sifive.com (mailing list archive)
State New
Headers show
Series [v2] riscv: add support for SECCOMP and SECCOMP_FILTER | expand

Commit Message

David Abdurachmanov Aug. 22, 2019, 8:55 p.m. UTC
This patch was extensively tested on Fedora/RISCV (applied by default on
top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
on QEMU and SiFive Unleashed board.

libseccomp (userspace) was rebased:
https://github.com/seccomp/libseccomp/pull/134

Fully passes libseccomp regression testing (simulation and live).

There is one failing kernel selftest: global.user_notification_signal

v1 -> v2:
  - return immediatly if secure_computing(NULL) returns -1
  - fixed whitespace issues
  - add missing seccomp.h
  - remove patch #2 (solved now)
  - add riscv to seccomp kernel selftest

Cc: keescook@chromium.org
Cc: me@carlosedp.com

Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
---
 arch/riscv/Kconfig                            | 14 ++++++++++
 arch/riscv/include/asm/seccomp.h              | 10 +++++++
 arch/riscv/include/asm/thread_info.h          |  5 +++-
 arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
 arch/riscv/kernel/ptrace.c                    | 10 +++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
 6 files changed, 70 insertions(+), 4 deletions(-)
 create mode 100644 arch/riscv/include/asm/seccomp.h

Comments

Carlos de Paula Aug. 23, 2019, 10:54 p.m. UTC | #1
On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
>
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
>
> Fully passes libseccomp regression testing (simulation and live).
>
> There is one failing kernel selftest: global.user_notification_signal
>
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
>
> Cc: keescook@chromium.org
> Cc: me@carlosedp.com
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> ---
>  arch/riscv/Kconfig                            | 14 ++++++++++
>  arch/riscv/include/asm/seccomp.h              | 10 +++++++
>  arch/riscv/include/asm/thread_info.h          |  5 +++-
>  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
>  arch/riscv/kernel/ptrace.c                    | 10 +++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
>         select GENERIC_SMP_IDLE_THREAD
>         select GENERIC_ATOMIC64 if !64BIT
>         select HAVE_ARCH_AUDITSYSCALL
> +       select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_MEMBLOCK_NODE_MAP
>         select HAVE_DMA_CONTIGUOUS
>         select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +       bool "Enable seccomp to safely compute untrusted bytecode"
> +       help
> +         This kernel feature is useful for number crunching applications
> +         that may need to compute untrusted bytecode during their
> +         execution. By using pipes or other transports made available to
> +         the process as file descriptors supporting the read/write
> +         syscalls, it's possible to isolate those applications in
> +         their own address space using seccomp. Once seccomp is
> +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +         and the task is only allowed to execute a few safe syscalls
> +         defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index 000000000000..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include <asm/unistd.h>
> +
> +#include <asm-generic/seccomp.h>
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP            8       /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
>         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +        _TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check_syscall_nr:
>         /* Check to make sure we don't jump to a bogus syscall number. */
>         li t0, __NR_syscalls
>         la s0, sys_ni_syscall
> -       /* Syscall number held in a7 */
> -       bgeu a7, t0, 1f
> +       /*
> +        * The tracer can change syscall number to valid/invalid value.
> +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> +        * cannot trust the current value in a7 and have to reload from
> +        * the current task pt_regs.
> +        */
> +       REG_L a7, PT_A7(sp)
> +       /*
> +        * Syscall number held in a7.
> +        * If syscall number is above allowed value, redirect to ni_syscall.
> +        */
> +       bge a7, t0, 1f
> +       /*
> +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +        * If yes, we pretend it was executed.
> +        */
> +       li t1, -1
> +       beq a7, t1, ret_from_syscall_rejected
> +       /* Call syscall */
>         la s0, sys_call_table
>         slli t0, a7, RISCV_LGPTR
>         add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
>         /* Set user a0 to kernel a0 */
>         REG_S a0, PT_A0(sp)
> +       /*
> +        * We didn't execute the actual syscall.
> +        * Seccomp already set return value for the current task pt_regs.
> +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +        */
> +ret_from_syscall_rejected:
>         /* Trace syscalls, but only if requested by the user. */
>         REG_L t0, TASK_TI_FLAGS(tp)
>         andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>                 if (tracehook_report_syscall_entry(regs))
>                         syscall_set_nr(current, regs, -1);
>
> +       /*
> +        * Do the secure computing after ptrace; failures should be fast.
> +        * If this fails we might have return value in a0 from seccomp
> +        * (via SECCOMP_RET_ERRNO/TRACE).
> +        */
> +       if (secure_computing(NULL) == -1) {
> +               syscall_set_nr(current, regs, -1);
> +               return;
> +       }
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS     struct user_pt_regs
>  # define SYSCALL_NUM   regs[8]
>  # define SYSCALL_RET   regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS     struct user_regs_struct
> +# define SYSCALL_NUM   a7
> +# define SYSCALL_RET   a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS     struct user_regs_struct
>  # define SYSCALL_NUM   gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
>         EXPECT_EQ(0, ret) {}
>
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -    defined(__s390__) || defined(__hppa__)
> +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
>         {
>                 regs.SYSCALL_NUM = syscall;
>         }
> --
> 2.21.0
>

Tested-by: Carlos de Paula <me@carlosedp.com>
Carlos de Paula Aug. 23, 2019, 11:01 p.m. UTC | #2
On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
>
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
>
> Fully passes libseccomp regression testing (simulation and live).
>
> There is one failing kernel selftest: global.user_notification_signal
>
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
>
> Cc: keescook@chromium.org
> Cc: me@carlosedp.com
>
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> Tested-by: Carlos de Paula <me@carlosedp.com>
> ---
>  arch/riscv/Kconfig                            | 14 ++++++++++
>  arch/riscv/include/asm/seccomp.h              | 10 +++++++
>  arch/riscv/include/asm/thread_info.h          |  5 +++-
>  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
>  arch/riscv/kernel/ptrace.c                    | 10 +++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
>         select GENERIC_SMP_IDLE_THREAD
>         select GENERIC_ATOMIC64 if !64BIT
>         select HAVE_ARCH_AUDITSYSCALL
> +       select HAVE_ARCH_SECCOMP_FILTER
>         select HAVE_MEMBLOCK_NODE_MAP
>         select HAVE_DMA_CONTIGUOUS
>         select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +       bool "Enable seccomp to safely compute untrusted bytecode"
> +       help
> +         This kernel feature is useful for number crunching applications
> +         that may need to compute untrusted bytecode during their
> +         execution. By using pipes or other transports made available to
> +         the process as file descriptors supporting the read/write
> +         syscalls, it's possible to isolate those applications in
> +         their own address space using seccomp. Once seccomp is
> +         enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +         and the task is only allowed to execute a few safe syscalls
> +         defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index 000000000000..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include <asm/unistd.h>
> +
> +#include <asm-generic/seccomp.h>
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE             5       /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing */
> +#define TIF_SECCOMP            8       /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP           (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
>         (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +        _TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check_syscall_nr:
>         /* Check to make sure we don't jump to a bogus syscall number. */
>         li t0, __NR_syscalls
>         la s0, sys_ni_syscall
> -       /* Syscall number held in a7 */
> -       bgeu a7, t0, 1f
> +       /*
> +        * The tracer can change syscall number to valid/invalid value.
> +        * We use syscall_set_nr helper in syscall_trace_enter thus we
> +        * cannot trust the current value in a7 and have to reload from
> +        * the current task pt_regs.
> +        */
> +       REG_L a7, PT_A7(sp)
> +       /*
> +        * Syscall number held in a7.
> +        * If syscall number is above allowed value, redirect to ni_syscall.
> +        */
> +       bge a7, t0, 1f
> +       /*
> +        * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +        * If yes, we pretend it was executed.
> +        */
> +       li t1, -1
> +       beq a7, t1, ret_from_syscall_rejected
> +       /* Call syscall */
>         la s0, sys_call_table
>         slli t0, a7, RISCV_LGPTR
>         add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
>         /* Set user a0 to kernel a0 */
>         REG_S a0, PT_A0(sp)
> +       /*
> +        * We didn't execute the actual syscall.
> +        * Seccomp already set return value for the current task pt_regs.
> +        * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +        */
> +ret_from_syscall_rejected:
>         /* Trace syscalls, but only if requested by the user. */
>         REG_L t0, TASK_TI_FLAGS(tp)
>         andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>                 if (tracehook_report_syscall_entry(regs))
>                         syscall_set_nr(current, regs, -1);
>
> +       /*
> +        * Do the secure computing after ptrace; failures should be fast.
> +        * If this fails we might have return value in a0 from seccomp
> +        * (via SECCOMP_RET_ERRNO/TRACE).
> +        */
> +       if (secure_computing(NULL) == -1) {
> +               syscall_set_nr(current, regs, -1);
> +               return;
> +       }
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS     struct user_pt_regs
>  # define SYSCALL_NUM   regs[8]
>  # define SYSCALL_RET   regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS     struct user_regs_struct
> +# define SYSCALL_NUM   a7
> +# define SYSCALL_RET   a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS     struct user_regs_struct
>  # define SYSCALL_NUM   gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
>         EXPECT_EQ(0, ret) {}
>
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -    defined(__s390__) || defined(__hppa__)
> +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
>         {
>                 regs.SYSCALL_NUM = syscall;
>         }
> --
> 2.21.0
>

Kernel selftests results:

➜ uname -a
Linux fedora-unleashed 5.2.0-rc7-30159-g2d072d4-dirty #3 SMP Thu Jul 4
20:18:21 -03 2019 riscv64 riscv64 riscv64 GNU/Linux

➜ sudo ./seccomp_bpf
[==========] Running 74 tests from 1 test cases.
[ RUN      ] global.mode_strict_support
[       OK ] global.mode_strict_support
[ RUN      ] global.mode_strict_cannot_call_prctl
[       OK ] global.mode_strict_cannot_call_prctl
[ RUN      ] global.no_new_privs_support
[       OK ] global.no_new_privs_support
[ RUN      ] global.mode_filter_support
[       OK ] global.mode_filter_support
[ RUN      ] global.mode_filter_without_nnp
[       OK ] global.mode_filter_without_nnp
[ RUN      ] global.filter_size_limits
[       OK ] global.filter_size_limits
[ RUN      ] global.filter_chain_limits
[       OK ] global.filter_chain_limits
[ RUN      ] global.mode_filter_cannot_move_to_strict
[       OK ] global.mode_filter_cannot_move_to_strict
[ RUN      ] global.mode_filter_get_seccomp
[       OK ] global.mode_filter_get_seccomp
[ RUN      ] global.ALLOW_all
[       OK ] global.ALLOW_all
[ RUN      ] global.empty_prog
[       OK ] global.empty_prog
[ RUN      ] global.log_all
[       OK ] global.log_all
[ RUN      ] global.unknown_ret_is_kill_inside
[       OK ] global.unknown_ret_is_kill_inside
[ RUN      ] global.unknown_ret_is_kill_above_allow
[       OK ] global.unknown_ret_is_kill_above_allow
[ RUN      ] global.KILL_all
[       OK ] global.KILL_all
[ RUN      ] global.KILL_one
[       OK ] global.KILL_one
[ RUN      ] global.KILL_one_arg_one
[       OK ] global.KILL_one_arg_one
[ RUN      ] global.KILL_one_arg_six
[       OK ] global.KILL_one_arg_six
[ RUN      ] global.KILL_thread
[       OK ] global.KILL_thread
[ RUN      ] global.KILL_process
[       OK ] global.KILL_process
[ RUN      ] global.arg_out_of_range
[       OK ] global.arg_out_of_range
[ RUN      ] global.ERRNO_valid
[       OK ] global.ERRNO_valid
[ RUN      ] global.ERRNO_zero
[       OK ] global.ERRNO_zero
[ RUN      ] global.ERRNO_capped
[       OK ] global.ERRNO_capped
[ RUN      ] global.ERRNO_order
[       OK ] global.ERRNO_order
[ RUN      ] TRAP.dfl
[       OK ] TRAP.dfl
[ RUN      ] TRAP.ign
[       OK ] TRAP.ign
[ RUN      ] TRAP.handler
[       OK ] TRAP.handler
[ RUN      ] precedence.allow_ok
[       OK ] precedence.allow_ok
[ RUN      ] precedence.kill_is_highest
[       OK ] precedence.kill_is_highest
[ RUN      ] precedence.kill_is_highest_in_any_order
[       OK ] precedence.kill_is_highest_in_any_order
[ RUN      ] precedence.trap_is_second
[       OK ] precedence.trap_is_second
[ RUN      ] precedence.trap_is_second_in_any_order
[       OK ] precedence.trap_is_second_in_any_order
[ RUN      ] precedence.errno_is_third
[       OK ] precedence.errno_is_third
[ RUN      ] precedence.errno_is_third_in_any_order
[       OK ] precedence.errno_is_third_in_any_order
[ RUN      ] precedence.trace_is_fourth
[       OK ] precedence.trace_is_fourth
[ RUN      ] precedence.trace_is_fourth_in_any_order
[       OK ] precedence.trace_is_fourth_in_any_order
[ RUN      ] precedence.log_is_fifth
[       OK ] precedence.log_is_fifth
[ RUN      ] precedence.log_is_fifth_in_any_order
[       OK ] precedence.log_is_fifth_in_any_order
[ RUN      ] TRACE_poke.read_has_side_effects
[       OK ] TRACE_poke.read_has_side_effects
[ RUN      ] TRACE_poke.getpid_runs_normally
[       OK ] TRACE_poke.getpid_runs_normally
[ RUN      ] TRACE_syscall.ptrace_syscall_redirected
[       OK ] TRACE_syscall.ptrace_syscall_redirected
[ RUN      ] TRACE_syscall.ptrace_syscall_errno
[       OK ] TRACE_syscall.ptrace_syscall_errno
[ RUN      ] TRACE_syscall.ptrace_syscall_faked
[       OK ] TRACE_syscall.ptrace_syscall_faked
[ RUN      ] TRACE_syscall.syscall_allowed
[       OK ] TRACE_syscall.syscall_allowed
[ RUN      ] TRACE_syscall.syscall_redirected
[       OK ] TRACE_syscall.syscall_redirected
[ RUN      ] TRACE_syscall.syscall_errno
[       OK ] TRACE_syscall.syscall_errno
[ RUN      ] TRACE_syscall.syscall_faked
[       OK ] TRACE_syscall.syscall_faked
[ RUN      ] TRACE_syscall.skip_after_RET_TRACE
[       OK ] TRACE_syscall.skip_after_RET_TRACE
[ RUN      ] TRACE_syscall.kill_after_RET_TRACE
[       OK ] TRACE_syscall.kill_after_RET_TRACE
[ RUN      ] TRACE_syscall.skip_after_ptrace
[       OK ] TRACE_syscall.skip_after_ptrace
[ RUN      ] TRACE_syscall.kill_after_ptrace
[       OK ] TRACE_syscall.kill_after_ptrace
[ RUN      ] global.seccomp_syscall
[       OK ] global.seccomp_syscall
[ RUN      ] global.seccomp_syscall_mode_lock
[       OK ] global.seccomp_syscall_mode_lock
[ RUN      ] global.detect_seccomp_filter_flags
[       OK ] global.detect_seccomp_filter_flags
[ RUN      ] global.TSYNC_first
[       OK ] global.TSYNC_first
[ RUN      ] TSYNC.siblings_fail_prctl
[       OK ] TSYNC.siblings_fail_prctl
[ RUN      ] TSYNC.two_siblings_with_ancestor
[       OK ] TSYNC.two_siblings_with_ancestor
[ RUN      ] TSYNC.two_sibling_want_nnp
[       OK ] TSYNC.two_sibling_want_nnp
[ RUN      ] TSYNC.two_siblings_with_no_filter
[       OK ] TSYNC.two_siblings_with_no_filter
[ RUN      ] TSYNC.two_siblings_with_one_divergence
[       OK ] TSYNC.two_siblings_with_one_divergence
[ RUN      ] TSYNC.two_siblings_not_under_filter
[       OK ] TSYNC.two_siblings_not_under_filter
[ RUN      ] global.syscall_restart
[       OK ] global.syscall_restart
[ RUN      ] global.filter_flag_log
[       OK ] global.filter_flag_log
[ RUN      ] global.get_action_avail
[       OK ] global.get_action_avail
[ RUN      ] global.get_metadata
[       OK ] global.get_metadata
[ RUN      ] global.user_notification_basic
[       OK ] global.user_notification_basic
[ RUN      ] global.user_notification_kill_in_middle
[       OK ] global.user_notification_kill_in_middle
[ RUN      ] global.user_notification_signal
[1]    5951 alarm      sudo ./seccomp_bpf

carlosedp in ~ at fedora-unleashed
➜ sudo ./seccomp_benchmark
Calibrating reasonable sample size...
1564584448.964538790 - 1564584448.964529687 = 9103
1564584448.964588859 - 1564584448.964575204 = 13655
1564584448.964631342 - 1564584448.964604790 = 26552
1564584448.964710239 - 1564584448.964644997 = 65242
1564584448.964842239 - 1564584448.964726928 = 115311
1564584448.965072859 - 1564584448.964857411 = 215448
1564584448.965513618 - 1564584448.965089549 = 424069
1564584448.966417894 - 1564584448.965532584 = 885310
1564584448.968286377 - 1564584448.966443687 = 1842690
1564584448.971667549 - 1564584448.968314446 = 3353103
1564584448.978288790 - 1564584448.971694101 = 6594689
1564584448.991803618 - 1564584448.978313066 = 13490552
1564584449.017692308 - 1564584448.991836239 = 25856069
1564584449.069651756 - 1564584449.017713549 = 51938207
1564584449.173110928 - 1564584449.069673756 = 103437172
1564584449.380001204 - 1564584449.173132928 = 206868276
1564584449.793857618 - 1564584449.380041411 = 413816207
1564584450.625367342 - 1564584449.793898584 = 831468758
1564584452.299529411 - 1564584450.625426514 = 1674102897
1564584455.665938307 - 1564584452.299592376 = 3366345931
1564584462.331777479 - 1564584455.665973962 = 6665803517
Benchmarking 33554432 samples...
18.107882743 - 12.075641371 = 6032241372
getpid native: 179 ns
34.720410331 - 18.107978605 = 16612431726
getpid RET_ALLOW: 495 ns
Estimated seccomp overhead per syscall: 316 n
Paul Walmsley Aug. 24, 2019, 12:30 a.m. UTC | #3
On Thu, 22 Aug 2019, David Abdurachmanov wrote:

> There is one failing kernel selftest: global.user_notification_signal

Is this the only failing test?  Or are the rest of the selftests skipped 
when this test fails, and no further tests are run, as seems to be shown 
here:

  https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/

For example, looking at the source, I'd naively expect to see the 
user_notification_closed_listener test result -- which follows right 
after the failing test in the selftest source.  But there aren't any 
results?

Also - could you follow up with the author of this failing test to see if 
we can get some more clarity about what might be going wrong here?  It 
appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp: 
add a return code to trap to userspace") by Tycho Andersen 
<tycho@tycho.ws>.


- Paul
David Abdurachmanov Aug. 24, 2019, 1:04 a.m. UTC | #4
On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley <paul.walmsley@sifive.com> wrote:
>
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
>
> > There is one failing kernel selftest: global.user_notification_signal
>
> Is this the only failing test?  Or are the rest of the selftests skipped
> when this test fails, and no further tests are run, as seems to be shown
> here:
>
>   https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/

Yes, it's a single test failing. After removing global.user_notification_signal
test everything else pass and you get the results printed.

>
> For example, looking at the source, I'd naively expect to see the
> user_notification_closed_listener test result -- which follows right
> after the failing test in the selftest source.  But there aren't any
> results?

Yes, it hangs at this point. You have to manually terminate it.

>
> Also - could you follow up with the author of this failing test to see if
> we can get some more clarity about what might be going wrong here?  It
> appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> add a return code to trap to userspace") by Tycho Andersen
> <tycho@tycho.ws>.

Well the code states ".. and hope that it doesn't break when there
is actually a signal :)". Maybe we are just unlucky. I don't have results
from other architectures to compare.

I found that Linaro is running selftests, but SECCOMP is disabled
and thus it's failing. Is there another CI which tracks selftests?

https://qa-reports.linaro.org/lkft/linux-next-oe/tests/kselftest/seccomp_seccomp_bpf?top=next-20190823

>
>
> - Paul
David Abdurachmanov Aug. 24, 2019, 1:10 a.m. UTC | #5
On Fri, Aug 23, 2019 at 6:04 PM David Abdurachmanov
<david.abdurachmanov@gmail.com> wrote:
>
> On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley <paul.walmsley@sifive.com> wrote:
> >
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Is this the only failing test?  Or are the rest of the selftests skipped
> > when this test fails, and no further tests are run, as seems to be shown
> > here:
> >
> >   https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/
>
> Yes, it's a single test failing. After removing global.user_notification_signal
> test everything else pass and you get the results printed.
>
> >
> > For example, looking at the source, I'd naively expect to see the
> > user_notification_closed_listener test result -- which follows right
> > after the failing test in the selftest source.  But there aren't any
> > results?
>
> Yes, it hangs at this point. You have to manually terminate it.
>
> >
> > Also - could you follow up with the author of this failing test to see if
> > we can get some more clarity about what might be going wrong here?  It
> > appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> > add a return code to trap to userspace") by Tycho Andersen
> > <tycho@tycho.ws>.
>
> Well the code states ".. and hope that it doesn't break when there
> is actually a signal :)". Maybe we are just unlucky. I don't have results
> from other architectures to compare.
>
> I found that Linaro is running selftests, but SECCOMP is disabled
> and thus it's failing. Is there another CI which tracks selftests?
>
> https://qa-reports.linaro.org/lkft/linux-next-oe/tests/kselftest/seccomp_seccomp_bpf?top=next-20190823

Actually it seems that seccomp is enabled in kernel, but not in
systemd, and somehow seccomp_bpf is missing on all arches thus
causing automatic failure.

> >
> >
> > - Paul
Paul Walmsley Aug. 24, 2019, 1:18 a.m. UTC | #6
On Fri, 23 Aug 2019, David Abdurachmanov wrote:

> On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley <paul.walmsley@sifive.com> wrote:
> >
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Is this the only failing test?  Or are the rest of the selftests skipped
> > when this test fails, and no further tests are run, as seems to be shown
> > here:
> >
> >   https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/
> 
> Yes, it's a single test failing. After removing global.user_notification_signal
> test everything else pass and you get the results printed.

OK.

> Well the code states ".. and hope that it doesn't break when there
> is actually a signal :)". Maybe we are just unlucky. I don't have results
> from other architectures to compare.
> 
> I found that Linaro is running selftests, but SECCOMP is disabled
> and thus it's failing. Is there another CI which tracks selftests?

0day runs the kselftests, and at least on some architectures/Kconfigs, 
it's succeeding:

https://lore.kernel.org/lkml/20190726083740.GG22106@shao2-debian/

https://lore.kernel.org/lkml/20190712064850.GC20848@shao2-debian/

https://lore.kernel.org/lkml/20190311074115.GC10839@shao2-debian/

etc.


- Paul
Kees Cook Aug. 25, 2019, 9:51 p.m. UTC | #7
On Fri, Aug 23, 2019 at 05:30:53PM -0700, Paul Walmsley wrote:
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> 
> > There is one failing kernel selftest: global.user_notification_signal
> 
> Is this the only failing test?  Or are the rest of the selftests skipped 
> when this test fails, and no further tests are run, as seems to be shown 
> here:
> 
>   https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/
> 
> For example, looking at the source, I'd naively expect to see the 
> user_notification_closed_listener test result -- which follows right 
> after the failing test in the selftest source.  But there aren't any 
> results?
> 
> Also - could you follow up with the author of this failing test to see if 
> we can get some more clarity about what might be going wrong here?  It 
> appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp: 
> add a return code to trap to userspace") by Tycho Andersen 
> <tycho@tycho.ws>.

So, the original email says the riscv series is tested on top of 5.2-rc7,
but just for fun, can you confirm that you're building a tree that includes
9dd3fcb0ab73 ("selftests/seccomp: Handle namespace failures gracefully")? I
assume it does, but I suspect something similar is happening, where the
environment is slightly different than expected and the test stalls.

Does it behave the same way under emulation (i.e. can I hope to
reproduce this myself?)
Kees Cook Aug. 25, 2019, 9:59 p.m. UTC | #8
On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.

Oops, I see the mention of QEMU here. Where's the best place to find
instructions on creating a qemu riscv image/environment?

> There is one failing kernel selftest: global.user_notification_signal

This test has been fragile (and is not arch-specific), so as long as
everything else is passing, I would call this patch ready to go. :)

Reviewed-by: Kees Cook <keescook@chromium.org>
Tycho Andersen Aug. 26, 2019, 2:57 p.m. UTC | #9
Hi,

On Fri, Aug 23, 2019 at 05:30:53PM -0700, Paul Walmsley wrote:
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> 
> > There is one failing kernel selftest: global.user_notification_signal
> 
> Also - could you follow up with the author of this failing test to see if 
> we can get some more clarity about what might be going wrong here?  It 
> appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp: 
> add a return code to trap to userspace") by Tycho Andersen 
> <tycho@tycho.ws>.

Can you post an strace and a cat of /proc/$pid/stack for both tasks
where it gets stuck? I don't have any riscv hardware, and it "works
for me" on x86 and arm64 with 100 tries.

Thanks,

Tycho
David Abdurachmanov Aug. 26, 2019, 4:39 p.m. UTC | #10
On Mon, Aug 26, 2019 at 7:57 AM Tycho Andersen <tycho@tycho.ws> wrote:
>
> Hi,
>
> On Fri, Aug 23, 2019 at 05:30:53PM -0700, Paul Walmsley wrote:
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Also - could you follow up with the author of this failing test to see if
> > we can get some more clarity about what might be going wrong here?  It
> > appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> > add a return code to trap to userspace") by Tycho Andersen
> > <tycho@tycho.ws>.
>
> Can you post an strace and a cat of /proc/$pid/stack for both tasks
> where it gets stuck? I don't have any riscv hardware, and it "works
> for me" on x86 and arm64 with 100 tries.

I don't have the a build with SECCOMP for the board right now, so it
will have to wait. I just finished a new kernel (almost rc6) for Fedora,
but it will take time to assemble new repositories and a disk image.

There is older disk image available (5.2.0-rc7 kernel with v2 SECCOMP)
for QEMU or libvirt/QEMU:

https://dl.fedoraproject.org/pub/alt/risc-v/disk-images/fedora/rawhide/20190703.n.0/Developer/
https://fedoraproject.org/wiki/Architectures/RISC-V/Installing#Boot_with_libvirt

(If you are interesting trying it locally.)

IIRC I attempted to connected with strace, but it quickly returns and fails
properly. Simply put strace unblocks whatever is stuck.

david
Kees Cook Aug. 26, 2019, 5:48 p.m. UTC | #11
On Mon, Aug 26, 2019 at 09:39:50AM -0700, David Abdurachmanov wrote:
> I don't have the a build with SECCOMP for the board right now, so it
> will have to wait. I just finished a new kernel (almost rc6) for Fedora,

FWIW, I don't think this should block landing the code: all the tests
fail without seccomp support. ;) So this patch is an improvement!
Andy Lutomirski Aug. 28, 2019, 5:52 p.m. UTC | #12
> On Aug 25, 2019, at 2:59 PM, Kees Cook <keescook@chromium.org> wrote:
> 
>> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
>> This patch was extensively tested on Fedora/RISCV (applied by default on
>> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
>> on QEMU and SiFive Unleashed board.
> 
> Oops, I see the mention of QEMU here. Where's the best place to find
> instructions on creating a qemu riscv image/environment?

I don’t suppose one of you riscv folks would like to contribute riscv support to virtme?  virtme-run —arch=riscv would be quite nice, and the total patch should be just a couple lines.  Unfortunately, it helps a lot to understand the subtleties of booting the architecture to write those couple lines :)
Kees Cook Aug. 28, 2019, 6:01 p.m. UTC | #13
On Wed, Aug 28, 2019 at 10:52:05AM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 25, 2019, at 2:59 PM, Kees Cook <keescook@chromium.org> wrote:
> > 
> >> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
> >> This patch was extensively tested on Fedora/RISCV (applied by default on
> >> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> >> on QEMU and SiFive Unleashed board.
> > 
> > Oops, I see the mention of QEMU here. Where's the best place to find
> > instructions on creating a qemu riscv image/environment?
> 
> I don’t suppose one of you riscv folks would like to contribute riscv support to virtme?  virtme-run —arch=riscv would be quite nice, and the total patch should be just a couple lines.  Unfortunately, it helps a lot to understand the subtleties of booting the architecture to write those couple lines :)

As it turns out, this is where I'm stuck. All the instructions I can
find are about booting a kernel off a disk image. :(
David Abdurachmanov Aug. 28, 2019, 9:37 p.m. UTC | #14
On Wed, Aug 28, 2019 at 10:36 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
> > This patch was extensively tested on Fedora/RISCV (applied by default on
> > top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> > on QEMU and SiFive Unleashed board.
>
> Oops, I see the mention of QEMU here. Where's the best place to find
> instructions on creating a qemu riscv image/environment?

Examples from what I personally use:
https://github.com/riscv/meta-riscv
https://fedoraproject.org/wiki/Architectures/RISC-V/Installing#Boot_with_libvirt
(might be outdated)

If you are running machine with a properly working libvirt/QEMU setup:

VIRTBUILDER_IMAGE=fedora-rawhide-developer-20190703n0
FIRMWARE=fw_payload-uboot-qemu-virt-smode.elf
wget https://dl.fedoraproject.org/pub/alt/risc-v/disk-images/fedora/rawhide/20190703.n.0/Developer/$FIRMWARE
echo riscv > /tmp/rootpw
virt-builder \
    --verbose \
    --source https://dl.fedoraproject.org/pub/alt/risc-v/repo/virt-builder-images/images/index
\
    --no-check-signature \
    --arch riscv64 \
    --size 10G \
    --format raw \
    --hostname fedora-riscv \
    -o disk \
    --root-password file:/tmp/rootpw \
    ${VIRTBUILDER_IMAGE}

sudo virt-install \
    --name fedora-riscv \
    --arch riscv64 \
    --vcpus 4 \
    --memory 3048 \
    --import \
    --disk path=$PWD/disk \
    --boot kernel=$PWD/${FIRMWARE} \
    --network network=default \
    --graphics none \
    --serial log.file=/tmp/fedora-riscv.serial.log \
    --noautoconsole

The following does incl. SECCOMP v2 patch on top of 5.2-rc7 kernel.

>
> > There is one failing kernel selftest: global.user_notification_signal
>
> This test has been fragile (and is not arch-specific), so as long as
> everything else is passing, I would call this patch ready to go. :)
>
> Reviewed-by: Kees Cook <keescook@chromium.org>
>
> --
> Kees Cook
David Abdurachmanov Aug. 28, 2019, 9:39 p.m. UTC | #15
On Wed, Aug 28, 2019 at 10:36 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Fri, Aug 23, 2019 at 05:30:53PM -0700, Paul Walmsley wrote:
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Is this the only failing test?  Or are the rest of the selftests skipped
> > when this test fails, and no further tests are run, as seems to be shown
> > here:
> >
> >   https://lore.kernel.org/linux-riscv/CADnnUqcmDMRe1f+3jG8SPR6jRrnBsY8VVD70VbKEm0NqYeoicA@mail.gmail.com/
> >
> > For example, looking at the source, I'd naively expect to see the
> > user_notification_closed_listener test result -- which follows right
> > after the failing test in the selftest source.  But there aren't any
> > results?
> >
> > Also - could you follow up with the author of this failing test to see if
> > we can get some more clarity about what might be going wrong here?  It
> > appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> > add a return code to trap to userspace") by Tycho Andersen
> > <tycho@tycho.ws>.
>
> So, the original email says the riscv series is tested on top of 5.2-rc7,
> but just for fun, can you confirm that you're building a tree that includes
> 9dd3fcb0ab73 ("selftests/seccomp: Handle namespace failures gracefully")? I
> assume it does, but I suspect something similar is happening, where the
> environment is slightly different than expected and the test stalls.
>
> Does it behave the same way under emulation (i.e. can I hope to
> reproduce this myself?)

This was tested in 5.2-rc7 and later in 5.3-rc with the same behavior.
Also VM or physical HW doesn't matter, same result.

>
> --
> Kees Cook
Kees Cook Aug. 28, 2019, 11:44 p.m. UTC | #16
On Wed, Aug 28, 2019 at 02:37:34PM -0700, David Abdurachmanov wrote:
>     --disk path=$PWD/disk \
>     --boot kernel=$PWD/${FIRMWARE} \

This is where I tripped over things. How do I specify the kernel to boot
from OUTSIDE the disk image?
Paul Walmsley Aug. 29, 2019, 1:30 a.m. UTC | #17
Hi Kees,

On Mon, 26 Aug 2019, Kees Cook wrote:

> On Mon, Aug 26, 2019 at 09:39:50AM -0700, David Abdurachmanov wrote:
> > I don't have the a build with SECCOMP for the board right now, so it
> > will have to wait. I just finished a new kernel (almost rc6) for Fedora,
> 
> FWIW, I don't think this should block landing the code: all the tests
> fail without seccomp support. ;) So this patch is an improvement!

Am sympathetic to this -- we did it with the hugetlb patches for RISC-V -- 
but it would be good to understand a little bit more about why the test 
fails before we merge it.

Once we merge the patch, it will probably reduce the motivation for others 
to either understand and fix the underlying problem with the RISC-V code 
-- or, if it truly is a flaky test, to drop (or fix) the test in the 
seccomp_bpf kselftests.

Thanks for helping to take a closer look at this,

- Paul
Palmer Dabbelt Sept. 3, 2019, 10:27 p.m. UTC | #18
On Wed, 28 Aug 2019 10:52:05 PDT (-0700), luto@amacapital.net wrote:
>
>
>> On Aug 25, 2019, at 2:59 PM, Kees Cook <keescook@chromium.org> wrote:
>> 
>>> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
>>> This patch was extensively tested on Fedora/RISCV (applied by default on
>>> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
>>> on QEMU and SiFive Unleashed board.
>> 
>> Oops, I see the mention of QEMU here. Where's the best place to find
>> instructions on creating a qemu riscv image/environment?
>
> I don’t suppose one of you riscv folks would like to contribute riscv support to virtme?  virtme-run —arch=riscv would be quite nice, and the total patch should be just a couple lines.  Unfortunately, it helps a lot to understand the subtleties of booting the architecture to write those couple lines :)

What mailing list should I sent this to?  You need to use the "virtme" branch 
of kernel.org/palmer/linux.git until I send the defconfig patches.

commit a8bd7b318691891991caea298f9a5ed0f815c322
gpg: Signature made Tue 03 Sep 2019 03:22:45 PM PDT
gpg:                using RSA key 00CE76D1834960DFCE886DF8EF4CA1502CCBAB41
gpg:                issuer "palmer@dabbelt.com"
gpg: Good signature from "Palmer Dabbelt <palmer@dabbelt.com>" [ultimate]
gpg:                 aka "Palmer Dabbelt <palmer@sifive.com>" [ultimate]
Author: Palmer Dabbelt <palmer@sifive.com>
Date:   Tue Sep 3 14:39:39 2019 -0700

    Add RISC-V support

    This expects a kernel with the plan 9 stuff supported (not yet in
    defconfig) and a new QEMU (as described in the README).  I'm also not
    100% sure it's working, as I'm getting

        /bin/sh: exec: line 1: /run/virtme/guesttools/virtme-init: not found

    Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

diff --git a/README.md b/README.md
index 51b6583..d53a456 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,14 @@ PPC64

 PPC64 appears to be reasonably functional.

+RISC-V
+------
+
+riscv64 works out of the box, but you'll neet at least QEMU-4.1.0 to be
+able to run `vmlinux`-style kernels.  riscv32 is not supported because
+there are no existing userspace images for it.  Support is provided via
+QEMU's `virt` machine with OpenSBI for firmware.
+
 Others
 ------

diff --git a/virtme/architectures.py b/virtme/architectures.py
index 9871ea4..ee84494 100644
--- a/virtme/architectures.py
+++ b/virtme/architectures.py
@@ -207,6 +207,30 @@ class Arch_ppc64(Arch):
         # Apparently SLOF (QEMU's bundled firmware?) can't boot a zImage.
         return 'vmlinux'

+class Arch_riscv64(Arch):
+    def __init__(self, name):
+        Arch.__init__(self, name)
+
+        self.defconfig_target = 'riscv64_defconfig'
+        self.qemuname = 'riscv64'
+        self.linuxname = 'riscv'
+        self.gccname = 'riscv64'
+
+    def qemuargs(self, is_native):
+        ret = Arch.qemuargs(is_native)
+
+        ret.extend(['-machine', 'virt'])
+        ret.extend(['-bios', 'default'])
+
+        return ret
+
+    @staticmethod
+    def serial_console_args():
+        return ['console=ttyS0']
+
+    def kimg_path(self):
+        return 'arch/riscv/boot/Image'
+
 class Arch_sparc64(Arch):
     def __init__(self, name):
         Arch.__init__(self, name)
@@ -264,6 +288,7 @@ ARCHES = {
     'arm': Arch_arm,
     'aarch64': Arch_aarch64,
     'ppc64': Arch_ppc64,
+    'riscv64': Arch_riscv64,
     'sparc64': Arch_sparc64,
     's390x': Arch_s390x,
 }
Kees Cook Sept. 27, 2019, 5:20 p.m. UTC | #19
On Wed, Aug 28, 2019 at 6:30 PM Paul Walmsley <paul.walmsley@sifive.com> wrote:
> On Mon, 26 Aug 2019, Kees Cook wrote:
>
> > On Mon, Aug 26, 2019 at 09:39:50AM -0700, David Abdurachmanov wrote:
> > > I don't have the a build with SECCOMP for the board right now, so it
> > > will have to wait. I just finished a new kernel (almost rc6) for Fedora,
> >
> > FWIW, I don't think this should block landing the code: all the tests
> > fail without seccomp support. ;) So this patch is an improvement!
>
> Am sympathetic to this -- we did it with the hugetlb patches for RISC-V --
> but it would be good to understand a little bit more about why the test
> fails before we merge it.

The test is almost certainly failing due to the environmental
requirements (i.e. namespaces, user ids, etc). There are some corner
cases in there that we've had to fix in the past. If the other tests
are passing, then I would expect all the seccomp internals are fine --
it's just the case being weird. It's just a matter of figuring out
what state the test environment is in so we can cover that corner case
too.

> Once we merge the patch, it will probably reduce the motivation for others
> to either understand and fix the underlying problem with the RISC-V code
> -- or, if it truly is a flaky test, to drop (or fix) the test in the
> seccomp_bpf kselftests.

Sure, I get that point -- but I don't want to block seccomp landing
for riscv for that. I suggested to David offlist that the test could
just be marked with a FIXME XFAIL on riscv and once someone's in a
better position to reproduce it we can fix it. (I think the test bug
is almost certainly not riscv specific, but just some missing
requirement that we aren't handling correctly.)

How does that sound?

-Kees
Andy Lutomirski Sept. 27, 2019, 8:58 p.m. UTC | #20
On Tue, Sep 3, 2019 at 3:27 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>
> On Wed, 28 Aug 2019 10:52:05 PDT (-0700), luto@amacapital.net wrote:
> >
> >
> >> On Aug 25, 2019, at 2:59 PM, Kees Cook <keescook@chromium.org> wrote:
> >>
> >>> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
> >>> This patch was extensively tested on Fedora/RISCV (applied by default on
> >>> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> >>> on QEMU and SiFive Unleashed board.
> >>
> >> Oops, I see the mention of QEMU here. Where's the best place to find
> >> instructions on creating a qemu riscv image/environment?
> >
> > I don’t suppose one of you riscv folks would like to contribute riscv support to virtme?  virtme-run —arch=riscv would be quite nice, and the total patch should be just a couple lines.  Unfortunately, it helps a lot to understand the subtleties of booting the architecture to write those couple lines :)
>
> What mailing list should I sent this to?  You need to use the "virtme" branch
> of kernel.org/palmer/linux.git until I send the defconfig patches.
>
> commit a8bd7b318691891991caea298f9a5ed0f815c322
> gpg: Signature made Tue 03 Sep 2019 03:22:45 PM PDT
> gpg:                using RSA key 00CE76D1834960DFCE886DF8EF4CA1502CCBAB41
> gpg:                issuer "palmer@dabbelt.com"
> gpg: Good signature from "Palmer Dabbelt <palmer@dabbelt.com>" [ultimate]
> gpg:                 aka "Palmer Dabbelt <palmer@sifive.com>" [ultimate]
> Author: Palmer Dabbelt <palmer@sifive.com>
> Date:   Tue Sep 3 14:39:39 2019 -0700
>
>     Add RISC-V support

Could you rebase onto virtme master and resend in some format that
isn't corrupt?  git am really doesn't like your patch and, even if I
fix it up manually, your gpg: lines are bogus.  You could also send a
PR at https://github.com/amluto/virtme
Paul Walmsley Oct. 5, 2019, 1:20 a.m. UTC | #21
Hello Shuah,

On Thu, 22 Aug 2019, David Abdurachmanov wrote:

> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
> 
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
> 
> Fully passes libseccomp regression testing (simulation and live).
> 
> There is one failing kernel selftest: global.user_notification_signal
> 
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
> 
> Cc: keescook@chromium.org
> Cc: me@carlosedp.com
> 
> Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>

We'd like to merge this patch through the RISC-V tree.
Care to ack the change to tools/testing/selftests/seccomp/seccomp_bpf.c ?  

Kees has already reviewed it:

https://lore.kernel.org/linux-riscv/CAJr-aD=UnCN9E_mdVJ2H5nt=6juRSWikZnA5HxDLQxXLbsRz-w@mail.gmail.com/


- Paul


> ---
>  arch/riscv/Kconfig                            | 14 ++++++++++
>  arch/riscv/include/asm/seccomp.h              | 10 +++++++
>  arch/riscv/include/asm/thread_info.h          |  5 +++-
>  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
>  arch/riscv/kernel/ptrace.c                    | 10 +++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
>  	select GENERIC_SMP_IDLE_THREAD
>  	select GENERIC_ATOMIC64 if !64BIT
>  	select HAVE_ARCH_AUDITSYSCALL
> +	select HAVE_ARCH_SECCOMP_FILTER
>  	select HAVE_MEMBLOCK_NODE_MAP
>  	select HAVE_DMA_CONTIGUOUS
>  	select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>  
>  source "kernel/Kconfig.hz"
>  
> +config SECCOMP
> +	bool "Enable seccomp to safely compute untrusted bytecode"
> +	help
> +	  This kernel feature is useful for number crunching applications
> +	  that may need to compute untrusted bytecode during their
> +	  execution. By using pipes or other transports made available to
> +	  the process as file descriptors supporting the read/write
> +	  syscalls, it's possible to isolate those applications in
> +	  their own address space using seccomp. Once seccomp is
> +	  enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> +	  and the task is only allowed to execute a few safe syscalls
> +	  defined by each seccomp mode.
> +
>  endmenu
>  
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index 000000000000..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include <asm/unistd.h>
> +
> +#include <asm-generic/seccomp.h>
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
>  #define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
> +#define TIF_SECCOMP		8	/* syscall secure computing */
>  
>  #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
>  
>  #define _TIF_WORK_MASK \
>  	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>  
>  #define _TIF_SYSCALL_WORK \
> -	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +	 _TIF_SECCOMP )
>  
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check_syscall_nr:
>  	/* Check to make sure we don't jump to a bogus syscall number. */
>  	li t0, __NR_syscalls
>  	la s0, sys_ni_syscall
> -	/* Syscall number held in a7 */
> -	bgeu a7, t0, 1f
> +	/*
> +	 * The tracer can change syscall number to valid/invalid value.
> +	 * We use syscall_set_nr helper in syscall_trace_enter thus we
> +	 * cannot trust the current value in a7 and have to reload from
> +	 * the current task pt_regs.
> +	 */
> +	REG_L a7, PT_A7(sp)
> +	/*
> +	 * Syscall number held in a7.
> +	 * If syscall number is above allowed value, redirect to ni_syscall.
> +	 */
> +	bge a7, t0, 1f
> +	/*
> +	 * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +	 * If yes, we pretend it was executed.
> +	 */
> +	li t1, -1
> +	beq a7, t1, ret_from_syscall_rejected
> +	/* Call syscall */
>  	la s0, sys_call_table
>  	slli t0, a7, RISCV_LGPTR
>  	add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
>  	/* Set user a0 to kernel a0 */
>  	REG_S a0, PT_A0(sp)
> +	/*
> +	 * We didn't execute the actual syscall.
> +	 * Seccomp already set return value for the current task pt_regs.
> +	 * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +	 */
> +ret_from_syscall_rejected:
>  	/* Trace syscalls, but only if requested by the user. */
>  	REG_L t0, TASK_TI_FLAGS(tp)
>  	andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
>  		if (tracehook_report_syscall_entry(regs))
>  			syscall_set_nr(current, regs, -1);
>  
> +	/*
> +	 * Do the secure computing after ptrace; failures should be fast.
> +	 * If this fails we might have return value in a0 from seccomp
> +	 * (via SECCOMP_RET_ERRNO/TRACE).
> +	 */
> +	if (secure_computing(NULL) == -1) {
> +		syscall_set_nr(current, regs, -1);
> +		return;
> +	}
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
>  	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>  		trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS	struct user_pt_regs
>  # define SYSCALL_NUM	regs[8]
>  # define SYSCALL_RET	regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS	struct user_regs_struct
> +# define SYSCALL_NUM	a7
> +# define SYSCALL_RET	a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS	struct user_regs_struct
>  # define SYSCALL_NUM	gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
>  	EXPECT_EQ(0, ret) {}
>  
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -    defined(__s390__) || defined(__hppa__)
> +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
>  	{
>  		regs.SYSCALL_NUM = syscall;
>  	}
> -- 
> 2.21.0
> 
> 


- Paul
Paul Walmsley Oct. 5, 2019, 1:24 a.m. UTC | #22
On Fri, 27 Sep 2019, Kees Cook wrote:

> On Wed, Aug 28, 2019 at 6:30 PM Paul Walmsley <paul.walmsley@sifive.com> wrote:
> > On Mon, 26 Aug 2019, Kees Cook wrote:
> >
> > > On Mon, Aug 26, 2019 at 09:39:50AM -0700, David Abdurachmanov wrote:
> > > > I don't have the a build with SECCOMP for the board right now, so it
> > > > will have to wait. I just finished a new kernel (almost rc6) for Fedora,
> > >
> > > FWIW, I don't think this should block landing the code: all the tests
> > > fail without seccomp support. ;) So this patch is an improvement!
> >
> > Am sympathetic to this -- we did it with the hugetlb patches for RISC-V --
> > but it would be good to understand a little bit more about why the test
> > fails before we merge it.
> 
> The test is almost certainly failing due to the environmental
> requirements (i.e. namespaces, user ids, etc). There are some corner
> cases in there that we've had to fix in the past. If the other tests
> are passing, then I would expect all the seccomp internals are fine --
> it's just the case being weird. It's just a matter of figuring out
> what state the test environment is in so we can cover that corner case
> too.
> 
> > Once we merge the patch, it will probably reduce the motivation for others
> > to either understand and fix the underlying problem with the RISC-V code
> > -- or, if it truly is a flaky test, to drop (or fix) the test in the
> > seccomp_bpf kselftests.
> 
> Sure, I get that point -- but I don't want to block seccomp landing
> for riscv for that. I suggested to David offlist that the test could
> just be marked with a FIXME XFAIL on riscv and once someone's in a
> better position to reproduce it we can fix it. (I think the test bug
> is almost certainly not riscv specific, but just some missing
> requirement that we aren't handling correctly.)

OK.  It might be nice to mark the seccomp_bpf.c test as flaky in the 
comments for the test.

> How does that sound?

Let's follow your plan.  Thanks for your review and feedback.


- Paul
Andy Lutomirski Oct. 6, 2019, 7:18 p.m. UTC | #23
On Fri, Sep 27, 2019 at 1:58 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Tue, Sep 3, 2019 at 3:27 PM Palmer Dabbelt <palmer@sifive.com> wrote:
> >
> > On Wed, 28 Aug 2019 10:52:05 PDT (-0700), luto@amacapital.net wrote:
> > >
> > >
> > >> On Aug 25, 2019, at 2:59 PM, Kees Cook <keescook@chromium.org> wrote:
> > >>
> > >>> On Thu, Aug 22, 2019 at 01:55:22PM -0700, David Abdurachmanov wrote:
> > >>> This patch was extensively tested on Fedora/RISCV (applied by default on
> > >>> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> > >>> on QEMU and SiFive Unleashed board.
> > >>
> > >> Oops, I see the mention of QEMU here. Where's the best place to find
> > >> instructions on creating a qemu riscv image/environment?
> > >
> > > I don’t suppose one of you riscv folks would like to contribute riscv support to virtme?  virtme-run —arch=riscv would be quite nice, and the total patch should be just a couple lines.  Unfortunately, it helps a lot to understand the subtleties of booting the architecture to write those couple lines :)
> >

FYI, it works now:

$ virtme-configkernel --arch=riscv --defconfig
  GEN     Makefile
[...]
Configured.  Build with 'make ARCH=riscv CROSS_COMPILE=riscv64-linux-gnu- -j4'

$ make ARCH=riscv CROSS_COMPILE=riscv64-linux-gnu- -j4
[...]

$ virtme-run --kdir=. --arch=riscv64 --mods=auto --root [path to a
riscv filesystem]

This is with virtme master and a qemu-system-riscv64 from qemu git on
my path.  It does *not* work with Fedora 30's qemu.

So now you can all jump on the virtme bandwagon and have an easy way
to test riscv kernels. :)  Although, if you want to run kernel
selftests, you may find the process of actually running them to be
more fun if you use --rodir or --rwdir to map the kernel selftests
directory into the guest.
Paul Walmsley Oct. 14, 2019, 9:06 p.m. UTC | #24
Shuah,

Could you please take a quick look at this and ack it if you're OK with 
the tools/testing change?  We'd like to get this merged soon.

- Paul


On Fri, 4 Oct 2019, Paul Walmsley wrote:

> Hello Shuah,
> 
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> 
> > This patch was extensively tested on Fedora/RISCV (applied by default on
> > top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> > on QEMU and SiFive Unleashed board.
> > 
> > libseccomp (userspace) was rebased:
> > https://github.com/seccomp/libseccomp/pull/134
> > 
> > Fully passes libseccomp regression testing (simulation and live).
> > 
> > There is one failing kernel selftest: global.user_notification_signal
> > 
> > v1 -> v2:
> >   - return immediatly if secure_computing(NULL) returns -1
> >   - fixed whitespace issues
> >   - add missing seccomp.h
> >   - remove patch #2 (solved now)
> >   - add riscv to seccomp kernel selftest
> > 
> > Cc: keescook@chromium.org
> > Cc: me@carlosedp.com
> > 
> > Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> 
> We'd like to merge this patch through the RISC-V tree.
> Care to ack the change to tools/testing/selftests/seccomp/seccomp_bpf.c ?  
> 
> Kees has already reviewed it:
> 
> https://lore.kernel.org/linux-riscv/CAJr-aD=UnCN9E_mdVJ2H5nt=6juRSWikZnA5HxDLQxXLbsRz-w@mail.gmail.com/
> 
> 
> - Paul
> 
> 
> > ---
> >  arch/riscv/Kconfig                            | 14 ++++++++++
> >  arch/riscv/include/asm/seccomp.h              | 10 +++++++
> >  arch/riscv/include/asm/thread_info.h          |  5 +++-
> >  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
> >  arch/riscv/kernel/ptrace.c                    | 10 +++++++
> >  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
> >  6 files changed, 70 insertions(+), 4 deletions(-)
> >  create mode 100644 arch/riscv/include/asm/seccomp.h
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index 59a4727ecd6c..441e63ff5adc 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -31,6 +31,7 @@ config RISCV
> >  	select GENERIC_SMP_IDLE_THREAD
> >  	select GENERIC_ATOMIC64 if !64BIT
> >  	select HAVE_ARCH_AUDITSYSCALL
> > +	select HAVE_ARCH_SECCOMP_FILTER
> >  	select HAVE_MEMBLOCK_NODE_MAP
> >  	select HAVE_DMA_CONTIGUOUS
> >  	select HAVE_FUTEX_CMPXCHG if FUTEX
> > @@ -235,6 +236,19 @@ menu "Kernel features"
> >  
> >  source "kernel/Kconfig.hz"
> >  
> > +config SECCOMP
> > +	bool "Enable seccomp to safely compute untrusted bytecode"
> > +	help
> > +	  This kernel feature is useful for number crunching applications
> > +	  that may need to compute untrusted bytecode during their
> > +	  execution. By using pipes or other transports made available to
> > +	  the process as file descriptors supporting the read/write
> > +	  syscalls, it's possible to isolate those applications in
> > +	  their own address space using seccomp. Once seccomp is
> > +	  enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> > +	  and the task is only allowed to execute a few safe syscalls
> > +	  defined by each seccomp mode.
> > +
> >  endmenu
> >  
> >  menu "Boot options"
> > diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> > new file mode 100644
> > index 000000000000..bf7744ee3b3d
> > --- /dev/null
> > +++ b/arch/riscv/include/asm/seccomp.h
> > @@ -0,0 +1,10 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef _ASM_SECCOMP_H
> > +#define _ASM_SECCOMP_H
> > +
> > +#include <asm/unistd.h>
> > +
> > +#include <asm-generic/seccomp.h>
> > +
> > +#endif /* _ASM_SECCOMP_H */
> > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > index 905372d7eeb8..a0b2a29a0da1 100644
> > --- a/arch/riscv/include/asm/thread_info.h
> > +++ b/arch/riscv/include/asm/thread_info.h
> > @@ -75,6 +75,7 @@ struct thread_info {
> >  #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
> >  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
> >  #define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
> > +#define TIF_SECCOMP		8	/* syscall secure computing */
> >  
> >  #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
> >  #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
> > @@ -82,11 +83,13 @@ struct thread_info {
> >  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
> >  #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
> >  #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
> > +#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
> >  
> >  #define _TIF_WORK_MASK \
> >  	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
> >  
> >  #define _TIF_SYSCALL_WORK \
> > -	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> > +	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> > +	 _TIF_SECCOMP )
> >  
> >  #endif /* _ASM_RISCV_THREAD_INFO_H */
> > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> > index bc7a56e1ca6f..0bbedfa3e47d 100644
> > --- a/arch/riscv/kernel/entry.S
> > +++ b/arch/riscv/kernel/entry.S
> > @@ -203,8 +203,25 @@ check_syscall_nr:
> >  	/* Check to make sure we don't jump to a bogus syscall number. */
> >  	li t0, __NR_syscalls
> >  	la s0, sys_ni_syscall
> > -	/* Syscall number held in a7 */
> > -	bgeu a7, t0, 1f
> > +	/*
> > +	 * The tracer can change syscall number to valid/invalid value.
> > +	 * We use syscall_set_nr helper in syscall_trace_enter thus we
> > +	 * cannot trust the current value in a7 and have to reload from
> > +	 * the current task pt_regs.
> > +	 */
> > +	REG_L a7, PT_A7(sp)
> > +	/*
> > +	 * Syscall number held in a7.
> > +	 * If syscall number is above allowed value, redirect to ni_syscall.
> > +	 */
> > +	bge a7, t0, 1f
> > +	/*
> > +	 * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> > +	 * If yes, we pretend it was executed.
> > +	 */
> > +	li t1, -1
> > +	beq a7, t1, ret_from_syscall_rejected
> > +	/* Call syscall */
> >  	la s0, sys_call_table
> >  	slli t0, a7, RISCV_LGPTR
> >  	add s0, s0, t0
> > @@ -215,6 +232,12 @@ check_syscall_nr:
> >  ret_from_syscall:
> >  	/* Set user a0 to kernel a0 */
> >  	REG_S a0, PT_A0(sp)
> > +	/*
> > +	 * We didn't execute the actual syscall.
> > +	 * Seccomp already set return value for the current task pt_regs.
> > +	 * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> > +	 */
> > +ret_from_syscall_rejected:
> >  	/* Trace syscalls, but only if requested by the user. */
> >  	REG_L t0, TASK_TI_FLAGS(tp)
> >  	andi t0, t0, _TIF_SYSCALL_WORK
> > diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> > index 368751438366..63e47c9f85f0 100644
> > --- a/arch/riscv/kernel/ptrace.c
> > +++ b/arch/riscv/kernel/ptrace.c
> > @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
> >  		if (tracehook_report_syscall_entry(regs))
> >  			syscall_set_nr(current, regs, -1);
> >  
> > +	/*
> > +	 * Do the secure computing after ptrace; failures should be fast.
> > +	 * If this fails we might have return value in a0 from seccomp
> > +	 * (via SECCOMP_RET_ERRNO/TRACE).
> > +	 */
> > +	if (secure_computing(NULL) == -1) {
> > +		syscall_set_nr(current, regs, -1);
> > +		return;
> > +	}
> > +
> >  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> >  	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> >  		trace_sys_enter(regs, syscall_get_nr(current, regs));
> > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > index 6ef7f16c4cf5..492e0adad9d3 100644
> > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > @@ -112,6 +112,8 @@ struct seccomp_data {
> >  #  define __NR_seccomp 383
> >  # elif defined(__aarch64__)
> >  #  define __NR_seccomp 277
> > +# elif defined(__riscv)
> > +#  define __NR_seccomp 277
> >  # elif defined(__hppa__)
> >  #  define __NR_seccomp 338
> >  # elif defined(__powerpc__)
> > @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
> >  # define ARCH_REGS	struct user_pt_regs
> >  # define SYSCALL_NUM	regs[8]
> >  # define SYSCALL_RET	regs[0]
> > +#elif defined(__riscv) && __riscv_xlen == 64
> > +# define ARCH_REGS	struct user_regs_struct
> > +# define SYSCALL_NUM	a7
> > +# define SYSCALL_RET	a0
> >  #elif defined(__hppa__)
> >  # define ARCH_REGS	struct user_regs_struct
> >  # define SYSCALL_NUM	gr[20]
> > @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
> >  	EXPECT_EQ(0, ret) {}
> >  
> >  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> > -    defined(__s390__) || defined(__hppa__)
> > +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
> >  	{
> >  		regs.SYSCALL_NUM = syscall;
> >  	}
> > -- 
> > 2.21.0
> > 
> > 
> 
> 
> - Paul
>
Kees Cook Oct. 15, 2019, 4:27 p.m. UTC | #25
On Mon, Oct 14, 2019 at 02:06:07PM -0700, Paul Walmsley wrote:
> Shuah,
> 
> Could you please take a quick look at this and ack it if you're OK with 
> the tools/testing change?  We'd like to get this merged soon.

FWIW, I regularly carry these kinds of selftest changes via my seccomp
tree, so if Shuah is busy, I think it'll be fine to take this in
riscv. If not, I'll take responsibility of apologizing to Shuah! :) :)

-Kees

> 
> - Paul
> 
> 
> On Fri, 4 Oct 2019, Paul Walmsley wrote:
> 
> > Hello Shuah,
> > 
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> > 
> > > This patch was extensively tested on Fedora/RISCV (applied by default on
> > > top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> > > on QEMU and SiFive Unleashed board.
> > > 
> > > libseccomp (userspace) was rebased:
> > > https://github.com/seccomp/libseccomp/pull/134
> > > 
> > > Fully passes libseccomp regression testing (simulation and live).
> > > 
> > > There is one failing kernel selftest: global.user_notification_signal
> > > 
> > > v1 -> v2:
> > >   - return immediatly if secure_computing(NULL) returns -1
> > >   - fixed whitespace issues
> > >   - add missing seccomp.h
> > >   - remove patch #2 (solved now)
> > >   - add riscv to seccomp kernel selftest
> > > 
> > > Cc: keescook@chromium.org
> > > Cc: me@carlosedp.com
> > > 
> > > Signed-off-by: David Abdurachmanov <david.abdurachmanov@sifive.com>
> > 
> > We'd like to merge this patch through the RISC-V tree.
> > Care to ack the change to tools/testing/selftests/seccomp/seccomp_bpf.c ?  
> > 
> > Kees has already reviewed it:
> > 
> > https://lore.kernel.org/linux-riscv/CAJr-aD=UnCN9E_mdVJ2H5nt=6juRSWikZnA5HxDLQxXLbsRz-w@mail.gmail.com/
> > 
> > 
> > - Paul
> > 
> > 
> > > ---
> > >  arch/riscv/Kconfig                            | 14 ++++++++++
> > >  arch/riscv/include/asm/seccomp.h              | 10 +++++++
> > >  arch/riscv/include/asm/thread_info.h          |  5 +++-
> > >  arch/riscv/kernel/entry.S                     | 27 +++++++++++++++++--
> > >  arch/riscv/kernel/ptrace.c                    | 10 +++++++
> > >  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +++++-
> > >  6 files changed, 70 insertions(+), 4 deletions(-)
> > >  create mode 100644 arch/riscv/include/asm/seccomp.h
> > > 
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index 59a4727ecd6c..441e63ff5adc 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -31,6 +31,7 @@ config RISCV
> > >  	select GENERIC_SMP_IDLE_THREAD
> > >  	select GENERIC_ATOMIC64 if !64BIT
> > >  	select HAVE_ARCH_AUDITSYSCALL
> > > +	select HAVE_ARCH_SECCOMP_FILTER
> > >  	select HAVE_MEMBLOCK_NODE_MAP
> > >  	select HAVE_DMA_CONTIGUOUS
> > >  	select HAVE_FUTEX_CMPXCHG if FUTEX
> > > @@ -235,6 +236,19 @@ menu "Kernel features"
> > >  
> > >  source "kernel/Kconfig.hz"
> > >  
> > > +config SECCOMP
> > > +	bool "Enable seccomp to safely compute untrusted bytecode"
> > > +	help
> > > +	  This kernel feature is useful for number crunching applications
> > > +	  that may need to compute untrusted bytecode during their
> > > +	  execution. By using pipes or other transports made available to
> > > +	  the process as file descriptors supporting the read/write
> > > +	  syscalls, it's possible to isolate those applications in
> > > +	  their own address space using seccomp. Once seccomp is
> > > +	  enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> > > +	  and the task is only allowed to execute a few safe syscalls
> > > +	  defined by each seccomp mode.
> > > +
> > >  endmenu
> > >  
> > >  menu "Boot options"
> > > diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
> > > new file mode 100644
> > > index 000000000000..bf7744ee3b3d
> > > --- /dev/null
> > > +++ b/arch/riscv/include/asm/seccomp.h
> > > @@ -0,0 +1,10 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +
> > > +#ifndef _ASM_SECCOMP_H
> > > +#define _ASM_SECCOMP_H
> > > +
> > > +#include <asm/unistd.h>
> > > +
> > > +#include <asm-generic/seccomp.h>
> > > +
> > > +#endif /* _ASM_SECCOMP_H */
> > > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > > index 905372d7eeb8..a0b2a29a0da1 100644
> > > --- a/arch/riscv/include/asm/thread_info.h
> > > +++ b/arch/riscv/include/asm/thread_info.h
> > > @@ -75,6 +75,7 @@ struct thread_info {
> > >  #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
> > >  #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
> > >  #define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
> > > +#define TIF_SECCOMP		8	/* syscall secure computing */
> > >  
> > >  #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
> > >  #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
> > > @@ -82,11 +83,13 @@ struct thread_info {
> > >  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
> > >  #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
> > >  #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
> > > +#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
> > >  
> > >  #define _TIF_WORK_MASK \
> > >  	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
> > >  
> > >  #define _TIF_SYSCALL_WORK \
> > > -	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> > > +	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> > > +	 _TIF_SECCOMP )
> > >  
> > >  #endif /* _ASM_RISCV_THREAD_INFO_H */
> > > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> > > index bc7a56e1ca6f..0bbedfa3e47d 100644
> > > --- a/arch/riscv/kernel/entry.S
> > > +++ b/arch/riscv/kernel/entry.S
> > > @@ -203,8 +203,25 @@ check_syscall_nr:
> > >  	/* Check to make sure we don't jump to a bogus syscall number. */
> > >  	li t0, __NR_syscalls
> > >  	la s0, sys_ni_syscall
> > > -	/* Syscall number held in a7 */
> > > -	bgeu a7, t0, 1f
> > > +	/*
> > > +	 * The tracer can change syscall number to valid/invalid value.
> > > +	 * We use syscall_set_nr helper in syscall_trace_enter thus we
> > > +	 * cannot trust the current value in a7 and have to reload from
> > > +	 * the current task pt_regs.
> > > +	 */
> > > +	REG_L a7, PT_A7(sp)
> > > +	/*
> > > +	 * Syscall number held in a7.
> > > +	 * If syscall number is above allowed value, redirect to ni_syscall.
> > > +	 */
> > > +	bge a7, t0, 1f
> > > +	/*
> > > +	 * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> > > +	 * If yes, we pretend it was executed.
> > > +	 */
> > > +	li t1, -1
> > > +	beq a7, t1, ret_from_syscall_rejected
> > > +	/* Call syscall */
> > >  	la s0, sys_call_table
> > >  	slli t0, a7, RISCV_LGPTR
> > >  	add s0, s0, t0
> > > @@ -215,6 +232,12 @@ check_syscall_nr:
> > >  ret_from_syscall:
> > >  	/* Set user a0 to kernel a0 */
> > >  	REG_S a0, PT_A0(sp)
> > > +	/*
> > > +	 * We didn't execute the actual syscall.
> > > +	 * Seccomp already set return value for the current task pt_regs.
> > > +	 * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> > > +	 */
> > > +ret_from_syscall_rejected:
> > >  	/* Trace syscalls, but only if requested by the user. */
> > >  	REG_L t0, TASK_TI_FLAGS(tp)
> > >  	andi t0, t0, _TIF_SYSCALL_WORK
> > > diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> > > index 368751438366..63e47c9f85f0 100644
> > > --- a/arch/riscv/kernel/ptrace.c
> > > +++ b/arch/riscv/kernel/ptrace.c
> > > @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
> > >  		if (tracehook_report_syscall_entry(regs))
> > >  			syscall_set_nr(current, regs, -1);
> > >  
> > > +	/*
> > > +	 * Do the secure computing after ptrace; failures should be fast.
> > > +	 * If this fails we might have return value in a0 from seccomp
> > > +	 * (via SECCOMP_RET_ERRNO/TRACE).
> > > +	 */
> > > +	if (secure_computing(NULL) == -1) {
> > > +		syscall_set_nr(current, regs, -1);
> > > +		return;
> > > +	}
> > > +
> > >  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> > >  	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> > >  		trace_sys_enter(regs, syscall_get_nr(current, regs));
> > > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > > index 6ef7f16c4cf5..492e0adad9d3 100644
> > > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> > > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> > > @@ -112,6 +112,8 @@ struct seccomp_data {
> > >  #  define __NR_seccomp 383
> > >  # elif defined(__aarch64__)
> > >  #  define __NR_seccomp 277
> > > +# elif defined(__riscv)
> > > +#  define __NR_seccomp 277
> > >  # elif defined(__hppa__)
> > >  #  define __NR_seccomp 338
> > >  # elif defined(__powerpc__)
> > > @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
> > >  # define ARCH_REGS	struct user_pt_regs
> > >  # define SYSCALL_NUM	regs[8]
> > >  # define SYSCALL_RET	regs[0]
> > > +#elif defined(__riscv) && __riscv_xlen == 64
> > > +# define ARCH_REGS	struct user_regs_struct
> > > +# define SYSCALL_NUM	a7
> > > +# define SYSCALL_RET	a0
> > >  #elif defined(__hppa__)
> > >  # define ARCH_REGS	struct user_regs_struct
> > >  # define SYSCALL_NUM	gr[20]
> > > @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
> > >  	EXPECT_EQ(0, ret) {}
> > >  
> > >  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> > > -    defined(__s390__) || defined(__hppa__)
> > > +    defined(__s390__) || defined(__hppa__) || defined(__riscv)
> > >  	{
> > >  		regs.SYSCALL_NUM = syscall;
> > >  	}
> > > -- 
> > > 2.21.0
> > > 
> > > 
> > 
> > 
> > - Paul
> > 
> 
>
diff mbox series

Patch

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 59a4727ecd6c..441e63ff5adc 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -31,6 +31,7 @@  config RISCV
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_ATOMIC64 if !64BIT
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_FUTEX_CMPXCHG if FUTEX
@@ -235,6 +236,19 @@  menu "Kernel features"
 
 source "kernel/Kconfig.hz"
 
+config SECCOMP
+	bool "Enable seccomp to safely compute untrusted bytecode"
+	help
+	  This kernel feature is useful for number crunching applications
+	  that may need to compute untrusted bytecode during their
+	  execution. By using pipes or other transports made available to
+	  the process as file descriptors supporting the read/write
+	  syscalls, it's possible to isolate those applications in
+	  their own address space using seccomp. Once seccomp is
+	  enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+	  and the task is only allowed to execute a few safe syscalls
+	  defined by each seccomp mode.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h
new file mode 100644
index 000000000000..bf7744ee3b3d
--- /dev/null
+++ b/arch/riscv/include/asm/seccomp.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_SECCOMP_H
+#define _ASM_SECCOMP_H
+
+#include <asm/unistd.h>
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_SECCOMP_H */
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 905372d7eeb8..a0b2a29a0da1 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -75,6 +75,7 @@  struct thread_info {
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_TRACEPOINT  6       /* syscall tracepoint instrumentation */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing */
+#define TIF_SECCOMP		8	/* syscall secure computing */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -82,11 +83,13 @@  struct thread_info {
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
 #define _TIF_WORK_MASK \
 	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
 
 #define _TIF_SYSCALL_WORK \
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
+	 _TIF_SECCOMP )
 
 #endif /* _ASM_RISCV_THREAD_INFO_H */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index bc7a56e1ca6f..0bbedfa3e47d 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -203,8 +203,25 @@  check_syscall_nr:
 	/* Check to make sure we don't jump to a bogus syscall number. */
 	li t0, __NR_syscalls
 	la s0, sys_ni_syscall
-	/* Syscall number held in a7 */
-	bgeu a7, t0, 1f
+	/*
+	 * The tracer can change syscall number to valid/invalid value.
+	 * We use syscall_set_nr helper in syscall_trace_enter thus we
+	 * cannot trust the current value in a7 and have to reload from
+	 * the current task pt_regs.
+	 */
+	REG_L a7, PT_A7(sp)
+	/*
+	 * Syscall number held in a7.
+	 * If syscall number is above allowed value, redirect to ni_syscall.
+	 */
+	bge a7, t0, 1f
+	/*
+	 * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
+	 * If yes, we pretend it was executed.
+	 */
+	li t1, -1
+	beq a7, t1, ret_from_syscall_rejected
+	/* Call syscall */
 	la s0, sys_call_table
 	slli t0, a7, RISCV_LGPTR
 	add s0, s0, t0
@@ -215,6 +232,12 @@  check_syscall_nr:
 ret_from_syscall:
 	/* Set user a0 to kernel a0 */
 	REG_S a0, PT_A0(sp)
+	/*
+	 * We didn't execute the actual syscall.
+	 * Seccomp already set return value for the current task pt_regs.
+	 * (If it was configured with SECCOMP_RET_ERRNO/TRACE)
+	 */
+ret_from_syscall_rejected:
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
 	andi t0, t0, _TIF_SYSCALL_WORK
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 368751438366..63e47c9f85f0 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -154,6 +154,16 @@  void do_syscall_trace_enter(struct pt_regs *regs)
 		if (tracehook_report_syscall_entry(regs))
 			syscall_set_nr(current, regs, -1);
 
+	/*
+	 * Do the secure computing after ptrace; failures should be fast.
+	 * If this fails we might have return value in a0 from seccomp
+	 * (via SECCOMP_RET_ERRNO/TRACE).
+	 */
+	if (secure_computing(NULL) == -1) {
+		syscall_set_nr(current, regs, -1);
+		return;
+	}
+
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, syscall_get_nr(current, regs));
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 6ef7f16c4cf5..492e0adad9d3 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -112,6 +112,8 @@  struct seccomp_data {
 #  define __NR_seccomp 383
 # elif defined(__aarch64__)
 #  define __NR_seccomp 277
+# elif defined(__riscv)
+#  define __NR_seccomp 277
 # elif defined(__hppa__)
 #  define __NR_seccomp 338
 # elif defined(__powerpc__)
@@ -1582,6 +1584,10 @@  TEST_F(TRACE_poke, getpid_runs_normally)
 # define ARCH_REGS	struct user_pt_regs
 # define SYSCALL_NUM	regs[8]
 # define SYSCALL_RET	regs[0]
+#elif defined(__riscv) && __riscv_xlen == 64
+# define ARCH_REGS	struct user_regs_struct
+# define SYSCALL_NUM	a7
+# define SYSCALL_RET	a0
 #elif defined(__hppa__)
 # define ARCH_REGS	struct user_regs_struct
 # define SYSCALL_NUM	gr[20]
@@ -1671,7 +1677,7 @@  void change_syscall(struct __test_metadata *_metadata,
 	EXPECT_EQ(0, ret) {}
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
-    defined(__s390__) || defined(__hppa__)
+    defined(__s390__) || defined(__hppa__) || defined(__riscv)
 	{
 		regs.SYSCALL_NUM = syscall;
 	}