diff mbox series

[v8,4/7] entry: Support Syscall User Dispatch on common syscall entry

Message ID 20201127193238.821364-5-krisman@collabora.com (mailing list archive)
State New
Headers show
Series Syscall User Dispatch | expand

Commit Message

Gabriel Krisman Bertazi Nov. 27, 2020, 7:32 p.m. UTC
Syscall User Dispatch (SUD) must take precedence over seccomp and
ptrace, since the use case is emulation (it can be invoked with a
different ABI) such that seccomp filtering by syscall number doesn't
make sense in the first place.  In addition, either the syscall is
dispatched back to userspace, in which case there is no resource for to
trace, or the syscall will be executed, and seccomp/ptrace will execute
next.

Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as
well, just to prevent a trace exit event when dispatch was triggered.
For that, the on_syscall_dispatch() examines context to skip the
tracepoint, audit and other work.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
Changes since v6:
  - Update do_syscall_intercept signature (Christian Brauner)
  - Move it to before tracepoints
  - Use SYSCALL_WORK flags
---
 include/linux/entry-common.h |  2 ++
 kernel/entry/common.c        | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

Comments

Kees Cook Dec. 1, 2020, 10:57 p.m. UTC | #1
On Fri, Nov 27, 2020 at 02:32:35PM -0500, Gabriel Krisman Bertazi wrote:
> Syscall User Dispatch (SUD) must take precedence over seccomp and
> ptrace, since the use case is emulation (it can be invoked with a
> different ABI) such that seccomp filtering by syscall number doesn't
> make sense in the first place.  In addition, either the syscall is
> dispatched back to userspace, in which case there is no resource for to
> trace, or the syscall will be executed, and seccomp/ptrace will execute
> next.
> 
> Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as
> well, just to prevent a trace exit event when dispatch was triggered.
> For that, the on_syscall_dispatch() examines context to skip the
> tracepoint, audit and other work.
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>

Acked-by: Kees Cook <keescook@chromium.org>
Andy Lutomirski Dec. 2, 2020, 12:04 a.m. UTC | #2
On Fri, Nov 27, 2020 at 11:33 AM Gabriel Krisman Bertazi
<krisman@collabora.com> wrote:
>
> Syscall User Dispatch (SUD) must take precedence over seccomp and
> ptrace, since the use case is emulation (it can be invoked with a
> different ABI) such that seccomp filtering by syscall number doesn't
> make sense in the first place.  In addition, either the syscall is
> dispatched back to userspace, in which case there is no resource for to
> trace, or the syscall will be executed, and seccomp/ptrace will execute
> next.
>
> Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as
> well, just to prevent a trace exit event when dispatch was triggered.
> For that, the on_syscall_dispatch() examines context to skip the
> tracepoint, audit and other work.
>
> Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> Changes since v6:
>   - Update do_syscall_intercept signature (Christian Brauner)
>   - Move it to before tracepoints
>   - Use SYSCALL_WORK flags
> ---
>  include/linux/entry-common.h |  2 ++
>  kernel/entry/common.c        | 17 +++++++++++++++++
>  2 files changed, 19 insertions(+)
>
> diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
> index 49b26b216e4e..a6e98b4ba8e9 100644
> --- a/include/linux/entry-common.h
> +++ b/include/linux/entry-common.h
> @@ -44,10 +44,12 @@
>                                  SYSCALL_WORK_SYSCALL_TRACE |           \
>                                  SYSCALL_WORK_SYSCALL_EMU |             \
>                                  SYSCALL_WORK_SYSCALL_AUDIT |           \
> +                                SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
>                                  ARCH_SYSCALL_WORK_ENTER)
>  #define SYSCALL_WORK_EXIT      (SYSCALL_WORK_SYSCALL_TRACEPOINT |      \
>                                  SYSCALL_WORK_SYSCALL_TRACE |           \
>                                  SYSCALL_WORK_SYSCALL_AUDIT |           \
> +                                SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
>                                  ARCH_SYSCALL_WORK_EXIT)
>
>  /*
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index f1b12dc32ff4..ec20aba3b890 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -6,6 +6,8 @@
>  #include <linux/livepatch.h>
>  #include <linux/audit.h>
>
> +#include "common.h"
> +
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/syscalls.h>
>
> @@ -47,6 +49,16 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
>  {
>         long ret = 0;
>
> +       /*
> +        * Handle Syscall User Dispatch.  This must comes first, since
> +        * the ABI here can be something that doesn't make sense for
> +        * other syscall_work features.
> +        */
> +       if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
> +               if (do_syscall_user_dispatch(regs))
> +                       return -1L;
> +       }
> +
>         /* Handle ptrace */
>         if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
>                 ret = arch_syscall_enter_tracehook(regs);
> @@ -232,6 +244,11 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
>  {
>         bool step;
>
> +       if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
> +               if (on_syscall_dispatch())
> +                       return;
> +       }

I think this would be less confusing if you just open-coded the body
of on_syscall_dispatch here and got rid of the helper.

--Andy
diff mbox series

Patch

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 49b26b216e4e..a6e98b4ba8e9 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -44,10 +44,12 @@ 
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_EMU |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_ENTER)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
+				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
 /*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f1b12dc32ff4..ec20aba3b890 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -6,6 +6,8 @@ 
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 
+#include "common.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
@@ -47,6 +49,16 @@  static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 {
 	long ret = 0;
 
+	/*
+	 * Handle Syscall User Dispatch.  This must comes first, since
+	 * the ABI here can be something that doesn't make sense for
+	 * other syscall_work features.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (do_syscall_user_dispatch(regs))
+			return -1L;
+	}
+
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
 		ret = arch_syscall_enter_tracehook(regs);
@@ -232,6 +244,11 @@  static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 {
 	bool step;
 
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (on_syscall_dispatch())
+			return;
+	}
+
 	audit_syscall_exit(regs);
 
 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)