diff mbox

[RFC,5/5] x86,seccomp: Add a seccomp fastpath

Message ID 9e11cd988a0f120606e37b5e275019754e2774da.1402517933.git.luto@amacapital.net (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski June 11, 2014, 8:23 p.m. UTC
On my VM, getpid takes about 70ns.  Before this patch, adding a
single-instruction always-accept seccomp filter added about 134ns of
overhead to getpid.  With this patch, the overhead is down to about
13ns.

I'm not really thrilled by this patch.  It has two main issues:

1. Calling into code in kernel/seccomp.c from assembly feels ugly.

2. The x86 64-bit syscall entry now has four separate code paths:
fast, seccomp only, audit only, and slow.  This kind of sucks.
Would it be worth trying to rewrite the whole thing in C with a
two-phase slow path approach like I'm using here for seccomp?

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/kernel/entry_64.S | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/seccomp.h    |  4 ++--
 2 files changed, 47 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov June 11, 2014, 9:29 p.m. UTC | #1
On Wed, Jun 11, 2014 at 1:23 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On my VM, getpid takes about 70ns.  Before this patch, adding a
> single-instruction always-accept seccomp filter added about 134ns of
> overhead to getpid.  With this patch, the overhead is down to about
> 13ns.

interesting.
Is this the gain from patch 4 into patch 5 or from patch 0 to patch 5?
13ns is still with seccomp enabled, but without filters?

> I'm not really thrilled by this patch.  It has two main issues:
>
> 1. Calling into code in kernel/seccomp.c from assembly feels ugly.
>
> 2. The x86 64-bit syscall entry now has four separate code paths:
> fast, seccomp only, audit only, and slow.  This kind of sucks.
> Would it be worth trying to rewrite the whole thing in C with a
> two-phase slow path approach like I'm using here for seccomp?
>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>  arch/x86/kernel/entry_64.S | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/seccomp.h    |  4 ++--
>  2 files changed, 47 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index f9e713a..feb32b2 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -683,6 +683,45 @@ sysret_signal:
>         FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
>         jmp int_check_syscall_exit_work
>
> +#ifdef CONFIG_SECCOMP
> +       /*
> +        * Fast path for seccomp without any other slow path triggers.
> +        */
> +seccomp_fastpath:
> +       /* Build seccomp_data */
> +       pushq %r9                               /* args[5] */
> +       pushq %r8                               /* args[4] */
> +       pushq %r10                              /* args[3] */
> +       pushq %rdx                              /* args[2] */
> +       pushq %rsi                              /* args[1] */
> +       pushq %rdi                              /* args[0] */
> +       pushq RIP-ARGOFFSET+6*8(%rsp)           /* rip */
> +       pushq %rax                              /* nr and junk */
> +       movl $AUDIT_ARCH_X86_64, 4(%rsp)        /* arch */
> +       movq %rsp, %rdi
> +       call seccomp_phase1

the assembler code is pretty much repeating what C does in
populate_seccomp_data(). Assuming the whole gain came from
patch 5 why asm version is so much faster than C?
it skips SAVE/RESTORE_REST... what else?
If the most of the gain is from all patches combined (mainly from
2 phase approach) then why bother with asm?

Somehow it feels that the gain is due to better branch prediction
in asm version. If you change few 'unlikely' in C to 'likely', it may
get to the same performance...

btw patches #1-3 look good to me. especially #1 is nice.

> +       addq $8*8, %rsp
> +       cmpq $1, %rax
> +       ja seccomp_invoke_phase2
> +       LOAD_ARGS 0  /* restore clobbered regs */
> +       jb system_call_fastpath
> +       jmp ret_from_sys_call
> +
> +seccomp_invoke_phase2:
> +       SAVE_REST
> +       FIXUP_TOP_OF_STACK %rdi
> +       movq %rax,%rdi
> +       call seccomp_phase2
> +
> +       /* if seccomp says to skip, then set orig_ax to -1 and skip */
> +       test %eax,%eax
> +       jz 1f
> +       movq $-1, ORIG_RAX(%rsp)
> +1:
> +       mov ORIG_RAX(%rsp), %rax                /* reload rax */
> +       jmp system_call_post_trace              /* and maybe do the syscall */
> +#endif
> +
>  #ifdef CONFIG_AUDITSYSCALL
>         /*
>          * Fast path for syscall audit without full syscall trace.
> @@ -717,6 +756,10 @@ sysret_audit:
>
>         /* Do syscall tracing */
>  tracesys:
> +#ifdef CONFIG_SECCOMP
> +       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SECCOMP),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
> +       jz seccomp_fastpath
> +#endif
>  #ifdef CONFIG_AUDITSYSCALL
>         testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
>         jz auditsys
> @@ -725,6 +768,8 @@ tracesys:
>         FIXUP_TOP_OF_STACK %rdi
>         movq %rsp,%rdi
>         call syscall_trace_enter
> +
> +system_call_post_trace:
>         /*
>          * Reload arg registers from stack in case ptrace changed them.
>          * We don't reload %rax because syscall_trace_enter() returned
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 4fc7a84..d3d4c52 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -37,8 +37,8 @@ static inline int secure_computing(void)
>  #define SECCOMP_PHASE1_OK      0
>  #define SECCOMP_PHASE1_SKIP    1
>
> -extern u32 seccomp_phase1(struct seccomp_data *sd);
> -int seccomp_phase2(u32 phase1_result);
> +asmlinkage __visible extern u32 seccomp_phase1(struct seccomp_data *sd);
> +asmlinkage __visible int seccomp_phase2(u32 phase1_result);
>  #else
>  extern void secure_computing_strict(int this_syscall);
>  #endif
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
Andy Lutomirski June 11, 2014, 9:56 p.m. UTC | #2
On Wed, Jun 11, 2014 at 2:29 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Wed, Jun 11, 2014 at 1:23 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On my VM, getpid takes about 70ns.  Before this patch, adding a
>> single-instruction always-accept seccomp filter added about 134ns of
>> overhead to getpid.  With this patch, the overhead is down to about
>> 13ns.
>
> interesting.
> Is this the gain from patch 4 into patch 5 or from patch 0 to patch 5?
> 13ns is still with seccomp enabled, but without filters?

13ns is with the simplest nonempty filter.  I hope that empty filters
don't work.

>
>> I'm not really thrilled by this patch.  It has two main issues:
>>
>> 1. Calling into code in kernel/seccomp.c from assembly feels ugly.
>>
>> 2. The x86 64-bit syscall entry now has four separate code paths:
>> fast, seccomp only, audit only, and slow.  This kind of sucks.
>> Would it be worth trying to rewrite the whole thing in C with a
>> two-phase slow path approach like I'm using here for seccomp?
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>  arch/x86/kernel/entry_64.S | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/seccomp.h    |  4 ++--
>>  2 files changed, 47 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index f9e713a..feb32b2 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -683,6 +683,45 @@ sysret_signal:
>>         FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
>>         jmp int_check_syscall_exit_work
>>
>> +#ifdef CONFIG_SECCOMP
>> +       /*
>> +        * Fast path for seccomp without any other slow path triggers.
>> +        */
>> +seccomp_fastpath:
>> +       /* Build seccomp_data */
>> +       pushq %r9                               /* args[5] */
>> +       pushq %r8                               /* args[4] */
>> +       pushq %r10                              /* args[3] */
>> +       pushq %rdx                              /* args[2] */
>> +       pushq %rsi                              /* args[1] */
>> +       pushq %rdi                              /* args[0] */
>> +       pushq RIP-ARGOFFSET+6*8(%rsp)           /* rip */

>> +       pushq %rax                              /* nr and junk */
>> +       movl $AUDIT_ARCH_X86_64, 4(%rsp)        /* arch */

It wouldn't shock me if this pair of instructions were
microarchitecturally bad.  Maybe I can save a few more cycles by using
bitwise arithmetic or a pair of movls and explicit rsp manipulation
here.  I haven't tried.

>> +       movq %rsp, %rdi
>> +       call seccomp_phase1
>
> the assembler code is pretty much repeating what C does in
> populate_seccomp_data(). Assuming the whole gain came from
> patch 5 why asm version is so much faster than C?
>
> it skips SAVE/RESTORE_REST... what else?
> If the most of the gain is from all patches combined (mainly from
> 2 phase approach) then why bother with asm?

The whole gain should be patch 5, but there are three things going on here.

The biggest win is skipping int_ret_from_sys_call.  IRET sucks.
There's some extra win from skipping SAVE/RESTORE_REST, but I haven't
benchmarked that.  I would guess it's on the order of 5ns.  In theory
a one-pass implementation could skip int_ret_from_sys_call, but that
will be awkward to implement correctly.

The other benefit is generating seccomp_data in assembly.  It saves
about 7ns.  This is likely due to avoiding all the indirection in
syscall_xyz and to avoiding prodding at flags to figure out the arch
token.

Fiddling with branch prediction made no difference that I could measure.

>
> Somehow it feels that the gain is due to better branch prediction
> in asm version. If you change few 'unlikely' in C to 'likely', it may
> get to the same performance...
>
> btw patches #1-3 look good to me. especially #1 is nice.

Thanks :)

--Andy
H. Peter Anvin June 11, 2014, 10:18 p.m. UTC | #3
On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
> 
> 13ns is with the simplest nonempty filter.  I hope that empty filters
> don't work.
> 

Why wouldn't they?

	-hpa
Andy Lutomirski June 11, 2014, 10:22 p.m. UTC | #4
On Wed, Jun 11, 2014 at 3:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
>>
>> 13ns is with the simplest nonempty filter.  I hope that empty filters
>> don't work.
>>
>
> Why wouldn't they?

Is it permissible to fall off the end of a BPF program?  I'm getting
EINVAL trying to install an actual empty filter.  The filter I tested
with was:

#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/syscall.h>
#include <err.h>
#include <sys/prctl.h>
#include <stddef.h>
#include <stdio.h>

int main(int argc, char **argv)
{
    int rc;

    struct sock_filter filter[] = {
        BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
    };

    struct sock_fprog prog = {
        .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
        .filter = filter,
    };

    if (argc < 2) {
        printf("Usage: null_seccomp PATH ARGS...\n");
        return 1;
    }

    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
        err(1, "PR_SET_NO_NEW_PRIVS");
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog))
        err(1, "PR_SET_SECCOMP");

    execv(argv[1], argv + 1);
    err(1, argv[1]);
}


--Andy
H. Peter Anvin June 11, 2014, 10:27 p.m. UTC | #5
On 06/11/2014 03:22 PM, Andy Lutomirski wrote:
> On Wed, Jun 11, 2014 at 3:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
>>>
>>> 13ns is with the simplest nonempty filter.  I hope that empty filters
>>> don't work.
>>>
>>
>> Why wouldn't they?
> 
> Is it permissible to fall off the end of a BPF program?  I'm getting
> EINVAL trying to install an actual empty filter.  The filter I tested
> with was:
> 

What I meant was that there has to be a well-defined behavior for the
program falling off the end anyway, and that that should be preserved.

I guess it is possible to require that all code paths must provably
reach a termination point.

	-hpa
Andy Lutomirski June 11, 2014, 10:28 p.m. UTC | #6
On Wed, Jun 11, 2014 at 3:27 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 06/11/2014 03:22 PM, Andy Lutomirski wrote:
>> On Wed, Jun 11, 2014 at 3:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
>>>>
>>>> 13ns is with the simplest nonempty filter.  I hope that empty filters
>>>> don't work.
>>>>
>>>
>>> Why wouldn't they?
>>
>> Is it permissible to fall off the end of a BPF program?  I'm getting
>> EINVAL trying to install an actual empty filter.  The filter I tested
>> with was:
>>
>
> What I meant was that there has to be a well-defined behavior for the
> program falling off the end anyway, and that that should be preserved.
>
> I guess it is possible to require that all code paths must provably
> reach a termination point.
>

Dunno.  I haven't ever touched any of the actual BPF code.  This whole
patchset only changes the code that invokes the BPF evaluator.

--Andy
Kees Cook June 11, 2014, 10:32 p.m. UTC | #7
On Wed, Jun 11, 2014 at 3:28 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Wed, Jun 11, 2014 at 3:27 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 06/11/2014 03:22 PM, Andy Lutomirski wrote:
>>> On Wed, Jun 11, 2014 at 3:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>> On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
>>>>>
>>>>> 13ns is with the simplest nonempty filter.  I hope that empty filters
>>>>> don't work.
>>>>>
>>>>
>>>> Why wouldn't they?
>>>
>>> Is it permissible to fall off the end of a BPF program?  I'm getting
>>> EINVAL trying to install an actual empty filter.  The filter I tested
>>> with was:
>>>
>>
>> What I meant was that there has to be a well-defined behavior for the
>> program falling off the end anyway, and that that should be preserved.
>>
>> I guess it is possible to require that all code paths must provably
>> reach a termination point.
>>
>
> Dunno.  I haven't ever touched any of the actual BPF code.  This whole
> patchset only changes the code that invokes the BPF evaluator.

Yes, this is how BPF works: runs to the end or exit early. With
seccomp BPF specifically, the return value defaults to kill the
process. If a filter was missing (NULL), or empty, or didn't
explicitly return with a new value, the default (kill) should be
taken.

-Kees
Will Drewry June 13, 2014, 4:29 p.m. UTC | #8
On Wed, Jun 11, 2014 at 5:32 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Jun 11, 2014 at 3:28 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Wed, Jun 11, 2014 at 3:27 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>> On 06/11/2014 03:22 PM, Andy Lutomirski wrote:
>>>> On Wed, Jun 11, 2014 at 3:18 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>>>> On 06/11/2014 02:56 PM, Andy Lutomirski wrote:
>>>>>>
>>>>>> 13ns is with the simplest nonempty filter.  I hope that empty filters
>>>>>> don't work.
>>>>>>
>>>>>
>>>>> Why wouldn't they?
>>>>
>>>> Is it permissible to fall off the end of a BPF program?  I'm getting
>>>> EINVAL trying to install an actual empty filter.  The filter I tested
>>>> with was:
>>>>
>>>
>>> What I meant was that there has to be a well-defined behavior for the
>>> program falling off the end anyway, and that that should be preserved.
>>>
>>> I guess it is possible to require that all code paths must provably
>>> reach a termination point.
>>>
>>
>> Dunno.  I haven't ever touched any of the actual BPF code.  This whole
>> patchset only changes the code that invokes the BPF evaluator.
>
> Yes, this is how BPF works: runs to the end or exit early. With
> seccomp BPF specifically, the return value defaults to kill the
> process. If a filter was missing (NULL), or empty, or didn't
> explicitly return with a new value, the default (kill) should be
> taken.

Yup - this is just a property of BPF (and a nice one :)

On seccomp_attach_filter this check fires:
  if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
    return -EINVAL;

As well as in sk_chk_filter:
  if (flen == 0 || flen > BPF_MAXINSNS)
    return -EINVAL;

And:
  /* last instruction must be a RET code */
  switch (filter[flen - 1].code) {
    case BPF_S_RET_K:
    case BPF_S_RET_A:
      return check_load_and_stores(filter, flen);
  }

cheers!
will
diff mbox

Patch

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f9e713a..feb32b2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -683,6 +683,45 @@  sysret_signal:
 	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
 	jmp int_check_syscall_exit_work
 
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Fast path for seccomp without any other slow path triggers.
+	 */
+seccomp_fastpath:
+	/* Build seccomp_data */
+	pushq %r9				/* args[5] */
+	pushq %r8				/* args[4] */
+	pushq %r10				/* args[3] */
+	pushq %rdx				/* args[2] */
+	pushq %rsi				/* args[1] */
+	pushq %rdi				/* args[0] */
+	pushq RIP-ARGOFFSET+6*8(%rsp)		/* rip */
+	pushq %rax 				/* nr and junk */
+	movl $AUDIT_ARCH_X86_64, 4(%rsp)	/* arch */
+	movq %rsp, %rdi
+	call seccomp_phase1
+	addq $8*8, %rsp
+	cmpq $1, %rax
+	ja seccomp_invoke_phase2
+	LOAD_ARGS 0  /* restore clobbered regs */
+	jb system_call_fastpath
+	jmp ret_from_sys_call
+
+seccomp_invoke_phase2:
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %rdi
+	movq %rax,%rdi
+	call seccomp_phase2
+
+	/* if seccomp says to skip, then set orig_ax to -1 and skip */
+	test %eax,%eax
+	jz 1f
+	movq $-1, ORIG_RAX(%rsp)
+1:
+	mov ORIG_RAX(%rsp), %rax		/* reload rax */
+	jmp system_call_post_trace		/* and maybe do the syscall */
+#endif
+
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
@@ -717,6 +756,10 @@  sysret_audit:
 
 	/* Do syscall tracing */
 tracesys:
+#ifdef CONFIG_SECCOMP
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SECCOMP),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jz seccomp_fastpath
+#endif
 #ifdef CONFIG_AUDITSYSCALL
 	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz auditsys
@@ -725,6 +768,8 @@  tracesys:
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
+
+system_call_post_trace:
 	/*
 	 * Reload arg registers from stack in case ptrace changed them.
 	 * We don't reload %rax because syscall_trace_enter() returned
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 4fc7a84..d3d4c52 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -37,8 +37,8 @@  static inline int secure_computing(void)
 #define SECCOMP_PHASE1_OK	0
 #define SECCOMP_PHASE1_SKIP	1
 
-extern u32 seccomp_phase1(struct seccomp_data *sd);
-int seccomp_phase2(u32 phase1_result);
+asmlinkage __visible extern u32 seccomp_phase1(struct seccomp_data *sd);
+asmlinkage __visible int seccomp_phase2(u32 phase1_result);
 #else
 extern void secure_computing_strict(int this_syscall);
 #endif