diff mbox

[v5,1/3] arm64: ptrace: reload a syscall number after ptrace operations

Message ID 1406020499-5537-2-git-send-email-takahiro.akashi@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

AKASHI Takahiro July 22, 2014, 9:14 a.m. UTC
Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
its value either to:
  * any valid syscall number to alter a system call, or
  * -1 to skip a system call

This patch implements this behavior by reloading that value into syscallno
in struct pt_regs after tracehook_report_syscall_entry() or
secure_computing(). In case of '-1', a return value of system call can also
be changed by the tracer setting the value to x0 register, and so
sys_ni_nosyscall() should not be called.

See also:
    42309ab4, ARM: 8087/1: ptrace: reload syscall number after
	      secure_computing() check

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
---
 arch/arm64/kernel/entry.S  |    2 ++
 arch/arm64/kernel/ptrace.c |   13 +++++++++++++
 2 files changed, 15 insertions(+)

Comments

Kees Cook July 22, 2014, 8:15 p.m. UTC | #1
On Tue, Jul 22, 2014 at 2:14 AM, AKASHI Takahiro
<takahiro.akashi@linaro.org> wrote:
> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
> its value either to:
>   * any valid syscall number to alter a system call, or
>   * -1 to skip a system call
>
> This patch implements this behavior by reloading that value into syscallno
> in struct pt_regs after tracehook_report_syscall_entry() or
> secure_computing(). In case of '-1', a return value of system call can also
> be changed by the tracer setting the value to x0 register, and so
> sys_ni_nosyscall() should not be called.
>
> See also:
>     42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>               secure_computing() check
>
> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
> ---
>  arch/arm64/kernel/entry.S  |    2 ++
>  arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>  2 files changed, 15 insertions(+)
>
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 5141e79..de8bdbc 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>  __sys_trace:
>         mov     x0, sp
>         bl      syscall_trace_enter
> +       cmp     w0, #-1                         // skip syscall?
> +       b.eq    ret_to_user
>         adr     lr, __sys_trace_return          // return address
>         uxtw    scno, w0                        // syscall number (possibly new)
>         mov     x1, sp                          // pointer to regs
> diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
> index 70526cf..100d7d1 100644
> --- a/arch/arm64/kernel/ptrace.c
> +++ b/arch/arm64/kernel/ptrace.c
> @@ -21,6 +21,7 @@
>
>  #include <linux/audit.h>
>  #include <linux/compat.h>
> +#include <linux/errno.h>
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/mm.h>
> @@ -1109,9 +1110,21 @@ static void tracehook_report_syscall(struct pt_regs *regs,
>
>  asmlinkage int syscall_trace_enter(struct pt_regs *regs)
>  {
> +       unsigned long saved_x0, saved_x8;
> +
> +       saved_x0 = regs->regs[0];
> +       saved_x8 = regs->regs[8];
> +
>         if (test_thread_flag(TIF_SYSCALL_TRACE))
>                 tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
>
> +       regs->syscallno = regs->regs[8];
> +       if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
> +               regs->regs[8] = saved_x8;
> +               if (regs->regs[0] == saved_x0) /* not changed by user */
> +                       regs->regs[0] = -ENOSYS;

I'm not sure this is right compared to other architectures. Generally
when a tracer performs a syscall skip, it's up to them to also adjust
the return value. They may want to be faking a syscall, and what if
the value they want to return happens to be what x0 was going into the
tracer? It would have no way to avoid this -ENOSYS case. I think
things are fine without this test.

-Kees

> +       }
> +
>         if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>                 trace_sys_enter(regs, regs->syscallno);
>
> --
> 1.7.9.5
>
AKASHI Takahiro July 23, 2014, 7:03 a.m. UTC | #2
On 07/23/2014 05:15 AM, Kees Cook wrote:
> On Tue, Jul 22, 2014 at 2:14 AM, AKASHI Takahiro
> <takahiro.akashi@linaro.org> wrote:
>> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
>> its value either to:
>>    * any valid syscall number to alter a system call, or
>>    * -1 to skip a system call
>>
>> This patch implements this behavior by reloading that value into syscallno
>> in struct pt_regs after tracehook_report_syscall_entry() or
>> secure_computing(). In case of '-1', a return value of system call can also
>> be changed by the tracer setting the value to x0 register, and so
>> sys_ni_nosyscall() should not be called.
>>
>> See also:
>>      42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>>                secure_computing() check
>>
>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>> ---
>>   arch/arm64/kernel/entry.S  |    2 ++
>>   arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>>   2 files changed, 15 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 5141e79..de8bdbc 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>>   __sys_trace:
>>          mov     x0, sp
>>          bl      syscall_trace_enter
>> +       cmp     w0, #-1                         // skip syscall?
>> +       b.eq    ret_to_user
>>          adr     lr, __sys_trace_return          // return address
>>          uxtw    scno, w0                        // syscall number (possibly new)
>>          mov     x1, sp                          // pointer to regs
>> diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
>> index 70526cf..100d7d1 100644
>> --- a/arch/arm64/kernel/ptrace.c
>> +++ b/arch/arm64/kernel/ptrace.c
>> @@ -21,6 +21,7 @@
>>
>>   #include <linux/audit.h>
>>   #include <linux/compat.h>
>> +#include <linux/errno.h>
>>   #include <linux/kernel.h>
>>   #include <linux/sched.h>
>>   #include <linux/mm.h>
>> @@ -1109,9 +1110,21 @@ static void tracehook_report_syscall(struct pt_regs *regs,
>>
>>   asmlinkage int syscall_trace_enter(struct pt_regs *regs)
>>   {
>> +       unsigned long saved_x0, saved_x8;
>> +
>> +       saved_x0 = regs->regs[0];
>> +       saved_x8 = regs->regs[8];
>> +
>>          if (test_thread_flag(TIF_SYSCALL_TRACE))
>>                  tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
>>
>> +       regs->syscallno = regs->regs[8];
>> +       if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
>> +               regs->regs[8] = saved_x8;
>> +               if (regs->regs[0] == saved_x0) /* not changed by user */
>> +                       regs->regs[0] = -ENOSYS;
>
> I'm not sure this is right compared to other architectures. Generally
> when a tracer performs a syscall skip, it's up to them to also adjust
> the return value. They may want to be faking a syscall, and what if
> the value they want to return happens to be what x0 was going into the
> tracer? It would have no way to avoid this -ENOSYS case. I think
> things are fine without this test.

Yeah, I know this issue, but was not sure that setting a return value
is mandatory. (x86 seems to return -ENOSYS by default if not explicitly
specified.)
Is "fake a system call" a more appropriate word than "skip"?

I will defer to Will.

Thanks,
-Takahiro AKASHI

> -Kees
>
>> +       }
>> +
>>          if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
>>                  trace_sys_enter(regs, regs->syscallno);
>>
>> --
>> 1.7.9.5
>>
>
>
>
Will Deacon July 23, 2014, 8:25 a.m. UTC | #3
On Wed, Jul 23, 2014 at 08:03:47AM +0100, AKASHI Takahiro wrote:
> On 07/23/2014 05:15 AM, Kees Cook wrote:
> > On Tue, Jul 22, 2014 at 2:14 AM, AKASHI Takahiro
> > <takahiro.akashi@linaro.org> wrote:
> >>   asmlinkage int syscall_trace_enter(struct pt_regs *regs)
> >>   {
> >> +       unsigned long saved_x0, saved_x8;
> >> +
> >> +       saved_x0 = regs->regs[0];
> >> +       saved_x8 = regs->regs[8];
> >> +
> >>          if (test_thread_flag(TIF_SYSCALL_TRACE))
> >>                  tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
> >>
> >> +       regs->syscallno = regs->regs[8];
> >> +       if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
> >> +               regs->regs[8] = saved_x8;
> >> +               if (regs->regs[0] == saved_x0) /* not changed by user */
> >> +                       regs->regs[0] = -ENOSYS;
> >
> > I'm not sure this is right compared to other architectures. Generally
> > when a tracer performs a syscall skip, it's up to them to also adjust
> > the return value. They may want to be faking a syscall, and what if
> > the value they want to return happens to be what x0 was going into the
> > tracer? It would have no way to avoid this -ENOSYS case. I think
> > things are fine without this test.
> 
> Yeah, I know this issue, but was not sure that setting a return value
> is mandatory. (x86 seems to return -ENOSYS by default if not explicitly
> specified.)
> Is "fake a system call" a more appropriate word than "skip"?
> 
> I will defer to Will.

I agree with Kees -- iirc, I only suggested restoring x8.

Will
AKASHI Takahiro July 23, 2014, 9:09 a.m. UTC | #4
On 07/23/2014 05:25 PM, Will Deacon wrote:
> On Wed, Jul 23, 2014 at 08:03:47AM +0100, AKASHI Takahiro wrote:
>> On 07/23/2014 05:15 AM, Kees Cook wrote:
>>> On Tue, Jul 22, 2014 at 2:14 AM, AKASHI Takahiro
>>> <takahiro.akashi@linaro.org> wrote:
>>>>    asmlinkage int syscall_trace_enter(struct pt_regs *regs)
>>>>    {
>>>> +       unsigned long saved_x0, saved_x8;
>>>> +
>>>> +       saved_x0 = regs->regs[0];
>>>> +       saved_x8 = regs->regs[8];
>>>> +
>>>>           if (test_thread_flag(TIF_SYSCALL_TRACE))
>>>>                   tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
>>>>
>>>> +       regs->syscallno = regs->regs[8];
>>>> +       if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
>>>> +               regs->regs[8] = saved_x8;
>>>> +               if (regs->regs[0] == saved_x0) /* not changed by user */
>>>> +                       regs->regs[0] = -ENOSYS;
>>>
>>> I'm not sure this is right compared to other architectures. Generally
>>> when a tracer performs a syscall skip, it's up to them to also adjust
>>> the return value. They may want to be faking a syscall, and what if
>>> the value they want to return happens to be what x0 was going into the
>>> tracer? It would have no way to avoid this -ENOSYS case. I think
>>> things are fine without this test.
>>
>> Yeah, I know this issue, but was not sure that setting a return value
>> is mandatory. (x86 seems to return -ENOSYS by default if not explicitly
>> specified.)
>> Is "fake a system call" a more appropriate word than "skip"?
>>
>> I will defer to Will.
>
> I agree with Kees -- iirc, I only suggested restoring x8.

OK.

-Takahiro AKASHI

> Will
>
Kees Cook July 23, 2014, 3:13 p.m. UTC | #5
On Wed, Jul 23, 2014 at 12:03 AM, AKASHI Takahiro
<takahiro.akashi@linaro.org> wrote:
> On 07/23/2014 05:15 AM, Kees Cook wrote:
>>
>> On Tue, Jul 22, 2014 at 2:14 AM, AKASHI Takahiro
>> <takahiro.akashi@linaro.org> wrote:
>>>
>>> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
>>> its value either to:
>>>    * any valid syscall number to alter a system call, or
>>>    * -1 to skip a system call
>>>
>>> This patch implements this behavior by reloading that value into
>>> syscallno
>>> in struct pt_regs after tracehook_report_syscall_entry() or
>>> secure_computing(). In case of '-1', a return value of system call can
>>> also
>>> be changed by the tracer setting the value to x0 register, and so
>>> sys_ni_nosyscall() should not be called.
>>>
>>> See also:
>>>      42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>>>                secure_computing() check
>>>
>>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>>> ---
>>>   arch/arm64/kernel/entry.S  |    2 ++
>>>   arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>>>   2 files changed, 15 insertions(+)
>>>
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index 5141e79..de8bdbc 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>>>   __sys_trace:
>>>          mov     x0, sp
>>>          bl      syscall_trace_enter
>>> +       cmp     w0, #-1                         // skip syscall?
>>> +       b.eq    ret_to_user
>>>          adr     lr, __sys_trace_return          // return address
>>>          uxtw    scno, w0                        // syscall number
>>> (possibly new)
>>>          mov     x1, sp                          // pointer to regs
>>> diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
>>> index 70526cf..100d7d1 100644
>>> --- a/arch/arm64/kernel/ptrace.c
>>> +++ b/arch/arm64/kernel/ptrace.c
>>> @@ -21,6 +21,7 @@
>>>
>>>   #include <linux/audit.h>
>>>   #include <linux/compat.h>
>>> +#include <linux/errno.h>
>>>   #include <linux/kernel.h>
>>>   #include <linux/sched.h>
>>>   #include <linux/mm.h>
>>> @@ -1109,9 +1110,21 @@ static void tracehook_report_syscall(struct
>>> pt_regs *regs,
>>>
>>>   asmlinkage int syscall_trace_enter(struct pt_regs *regs)
>>>   {
>>> +       unsigned long saved_x0, saved_x8;
>>> +
>>> +       saved_x0 = regs->regs[0];
>>> +       saved_x8 = regs->regs[8];
>>> +
>>>          if (test_thread_flag(TIF_SYSCALL_TRACE))
>>>                  tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
>>>
>>> +       regs->syscallno = regs->regs[8];
>>> +       if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
>>> +               regs->regs[8] = saved_x8;
>>> +               if (regs->regs[0] == saved_x0) /* not changed by user */
>>> +                       regs->regs[0] = -ENOSYS;
>>
>>
>> I'm not sure this is right compared to other architectures. Generally
>> when a tracer performs a syscall skip, it's up to them to also adjust
>> the return value. They may want to be faking a syscall, and what if
>> the value they want to return happens to be what x0 was going into the
>> tracer? It would have no way to avoid this -ENOSYS case. I think
>> things are fine without this test.
>
>
> Yeah, I know this issue, but was not sure that setting a return value
> is mandatory. (x86 seems to return -ENOSYS by default if not explicitly
> specified.)
> Is "fake a system call" a more appropriate word than "skip"?

I think this is just a matter of semantics and perspective. From the
kernel's perspective, it's always a "skip" since the syscall is never
actually executed. But from the perspective of userspace, it's really
up to the tracer to decide how it should be seen: the tracer could
return -ENOSYS, or a fake return value, etc. But generally, I think
"skip" is the most accurate term for this.

-Kees
Andy Lutomirski July 24, 2014, 3:54 a.m. UTC | #6
On 07/22/2014 02:14 AM, AKASHI Takahiro wrote:
> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
> its value either to:
>    * any valid syscall number to alter a system call, or
>    * -1 to skip a system call
>
> This patch implements this behavior by reloading that value into syscallno
> in struct pt_regs after tracehook_report_syscall_entry() or
> secure_computing(). In case of '-1', a return value of system call can also
> be changed by the tracer setting the value to x0 register, and so
> sys_ni_nosyscall() should not be called.
>
> See also:
>      42309ab4, ARM: 8087/1: ptrace: reload syscall number after
> 	      secure_computing() check
>
> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
> ---
>   arch/arm64/kernel/entry.S  |    2 ++
>   arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>   2 files changed, 15 insertions(+)
>
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 5141e79..de8bdbc 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>   __sys_trace:
>   	mov	x0, sp
>   	bl	syscall_trace_enter
> +	cmp	w0, #-1				// skip syscall?
> +	b.eq	ret_to_user

Does this mean that skipped syscalls will cause exit tracing to be 
skipped?  If so, then you risk (at least) introducing a nice 
user-triggerable OOPS if audit is enabled.  This bug existed for *years* 
on x86_32, and it amazes me that no one ever triggered it by accident. 
(Grr, audit.)

--Andy
AKASHI Takahiro July 24, 2014, 5:57 a.m. UTC | #7
On 07/24/2014 12:54 PM, Andy Lutomirski wrote:
> On 07/22/2014 02:14 AM, AKASHI Takahiro wrote:
>> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
>> its value either to:
>>    * any valid syscall number to alter a system call, or
>>    * -1 to skip a system call
>>
>> This patch implements this behavior by reloading that value into syscallno
>> in struct pt_regs after tracehook_report_syscall_entry() or
>> secure_computing(). In case of '-1', a return value of system call can also
>> be changed by the tracer setting the value to x0 register, and so
>> sys_ni_nosyscall() should not be called.
>>
>> See also:
>>      42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>>           secure_computing() check
>>
>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>> ---
>>   arch/arm64/kernel/entry.S  |    2 ++
>>   arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>>   2 files changed, 15 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 5141e79..de8bdbc 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>>   __sys_trace:
>>       mov    x0, sp
>>       bl    syscall_trace_enter
>> +    cmp    w0, #-1                // skip syscall?
>> +    b.eq    ret_to_user
>
> Does this mean that skipped syscalls will cause exit tracing to be skipped?

Yes. (and I guess yes on arm, too)

 > If so, then you risk (at least) introducing
> a nice user-triggerable OOPS if audit is enabled.

Can you please elaborate this?
Since I didn't find any definition of audit's behavior when syscall is
rewritten to -1, I thought it is reasonable to skip "exit tracing" of
"skipped" syscall.
(otherwise, "fake" seems to be more appropriate :)

-Takahiro AKASHI

> This bug existed for *years* on x86_32, and it amazes me that no one
> ever triggered it by accident. (Grr, audit.)
>
> --Andy
Andy Lutomirski July 24, 2014, 3:01 p.m. UTC | #8
On Jul 23, 2014 10:57 PM, "AKASHI Takahiro" <takahiro.akashi@linaro.org> wrote:
>
> On 07/24/2014 12:54 PM, Andy Lutomirski wrote:
>>
>> On 07/22/2014 02:14 AM, AKASHI Takahiro wrote:
>>>
>>> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
>>> its value either to:
>>>    * any valid syscall number to alter a system call, or
>>>    * -1 to skip a system call
>>>
>>> This patch implements this behavior by reloading that value into syscallno
>>> in struct pt_regs after tracehook_report_syscall_entry() or
>>> secure_computing(). In case of '-1', a return value of system call can also
>>> be changed by the tracer setting the value to x0 register, and so
>>> sys_ni_nosyscall() should not be called.
>>>
>>> See also:
>>>      42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>>>           secure_computing() check
>>>
>>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>>> ---
>>>   arch/arm64/kernel/entry.S  |    2 ++
>>>   arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>>>   2 files changed, 15 insertions(+)
>>>
>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>> index 5141e79..de8bdbc 100644
>>> --- a/arch/arm64/kernel/entry.S
>>> +++ b/arch/arm64/kernel/entry.S
>>> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>>>   __sys_trace:
>>>       mov    x0, sp
>>>       bl    syscall_trace_enter
>>> +    cmp    w0, #-1                // skip syscall?
>>> +    b.eq    ret_to_user
>>
>>
>> Does this mean that skipped syscalls will cause exit tracing to be skipped?
>
>
> Yes. (and I guess yes on arm, too)
>
>
> > If so, then you risk (at least) introducing
>>
>> a nice user-triggerable OOPS if audit is enabled.
>
>
> Can you please elaborate this?
> Since I didn't find any definition of audit's behavior when syscall is
> rewritten to -1, I thought it is reasonable to skip "exit tracing" of
> "skipped" syscall.
> (otherwise, "fake" seems to be more appropriate :)

The audit entry hook will oops if you call it twice in a row without
calling the exit hook in between.  I can also imagine ptracers getting
confused if ptrace entry and exit don't line up.

What happens if user code directly issues syscall ~0?  Does the return
value register get set?  Is the behavior different between traced and
untraced syscalls?  The current approach seems a bit scary.

--Andy
AKASHI Takahiro July 25, 2014, 10:36 a.m. UTC | #9
On 07/25/2014 12:01 AM, Andy Lutomirski wrote:
> On Jul 23, 2014 10:57 PM, "AKASHI Takahiro" <takahiro.akashi@linaro.org> wrote:
>>
>> On 07/24/2014 12:54 PM, Andy Lutomirski wrote:
>>>
>>> On 07/22/2014 02:14 AM, AKASHI Takahiro wrote:
>>>>
>>>> Arm64 holds a syscall number in w8(x8) register. Ptrace tracer may change
>>>> its value either to:
>>>>     * any valid syscall number to alter a system call, or
>>>>     * -1 to skip a system call
>>>>
>>>> This patch implements this behavior by reloading that value into syscallno
>>>> in struct pt_regs after tracehook_report_syscall_entry() or
>>>> secure_computing(). In case of '-1', a return value of system call can also
>>>> be changed by the tracer setting the value to x0 register, and so
>>>> sys_ni_nosyscall() should not be called.
>>>>
>>>> See also:
>>>>       42309ab4, ARM: 8087/1: ptrace: reload syscall number after
>>>>            secure_computing() check
>>>>
>>>> Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
>>>> ---
>>>>    arch/arm64/kernel/entry.S  |    2 ++
>>>>    arch/arm64/kernel/ptrace.c |   13 +++++++++++++
>>>>    2 files changed, 15 insertions(+)
>>>>
>>>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>>>> index 5141e79..de8bdbc 100644
>>>> --- a/arch/arm64/kernel/entry.S
>>>> +++ b/arch/arm64/kernel/entry.S
>>>> @@ -628,6 +628,8 @@ ENDPROC(el0_svc)
>>>>    __sys_trace:
>>>>        mov    x0, sp
>>>>        bl    syscall_trace_enter
>>>> +    cmp    w0, #-1                // skip syscall?
>>>> +    b.eq    ret_to_user
>>>
>>>
>>> Does this mean that skipped syscalls will cause exit tracing to be skipped?
>>
>>
>> Yes. (and I guess yes on arm, too)
>>
>>
>>> If so, then you risk (at least) introducing
>>>
>>> a nice user-triggerable OOPS if audit is enabled.
>>
>>
>> Can you please elaborate this?
>> Since I didn't find any definition of audit's behavior when syscall is
>> rewritten to -1, I thought it is reasonable to skip "exit tracing" of
>> "skipped" syscall.
>> (otherwise, "fake" seems to be more appropriate :)
>
> The audit entry hook will oops if you call it twice in a row without
> calling the exit hook in between.

Thank you, I could reproduce this problem which hits BUG(in_syscall) in
audit_syscall_entry(). Really bad, and I fixed it in my next version and
now a "skipped" system call is also traced by audit.

I ran libseccomp test and Kees' test under auditd running with a rule,
	auditctl -a exit,always -S all
and all the tests seemed to pass.

   I can also imagine ptracers getting
> confused if ptrace entry and exit don't line up.

FYI, on arm64, we can distinguish syscall enter/exit with x12 register.

> What happens if user code directly issues syscall ~0?  Does the return
> value register get set?  Is the behavior different between traced and
> untraced syscalls?

Interesting cases. Let me think about it.

Thanks,
-Takahiro AKASHI

>  The current approach seems a bit scary.
>
> --Andy
>
Will Deacon July 25, 2014, 11:03 a.m. UTC | #10
On Fri, Jul 25, 2014 at 11:36:49AM +0100, AKASHI Takahiro wrote:
> On 07/25/2014 12:01 AM, Andy Lutomirski wrote:
> >>> If so, then you risk (at least) introducing
> >>>
> >>> a nice user-triggerable OOPS if audit is enabled.
> >>
> >>
> >> Can you please elaborate this?
> >> Since I didn't find any definition of audit's behavior when syscall is
> >> rewritten to -1, I thought it is reasonable to skip "exit tracing" of
> >> "skipped" syscall.
> >> (otherwise, "fake" seems to be more appropriate :)
> >
> > The audit entry hook will oops if you call it twice in a row without
> > calling the exit hook in between.
> 
> Thank you, I could reproduce this problem which hits BUG(in_syscall) in
> audit_syscall_entry(). Really bad, and I fixed it in my next version and
> now a "skipped" system call is also traced by audit.

Can you reproduce this on arch/arm/ too? If so, we should also fix the code
there.

Will
AKASHI Takahiro July 29, 2014, 6:49 a.m. UTC | #11
On 07/25/2014 08:03 PM, Will Deacon wrote:
> On Fri, Jul 25, 2014 at 11:36:49AM +0100, AKASHI Takahiro wrote:
>> On 07/25/2014 12:01 AM, Andy Lutomirski wrote:
>>>>> If so, then you risk (at least) introducing
>>>>>
>>>>> a nice user-triggerable OOPS if audit is enabled.
>>>>
>>>>
>>>> Can you please elaborate this?
>>>> Since I didn't find any definition of audit's behavior when syscall is
>>>> rewritten to -1, I thought it is reasonable to skip "exit tracing" of
>>>> "skipped" syscall.
>>>> (otherwise, "fake" seems to be more appropriate :)
>>>
>>> The audit entry hook will oops if you call it twice in a row without
>>> calling the exit hook in between.
>>
>> Thank you, I could reproduce this problem which hits BUG(in_syscall) in
>> audit_syscall_entry(). Really bad, and I fixed it in my next version and
>> now a "skipped" system call is also traced by audit.
>
> Can you reproduce this on arch/arm/ too? If so, we should also fix the code
> there.

As far as I tried on arm with syscall auditing enabled,

1) Changing a syscall number to -1 under seccomp doesn't hit BUG_ON(in_syscall).
2) But, in fact, audit_syscall_entry() is NOT called in this case because
    __secure_computing() returns -1 and then it causes the succeeding tracing
    in syscall_trace_enter(), including audit_syscall_entry(), skipped.
3) On the other hand, calling syscall(-1) from userspace hits BUG_ON because
    the return path, ret_slow_syscall, doesn't contain syscall_trace_exit().
4) When we re-write a syscall number to -1 without seccomp, we will also see
    BUG_ON hit, although I didn't try yet.

Fixing case 3 is easy, but should we also fix case 2?

Please note that, even if we call audit_syscall_exit() in case 2 or 3, no log against
syscall -1 will be recorded because audit_filter_syscall() doesn't allow logging
for any syscall number which is greater than 2048.
This behavior was introduced by Andy's patch, a3c54931, in v3.16-rc.
If the intention of "-1" is to fake a system call, this behavior seems to be a bit odd.

Thanks,
-Takahiro AKASHI


> Will
>
Will Deacon July 29, 2014, 1:26 p.m. UTC | #12
On Tue, Jul 29, 2014 at 07:49:47AM +0100, AKASHI Takahiro wrote:
> On 07/25/2014 08:03 PM, Will Deacon wrote:
> > On Fri, Jul 25, 2014 at 11:36:49AM +0100, AKASHI Takahiro wrote:
> >> On 07/25/2014 12:01 AM, Andy Lutomirski wrote:
> >>>>> If so, then you risk (at least) introducing
> >>>>>
> >>>>> a nice user-triggerable OOPS if audit is enabled.
> >>>>
> >>>>
> >>>> Can you please elaborate this?
> >>>> Since I didn't find any definition of audit's behavior when syscall is
> >>>> rewritten to -1, I thought it is reasonable to skip "exit tracing" of
> >>>> "skipped" syscall.
> >>>> (otherwise, "fake" seems to be more appropriate :)
> >>>
> >>> The audit entry hook will oops if you call it twice in a row without
> >>> calling the exit hook in between.
> >>
> >> Thank you, I could reproduce this problem which hits BUG(in_syscall) in
> >> audit_syscall_entry(). Really bad, and I fixed it in my next version and
> >> now a "skipped" system call is also traced by audit.
> >
> > Can you reproduce this on arch/arm/ too? If so, we should also fix the code
> > there.
> 
> As far as I tried on arm with syscall auditing enabled,
> 
> 1) Changing a syscall number to -1 under seccomp doesn't hit BUG_ON(in_syscall).
> 2) But, in fact, audit_syscall_entry() is NOT called in this case because
>     __secure_computing() returns -1 and then it causes the succeeding tracing
>     in syscall_trace_enter(), including audit_syscall_entry(), skipped.

What happens if CONFIG_SECCOMP=n?

> 3) On the other hand, calling syscall(-1) from userspace hits BUG_ON because
>     the return path, ret_slow_syscall, doesn't contain syscall_trace_exit().
> 4) When we re-write a syscall number to -1 without seccomp, we will also see
>     BUG_ON hit, although I didn't try yet.
> 
> Fixing case 3 is easy, but should we also fix case 2?

I think so.

Will
diff mbox

Patch

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5141e79..de8bdbc 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -628,6 +628,8 @@  ENDPROC(el0_svc)
 __sys_trace:
 	mov	x0, sp
 	bl	syscall_trace_enter
+	cmp	w0, #-1				// skip syscall?
+	b.eq	ret_to_user
 	adr	lr, __sys_trace_return		// return address
 	uxtw	scno, w0			// syscall number (possibly new)
 	mov	x1, sp				// pointer to regs
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 70526cf..100d7d1 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -21,6 +21,7 @@ 
 
 #include <linux/audit.h>
 #include <linux/compat.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -1109,9 +1110,21 @@  static void tracehook_report_syscall(struct pt_regs *regs,
 
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
+	unsigned long saved_x0, saved_x8;
+
+	saved_x0 = regs->regs[0];
+	saved_x8 = regs->regs[8];
+
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
+	regs->syscallno = regs->regs[8];
+	if ((long)regs->syscallno == ~0UL) { /* skip this syscall */
+		regs->regs[8] = saved_x8;
+		if (regs->regs[0] == saved_x0) /* not changed by user */
+			regs->regs[0] = -ENOSYS;
+	}
+
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, regs->syscallno);