diff mbox

Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return

Message ID CAJcbSZEr8HPBwH1oVaHqPzAY4MS_=yqMoqPhcauuKu3cikB3uQ@mail.gmail.com
State New, archived
Headers show

Commit Message

Thomas Garnier July 18, 2017, 4:04 p.m. UTC
On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez
<leonard.crestez@nxp.com> wrote:
> On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> Ensure the address limit is a user-mode segment before returning to
>> user-mode. Otherwise a process can corrupt kernel-mode memory and
>> elevate privileges [1].
>>
>> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> return. In the slow path, the address limit is checked to be USER_DS if
>> needed.
>>
>> The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> for arm instruction immediate support. The global work mask is too big
>> to used on a single instruction so adapt ret_fast_syscall.
>>
>> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> ---
>> v10 redesigns the change to use work flags on set_fs as recommended by
>> Linus and agreed by others.
>>
>> Based on next-20170609
>> ---
>>  arch/arm/include/asm/thread_info.h | 15 +++++++++------
>>  arch/arm/include/asm/uaccess.h     |  2 ++
>>  arch/arm/kernel/entry-common.S     |  9 +++++++--
>>  arch/arm/kernel/signal.c           |  5 +++++
>>  4 files changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
>> index 776757d1604a..1d468b527b7b 100644
>> --- a/arch/arm/include/asm/thread_info.h
>> +++ b/arch/arm/include/asm/thread_info.h
>> @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define TIF_NEED_RESCHED     1       /* rescheduling necessary */
>>  #define TIF_NOTIFY_RESUME    2       /* callback before returning to user */
>>  #define TIF_UPROBE           3       /* breakpointed or singlestepping */
>> -#define TIF_SYSCALL_TRACE    4       /* syscall trace active */
>> -#define TIF_SYSCALL_AUDIT    5       /* syscall auditing active */
>> -#define TIF_SYSCALL_TRACEPOINT       6       /* syscall tracepoint instrumentation */
>> -#define TIF_SECCOMP          7       /* seccomp syscall filtering active */
>> +#define TIF_FSCHECK          4       /* Check FS is USER_DS on return */
>> +#define TIF_SYSCALL_TRACE    5       /* syscall trace active */
>> +#define TIF_SYSCALL_AUDIT    6       /* syscall auditing active */
>> +#define TIF_SYSCALL_TRACEPOINT       7       /* syscall tracepoint instrumentation */
>> +#define TIF_SECCOMP          8       /* seccomp syscall filtering active */
>>
>>  #define TIF_NOHZ             12      /* in adaptive nohz mode */
>>  #define TIF_USING_IWMMXT     17
>> @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define _TIF_NEED_RESCHED    (1 << TIF_NEED_RESCHED)
>>  #define _TIF_NOTIFY_RESUME   (1 << TIF_NOTIFY_RESUME)
>>  #define _TIF_UPROBE          (1 << TIF_UPROBE)
>> +#define _TIF_FSCHECK         (1 << TIF_FSCHECK)
>>  #define _TIF_SYSCALL_TRACE   (1 << TIF_SYSCALL_TRACE)
>>  #define _TIF_SYSCALL_AUDIT   (1 << TIF_SYSCALL_AUDIT)
>>  #define _TIF_SYSCALL_TRACEPOINT      (1 << TIF_SYSCALL_TRACEPOINT)
>> @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  /*
>>   * Change these and you break ASM code in entry-common.S
>>   */
>> -#define _TIF_WORK_MASK               (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
>> -                              _TIF_NOTIFY_RESUME | _TIF_UPROBE)
>> +#define _TIF_WORK_MASK               (_TIF_NEED_RESCHED | _TIF_SIGPENDING |  \
>> +                              _TIF_NOTIFY_RESUME | _TIF_UPROBE |     \
>> +                              _TIF_FSCHECK)
>>
>>  #endif /* __KERNEL__ */
>>  #endif /* __ASM_ARM_THREAD_INFO_H */
>> diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
>> index 2577405d082d..6cc882223e34 100644
>> --- a/arch/arm/include/asm/uaccess.h
>> +++ b/arch/arm/include/asm/uaccess.h
>> @@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
>>  {
>>       current_thread_info()->addr_limit = fs;
>>       modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
>> +     /* On user-mode return, check fs is correct */
>> +     set_thread_flag(TIF_FSCHECK);
>>  }
>>
>>  #define segment_eq(a, b)     ((a) == (b))
>> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
>> index eb5cd77bf1d8..e33c32d56193 100644
>> --- a/arch/arm/kernel/entry-common.S
>> +++ b/arch/arm/kernel/entry-common.S
>> @@ -41,7 +41,9 @@ ret_fast_syscall:
>>   UNWIND(.cantunwind  )
>>       disable_irq_notrace                     @ disable interrupts
>>       ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
>> -     tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> +     tst     r1, #_TIF_SYSCALL_WORK
>> +     bne     fast_work_pending
>> +     tst     r1, #_TIF_WORK_MASK
>>       bne     fast_work_pending
>>
>>       /* perform architecture specific actions before user return */
>> @@ -67,12 +69,15 @@ ret_fast_syscall:
>>       str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
>>       disable_irq_notrace                     @ disable interrupts
>>       ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
>> -     tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> +     tst     r1, #_TIF_SYSCALL_WORK
>> +     bne     fast_work_pending
>> +     tst     r1, #_TIF_WORK_MASK
>>       beq     no_work_pending
>>   UNWIND(.fnend               )
>>  ENDPROC(ret_fast_syscall)
>>
>>       /* Slower path - fall through to work_pending */
>> +fast_work_pending:
>>  #endif
>>
>>       tst     r1, #_TIF_SYSCALL_WORK
>> diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> index 7b8f2141427b..3a48b54c6405 100644
>> --- a/arch/arm/kernel/signal.c
>> +++ b/arch/arm/kernel/signal.c
>> @@ -14,6 +14,7 @@
>>  #include
>>  #include
>>  #include
>> +#include
>>
>>  #include
>>  #include
>> @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>>        * Update the trace code with the current status.
>>        */
>>       trace_hardirqs_off();
>> +
>> +     /* Check valid user FS if needed */
>> +     addr_limit_user_check();
>> +
>>       do {
>>               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>>                       schedule();
>
> This patch made it's way into linux-next next-20170717 and it seems to
> cause hangs when booting some boards over NFS (found via bisection). I
> don't know exactly what determines the issue but I can reproduce hangs
> if even if I just boot with init=/bin/bash and do stuff like
>
> # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>
> When this happens sysrq-t shows a sleep task hung in the 'R' state
> spinning in do_work_pending, so maybe there is a potential infinite
> loop here?
>
> The addr_limit_user_check at the start of do_work_pending will check
> for TIF_FSCHECK once and clear it but the function loops while
> (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> the loop will never terminate. Does this make sense?

Yes, it does. Thanks for looking into this.

>
> I added some instrumentation to check if TIF_FSCHECK can show up during
> the do_work_pending loop and the answer seems to be yes. I also tried
> to get a stack with a set_fs call from inside do_work_pending and got
> the following:
>
> [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> [  227.596275] Backtrace:
> [  227.598754] [<c010cbb4>] (dump_backtrace) from [<c010ce60>] (show_stack+0x18/0x1c)
> [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> [  227.612016] [<c010ce48>] (show_stack) from [<c0493498>] (dump_stack+0xb4/0xe8)
> [  227.619258] [<c04933e4>] (dump_stack) from [<c010c350>] (mydbg_set_fs+0x40/0x48)
> [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> [  227.634433] [<c010c310>] (mydbg_set_fs) from [<c021f0b8>] (__probe_kernel_read+0x44/0xd0)
> [  227.642629] [<c021f074>] (__probe_kernel_read) from [<c011b8d8>] (do_alignment+0x8c/0x75c)
> [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> [  227.658748]  r4:00000000 r3:00000000
> [  227.662338] [<c011b84c>] (do_alignment) from [<c0101394>] (do_DataAbort+0x40/0xc0)
> [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> [  227.677760]  r4:c100dd3c
> [  227.680308] [<c0101354>] (do_DataAbort) from [<c010da44>] (__dabt_svc+0x64/0xa0)
> [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> [  227.709158] dc20: 40070113 ffffffff
> [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> [  227.719382] [<c08cf16c>] (inet_gro_receive) from [<c084a8ec>] (dev_gro_receive+0x2f0/0x618)
> [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> [  227.735585]  r4:c1009f78
> [  227.738132] [<c084a5fc>] (dev_gro_receive) from [<c084ac8c>] (napi_gro_receive+0x78/0x1f4)
> [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> [  227.754249]  r4:ee1b20c0
> [  227.756801] [<c084ac14>] (napi_gro_receive) from [<c06a2784>] (fec_enet_rx_napi+0x39c/0x988)
> [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> [  227.773010] [<c06a23e8>] (fec_enet_rx_napi) from [<c084a3a4>] (net_rx_action+0x21c/0x474)
> [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> [  227.789039]  r4:ef085710
> [  227.791593] [<c084a188>] (net_rx_action) from [<c012f2d4>] (__do_softirq+0x158/0x534)
> [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> [  227.807275]  r4:c100208c
> [  227.809824] [<c012f17c>] (__do_softirq) from [<c012fa68>] (irq_exit+0xec/0x168)
> [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> [  227.824984]  r4:c0fa534c
> [  227.827534] [<c012f97c>] (irq_exit) from [<c01883f4>] (__handle_domain_irq+0x74/0xe8)
> [  227.835377] [<c0188380>] (__handle_domain_irq) from [<c01015fc>] (gic_handle_irq+0x58/0xbc)
> [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> [  227.851498] [<c01015a4>] (gic_handle_irq) from [<c010daf0>] (__irq_svc+0x70/0x98)
> [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> [  227.880434] dec0: 60070013 ffffffff
> [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> [  227.891775]  r4:c0389688
> [  227.894327] [<c038a6f8>] (nfs_file_clear_open_context) from [<c03860e8>] (nfs_file_release+0x54/0x60)
> [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> [  227.909235] [<c0386094>] (nfs_file_release) from [<c0276cb4>] (__fput+0x94/0x1e0)
> [  227.916734] [<c0276c20>] (__fput) from [<c0276e60>] (____fput+0x10/0x14)
> [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> [  227.931286]  r4:ef2edbd4
> [  227.933839] [<c0276e50>] (____fput) from [<c014c534>] (task_work_run+0xc8/0xec)
> [  227.941166] [<c014c46c>] (task_work_run) from [<c010c484>] (do_work_pending+0x12c/0x1c4)
> [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> [  227.957029] [<c010c358>] (do_work_pending) from [<c0107c90>] (slow_work_pending+0xc/0x20)
> [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> [  227.973058]  r4:b6f76904
>
> Maybe the reason this reproduces easily in this particular setup is
> that ethernet causes lots of alignment faults?

Can you try this change?

  return 0;

>
> --
> Regards,
> Leonard

Comments

Leonard Crestez July 18, 2017, 5:18 p.m. UTC | #1
On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > 
> > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > 
> > > Ensure the address limit is a user-mode segment before returning to
> > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > elevate privileges [1].
> > > 
> > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > return. In the slow path, the address limit is checked to be USER_DS if
> > > needed.
> > > 
> > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > for arm instruction immediate support. The global work mask is too big
> > > to used on a single instruction so adapt ret_fast_syscall.
> > > 
> > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > >        * Update the trace code with the current status.
> > >        */
> > >       trace_hardirqs_off();
> > > +
> > > +     /* Check valid user FS if needed */
> > > +     addr_limit_user_check();
> > > +
> > >       do {
> > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > >                       schedule();
> > This patch made it's way into linux-next next-20170717 and it seems to
> > cause hangs when booting some boards over NFS (found via bisection). I
> > don't know exactly what determines the issue but I can reproduce hangs
> > if even if I just boot with init=/bin/bash and do stuff like
> > 
> > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > 
> > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > spinning in do_work_pending, so maybe there is a potential infinite
> > loop here?
> > 
> > The addr_limit_user_check at the start of do_work_pending will check
> > for TIF_FSCHECK once and clear it but the function loops while
> > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > the loop will never terminate. Does this make sense?

> Yes, it does. Thanks for looking into this.

> > I added some instrumentation to check if TIF_FSCHECK can show up during
> > the do_work_pending loop and the answer seems to be yes. I also tried
> > to get a stack with a set_fs call from inside do_work_pending and got
> > the following:
> > 
> > [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> > [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> > [  227.596275] Backtrace:
> > [  227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
> > [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> > [  227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
> > [  227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
> > [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> > [  227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
> > [  227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
> > [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> > [  227.658748]  r4:00000000 r3:00000000
> > [  227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
> > [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> > [  227.677760]  r4:c100dd3c
> > [  227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
> > [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> > [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> > [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> > [  227.709158] dc20: 40070113 ffffffff
> > [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> > [  227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
> > [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> > [  227.735585]  r4:c1009f78
> > [  227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
> > [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> > [  227.754249]  r4:ee1b20c0
> > [  227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
> > [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> > [  227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
> > [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> > [  227.789039]  r4:ef085710
> > [  227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
> > [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> > [  227.807275]  r4:c100208c
> > [  227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
> > [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> > [  227.824984]  r4:c0fa534c
> > [  227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
> > [  227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
> > [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> > [  227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
> > [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> > [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> > [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> > [  227.880434] dec0: 60070013 ffffffff
> > [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> > [  227.891775]  r4:c0389688
> > [  227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
> > [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> > [  227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
> > [  227.916734] [] (__fput) from [] (____fput+0x10/0x14)
> > [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> > [  227.931286]  r4:ef2edbd4
> > [  227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
> > [  227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
> > [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> > [  227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
> > [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> > [  227.973058]  r4:b6f76904
> > 
> > Maybe the reason this reproduces easily in this particular setup is
> > that ethernet causes lots of alignment faults?
> Can you try this change?
> 
> diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> index 3a48b54c6405..bc6ad7789568 100644
> --- a/arch/arm/kernel/signal.c
> +++ b/arch/arm/kernel/signal.c
> @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> int thread_flags, int syscall)
>   */
>   trace_hardirqs_off();
> 
> - /* Check valid user FS if needed */
> - addr_limit_user_check();
> -
>   do {
>   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>   schedule();
> + } else if (thread_flags & _TIF_FSCHECK) {
> + addr_limit_user_check();
>   } else {
>   if (unlikely(!user_mode(regs)))
>   return 0;

This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.

The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?

The issue seems like it would affect arm64 as well.

If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?

--
Regards,
Leonard
Thomas Garnier July 18, 2017, 7:04 p.m. UTC | #2
On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez
<leonard.crestez@nxp.com> wrote:
>
> On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > >
> > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > >
> > > > Ensure the address limit is a user-mode segment before returning to
> > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > elevate privileges [1].
> > > >
> > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > needed.
> > > >
> > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > for arm instruction immediate support. The global work mask is too big
> > > > to used on a single instruction so adapt ret_fast_syscall.
> > > >
> > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > >        * Update the trace code with the current status.
> > > >        */
> > > >       trace_hardirqs_off();
> > > > +
> > > > +     /* Check valid user FS if needed */
> > > > +     addr_limit_user_check();
> > > > +
> > > >       do {
> > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > >                       schedule();
> > > This patch made it's way into linux-next next-20170717 and it seems to
> > > cause hangs when booting some boards over NFS (found via bisection). I
> > > don't know exactly what determines the issue but I can reproduce hangs
> > > if even if I just boot with init=/bin/bash and do stuff like
> > >
> > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > >
> > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > spinning in do_work_pending, so maybe there is a potential infinite
> > > loop here?
> > >
> > > The addr_limit_user_check at the start of do_work_pending will check
> > > for TIF_FSCHECK once and clear it but the function loops while
> > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > the loop will never terminate. Does this make sense?
>
> > Yes, it does. Thanks for looking into this.
>
> > > I added some instrumentation to check if TIF_FSCHECK can show up during
> > > the do_work_pending loop and the answer seems to be yes. I also tried
> > > to get a stack with a set_fs call from inside do_work_pending and got
> > > the following:
> > >
> > > [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> > > [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> > > [  227.596275] Backtrace:
> > > [  227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
> > > [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> > > [  227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
> > > [  227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
> > > [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> > > [  227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
> > > [  227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
> > > [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> > > [  227.658748]  r4:00000000 r3:00000000
> > > [  227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
> > > [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> > > [  227.677760]  r4:c100dd3c
> > > [  227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
> > > [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> > > [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> > > [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> > > [  227.709158] dc20: 40070113 ffffffff
> > > [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> > > [  227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
> > > [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> > > [  227.735585]  r4:c1009f78
> > > [  227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
> > > [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> > > [  227.754249]  r4:ee1b20c0
> > > [  227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
> > > [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> > > [  227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
> > > [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> > > [  227.789039]  r4:ef085710
> > > [  227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
> > > [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> > > [  227.807275]  r4:c100208c
> > > [  227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
> > > [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> > > [  227.824984]  r4:c0fa534c
> > > [  227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
> > > [  227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
> > > [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> > > [  227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
> > > [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> > > [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> > > [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> > > [  227.880434] dec0: 60070013 ffffffff
> > > [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> > > [  227.891775]  r4:c0389688
> > > [  227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
> > > [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> > > [  227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
> > > [  227.916734] [] (__fput) from [] (____fput+0x10/0x14)
> > > [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> > > [  227.931286]  r4:ef2edbd4
> > > [  227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
> > > [  227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
> > > [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> > > [  227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
> > > [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> > > [  227.973058]  r4:b6f76904
> > >
> > > Maybe the reason this reproduces easily in this particular setup is
> > > that ethernet causes lots of alignment faults?
> > Can you try this change?
> >
> > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > index 3a48b54c6405..bc6ad7789568 100644
> > --- a/arch/arm/kernel/signal.c
> > +++ b/arch/arm/kernel/signal.c
> > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > int thread_flags, int syscall)
> >   */
> >   trace_hardirqs_off();
> >
> > - /* Check valid user FS if needed */
> > - addr_limit_user_check();
> > -
> >   do {
> >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> >   schedule();
> > + } else if (thread_flags & _TIF_FSCHECK) {
> > + addr_limit_user_check();
> >   } else {
> >   if (unlikely(!user_mode(regs)))
> >   return 0;
>
> This does seem to work, it no longer hangs on boot in my setup. This is
> obviously only a very superficial test.
>
> The new location of this check seems weird, it's not clear why it
> should be on an else path. Perhaps it should be moved to right before
> where current_thread_info()->flags is fetched again?

I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.

>
> The issue seems like it would affect arm64 as well.

Yes, I will propose a fix on each architecture.

>
> If the purpose is hardening against buggy kernel code doing bad set_fs
> calls shouldn't this flag also be checked before looking at
> TIF_NEED_RESCHED and calling schedule()?

I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.

What do you think? Let me know and I will look at changes all
architectures and testing them.

Thanks!

>
> --
> Regards,
> Leonard
Leonard Crestez July 19, 2017, 2:58 p.m. UTC | #3
On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
> On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > > > 
> > > > > Ensure the address limit is a user-mode segment before returning to
> > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > > elevate privileges [1].
> > > > > 
> > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > > needed.
> > > > > 
> > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > > for arm instruction immediate support. The global work mask is too big
> > > > > to used on a single instruction so adapt ret_fast_syscall.
> > > > > 
> > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > > >        * Update the trace code with the current status.
> > > > >        */
> > > > >       trace_hardirqs_off();
> > > > > +
> > > > > +     /* Check valid user FS if needed */
> > > > > +     addr_limit_user_check();
> > > > > +
> > > > >       do {
> > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > > >                       schedule();
> > > > This patch made it's way into linux-next next-20170717 and it seems to
> > > > cause hangs when booting some boards over NFS (found via bisection). I
> > > > don't know exactly what determines the issue but I can reproduce hangs
> > > > if even if I just boot with init=/bin/bash and do stuff like
> > > > 
> > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > > > 
> > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > > spinning in do_work_pending, so maybe there is a potential infinite
> > > > loop here?
> > > > 
> > > > The addr_limit_user_check at the start of do_work_pending will check
> > > > for TIF_FSCHECK once and clear it but the function loops while
> > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > > the loop will never terminate. Does this make sense?
> > > 
> > > Yes, it does. Thanks for looking into this.
> > > 
> > > Can you try this change?
> > > 
> > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > > index 3a48b54c6405..bc6ad7789568 100644
> > > --- a/arch/arm/kernel/signal.c
> > > +++ b/arch/arm/kernel/signal.c
> > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > > int thread_flags, int syscall)
> > >   */
> > >   trace_hardirqs_off();
> > > 
> > > - /* Check valid user FS if needed */
> > > - addr_limit_user_check();
> > > -
> > >   do {
> > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > >   schedule();
> > > + } else if (thread_flags & _TIF_FSCHECK) {
> > > + addr_limit_user_check();
> > >   } else {
> > >   if (unlikely(!user_mode(regs)))
> > >   return 0;
> > This does seem to work, it no longer hangs on boot in my setup. This is
> > obviously only a very superficial test.
> > 
> > The new location of this check seems weird, it's not clear why it
> > should be on an else path. Perhaps it should be moved to right before
> > where current_thread_info()->flags is fetched again?

> I was hitting bug when I tried that.I think that's because you
> basically let the signal handler do pending work before you check the
> flag, that's not a good idea.

> > If the purpose is hardening against buggy kernel code doing bad set_fs
> > calls shouldn't this flag also be checked before looking at
> > TIF_NEED_RESCHED and calling schedule()?
> I am not sure to be honest. I expected schedule to only schedule the
> processor to another task which would be fine given only the current
> task have a bogus fs. I will put it first in case there is an edge
> case scenario I missed.
> 
> What do you think? Let me know and I will look at changes all
> architectures and testing them.

I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.

Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.

--
Regards,
Leonard
Thomas Garnier July 19, 2017, 4:51 p.m. UTC | #4
On Wed, Jul 19, 2017 at 7:58 AM, Leonard Crestez
<leonard.crestez@nxp.com> wrote:
> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
>> On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
>> > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> > > > >
>> > > > > Ensure the address limit is a user-mode segment before returning to
>> > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
>> > > > > elevate privileges [1].
>> > > > >
>> > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
>> > > > > return. In the slow path, the address limit is checked to be USER_DS if
>> > > > > needed.
>> > > > >
>> > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> > > > > for arm instruction immediate support. The global work mask is too big
>> > > > > to used on a single instruction so adapt ret_fast_syscall.
>> > > > >
>> > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>> > > > >        * Update the trace code with the current status.
>> > > > >        */
>> > > > >       trace_hardirqs_off();
>> > > > > +
>> > > > > +     /* Check valid user FS if needed */
>> > > > > +     addr_limit_user_check();
>> > > > > +
>> > > > >       do {
>> > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > > >                       schedule();
>> > > > This patch made it's way into linux-next next-20170717 and it seems to
>> > > > cause hangs when booting some boards over NFS (found via bisection). I
>> > > > don't know exactly what determines the issue but I can reproduce hangs
>> > > > if even if I just boot with init=/bin/bash and do stuff like
>> > > >
>> > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>> > > >
>> > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
>> > > > spinning in do_work_pending, so maybe there is a potential infinite
>> > > > loop here?
>> > > >
>> > > > The addr_limit_user_check at the start of do_work_pending will check
>> > > > for TIF_FSCHECK once and clear it but the function loops while
>> > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
>> > > > the loop will never terminate. Does this make sense?
>> > >
>> > > Yes, it does. Thanks for looking into this.
>> > >
>> > > Can you try this change?
>> > >
>> > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> > > index 3a48b54c6405..bc6ad7789568 100644
>> > > --- a/arch/arm/kernel/signal.c
>> > > +++ b/arch/arm/kernel/signal.c
>> > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
>> > > int thread_flags, int syscall)
>> > >   */
>> > >   trace_hardirqs_off();
>> > >
>> > > - /* Check valid user FS if needed */
>> > > - addr_limit_user_check();
>> > > -
>> > >   do {
>> > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > >   schedule();
>> > > + } else if (thread_flags & _TIF_FSCHECK) {
>> > > + addr_limit_user_check();
>> > >   } else {
>> > >   if (unlikely(!user_mode(regs)))
>> > >   return 0;
>> > This does seem to work, it no longer hangs on boot in my setup. This is
>> > obviously only a very superficial test.
>> >
>> > The new location of this check seems weird, it's not clear why it
>> > should be on an else path. Perhaps it should be moved to right before
>> > where current_thread_info()->flags is fetched again?
>
>> I was hitting bug when I tried that.I think that's because you
>> basically let the signal handler do pending work before you check the
>> flag, that's not a good idea.
>
>> > If the purpose is hardening against buggy kernel code doing bad set_fs
>> > calls shouldn't this flag also be checked before looking at
>> > TIF_NEED_RESCHED and calling schedule()?
>> I am not sure to be honest. I expected schedule to only schedule the
>> processor to another task which would be fine given only the current
>> task have a bogus fs. I will put it first in case there is an edge
>> case scenario I missed.
>>
>> What do you think? Let me know and I will look at changes all
>> architectures and testing them.
>
> I don't know and I'd rather not guess on security issues. It's better
> if someone else reviews the code.
>
> Unless there is a very quick fix maybe this series should be removed or
> reverted from linux-next? A diagnosis of "system calls can sometimes
> hang on return" seems serious even for linux-next. Since it happens
> very rarely in most setups I can easily imagine somebody spending a lot
> of time digging at this.

I will send fixes for each architecture in the meantime.

>
> --
> Regards,
> Leonard
Russell King - ARM Linux admin July 19, 2017, 5:06 p.m. UTC | #5
On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
> > On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > > > > 
> > > > > > Ensure the address limit is a user-mode segment before returning to
> > > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > > > elevate privileges [1].
> > > > > > 
> > > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > > > needed.
> > > > > > 
> > > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > > > for arm instruction immediate support. The global work mask is too big
> > > > > > to used on a single instruction so adapt ret_fast_syscall.
> > > > > > 
> > > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > > > >        * Update the trace code with the current status.
> > > > > >        */
> > > > > >       trace_hardirqs_off();
> > > > > > +
> > > > > > +     /* Check valid user FS if needed */
> > > > > > +     addr_limit_user_check();
> > > > > > +
> > > > > >       do {
> > > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > > > >                       schedule();
> > > > > This patch made it's way into linux-next next-20170717 and it seems to
> > > > > cause hangs when booting some boards over NFS (found via bisection). I
> > > > > don't know exactly what determines the issue but I can reproduce hangs
> > > > > if even if I just boot with init=/bin/bash and do stuff like
> > > > > 
> > > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > > > > 
> > > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > > > spinning in do_work_pending, so maybe there is a potential infinite
> > > > > loop here?
> > > > > 
> > > > > The addr_limit_user_check at the start of do_work_pending will check
> > > > > for TIF_FSCHECK once and clear it but the function loops while
> > > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > > > the loop will never terminate. Does this make sense?
> > > > 
> > > > Yes, it does. Thanks for looking into this.
> > > > 
> > > > Can you try this change?
> > > > 
> > > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > > > index 3a48b54c6405..bc6ad7789568 100644
> > > > --- a/arch/arm/kernel/signal.c
> > > > +++ b/arch/arm/kernel/signal.c
> > > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > > > int thread_flags, int syscall)
> > > >   */
> > > >   trace_hardirqs_off();
> > > > 
> > > > - /* Check valid user FS if needed */
> > > > - addr_limit_user_check();
> > > > -
> > > >   do {
> > > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > >   schedule();
> > > > + } else if (thread_flags & _TIF_FSCHECK) {
> > > > + addr_limit_user_check();
> > > >   } else {
> > > >   if (unlikely(!user_mode(regs)))
> > > >   return 0;
> > > This does seem to work, it no longer hangs on boot in my setup. This is
> > > obviously only a very superficial test.
> > > 
> > > The new location of this check seems weird, it's not clear why it
> > > should be on an else path. Perhaps it should be moved to right before
> > > where current_thread_info()->flags is fetched again?
> 
> > I was hitting bug when I tried that.I think that's because you
> > basically let the signal handler do pending work before you check the
> > flag, that's not a good idea.
> 
> > > If the purpose is hardening against buggy kernel code doing bad set_fs
> > > calls shouldn't this flag also be checked before looking at
> > > TIF_NEED_RESCHED and calling schedule()?
> > I am not sure to be honest. I expected schedule to only schedule the
> > processor to another task which would be fine given only the current
> > task have a bogus fs. I will put it first in case there is an edge
> > case scenario I missed.
> > 
> > What do you think? Let me know and I will look at changes all
> > architectures and testing them.
> 
> I don't know and I'd rather not guess on security issues. It's better
> if someone else reviews the code.
> 
> Unless there is a very quick fix maybe this series should be removed or
> reverted from linux-next? A diagnosis of "system calls can sometimes
> hang on return" seems serious even for linux-next. Since it happens
> very rarely in most setups I can easily imagine somebody spending a lot
> of time digging at this.

Probably best to revert.  I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.

Looking at this patch now, there's several things I'm not happy with.

The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
split the tests, meaning more instructions in the return path.  Eg:

-       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+       tst     r1, #_TIF_SYSCALL_WORK
+       bne     fast_work_pending
+       tst     r1, #_TIF_WORK_MASK
        bne     fast_work_pending

should be written:

	tst	r1, #_TIF_SYSCALL_WORK
	tsteq	r1, #_TIF_WORK_MASK
	bne	fast_work_pending

and:

-       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+       tst     r1, #_TIF_SYSCALL_WORK
+       bne     fast_work_pending
+       tst     r1, #_TIF_WORK_MASK

should be:

	tst	r1, #_TIF_SYSCALL_WORK
	tsteq	r1, #_TIF_WORK_MASK

There's no need for extra branches.

Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.

My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.)  However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem.  Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().

I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.

Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?
Thomas Garnier July 19, 2017, 5:20 p.m. UTC | #6
On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
>> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
>> > On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
>> > > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> > > > > >
>> > > > > > Ensure the address limit is a user-mode segment before returning to
>> > > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
>> > > > > > elevate privileges [1].
>> > > > > >
>> > > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
>> > > > > > return. In the slow path, the address limit is checked to be USER_DS if
>> > > > > > needed.
>> > > > > >
>> > > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> > > > > > for arm instruction immediate support. The global work mask is too big
>> > > > > > to used on a single instruction so adapt ret_fast_syscall.
>> > > > > >
>> > > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>> > > > > >        * Update the trace code with the current status.
>> > > > > >        */
>> > > > > >       trace_hardirqs_off();
>> > > > > > +
>> > > > > > +     /* Check valid user FS if needed */
>> > > > > > +     addr_limit_user_check();
>> > > > > > +
>> > > > > >       do {
>> > > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > > > >                       schedule();
>> > > > > This patch made it's way into linux-next next-20170717 and it seems to
>> > > > > cause hangs when booting some boards over NFS (found via bisection). I
>> > > > > don't know exactly what determines the issue but I can reproduce hangs
>> > > > > if even if I just boot with init=/bin/bash and do stuff like
>> > > > >
>> > > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>> > > > >
>> > > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
>> > > > > spinning in do_work_pending, so maybe there is a potential infinite
>> > > > > loop here?
>> > > > >
>> > > > > The addr_limit_user_check at the start of do_work_pending will check
>> > > > > for TIF_FSCHECK once and clear it but the function loops while
>> > > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
>> > > > > the loop will never terminate. Does this make sense?
>> > > >
>> > > > Yes, it does. Thanks for looking into this.
>> > > >
>> > > > Can you try this change?
>> > > >
>> > > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> > > > index 3a48b54c6405..bc6ad7789568 100644
>> > > > --- a/arch/arm/kernel/signal.c
>> > > > +++ b/arch/arm/kernel/signal.c
>> > > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
>> > > > int thread_flags, int syscall)
>> > > >   */
>> > > >   trace_hardirqs_off();
>> > > >
>> > > > - /* Check valid user FS if needed */
>> > > > - addr_limit_user_check();
>> > > > -
>> > > >   do {
>> > > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > >   schedule();
>> > > > + } else if (thread_flags & _TIF_FSCHECK) {
>> > > > + addr_limit_user_check();
>> > > >   } else {
>> > > >   if (unlikely(!user_mode(regs)))
>> > > >   return 0;
>> > > This does seem to work, it no longer hangs on boot in my setup. This is
>> > > obviously only a very superficial test.
>> > >
>> > > The new location of this check seems weird, it's not clear why it
>> > > should be on an else path. Perhaps it should be moved to right before
>> > > where current_thread_info()->flags is fetched again?
>>
>> > I was hitting bug when I tried that.I think that's because you
>> > basically let the signal handler do pending work before you check the
>> > flag, that's not a good idea.
>>
>> > > If the purpose is hardening against buggy kernel code doing bad set_fs
>> > > calls shouldn't this flag also be checked before looking at
>> > > TIF_NEED_RESCHED and calling schedule()?
>> > I am not sure to be honest. I expected schedule to only schedule the
>> > processor to another task which would be fine given only the current
>> > task have a bogus fs. I will put it first in case there is an edge
>> > case scenario I missed.
>> >
>> > What do you think? Let me know and I will look at changes all
>> > architectures and testing them.
>>
>> I don't know and I'd rather not guess on security issues. It's better
>> if someone else reviews the code.
>>
>> Unless there is a very quick fix maybe this series should be removed or
>> reverted from linux-next? A diagnosis of "system calls can sometimes
>> hang on return" seems serious even for linux-next. Since it happens
>> very rarely in most setups I can easily imagine somebody spending a lot
>> of time digging at this.
>
> Probably best to revert.  I stopped looking at these patches during
> the discussion, as the discussion seemed to be mainly around other
> architectures, and I thought we had ARM settled.
>
> Looking at this patch now, there's several things I'm not happy with.
>
> The effect of adding a the new TIF flag for FSCHECK amongst the other
> flags is that we end up overflowing the 8-bit constant, and have to
> split the tests, meaning more instructions in the return path.  Eg:
>
> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +       tst     r1, #_TIF_SYSCALL_WORK
> +       bne     fast_work_pending
> +       tst     r1, #_TIF_WORK_MASK
>         bne     fast_work_pending
>
> should be written:
>
>         tst     r1, #_TIF_SYSCALL_WORK
>         tsteq   r1, #_TIF_WORK_MASK
>         bne     fast_work_pending
>
> and:
>
> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +       tst     r1, #_TIF_SYSCALL_WORK
> +       bne     fast_work_pending
> +       tst     r1, #_TIF_WORK_MASK
>
> should be:
>
>         tst     r1, #_TIF_SYSCALL_WORK
>         tsteq   r1, #_TIF_WORK_MASK
>
> There's no need for extra branches.
>
> Now, the next issue is that I don't think this TIF-flag approach is
> good for ARM - alignment faults can happen any time due to misaligned
> packets in the networking code, and we really don't want to be doing
> this check in a place that we can loop.
>
> My original suggestion for ARM was to do the address limit check after
> all work had been processed, with interrupts disabled (so no
> possibility of this kind of loop happening.)  However, that seems to
> have been replaced with this TIF approach, which is going to cause
> loops - I suspect if the probes code is enabled, this will suffer
> the same problem.  Remember, the various probes stuff can walk
> userspace stacks, which means they'll be using set_fs().
>
> I don't see why we've ended up with this (imho) sub-standard TIF-flag
> approach, and I think it's going to be very problematical.
>
> Can we please go back to the approach I suggested back in March for
> ARM that doesn't suffer from this problem?

During the extensive thread discussion, Linus asked to move away from
architecture specific changes to this work flag system. I am glad to
fix the assembly as you asked on a separate patch.

>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.
Russell King - ARM Linux admin July 19, 2017, 6:35 p.m. UTC | #7
On Wed, Jul 19, 2017 at 10:20:35AM -0700, Thomas Garnier wrote:
> On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
> <linux@armlinux.org.uk> wrote:
> > On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
> > Probably best to revert.  I stopped looking at these patches during
> > the discussion, as the discussion seemed to be mainly around other
> > architectures, and I thought we had ARM settled.
> >
> > Looking at this patch now, there's several things I'm not happy with.
> >
> > The effect of adding a the new TIF flag for FSCHECK amongst the other
> > flags is that we end up overflowing the 8-bit constant, and have to
> > split the tests, meaning more instructions in the return path.  Eg:
> >
> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> > +       tst     r1, #_TIF_SYSCALL_WORK
> > +       bne     fast_work_pending
> > +       tst     r1, #_TIF_WORK_MASK
> >         bne     fast_work_pending
> >
> > should be written:
> >
> >         tst     r1, #_TIF_SYSCALL_WORK
> >         tsteq   r1, #_TIF_WORK_MASK
> >         bne     fast_work_pending
> >
> > and:
> >
> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> > +       tst     r1, #_TIF_SYSCALL_WORK
> > +       bne     fast_work_pending
> > +       tst     r1, #_TIF_WORK_MASK
> >
> > should be:
> >
> >         tst     r1, #_TIF_SYSCALL_WORK
> >         tsteq   r1, #_TIF_WORK_MASK
> >
> > There's no need for extra branches.
> >
> > Now, the next issue is that I don't think this TIF-flag approach is
> > good for ARM - alignment faults can happen any time due to misaligned
> > packets in the networking code, and we really don't want to be doing
> > this check in a place that we can loop.
> >
> > My original suggestion for ARM was to do the address limit check after
> > all work had been processed, with interrupts disabled (so no
> > possibility of this kind of loop happening.)  However, that seems to
> > have been replaced with this TIF approach, which is going to cause
> > loops - I suspect if the probes code is enabled, this will suffer
> > the same problem.  Remember, the various probes stuff can walk
> > userspace stacks, which means they'll be using set_fs().
> >
> > I don't see why we've ended up with this (imho) sub-standard TIF-flag
> > approach, and I think it's going to be very problematical.
> >
> > Can we please go back to the approach I suggested back in March for
> > ARM that doesn't suffer from this problem?
> 
> During the extensive thread discussion, Linus asked to move away from
> architecture specific changes to this work flag system. I am glad to
> fix the assembly as you asked on a separate patch.

Well, for the record, I don't think you've got to the bottom of the
"infinite loop" potential of Linus' approach.

Eg, perf will likely trigger this same issue.  Eg, perf record -a -g
will attempt to record the callchain both in kernel space and userspace
each time a perf interrupt happens.  If the perf interrupt frequency is
sufficiently high that we have multiple interrupts during the execution
of do_work_pending() and its called functions, then that will turn this
into an infinite loop yet again.
Thomas Garnier July 19, 2017, 6:50 p.m. UTC | #8
On Wed, Jul 19, 2017 at 11:35 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jul 19, 2017 at 10:20:35AM -0700, Thomas Garnier wrote:
>> On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
>> <linux@armlinux.org.uk> wrote:
>> > On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
>> > Probably best to revert.  I stopped looking at these patches during
>> > the discussion, as the discussion seemed to be mainly around other
>> > architectures, and I thought we had ARM settled.
>> >
>> > Looking at this patch now, there's several things I'm not happy with.
>> >
>> > The effect of adding a the new TIF flag for FSCHECK amongst the other
>> > flags is that we end up overflowing the 8-bit constant, and have to
>> > split the tests, meaning more instructions in the return path.  Eg:
>> >
>> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> > +       tst     r1, #_TIF_SYSCALL_WORK
>> > +       bne     fast_work_pending
>> > +       tst     r1, #_TIF_WORK_MASK
>> >         bne     fast_work_pending
>> >
>> > should be written:
>> >
>> >         tst     r1, #_TIF_SYSCALL_WORK
>> >         tsteq   r1, #_TIF_WORK_MASK
>> >         bne     fast_work_pending
>> >
>> > and:
>> >
>> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> > +       tst     r1, #_TIF_SYSCALL_WORK
>> > +       bne     fast_work_pending
>> > +       tst     r1, #_TIF_WORK_MASK
>> >
>> > should be:
>> >
>> >         tst     r1, #_TIF_SYSCALL_WORK
>> >         tsteq   r1, #_TIF_WORK_MASK
>> >
>> > There's no need for extra branches.
>> >
>> > Now, the next issue is that I don't think this TIF-flag approach is
>> > good for ARM - alignment faults can happen any time due to misaligned
>> > packets in the networking code, and we really don't want to be doing
>> > this check in a place that we can loop.
>> >
>> > My original suggestion for ARM was to do the address limit check after
>> > all work had been processed, with interrupts disabled (so no
>> > possibility of this kind of loop happening.)  However, that seems to
>> > have been replaced with this TIF approach, which is going to cause
>> > loops - I suspect if the probes code is enabled, this will suffer
>> > the same problem.  Remember, the various probes stuff can walk
>> > userspace stacks, which means they'll be using set_fs().
>> >
>> > I don't see why we've ended up with this (imho) sub-standard TIF-flag
>> > approach, and I think it's going to be very problematical.
>> >
>> > Can we please go back to the approach I suggested back in March for
>> > ARM that doesn't suffer from this problem?
>>
>> During the extensive thread discussion, Linus asked to move away from
>> architecture specific changes to this work flag system. I am glad to
>> fix the assembly as you asked on a separate patch.
>
> Well, for the record, I don't think you've got to the bottom of the
> "infinite loop" potential of Linus' approach.
>
> Eg, perf will likely trigger this same issue.  Eg, perf record -a -g
> will attempt to record the callchain both in kernel space and userspace
> each time a perf interrupt happens.  If the perf interrupt frequency is
> sufficiently high that we have multiple interrupts during the execution
> of do_work_pending() and its called functions, then that will turn this
> into an infinite loop yet again.

Do you think it applies to the patch I just sent? The other approach
is to check at the entrance, ignore _TIF_FSCHECK on the loop and clear
it on exit.

>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.
diff mbox

Patch

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@  do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
  */
  trace_hardirqs_off();

- /* Check valid user FS if needed */
- addr_limit_user_check();
-
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
  } else {
  if (unlikely(!user_mode(regs)))