Message ID | 1542653726-5655-8-git-send-email-longman@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | locking/lockdep: Add a new class of terminal locks | expand |
On 11/19/2018 01:55 PM, Waiman Long wrote: > The db->lock is a raw spinlock and so the lock hold time is supposed > to be short. This will not be the case when printk() is being involved > in some of the critical sections. In order to avoid the long hold time, > in case some messages need to be printed, the debug_object_is_on_stack() > and debug_print_object() calls are now moved out of those critical > sections. > > Signed-off-by: Waiman Long <longman@redhat.com> > --- > lib/debugobjects.c | 61 +++++++++++++++++++++++++++++++++++++----------------- > 1 file changed, 42 insertions(+), 19 deletions(-) > > diff --git a/lib/debugobjects.c b/lib/debugobjects.c > index 403dd95..4216d3d 100644 > --- a/lib/debugobjects.c > +++ b/lib/debugobjects.c > @@ -376,6 +376,8 @@ static void debug_object_is_on_stack(void *addr, int onstack) > struct debug_bucket *db; > struct debug_obj *obj; > unsigned long flags; > + bool debug_printobj = false; > + bool debug_chkstack = false; > > fill_pool(); > > @@ -392,7 +394,7 @@ static void debug_object_is_on_stack(void *addr, int onstack) > debug_objects_oom(); > return; > } > - debug_object_is_on_stack(addr, onstack); > + debug_chkstack = true; > } > > switch (obj->state) { > @@ -403,20 +405,25 @@ static void debug_object_is_on_stack(void *addr, int onstack) > break; > > case ODEBUG_STATE_ACTIVE: > - debug_print_object(obj, "init"); > state = obj->state; > raw_spin_unlock_irqrestore(&db->lock, flags); > + debug_print_object(obj, "init"); > debug_object_fixup(descr->fixup_init, addr, state); > return; > > case ODEBUG_STATE_DESTROYED: > - debug_print_object(obj, "init"); > + debug_printobj = true; > break; > default: > break; > } > > raw_spin_unlock_irqrestore(&db->lock, flags); > + if (debug_chkstack) > + debug_object_is_on_stack(addr, onstack); > + if (debug_printobj) > + debug_print_object(obj, "init"); > + > } > > /** > @@ -474,6 +481,8 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) > > obj = lookup_object(addr, db); > if (obj) { > + bool debug_printobj = false; > + > switch (obj->state) { > case ODEBUG_STATE_INIT: > case ODEBUG_STATE_INACTIVE: > @@ -482,14 +491,14 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) > break; > > case ODEBUG_STATE_ACTIVE: > - debug_print_object(obj, "activate"); > state = obj->state; > raw_spin_unlock_irqrestore(&db->lock, flags); > + debug_print_object(obj, "activate"); > ret = debug_object_fixup(descr->fixup_activate, addr, state); > return ret ? 0 : -EINVAL; > > case ODEBUG_STATE_DESTROYED: > - debug_print_object(obj, "activate"); > + debug_printobj = true; > ret = -EINVAL; > break; > default: > @@ -497,10 +506,13 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) > break; > } > raw_spin_unlock_irqrestore(&db->lock, flags); > + if (debug_printobj) > + debug_print_object(obj, "activate"); > return ret; > } > > raw_spin_unlock_irqrestore(&db->lock, flags); > + > /* > * We are here when a static object is activated. We > * let the type specific code confirm whether this is > @@ -532,6 +544,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) > struct debug_bucket *db; > struct debug_obj *obj; > unsigned long flags; > + bool debug_printobj = false; > > if (!debug_objects_enabled) > return; > @@ -549,24 +562,27 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) > if (!obj->astate) > obj->state = ODEBUG_STATE_INACTIVE; > else > - debug_print_object(obj, "deactivate"); > + debug_printobj = true; > break; > > case ODEBUG_STATE_DESTROYED: > - debug_print_object(obj, "deactivate"); > + debug_printobj = true; > break; > default: > break; > } > - } else { > + } > + > + raw_spin_unlock_irqrestore(&db->lock, flags); > + if (!obj) { > struct debug_obj o = { .object = addr, > .state = ODEBUG_STATE_NOTAVAILABLE, > .descr = descr }; > > debug_print_object(&o, "deactivate"); > + } else if (debug_printobj) { > + debug_print_object(obj, "deactivate"); > } > - > - raw_spin_unlock_irqrestore(&db->lock, flags); > } > EXPORT_SYMBOL_GPL(debug_object_deactivate); > > @@ -581,6 +597,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) > struct debug_bucket *db; > struct debug_obj *obj; > unsigned long flags; > + bool debug_printobj = false; > > if (!debug_objects_enabled) > return; > @@ -600,20 +617,22 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) > obj->state = ODEBUG_STATE_DESTROYED; > break; > case ODEBUG_STATE_ACTIVE: > - debug_print_object(obj, "destroy"); > state = obj->state; > raw_spin_unlock_irqrestore(&db->lock, flags); > + debug_print_object(obj, "destroy"); > debug_object_fixup(descr->fixup_destroy, addr, state); > return; > > case ODEBUG_STATE_DESTROYED: > - debug_print_object(obj, "destroy"); > + debug_printobj = true; > break; > default: > break; > } > out_unlock: > raw_spin_unlock_irqrestore(&db->lock, flags); > + if (debug_printobj) > + debug_print_object(obj, "destroy"); > } > EXPORT_SYMBOL_GPL(debug_object_destroy); > > @@ -642,9 +661,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) > > switch (obj->state) { > case ODEBUG_STATE_ACTIVE: > - debug_print_object(obj, "free"); > state = obj->state; > raw_spin_unlock_irqrestore(&db->lock, flags); > + debug_print_object(obj, "free"); > debug_object_fixup(descr->fixup_free, addr, state); > return; > default: > @@ -717,6 +736,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) > struct debug_bucket *db; > struct debug_obj *obj; > unsigned long flags; > + bool debug_printobj = false; > > if (!debug_objects_enabled) > return; > @@ -732,22 +752,25 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) > if (obj->astate == expect) > obj->astate = next; > else > - debug_print_object(obj, "active_state"); > + debug_printobj = true; > break; > > default: > - debug_print_object(obj, "active_state"); > + debug_printobj = true; > break; > } > - } else { > + } > + > + raw_spin_unlock_irqrestore(&db->lock, flags); > + if (!obj) { > struct debug_obj o = { .object = addr, > .state = ODEBUG_STATE_NOTAVAILABLE, > .descr = descr }; > > debug_print_object(&o, "active_state"); > + } else if (debug_printobj) { > + debug_print_object(obj, "active_state"); > } > - > - raw_spin_unlock_irqrestore(&db->lock, flags); > } > EXPORT_SYMBOL_GPL(debug_object_active_state); > > @@ -783,10 +806,10 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size) > > switch (obj->state) { > case ODEBUG_STATE_ACTIVE: > - debug_print_object(obj, "free"); > descr = obj->descr; > state = obj->state; > raw_spin_unlock_irqrestore(&db->lock, flags); > + debug_print_object(obj, "free"); > debug_object_fixup(descr->fixup_free, > (void *) oaddr, state); > goto repeat; As a side note, one of the test systems that I used generated a debugobjects splat in the bootup process and the system hanged afterward. Applying this patch alone fix the hanging problem and the system booted up successfully. So it is not really a good idea to call printk() while holding a raw spinlock. Cheers, Longman
On (11/21/18 11:49), Waiman Long wrote: [..] > > case ODEBUG_STATE_ACTIVE: > > - debug_print_object(obj, "init"); > > state = obj->state; > > raw_spin_unlock_irqrestore(&db->lock, flags); > > + debug_print_object(obj, "init"); > > debug_object_fixup(descr->fixup_init, addr, state); > > return; > > > > case ODEBUG_STATE_DESTROYED: > > - debug_print_object(obj, "init"); > > + debug_printobj = true; > > break; > > default: > > break; > > } > > > > raw_spin_unlock_irqrestore(&db->lock, flags); > > + if (debug_chkstack) > > + debug_object_is_on_stack(addr, onstack); > > + if (debug_printobj) > > + debug_print_object(obj, "init"); > > [..] > > As a side note, one of the test systems that I used generated a > debugobjects splat in the bootup process and the system hanged > afterward. Applying this patch alone fix the hanging problem and the > system booted up successfully. So it is not really a good idea to call > printk() while holding a raw spinlock. Right, I like this patch. And I think that we, maybe, can go even further. Some serial consoles call mod_timer(). So what we could have with the debug objects enabled was mod_timer() lock_timer_base() debug_activate() printk() call_console_drivers() foo_console() mod_timer() lock_timer_base() << deadlock That's one possible scenario. The other one can involve console's IRQ handler, uart port spinlock, mod_timer, debug objects, printk, and an eventual deadlock on the uart port spinlock. This one can be mitigated with printk_safe. But mod_timer() deadlock will require a different fix. So maybe we need to switch debug objects print-outs to _always_ printk_deferred(). Debug objects can be used in code which cannot do direct printk() - timekeeping is just one example. -ss
On Thu, Nov 22, 2018 at 11:04:22AM +0900, Sergey Senozhatsky wrote: > Some serial consoles call mod_timer(). So what we could have with the > debug objects enabled was > > mod_timer() > lock_timer_base() > debug_activate() > printk() > call_console_drivers() > foo_console() > mod_timer() > lock_timer_base() << deadlock > > That's one possible scenario. The other one can involve console's > IRQ handler, uart port spinlock, mod_timer, debug objects, printk, > and an eventual deadlock on the uart port spinlock. This one can > be mitigated with printk_safe. But mod_timer() deadlock will require > a different fix. > > So maybe we need to switch debug objects print-outs to _always_ > printk_deferred(). Debug objects can be used in code which cannot > do direct printk() - timekeeping is just one example. No, printk_deferred() is a disease, it needs to be eradicated, not spread around.
On Thu 2018-11-22 11:04:22, Sergey Senozhatsky wrote: > On (11/21/18 11:49), Waiman Long wrote: > [..] > > > case ODEBUG_STATE_ACTIVE: > > > - debug_print_object(obj, "init"); > > > state = obj->state; > > > raw_spin_unlock_irqrestore(&db->lock, flags); > > > + debug_print_object(obj, "init"); > > > debug_object_fixup(descr->fixup_init, addr, state); > > > return; > > > > > > case ODEBUG_STATE_DESTROYED: > > > - debug_print_object(obj, "init"); > > > + debug_printobj = true; > > > break; > > > default: > > > break; > > > } > > > > > > raw_spin_unlock_irqrestore(&db->lock, flags); > > > + if (debug_chkstack) > > > + debug_object_is_on_stack(addr, onstack); > > > + if (debug_printobj) > > > + debug_print_object(obj, "init"); > > > > [..] > > > > As a side note, one of the test systems that I used generated a > > debugobjects splat in the bootup process and the system hanged > > afterward. Applying this patch alone fix the hanging problem and the > > system booted up successfully. So it is not really a good idea to call > > printk() while holding a raw spinlock. Please, was the system hang reproducible? I wonder if it was a deadlock described by Sergey below. The commit message is right. printk() might take too long and cause softlockup or livelock. But it does not explain why the system could competely hang. Also note that prinkt() should not longer block a single process indefinitely thanks to the commit dbdda842fe96f8932 ("printk: Add console owner and waiter logic to load balance console writes"). > Some serial consoles call mod_timer(). So what we could have with the > debug objects enabled was > > mod_timer() > lock_timer_base() > debug_activate() > printk() > call_console_drivers() > foo_console() > mod_timer() > lock_timer_base() << deadlock Anyway, I wonder what was the primary motivation for this patch. Was it the system hang? Or was it lockdep report about nesting two terminal locks: db->lock, pool_lock with logbuf_lock? Printk is problematic. It might delay any atomic context where it is used. I would like to better understand the problem here before we start spread printk-related hacks. Best Regards, Petr
On 11/21/2018 09:04 PM, Sergey Senozhatsky wrote: > On (11/21/18 11:49), Waiman Long wrote: > [..] >>> case ODEBUG_STATE_ACTIVE: >>> - debug_print_object(obj, "init"); >>> state = obj->state; >>> raw_spin_unlock_irqrestore(&db->lock, flags); >>> + debug_print_object(obj, "init"); >>> debug_object_fixup(descr->fixup_init, addr, state); >>> return; >>> >>> case ODEBUG_STATE_DESTROYED: >>> - debug_print_object(obj, "init"); >>> + debug_printobj = true; >>> break; >>> default: >>> break; >>> } >>> >>> raw_spin_unlock_irqrestore(&db->lock, flags); >>> + if (debug_chkstack) >>> + debug_object_is_on_stack(addr, onstack); >>> + if (debug_printobj) >>> + debug_print_object(obj, "init"); >>> > [..] >> As a side note, one of the test systems that I used generated a >> debugobjects splat in the bootup process and the system hanged >> afterward. Applying this patch alone fix the hanging problem and the >> system booted up successfully. So it is not really a good idea to call >> printk() while holding a raw spinlock. > Right, I like this patch. > And I think that we, maybe, can go even further. > > Some serial consoles call mod_timer(). So what we could have with the > debug objects enabled was > > mod_timer() > lock_timer_base() > debug_activate() > printk() > call_console_drivers() > foo_console() > mod_timer() > lock_timer_base() << deadlock > > That's one possible scenario. The other one can involve console's > IRQ handler, uart port spinlock, mod_timer, debug objects, printk, > and an eventual deadlock on the uart port spinlock. This one can > be mitigated with printk_safe. But mod_timer() deadlock will require > a different fix. > > So maybe we need to switch debug objects print-outs to _always_ > printk_deferred(). Debug objects can be used in code which cannot > do direct printk() - timekeeping is just one example. > > -ss Actually, I don't think that was the cause of the hang. The debugobjects splat was caused by debug_object_is_on_stack(), below was the output: [ 6.890048] ODEBUG: object (____ptrval____) is NOT on stack (____ptrval____), but annotated. [ 6.891000] WARNING: CPU: 28 PID: 1 at lib/debugobjects.c:369 __debug_object_init.cold.11+0x51/0x2d6 [ 6.891000] Modules linked in: [ 6.891000] CPU: 28 PID: 1 Comm: swapper/0 Not tainted 4.18.0-41.el8.bz1651764_cgroup_debug.x86_64+debug #1 [ 6.891000] Hardware name: HPE ProLiant DL120 Gen10/ProLiant DL120 Gen10, BIOS U36 11/14/2017 [ 6.891000] RIP: 0010:__debug_object_init.cold.11+0x51/0x2d6 [ 6.891000] Code: ea 03 80 3c 02 00 0f 85 85 02 00 00 49 8b 54 24 18 48 89 de 4c 89 44 24 10 48 c7 c7 00 ce 22 94 e8 73 18 62 ff 4c 8b 44 24 10 <0f> 0b e9 60 db ff ff 41 83 c4 01 b8 ff ff 37 00 44 89 25 ce 46 f9 [ 6.891000] RSP: 0000:ffff880104187960 EFLAGS: 00010086 [ 6.891000] RAX: 0000000000000050 RBX: ffffffff9764c570 RCX: 0000000000000000 [ 6.891000] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880104178ca8 [ 6.891000] RBP: 1ffff10020830f34 R08: ffff8807ce68a1d0 R09: fffffbfff2923554 [ 6.891000] R10: fffffbfff2923554 R11: ffffffff9491aaa3 R12: ffff880104178000 [ 6.891000] R13: ffffffff96c809b8 R14: 000000000000a370 R15: ffff8807ce68a1c0 [ 6.891000] FS: 0000000000000000(0000) GS:ffff8807d4200000(0000) knlGS:0000000000000000 [ 6.891000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6.891000] CR2: 0000000000000000 CR3: 000000028de16001 CR4: 00000000007606e0 [ 6.891000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6.891000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6.891000] PKRU: 00000000 [ 6.891000] Call Trace: [ 6.891000] ? debug_object_fixup+0x30/0x30 [ 6.891000] ? _raw_spin_unlock_irqrestore+0x4b/0x60 [ 6.891000] ? __lockdep_init_map+0x12f/0x510 [ 6.891000] ? __lockdep_init_map+0x12f/0x510 [ 6.891000] virt_efi_get_next_variable+0xa2/0x160 [ 6.891000] efivar_init+0x1c4/0x6d7 [ 6.891000] ? efivar_ssdt_setup+0x3b/0x3b [ 6.891000] ? efivar_entry_iter+0x120/0x120 [ 6.891000] ? find_held_lock+0x3a/0x1c0 [ 6.891000] ? lock_downgrade+0x5e0/0x5e0 [ 6.891000] ? kmsg_dump_rewind_nolock+0xd9/0xd9 [ 6.891000] ? _raw_spin_unlock_irqrestore+0x4b/0x60 [ 6.891000] ? trace_hardirqs_on_caller+0x381/0x570 [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 [ 6.891000] efisubsys_init+0x1be/0x4ae [ 6.891000] ? kernfs_get.part.8+0x4c/0x60 [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 [ 6.891000] ? __kernfs_create_file+0x235/0x2e0 [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 [ 6.891000] do_one_initcall+0xe9/0x5fd [ 6.891000] ? perf_trace_initcall_level+0x450/0x450 [ 6.891000] ? __wake_up_common+0x5a0/0x5a0 [ 6.891000] ? lock_downgrade+0x5e0/0x5e0 [ 6.891000] kernel_init_freeable+0x51a/0x5f2 [ 6.891000] ? start_kernel+0x7b8/0x7b8 [ 6.891000] ? finish_task_switch+0x19a/0x690 [ 6.891000] ? __switch_to_asm+0x40/0x70 [ 6.891000] ? __switch_to_asm+0x34/0x70 [ 6.891000] ? rest_init+0xe9/0xe9 [ 6.891000] kernel_init+0xc/0x110 [ 6.891000] ? rest_init+0xe9/0xe9 [ 6.891000] ret_from_fork+0x24/0x50 [ 6.891000] irq event stamp: 1081352 [ 6.891000] hardirqs last enabled at (1081351): [<ffffffff93af7dab>] _raw_spin_unlock_irqrestore+0x4b/0x60 [ 6.891000] hardirqs last disabled at (1081352): [<ffffffff93af85c2>] _raw_spin_lock_irqsave+0x22/0x81 [ 6.891000] softirqs last enabled at (1081334): [<ffffffff93e006f9>] __do_softirq+0x6f9/0xaa0 [ 6.891000] softirqs last disabled at (1081325): [<ffffffff921b993f>] irq_exit+0x27f/0x2d0 [ 6.891000] ---[ end trace 15e1083fc009a526 ]--- All the messages above were printed while holding a raw spinlock with IRQ disabled. Further down the bootup sequence, the system appeared to hang: 11.270654] systemd[1]: systemd 239 running in system mode. (+PAM +AUDIT +SELINUX +IMA -APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD +IDN2 -IDN +PCRE2 default-hierarchy=legacy) [ 11.311307] systemd[1]: Detected architecture x86-64. [ 11.316420] systemd[1]: Running in initial RAM disk. Welcome to The system is not responsive at this point. I am not totally sure what caused this. Maybe it was caused by disabling IRQ for too long leading to some kind of corruption. Anyway, moving debug_object_is_on_stack() outside of the IRQ disabled lock critical section seemed to fix the hang problem. Cheers, Longman
On 11/22/2018 11:02 AM, Petr Mladek wrote: > On Thu 2018-11-22 11:04:22, Sergey Senozhatsky wrote: >> On (11/21/18 11:49), Waiman Long wrote: >> [..] >>>> case ODEBUG_STATE_ACTIVE: >>>> - debug_print_object(obj, "init"); >>>> state = obj->state; >>>> raw_spin_unlock_irqrestore(&db->lock, flags); >>>> + debug_print_object(obj, "init"); >>>> debug_object_fixup(descr->fixup_init, addr, state); >>>> return; >>>> >>>> case ODEBUG_STATE_DESTROYED: >>>> - debug_print_object(obj, "init"); >>>> + debug_printobj = true; >>>> break; >>>> default: >>>> break; >>>> } >>>> >>>> raw_spin_unlock_irqrestore(&db->lock, flags); >>>> + if (debug_chkstack) >>>> + debug_object_is_on_stack(addr, onstack); >>>> + if (debug_printobj) >>>> + debug_print_object(obj, "init"); >>>> >> [..] >>> As a side note, one of the test systems that I used generated a >>> debugobjects splat in the bootup process and the system hanged >>> afterward. Applying this patch alone fix the hanging problem and the >>> system booted up successfully. So it is not really a good idea to call >>> printk() while holding a raw spinlock. > Please, was the system hang reproducible? I wonder if it was a > deadlock described by Sergey below. Yes, it is 100% reproducible on the testing system that I used. > The commit message is right. printk() might take too long and > cause softlockup or livelock. But it does not explain why > the system could competely hang. > > Also note that prinkt() should not longer block a single process > indefinitely thanks to the commit dbdda842fe96f8932 ("printk: > Add console owner and waiter logic to load balance console writes"). The problem might have been caused by the fact that IRQ was also disabled in the lock critical section. >> Some serial consoles call mod_timer(). So what we could have with the >> debug objects enabled was >> >> mod_timer() >> lock_timer_base() >> debug_activate() >> printk() >> call_console_drivers() >> foo_console() >> mod_timer() >> lock_timer_base() << deadlock > Anyway, I wonder what was the primary motivation for this patch. > Was it the system hang? Or was it lockdep report about nesting > two terminal locks: db->lock, pool_lock with logbuf_lock? The primary motivation was to make sure that printk() won't be called while holding either db->lock or pool_lock in the debugobjects code. In the determination of which locks can be made terminal, I focused on local spinlocks that won't cause boundary to an unrelated subsystem as it will greatly complicate the analysis. I didn't realize that it fixed a hang problem that I was seeing until I did bisection to find out that it was caused by the patch that cause the debugobjects splat in the first place a few days ago. I was comparing the performance status of the pre and post patched kernel. The pre-patch kernel failed to boot in the one of my test systems, but the post-patched kernel could. I narrowed it down to this patch as the fix for the hang problem. Cheers, Longman
On (11/22/18 14:57), Waiman Long wrote: > > [..] > >> As a side note, one of the test systems that I used generated a > >> debugobjects splat in the bootup process and the system hanged > >> afterward. Applying this patch alone fix the hanging problem and the > >> system booted up successfully. So it is not really a good idea to call > >> printk() while holding a raw spinlock. > > Right, I like this patch. > > And I think that we, maybe, can go even further. > > > > Some serial consoles call mod_timer(). So what we could have with the > > debug objects enabled was > > > > mod_timer() > > lock_timer_base() > > debug_activate() > > printk() > > call_console_drivers() > > foo_console() > > mod_timer() > > lock_timer_base() << deadlock > > > > That's one possible scenario. The other one can involve console's > > IRQ handler, uart port spinlock, mod_timer, debug objects, printk, > > and an eventual deadlock on the uart port spinlock. This one can > > be mitigated with printk_safe. But mod_timer() deadlock will require > > a different fix. > > > > So maybe we need to switch debug objects print-outs to _always_ > > printk_deferred(). Debug objects can be used in code which cannot > > do direct printk() - timekeeping is just one example. > > Actually, I don't think that was the cause of the hang. Oh, I didn't suggest that this was the case. Just talked about more problems with printk in debug objects. Serial consoles call mod_time, mod_timer calls debug objects, debug objects call printk and end up in serial console again. Serial consoles are not re-entrant at this point. > The debugobjects splat was caused by debug_object_is_on_stack(), below > was the output: > > [ 6.890048] ODEBUG: object (____ptrval____) is NOT on stack > (____ptrval____), but annotated. > [ 6.891000] WARNING: CPU: 28 PID: 1 at lib/debugobjects.c:369 > __debug_object_init.cold.11+0x51/0x2d6 [..] > 11.270654] systemd[1]: systemd 239 running in system mode. (+PAM > +AUDIT +SELINUX +IMA -APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP > +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD +IDN2 -IDN > +PCRE2 default-hierarchy=legacy) > [ 11.311307] systemd[1]: Detected architecture x86-64. > [ 11.316420] systemd[1]: Running in initial RAM disk. > > Welcome to > > The system is not responsive at this point. > > I am not totally sure what caused this. Hmm, me neither. -ss
On (11/22/18 11:16), Peter Zijlstra wrote: > > So maybe we need to switch debug objects print-outs to _always_ > > printk_deferred(). Debug objects can be used in code which cannot > > do direct printk() - timekeeping is just one example. > > No, printk_deferred() is a disease, it needs to be eradicated, not > spread around. deadlock-free printk() is deferred, but OK. Another idea then: --- diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 70935ed91125..3928c2b2f77c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -323,10 +323,13 @@ static void debug_print_object(struct debug_obj *obj, char *msg) void *hint = descr->debug_hint ? descr->debug_hint(obj->object) : NULL; limit++; + + bust_spinlocks(1); WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " "object type: %s hint: %pS\n", msg, obj_states[obj->state], obj->astate, descr->name, hint); + bust_spinlocks(0); } debug_objects_warnings++; } --- This should make serial consoles re-entrant. So printk->console_driver_write() hopefully will not deadlock. IOW, this turns serial consoles into early_consoles, for a while ;) -ss
On Thu 2018-11-22 14:57:02, Waiman Long wrote: > On 11/21/2018 09:04 PM, Sergey Senozhatsky wrote: > > On (11/21/18 11:49), Waiman Long wrote: > > [..] > >>> case ODEBUG_STATE_ACTIVE: > >>> - debug_print_object(obj, "init"); > >>> state = obj->state; > >>> raw_spin_unlock_irqrestore(&db->lock, flags); > >>> + debug_print_object(obj, "init"); > >>> debug_object_fixup(descr->fixup_init, addr, state); > >>> return; > >>> > >>> case ODEBUG_STATE_DESTROYED: > >>> - debug_print_object(obj, "init"); > >>> + debug_printobj = true; > >>> break; > >>> default: > >>> break; > >>> } > >>> > >>> raw_spin_unlock_irqrestore(&db->lock, flags); > >>> + if (debug_chkstack) > >>> + debug_object_is_on_stack(addr, onstack); > >>> + if (debug_printobj) > >>> + debug_print_object(obj, "init"); > >>> > > [..] > >> As a side note, one of the test systems that I used generated a > >> debugobjects splat in the bootup process and the system hanged > >> afterward. Applying this patch alone fix the hanging problem and the > >> system booted up successfully. So it is not really a good idea to call > >> printk() while holding a raw spinlock. > > Right, I like this patch. > > And I think that we, maybe, can go even further. > > > > Some serial consoles call mod_timer(). So what we could have with the > > debug objects enabled was > > > > mod_timer() > > lock_timer_base() > > debug_activate() > > printk() > > call_console_drivers() > > foo_console() > > mod_timer() > > lock_timer_base() << deadlock > > > > That's one possible scenario. The other one can involve console's > > IRQ handler, uart port spinlock, mod_timer, debug objects, printk, > > and an eventual deadlock on the uart port spinlock. This one can > > be mitigated with printk_safe. But mod_timer() deadlock will require > > a different fix. > > > > So maybe we need to switch debug objects print-outs to _always_ > > printk_deferred(). Debug objects can be used in code which cannot > > do direct printk() - timekeeping is just one example. > > > > -ss > > Actually, I don't think that was the cause of the hang. The debugobjects > splat was caused by debug_object_is_on_stack(), below was the output: > > [ 6.890048] ODEBUG: object (____ptrval____) is NOT on stack > (____ptrval____), but annotated. > [ 6.891000] WARNING: CPU: 28 PID: 1 at lib/debugobjects.c:369 > __debug_object_init.cold.11+0x51/0x2d6 > [ 6.891000] Modules linked in: > [ 6.891000] CPU: 28 PID: 1 Comm: swapper/0 Not tainted > 4.18.0-41.el8.bz1651764_cgroup_debug.x86_64+debug #1 > [ 6.891000] Hardware name: HPE ProLiant DL120 Gen10/ProLiant DL120 > Gen10, BIOS U36 11/14/2017 > [ 6.891000] RIP: 0010:__debug_object_init.cold.11+0x51/0x2d6 > [ 6.891000] Code: ea 03 80 3c 02 00 0f 85 85 02 00 00 49 8b 54 24 18 > 48 89 de 4c 89 44 24 10 48 c7 c7 00 ce 22 94 e8 73 18 62 ff 4c 8b 44 24 > 10 <0f> 0b e9 60 db ff ff 41 83 c4 01 b8 ff ff 37 00 44 89 25 ce 46 f9 > [ 6.891000] RSP: 0000:ffff880104187960 EFLAGS: 00010086 > [ 6.891000] RAX: 0000000000000050 RBX: ffffffff9764c570 RCX: > 0000000000000000 > [ 6.891000] RDX: 0000000000000000 RSI: 0000000000000000 RDI: > ffff880104178ca8 > [ 6.891000] RBP: 1ffff10020830f34 R08: ffff8807ce68a1d0 R09: > fffffbfff2923554 > [ 6.891000] R10: fffffbfff2923554 R11: ffffffff9491aaa3 R12: > ffff880104178000 > [ 6.891000] R13: ffffffff96c809b8 R14: 000000000000a370 R15: > ffff8807ce68a1c0 > [ 6.891000] FS: 0000000000000000(0000) GS:ffff8807d4200000(0000) > knlGS:0000000000000000 > [ 6.891000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 6.891000] CR2: 0000000000000000 CR3: 000000028de16001 CR4: > 00000000007606e0 > [ 6.891000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 6.891000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [ 6.891000] PKRU: 00000000 > [ 6.891000] Call Trace: > [ 6.891000] ? debug_object_fixup+0x30/0x30 > [ 6.891000] ? _raw_spin_unlock_irqrestore+0x4b/0x60 > [ 6.891000] ? __lockdep_init_map+0x12f/0x510 > [ 6.891000] ? __lockdep_init_map+0x12f/0x510 > [ 6.891000] virt_efi_get_next_variable+0xa2/0x160 > [ 6.891000] efivar_init+0x1c4/0x6d7 > [ 6.891000] ? efivar_ssdt_setup+0x3b/0x3b > [ 6.891000] ? efivar_entry_iter+0x120/0x120 > [ 6.891000] ? find_held_lock+0x3a/0x1c0 > [ 6.891000] ? lock_downgrade+0x5e0/0x5e0 > [ 6.891000] ? kmsg_dump_rewind_nolock+0xd9/0xd9 > [ 6.891000] ? _raw_spin_unlock_irqrestore+0x4b/0x60 > [ 6.891000] ? trace_hardirqs_on_caller+0x381/0x570 > [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 > [ 6.891000] efisubsys_init+0x1be/0x4ae > [ 6.891000] ? kernfs_get.part.8+0x4c/0x60 > [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 > [ 6.891000] ? __kernfs_create_file+0x235/0x2e0 > [ 6.891000] ? efivar_ssdt_iter+0x1f4/0x1f4 > [ 6.891000] do_one_initcall+0xe9/0x5fd > [ 6.891000] ? perf_trace_initcall_level+0x450/0x450 > [ 6.891000] ? __wake_up_common+0x5a0/0x5a0 > [ 6.891000] ? lock_downgrade+0x5e0/0x5e0 > [ 6.891000] kernel_init_freeable+0x51a/0x5f2 > [ 6.891000] ? start_kernel+0x7b8/0x7b8 > [ 6.891000] ? finish_task_switch+0x19a/0x690 > [ 6.891000] ? __switch_to_asm+0x40/0x70 > [ 6.891000] ? __switch_to_asm+0x34/0x70 > [ 6.891000] ? rest_init+0xe9/0xe9 > [ 6.891000] kernel_init+0xc/0x110 > [ 6.891000] ? rest_init+0xe9/0xe9 > [ 6.891000] ret_from_fork+0x24/0x50 > [ 6.891000] irq event stamp: 1081352 > [ 6.891000] hardirqs last enabled at (1081351): [<ffffffff93af7dab>] > _raw_spin_unlock_irqrestore+0x4b/0x60 > [ 6.891000] hardirqs last disabled at (1081352): [<ffffffff93af85c2>] > _raw_spin_lock_irqsave+0x22/0x81 > [ 6.891000] softirqs last enabled at (1081334): [<ffffffff93e006f9>] > __do_softirq+0x6f9/0xaa0 > [ 6.891000] softirqs last disabled at (1081325): [<ffffffff921b993f>] > irq_exit+0x27f/0x2d0 > [ 6.891000] ---[ end trace 15e1083fc009a526 ]--- > > All the messages above were printed while holding a raw spinlock with > IRQ disabled. Further down the bootup sequence, the system appeared to hang: > > 11.270654] systemd[1]: systemd 239 running in system mode. (+PAM > +AUDIT +SELINUX +IMA -APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP > +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD +IDN2 -IDN > +PCRE2 default-hierarchy=legacy) > [ 11.311307] systemd[1]: Detected architecture x86-64. > [ 11.316420] systemd[1]: Running in initial RAM disk. > > Welcome to > > The system is not responsive at this point. > > I am not totally sure what caused this. Maybe it was caused by disabling > IRQ for too long leading to some kind of corruption. Anyway, moving > debug_object_is_on_stack() outside of the IRQ disabled lock critical > section seemed to fix the hang problem. It is hard to say why printing the above with disabled interrupts would break anything. efivar_init() itself should get delayed the same way with and without the patch. Some clue might be in the rest of the log. It would be interesting to compare full logs of non-patched and patched system. Best Regards, Petr
On Thu 2018-11-22 15:29:35, Waiman Long wrote: > On 11/22/2018 11:02 AM, Petr Mladek wrote: > > Anyway, I wonder what was the primary motivation for this patch. > > Was it the system hang? Or was it lockdep report about nesting > > two terminal locks: db->lock, pool_lock with logbuf_lock? > > The primary motivation was to make sure that printk() won't be called > while holding either db->lock or pool_lock in the debugobjects code. In > the determination of which locks can be made terminal, I focused on > local spinlocks that won't cause boundary to an unrelated subsystem as > it will greatly complicate the analysis. Then please mention this as the real reason in the commit message. The reason that printk might take too long in IRQ context does not explain why we need the patch. printk() is called in IRQ context in many other locations. I do not see why this place should be special. The delay is the price for the chance to see the problem. > I didn't realize that it fixed a hang problem that I was seeing until I > did bisection to find out that it was caused by the patch that cause the > debugobjects splat in the first place a few days ago. I was comparing > the performance status of the pre and post patched kernel. The pre-patch > kernel failed to boot in the one of my test systems, but the > post-patched kernel could. I narrowed it down to this patch as the fix > for the hang problem. The hang is a mystery. My guess is that there is a race somewhere. The patch might have reduced the race window but it probably did not fix the race itself. Best Regards, Petr
On Fri 2018-11-23 11:40:48, Sergey Senozhatsky wrote: > On (11/22/18 11:16), Peter Zijlstra wrote: > > > So maybe we need to switch debug objects print-outs to _always_ > > > printk_deferred(). Debug objects can be used in code which cannot > > > do direct printk() - timekeeping is just one example. > > > > No, printk_deferred() is a disease, it needs to be eradicated, not > > spread around. > > deadlock-free printk() is deferred, but OK. The best solution would be lockless console drivers. Sigh. > Another idea then: > > --- > > diff --git a/lib/debugobjects.c b/lib/debugobjects.c > index 70935ed91125..3928c2b2f77c 100644 > --- a/lib/debugobjects.c > +++ b/lib/debugobjects.c > @@ -323,10 +323,13 @@ static void debug_print_object(struct debug_obj *obj, char *msg) > void *hint = descr->debug_hint ? > descr->debug_hint(obj->object) : NULL; > limit++; > + > + bust_spinlocks(1); > WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) " > "object type: %s hint: %pS\n", > msg, obj_states[obj->state], obj->astate, > descr->name, hint); > + bust_spinlocks(0); > } > debug_objects_warnings++; > } > > --- > > This should make serial consoles re-entrant. > So printk->console_driver_write() hopefully will not deadlock. Is the re-entrance safe? Some risk might be acceptable in Oops/panic situations. It is much less acceptable for random warnings. Best Regards, Petr
On (11/23/18 12:48), Petr Mladek wrote: [..] > > This should make serial consoles re-entrant. > > So printk->console_driver_write() hopefully will not deadlock. > > Is the re-entrance safe? Some risk might be acceptable in Oops/panic > situations. It is much less acceptable for random warnings. Good question. But what's the alternative? A deadlock in a serial console driver; such that even panic() is not guaranteed to make through it (at least of now). debug objects are used from the code which cannot re-entrant console drivers. bust_spinlock is called from various paths, not only panic. git grep bust_spinlocks | wc -l 62 So we already switch to re-entrant consoles (and accept the risks) in mm/fault.c, kernel/traps.c and so on. Which, I guess, makes us a little more confident, faults/traps happen often enough. It seems, that, more or less, serial consoles are ready to handle it. UART consoles in ->write() callbacks just do a bunch of writel() [for every char + \r\n]. -ss
On Mon 2018-11-26 13:57:09, Sergey Senozhatsky wrote: > On (11/23/18 12:48), Petr Mladek wrote: > [..] > > > This should make serial consoles re-entrant. > > > So printk->console_driver_write() hopefully will not deadlock. > > > > Is the re-entrance safe? Some risk might be acceptable in Oops/panic > > situations. It is much less acceptable for random warnings. > > Good question. > > But what's the alternative? A deadlock in a serial console driver; such > that even panic() is not guaranteed to make through it (at least of now). > debug objects are used from the code which cannot re-entrant console > drivers. > > bust_spinlock is called from various paths, not only panic. > git grep bust_spinlocks | wc -l > 62 bust_spinlocks() is followed by die() in several situations. The rests seems to be Oops situations where we an invalid address is being accessed. There is a nontrivial chance that the system would die anyway. Now, if I look into Documentation/core-api/debug-objects.rst, the API is used to detect: - Activation of uninitialized objects - Initialization of active objects - Usage of freed/destroyed objects Of course, all the above situations might lead to the system crash. But even in the worst case, use-after-free, there is a non-trivial chance that the data still would be valid and the system would survive. There might be many other warnings of the same severity. > So we already switch to re-entrant consoles (and accept the risks) in > mm/fault.c, kernel/traps.c and so on. Which, I guess, makes us a little > more confident, faults/traps happen often enough. Where is the border line, please? Do we want to have the kernel sources full of bust_spinlocks() callers? > It seems, that, more or less, serial consoles are ready to handle it. > UART consoles in ->write() callbacks just do a bunch of writel() [for > every char + \r\n]. But oops_in_progress does not affect only serial (UART) consoles. We want safe lockless consoles. We do not want to run a most-of-the-time-safe code too often. BTW: I have heard that someone from the RT people is working on a big printk() rewrite. One part is a lockless buffer. Another part should be a different handling of safe (lockless) and more complicated consoles. It was presented on some recent conference (I forgot which one). I do not know any details. But the first version should be sent in a near future. Best Regards, Petr
On Mon, Nov 19, 2018 at 01:55:16PM -0500, Waiman Long wrote: > The db->lock is a raw spinlock and so the lock hold time is supposed > to be short. This will not be the case when printk() is being involved > in some of the critical sections. In order to avoid the long hold time, > in case some messages need to be printed, the debug_object_is_on_stack() > and debug_print_object() calls are now moved out of those critical > sections. That's not why you did this patch though; you want to make these locks terminal locks and that means no printk() inside, as that uses locks again. Please write relevant changelogs.
diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 403dd95..4216d3d 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -376,6 +376,8 @@ static void debug_object_is_on_stack(void *addr, int onstack) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + bool debug_printobj = false; + bool debug_chkstack = false; fill_pool(); @@ -392,7 +394,7 @@ static void debug_object_is_on_stack(void *addr, int onstack) debug_objects_oom(); return; } - debug_object_is_on_stack(addr, onstack); + debug_chkstack = true; } switch (obj->state) { @@ -403,20 +405,25 @@ static void debug_object_is_on_stack(void *addr, int onstack) break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "init"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "init"); debug_object_fixup(descr->fixup_init, addr, state); return; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "init"); + debug_printobj = true; break; default: break; } raw_spin_unlock_irqrestore(&db->lock, flags); + if (debug_chkstack) + debug_object_is_on_stack(addr, onstack); + if (debug_printobj) + debug_print_object(obj, "init"); + } /** @@ -474,6 +481,8 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) obj = lookup_object(addr, db); if (obj) { + bool debug_printobj = false; + switch (obj->state) { case ODEBUG_STATE_INIT: case ODEBUG_STATE_INACTIVE: @@ -482,14 +491,14 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "activate"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "activate"); ret = debug_object_fixup(descr->fixup_activate, addr, state); return ret ? 0 : -EINVAL; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "activate"); + debug_printobj = true; ret = -EINVAL; break; default: @@ -497,10 +506,13 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) break; } raw_spin_unlock_irqrestore(&db->lock, flags); + if (debug_printobj) + debug_print_object(obj, "activate"); return ret; } raw_spin_unlock_irqrestore(&db->lock, flags); + /* * We are here when a static object is activated. We * let the type specific code confirm whether this is @@ -532,6 +544,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + bool debug_printobj = false; if (!debug_objects_enabled) return; @@ -549,24 +562,27 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) if (!obj->astate) obj->state = ODEBUG_STATE_INACTIVE; else - debug_print_object(obj, "deactivate"); + debug_printobj = true; break; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "deactivate"); + debug_printobj = true; break; default: break; } - } else { + } + + raw_spin_unlock_irqrestore(&db->lock, flags); + if (!obj) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; debug_print_object(&o, "deactivate"); + } else if (debug_printobj) { + debug_print_object(obj, "deactivate"); } - - raw_spin_unlock_irqrestore(&db->lock, flags); } EXPORT_SYMBOL_GPL(debug_object_deactivate); @@ -581,6 +597,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + bool debug_printobj = false; if (!debug_objects_enabled) return; @@ -600,20 +617,22 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) obj->state = ODEBUG_STATE_DESTROYED; break; case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "destroy"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "destroy"); debug_object_fixup(descr->fixup_destroy, addr, state); return; case ODEBUG_STATE_DESTROYED: - debug_print_object(obj, "destroy"); + debug_printobj = true; break; default: break; } out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); + if (debug_printobj) + debug_print_object(obj, "destroy"); } EXPORT_SYMBOL_GPL(debug_object_destroy); @@ -642,9 +661,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) switch (obj->state) { case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "free"); state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "free"); debug_object_fixup(descr->fixup_free, addr, state); return; default: @@ -717,6 +736,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + bool debug_printobj = false; if (!debug_objects_enabled) return; @@ -732,22 +752,25 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) if (obj->astate == expect) obj->astate = next; else - debug_print_object(obj, "active_state"); + debug_printobj = true; break; default: - debug_print_object(obj, "active_state"); + debug_printobj = true; break; } - } else { + } + + raw_spin_unlock_irqrestore(&db->lock, flags); + if (!obj) { struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; debug_print_object(&o, "active_state"); + } else if (debug_printobj) { + debug_print_object(obj, "active_state"); } - - raw_spin_unlock_irqrestore(&db->lock, flags); } EXPORT_SYMBOL_GPL(debug_object_active_state); @@ -783,10 +806,10 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size) switch (obj->state) { case ODEBUG_STATE_ACTIVE: - debug_print_object(obj, "free"); descr = obj->descr; state = obj->state; raw_spin_unlock_irqrestore(&db->lock, flags); + debug_print_object(obj, "free"); debug_object_fixup(descr->fixup_free, (void *) oaddr, state); goto repeat;
The db->lock is a raw spinlock and so the lock hold time is supposed to be short. This will not be the case when printk() is being involved in some of the critical sections. In order to avoid the long hold time, in case some messages need to be printed, the debug_object_is_on_stack() and debug_print_object() calls are now moved out of those critical sections. Signed-off-by: Waiman Long <longman@redhat.com> --- lib/debugobjects.c | 61 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 19 deletions(-)