[v2] mm/page_isolation: fix a deadlock with printk()
diff mbox series

Message ID 1570228005-24979-1-git-send-email-cai@lca.pw
State New
Headers show
Series
  • [v2] mm/page_isolation: fix a deadlock with printk()
Related show

Commit Message

Qian Cai Oct. 4, 2019, 10:26 p.m. UTC
It is unsafe to call printk() while zone->lock was held, i.e.,

zone->lock --> console_lock

because the console could always allocate some memory in different code
paths and form locking chains in an opposite order,

console_lock --> * --> zone->lock

As the result, it triggers lockdep splats like below and in different
code paths in this thread [1]. Since has_unmovable_pages() was only used
in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
the former will set the REPORT_FAILURE flag which will call printk().
Hence, unlock the zone->lock just before the dump_page() there where
when has_unmovable_pages() returns true, there is no need to hold the
lock anyway in the rest of set_migratetype_isolate().

While at it, remove a problematic printk() in __offline_isolated_pages()
only for debugging as well which will always disable lockdep on debug
kernels.

The problem is probably there forever, but neither many developers will
run memory offline with the lockdep enabled nor admins in the field are
lucky enough yet to hit a perfect timing which required to trigger a
real deadlock. In addition, there aren't many places that call printk()
while zone->lock was held.

WARNING: possible circular locking dependency detected
------------------------------------------------------
test.sh/1724 is trying to acquire lock:
0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
01: 328/0xa30

but task is already holding lock:
000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
01: late_page_range+0x216/0x538

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&(&zone->lock)->rlock){-.-.}:
       lock_acquire+0x21a/0x468
       _raw_spin_lock+0x54/0x68
       get_page_from_freelist+0x8b6/0x2d28
       __alloc_pages_nodemask+0x246/0x658
       __get_free_pages+0x34/0x78
       sclp_init+0x106/0x690
       sclp_register+0x2e/0x248
       sclp_rw_init+0x4a/0x70
       sclp_console_init+0x4a/0x1b8
       console_init+0x2c8/0x410
       start_kernel+0x530/0x6a0
       startup_continue+0x70/0xd0

-> #1 (sclp_lock){-.-.}:
       lock_acquire+0x21a/0x468
       _raw_spin_lock_irqsave+0xcc/0xe8
       sclp_add_request+0x34/0x308
       sclp_conbuf_emit+0x100/0x138
       sclp_console_write+0x96/0x3b8
       console_unlock+0x6dc/0xa30
       vprintk_emit+0x184/0x3c8
       vprintk_default+0x44/0x50
       printk+0xa8/0xc0
       iommu_debugfs_setup+0xf2/0x108
       iommu_init+0x6c/0x78
       do_one_initcall+0x162/0x680
       kernel_init_freeable+0x4e8/0x5a8
       kernel_init+0x2a/0x188
       ret_from_fork+0x30/0x34
       kernel_thread_starter+0x0/0xc

-> #0 (console_owner){-...}:
       check_noncircular+0x338/0x3e0
       __lock_acquire+0x1e66/0x2d88
       lock_acquire+0x21a/0x468
       console_unlock+0x3a6/0xa30
       vprintk_emit+0x184/0x3c8
       vprintk_default+0x44/0x50
       printk+0xa8/0xc0
       __dump_page+0x1dc/0x710
       dump_page+0x2e/0x58
       has_unmovable_pages+0x2e8/0x470
       start_isolate_page_range+0x404/0x538
       __offline_pages+0x22c/0x1338
       memory_subsys_offline+0xa6/0xe8
       device_offline+0xe6/0x118
       state_store+0xf0/0x110
       kernfs_fop_write+0x1bc/0x270
       vfs_write+0xce/0x220
       ksys_write+0xea/0x190
       system_call+0xd8/0x2b4

other info that might help us debug this:

Chain exists of:
  console_owner --> sclp_lock --> &(&zone->lock)->rlock

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&(&zone->lock)->rlock);
                               lock(sclp_lock);
                               lock(&(&zone->lock)->rlock);
  lock(console_owner);

 *** DEADLOCK ***

9 locks held by test.sh/1724:
 #0: 000000000e925408 (sb_writers#4){.+.+}, at: vfs_write+0x201:
 #1: 0000000050aa4280 (&of->mutex){+.+.}, at: kernfs_fop_write:
 #2: 0000000062e5c628 (kn->count#198){.+.+}, at: kernfs_fop_write
 #3: 00000000523236a0 (device_hotplug_lock){+.+.}, at:
lock_device_hotplug_sysfs+0x30/0x80
 #4: 0000000062e70990 (&dev->mutex){....}, at: device_offline
 #5: 0000000051fd36b0 (cpu_hotplug_lock.rw_sem){++++}, at:
__offline_pages+0xec/0x1338
 #6: 00000000521ca470 (mem_hotplug_lock.rw_sem){++++}, at:
percpu_down_write+0x38/0x210
 #7: 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at:
start_isolate_page_range+0x216/0x538
 #8: 000000005205a100 (console_lock){+.+.}, at: vprintk_emit

stack backtrace:
Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)
Call Trace:
([<00000000512ae218>] show_stack+0x110/0x1b0)
 [<0000000051b6d506>] dump_stack+0x126/0x178
 [<00000000513a4b08>] check_noncircular+0x338/0x3e0
 [<00000000513aaaf6>] __lock_acquire+0x1e66/0x2d88
 [<00000000513a7e12>] lock_acquire+0x21a/0x468
 [<00000000513bb2fe>] console_unlock+0x3a6/0xa30
 [<00000000513bde2c>] vprintk_emit+0x184/0x3c8
 [<00000000513be0b4>] vprintk_default+0x44/0x50
 [<00000000513beb60>] printk+0xa8/0xc0
 [<000000005158c364>] __dump_page+0x1dc/0x710
 [<000000005158c8c6>] dump_page+0x2e/0x58
 [<00000000515d87c8>] has_unmovable_pages+0x2e8/0x470
 [<000000005167072c>] start_isolate_page_range+0x404/0x538
 [<0000000051b96de4>] __offline_pages+0x22c/0x1338
 [<0000000051908586>] memory_subsys_offline+0xa6/0xe8
 [<00000000518e561e>] device_offline+0xe6/0x118
 [<0000000051908170>] state_store+0xf0/0x110
 [<0000000051796384>] kernfs_fop_write+0x1bc/0x270
 [<000000005168972e>] vfs_write+0xce/0x220
 [<0000000051689b9a>] ksys_write+0xea/0x190
 [<0000000051ba9990>] system_call+0xd8/0x2b4
INFO: lockdep is turned off.

[1] https://lore.kernel.org/lkml/1568817579.5576.172.camel@lca.pw/

Signed-off-by: Qian Cai <cai@lca.pw>
---

v2: unlock zone->lock in has_unmovable_pages() where necessary.

 include/linux/page-isolation.h |  2 +-
 mm/memory_hotplug.c            |  3 ++-
 mm/page_alloc.c                | 12 +++++-------
 mm/page_isolation.c            |  7 +++++--
 4 files changed, 13 insertions(+), 11 deletions(-)

Comments

Michal Hocko Oct. 7, 2019, 8:07 a.m. UTC | #1
On Fri 04-10-19 18:26:45, Qian Cai wrote:
> It is unsafe to call printk() while zone->lock was held, i.e.,
> 
> zone->lock --> console_lock
> 
> because the console could always allocate some memory in different code
> paths and form locking chains in an opposite order,
> 
> console_lock --> * --> zone->lock
> 
> As the result, it triggers lockdep splats like below and in different
> code paths in this thread [1]. Since has_unmovable_pages() was only used
> in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> the former will set the REPORT_FAILURE flag which will call printk().
> Hence, unlock the zone->lock just before the dump_page() there where
> when has_unmovable_pages() returns true, there is no need to hold the
> lock anyway in the rest of set_migratetype_isolate().
> 
> While at it, remove a problematic printk() in __offline_isolated_pages()
> only for debugging as well which will always disable lockdep on debug
> kernels.

I do not think that removing the printk is the right long term solution.
While I do agree that removing the debugging printk __offline_isolated_pages
does make sense because it is essentially of a very limited use, this
doesn't really solve the underlying problem.  There are likely other
printks from zone->lock. It would be much more saner to actually
disallow consoles to allocate any memory while printk is called from an
atomic context.

> The problem is probably there forever, but neither many developers will
> run memory offline with the lockdep enabled nor admins in the field are
> lucky enough yet to hit a perfect timing which required to trigger a
> real deadlock. In addition, there aren't many places that call printk()
> while zone->lock was held.
> 
> WARNING: possible circular locking dependency detected
> ------------------------------------------------------
> test.sh/1724 is trying to acquire lock:
> 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
> 01: 328/0xa30
> 
> but task is already holding lock:
> 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
> 01: late_page_range+0x216/0x538

I am also wondering what does this lockdep report actually say. How come
we have a dependency between a start_kernel path and a syscall?

> which lock already depends on the new lock.
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #2 (&(&zone->lock)->rlock){-.-.}:
>        lock_acquire+0x21a/0x468
>        _raw_spin_lock+0x54/0x68
>        get_page_from_freelist+0x8b6/0x2d28
>        __alloc_pages_nodemask+0x246/0x658
>        __get_free_pages+0x34/0x78
>        sclp_init+0x106/0x690
>        sclp_register+0x2e/0x248
>        sclp_rw_init+0x4a/0x70
>        sclp_console_init+0x4a/0x1b8
>        console_init+0x2c8/0x410
>        start_kernel+0x530/0x6a0
>        startup_continue+0x70/0xd0
> 
> -> #1 (sclp_lock){-.-.}:
>        lock_acquire+0x21a/0x468
>        _raw_spin_lock_irqsave+0xcc/0xe8
>        sclp_add_request+0x34/0x308
>        sclp_conbuf_emit+0x100/0x138
>        sclp_console_write+0x96/0x3b8
>        console_unlock+0x6dc/0xa30
>        vprintk_emit+0x184/0x3c8
>        vprintk_default+0x44/0x50
>        printk+0xa8/0xc0
>        iommu_debugfs_setup+0xf2/0x108
>        iommu_init+0x6c/0x78
>        do_one_initcall+0x162/0x680
>        kernel_init_freeable+0x4e8/0x5a8
>        kernel_init+0x2a/0x188
>        ret_from_fork+0x30/0x34
>        kernel_thread_starter+0x0/0xc
> 
> -> #0 (console_owner){-...}:
>        check_noncircular+0x338/0x3e0
>        __lock_acquire+0x1e66/0x2d88
>        lock_acquire+0x21a/0x468
>        console_unlock+0x3a6/0xa30
>        vprintk_emit+0x184/0x3c8
>        vprintk_default+0x44/0x50
>        printk+0xa8/0xc0
>        __dump_page+0x1dc/0x710
>        dump_page+0x2e/0x58
>        has_unmovable_pages+0x2e8/0x470
>        start_isolate_page_range+0x404/0x538
>        __offline_pages+0x22c/0x1338
>        memory_subsys_offline+0xa6/0xe8
>        device_offline+0xe6/0x118
>        state_store+0xf0/0x110
>        kernfs_fop_write+0x1bc/0x270
>        vfs_write+0xce/0x220
>        ksys_write+0xea/0x190
>        system_call+0xd8/0x2b4
> 
> other info that might help us debug this:
> 
> Chain exists of:
>   console_owner --> sclp_lock --> &(&zone->lock)->rlock
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(&(&zone->lock)->rlock);
>                                lock(sclp_lock);
>                                lock(&(&zone->lock)->rlock);
>   lock(console_owner);
> 
>  *** DEADLOCK ***
> 
> 9 locks held by test.sh/1724:
>  #0: 000000000e925408 (sb_writers#4){.+.+}, at: vfs_write+0x201:
>  #1: 0000000050aa4280 (&of->mutex){+.+.}, at: kernfs_fop_write:
>  #2: 0000000062e5c628 (kn->count#198){.+.+}, at: kernfs_fop_write
>  #3: 00000000523236a0 (device_hotplug_lock){+.+.}, at:
> lock_device_hotplug_sysfs+0x30/0x80
>  #4: 0000000062e70990 (&dev->mutex){....}, at: device_offline
>  #5: 0000000051fd36b0 (cpu_hotplug_lock.rw_sem){++++}, at:
> __offline_pages+0xec/0x1338
>  #6: 00000000521ca470 (mem_hotplug_lock.rw_sem){++++}, at:
> percpu_down_write+0x38/0x210
>  #7: 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at:
> start_isolate_page_range+0x216/0x538
>  #8: 000000005205a100 (console_lock){+.+.}, at: vprintk_emit
> 
> stack backtrace:
> Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)
> Call Trace:
> ([<00000000512ae218>] show_stack+0x110/0x1b0)
>  [<0000000051b6d506>] dump_stack+0x126/0x178
>  [<00000000513a4b08>] check_noncircular+0x338/0x3e0
>  [<00000000513aaaf6>] __lock_acquire+0x1e66/0x2d88
>  [<00000000513a7e12>] lock_acquire+0x21a/0x468
>  [<00000000513bb2fe>] console_unlock+0x3a6/0xa30
>  [<00000000513bde2c>] vprintk_emit+0x184/0x3c8
>  [<00000000513be0b4>] vprintk_default+0x44/0x50
>  [<00000000513beb60>] printk+0xa8/0xc0
>  [<000000005158c364>] __dump_page+0x1dc/0x710
>  [<000000005158c8c6>] dump_page+0x2e/0x58
>  [<00000000515d87c8>] has_unmovable_pages+0x2e8/0x470
>  [<000000005167072c>] start_isolate_page_range+0x404/0x538
>  [<0000000051b96de4>] __offline_pages+0x22c/0x1338
>  [<0000000051908586>] memory_subsys_offline+0xa6/0xe8
>  [<00000000518e561e>] device_offline+0xe6/0x118
>  [<0000000051908170>] state_store+0xf0/0x110
>  [<0000000051796384>] kernfs_fop_write+0x1bc/0x270
>  [<000000005168972e>] vfs_write+0xce/0x220
>  [<0000000051689b9a>] ksys_write+0xea/0x190
>  [<0000000051ba9990>] system_call+0xd8/0x2b4
> INFO: lockdep is turned off.
Petr Mladek Oct. 7, 2019, 9:05 a.m. UTC | #2
On Mon 2019-10-07 10:07:42, Michal Hocko wrote:
> On Fri 04-10-19 18:26:45, Qian Cai wrote:
> > It is unsafe to call printk() while zone->lock was held, i.e.,
> > 
> > zone->lock --> console_lock
> > 
> > because the console could always allocate some memory in different code
> > paths and form locking chains in an opposite order,
> > 
> > console_lock --> * --> zone->lock
> > 
> > As the result, it triggers lockdep splats like below and in different
> > code paths in this thread [1]. Since has_unmovable_pages() was only used
> > in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> > the former will set the REPORT_FAILURE flag which will call printk().
> > Hence, unlock the zone->lock just before the dump_page() there where
> > when has_unmovable_pages() returns true, there is no need to hold the
> > lock anyway in the rest of set_migratetype_isolate().
> > 
> > While at it, remove a problematic printk() in __offline_isolated_pages()
> > only for debugging as well which will always disable lockdep on debug
> > kernels.
> 
> I do not think that removing the printk is the right long term solution.
> While I do agree that removing the debugging printk __offline_isolated_pages
> does make sense because it is essentially of a very limited use, this
> doesn't really solve the underlying problem.  There are likely other
> printks from zone->lock. It would be much more saner to actually
> disallow consoles to allocate any memory while printk is called from an
> atomic context.

The current "standard" solution for these situations is to replace
the problematic printk() with printk_deferred(). It would deffer
the console handling.

Of course, this is a whack a mole approach. The long term solution
is to deffer printk() by default. We have finally agreed on this
few weeks ago on Plumbers conference. It is going to be added
together with fully lockless log buffer hopefully soon. It will
be part of upstreaming Real-Time related code.

> > The problem is probably there forever, but neither many developers will
> > run memory offline with the lockdep enabled nor admins in the field are
> > lucky enough yet to hit a perfect timing which required to trigger a
> > real deadlock. In addition, there aren't many places that call printk()
> > while zone->lock was held.
> > 
> > WARNING: possible circular locking dependency detected
> > ------------------------------------------------------
> > test.sh/1724 is trying to acquire lock:
> > 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
> > 01: 328/0xa30
> > 
> > but task is already holding lock:
> > 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
> > 01: late_page_range+0x216/0x538
> 
> I am also wondering what does this lockdep report actually say. How come
> we have a dependency between a start_kernel path and a syscall?

My understanding is that these are different code paths. Where each
code paths shows one existing lock ordering.

IMHO, it is possible that these code paths could never run in
parallel. I guess that lockdep is not able to distinguish
code paths that are called only during boot and others
that are called only in fully booted system. That said,
I am not sure if this is the case here.

Best Regards,
Petr
Qian Cai Oct. 7, 2019, 11:04 a.m. UTC | #3
> On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> 
> I do not think that removing the printk is the right long term solution.
> While I do agree that removing the debugging printk __offline_isolated_pages
> does make sense because it is essentially of a very limited use, this
> doesn't really solve the underlying problem.  There are likely other
> printks from zone->lock. It would be much more saner to actually
> disallow consoles to allocate any memory while printk is called from an
> atomic context.

No, there is only a handful of places called printk() from zone->lock. It is normal that the callers will quietly process “struct zone” modification in a short section with zone->lock held.

No, it is not about “allocate any memory while printk is called from an
atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,

CPU0:                 CPU1:         CPU2:
console_owner
                            sclp_lock
sclp_lock                                 zone_lock
                            zone_lock
                                                 console_owner

Here it is a deadlock.

> 
>> The problem is probably there forever, but neither many developers will
>> run memory offline with the lockdep enabled nor admins in the field are
>> lucky enough yet to hit a perfect timing which required to trigger a
>> real deadlock. In addition, there aren't many places that call printk()
>> while zone->lock was held.
>> 
>> WARNING: possible circular locking dependency detected
>> ------------------------------------------------------
>> test.sh/1724 is trying to acquire lock:
>> 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
>> 01: 328/0xa30
>> 
>> but task is already holding lock:
>> 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
>> 01: late_page_range+0x216/0x538
> 
> Show Quoted Content
>> The problem is probably there forever, but neither many developers will
>> run memory offline with the lockdep enabled nor admins in the field are
>> lucky enough yet to hit a perfect timing which required to trigger a
>> real deadlock. In addition, there aren't many places that call printk()
>> while zone->lock was held.
>> 
>> WARNING: possible circular locking dependency detected
>> ------------------------------------------------------
>> test.sh/1724 is trying to acquire lock:
>> 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
>> 01: 328/0xa30
>> 
>> but task is already holding lock:
>> 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
>> 01: late_page_range+0x216/0x538
> 
> 
> I am also wondering what does this lockdep report actually say. How come
> we have a dependency between a start_kernel path and a syscall?

Petr explained it correctly.
Michal Hocko Oct. 7, 2019, 11:33 a.m. UTC | #4
On Mon 07-10-19 11:05:53, Petr Mladek wrote:
> On Mon 2019-10-07 10:07:42, Michal Hocko wrote:
> > On Fri 04-10-19 18:26:45, Qian Cai wrote:
> > > It is unsafe to call printk() while zone->lock was held, i.e.,
> > > 
> > > zone->lock --> console_lock
> > > 
> > > because the console could always allocate some memory in different code
> > > paths and form locking chains in an opposite order,
> > > 
> > > console_lock --> * --> zone->lock
> > > 
> > > As the result, it triggers lockdep splats like below and in different
> > > code paths in this thread [1]. Since has_unmovable_pages() was only used
> > > in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> > > the former will set the REPORT_FAILURE flag which will call printk().
> > > Hence, unlock the zone->lock just before the dump_page() there where
> > > when has_unmovable_pages() returns true, there is no need to hold the
> > > lock anyway in the rest of set_migratetype_isolate().
> > > 
> > > While at it, remove a problematic printk() in __offline_isolated_pages()
> > > only for debugging as well which will always disable lockdep on debug
> > > kernels.
> > 
> > I do not think that removing the printk is the right long term solution.
> > While I do agree that removing the debugging printk __offline_isolated_pages
> > does make sense because it is essentially of a very limited use, this
> > doesn't really solve the underlying problem.  There are likely other
> > printks from zone->lock. It would be much more saner to actually
> > disallow consoles to allocate any memory while printk is called from an
> > atomic context.
> 
> The current "standard" solution for these situations is to replace
> the problematic printk() with printk_deferred(). It would deffer
> the console handling.
> 
> Of course, this is a whack a mole approach. The long term solution
> is to deffer printk() by default. We have finally agreed on this
> few weeks ago on Plumbers conference. It is going to be added
> together with fully lockless log buffer hopefully soon. It will
> be part of upstreaming Real-Time related code.

OK, then we do not really have to do anything here. That is good to hear
because I really detest putting printk_deferred or anything like
that at random places.
Michal Hocko Oct. 7, 2019, 11:37 a.m. UTC | #5
On Mon 07-10-19 07:04:00, Qian Cai wrote:
> 
> 
> > On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > 
> > I do not think that removing the printk is the right long term solution.
> > While I do agree that removing the debugging printk __offline_isolated_pages
> > does make sense because it is essentially of a very limited use, this
> > doesn't really solve the underlying problem.  There are likely other
> > printks from zone->lock. It would be much more saner to actually
> > disallow consoles to allocate any memory while printk is called from an
> > atomic context.
> 
> No, there is only a handful of places called printk() from
> zone->lock. It is normal that the callers will quietly process
> “struct zone” modification in a short section with zone->lock
> held.

It is extremely error prone to have any zone->lock vs. printk
dependency. I do not want to play an endless whack a mole.

> No, it is not about “allocate any memory while printk is called from an
> atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,
> 
> CPU0:                 CPU1:         CPU2:
> console_owner
>                             sclp_lock
> sclp_lock                                 zone_lock
>                             zone_lock
>                                                  console_owner

Why would sclp_lock ever take a zone->lock (apart from an allocation).
So really if sclp_lock is a lock that might be taken from many contexts
and generate very subtle lock dependencies then it should better be
really careful what it is calling into.

In other words you are trying to fix a wrong end of the problem. Fix the
console to not allocate or depend on MM by other means.
Qian Cai Oct. 7, 2019, 12:11 p.m. UTC | #6
On Mon, 2019-10-07 at 13:37 +0200, Michal Hocko wrote:
> On Mon 07-10-19 07:04:00, Qian Cai wrote:
> > 
> > 
> > > On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > 
> > > I do not think that removing the printk is the right long term solution.
> > > While I do agree that removing the debugging printk __offline_isolated_pages
> > > does make sense because it is essentially of a very limited use, this
> > > doesn't really solve the underlying problem.  There are likely other
> > > printks from zone->lock. It would be much more saner to actually
> > > disallow consoles to allocate any memory while printk is called from an
> > > atomic context.
> > 
> > No, there is only a handful of places called printk() from
> > zone->lock. It is normal that the callers will quietly process
> > “struct zone” modification in a short section with zone->lock
> > held.
> 
> It is extremely error prone to have any zone->lock vs. printk
> dependency. I do not want to play an endless whack a mole.
> 
> > No, it is not about “allocate any memory while printk is called from an
> > atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,
> > 
> > CPU0:                 CPU1:         CPU2:
> > console_owner
> >                             sclp_lock
> > sclp_lock                                 zone_lock
> >                             zone_lock
> >                                                  console_owner
> 
> Why would sclp_lock ever take a zone->lock (apart from an allocation).
> So really if sclp_lock is a lock that might be taken from many contexts
> and generate very subtle lock dependencies then it should better be
> really careful what it is calling into.
> 
> In other words you are trying to fix a wrong end of the problem. Fix the
> console to not allocate or depend on MM by other means.

It looks there are way too many places that could generate those indirect lock
chains that are hard to eliminate them all. Here is anther example, where it
has,

console_owner -> port_lock
port_lock -> zone_lock

[  297.425922] -> #3 (&(&zone->lock)->rlock){-.-.}:
[  297.425925]        __lock_acquire+0x5b3/0xb40
[  297.425925]        lock_acquire+0x126/0x280
[  297.425926]        _raw_spin_lock+0x2f/0x40
[  297.425927]        rmqueue_bulk.constprop.21+0xb6/0x1160
[  297.425928]        get_page_from_freelist+0x898/0x22c0
[  297.425928]        __alloc_pages_nodemask+0x2f3/0x1cd0
[  297.425929]        alloc_pages_current+0x9c/0x110
[  297.425930]        allocate_slab+0x4c6/0x19c0
[  297.425931]        new_slab+0x46/0x70
[  297.425931]        ___slab_alloc+0x58b/0x960
[  297.425932]        __slab_alloc+0x43/0x70
[  297.425933]        __kmalloc+0x3ad/0x4b0
[  297.425933]        __tty_buffer_request_room+0x100/0x250
[  297.425934]        tty_insert_flip_string_fixed_flag+0x67/0x110
[  297.425935]        pty_write+0xa2/0xf0
[  297.425936]        n_tty_write+0x36b/0x7b0
[  297.425936]        tty_write+0x284/0x4c0
[  297.425937]        __vfs_write+0x50/0xa0
[  297.425938]        vfs_write+0x105/0x290
[  297.425939]        redirected_tty_write+0x6a/0xc0
[  297.425939]        do_iter_write+0x248/0x2a0
[  297.425940]        vfs_writev+0x106/0x1e0
[  297.425941]        do_writev+0xd4/0x180
[  297.425941]        __x64_sys_writev+0x45/0x50
[  297.425942]        do_syscall_64+0xcc/0x76c
[  297.425943]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
Qian Cai Oct. 7, 2019, 12:34 p.m. UTC | #7
On Mon, 2019-10-07 at 11:05 +0200, Petr Mladek wrote:
> On Mon 2019-10-07 10:07:42, Michal Hocko wrote:
> > On Fri 04-10-19 18:26:45, Qian Cai wrote:
> > > It is unsafe to call printk() while zone->lock was held, i.e.,
> > > 
> > > zone->lock --> console_lock
> > > 
> > > because the console could always allocate some memory in different code
> > > paths and form locking chains in an opposite order,
> > > 
> > > console_lock --> * --> zone->lock
> > > 
> > > As the result, it triggers lockdep splats like below and in different
> > > code paths in this thread [1]. Since has_unmovable_pages() was only used
> > > in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> > > the former will set the REPORT_FAILURE flag which will call printk().
> > > Hence, unlock the zone->lock just before the dump_page() there where
> > > when has_unmovable_pages() returns true, there is no need to hold the
> > > lock anyway in the rest of set_migratetype_isolate().
> > > 
> > > While at it, remove a problematic printk() in __offline_isolated_pages()
> > > only for debugging as well which will always disable lockdep on debug
> > > kernels.
> > 
> > I do not think that removing the printk is the right long term solution.
> > While I do agree that removing the debugging printk __offline_isolated_pages
> > does make sense because it is essentially of a very limited use, this
> > doesn't really solve the underlying problem.  There are likely other
> > printks from zone->lock. It would be much more saner to actually
> > disallow consoles to allocate any memory while printk is called from an
> > atomic context.
> 
> The current "standard" solution for these situations is to replace
> the problematic printk() with printk_deferred(). It would deffer
> the console handling.
> 
> Of course, this is a whack a mole approach. The long term solution
> is to deffer printk() by default. We have finally agreed on this
> few weeks ago on Plumbers conference. It is going to be added
> together with fully lockless log buffer hopefully soon. It will
> be part of upstreaming Real-Time related code.

Does this guarantee that if,

lock(zone->lock)
printk_deferred()
unlock(zone->lock)

that the locks (console_owner and console_sem) in printk_deferred() will always
be processed by the unlock(zone->lock)?

If it is more of timing thing where klogd wakes up, it could still end up with,

zone_lock -> console_owner/console_sem

that causes a deadlock.
Michal Hocko Oct. 7, 2019, 12:43 p.m. UTC | #8
On Mon 07-10-19 08:11:44, Qian Cai wrote:
> On Mon, 2019-10-07 at 13:37 +0200, Michal Hocko wrote:
> > On Mon 07-10-19 07:04:00, Qian Cai wrote:
> > > 
> > > 
> > > > On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > 
> > > > I do not think that removing the printk is the right long term solution.
> > > > While I do agree that removing the debugging printk __offline_isolated_pages
> > > > does make sense because it is essentially of a very limited use, this
> > > > doesn't really solve the underlying problem.  There are likely other
> > > > printks from zone->lock. It would be much more saner to actually
> > > > disallow consoles to allocate any memory while printk is called from an
> > > > atomic context.
> > > 
> > > No, there is only a handful of places called printk() from
> > > zone->lock. It is normal that the callers will quietly process
> > > “struct zone” modification in a short section with zone->lock
> > > held.
> > 
> > It is extremely error prone to have any zone->lock vs. printk
> > dependency. I do not want to play an endless whack a mole.
> > 
> > > No, it is not about “allocate any memory while printk is called from an
> > > atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,
> > > 
> > > CPU0:                 CPU1:         CPU2:
> > > console_owner
> > >                             sclp_lock
> > > sclp_lock                                 zone_lock
> > >                             zone_lock
> > >                                                  console_owner
> > 
> > Why would sclp_lock ever take a zone->lock (apart from an allocation).
> > So really if sclp_lock is a lock that might be taken from many contexts
> > and generate very subtle lock dependencies then it should better be
> > really careful what it is calling into.
> > 
> > In other words you are trying to fix a wrong end of the problem. Fix the
> > console to not allocate or depend on MM by other means.
> 
> It looks there are way too many places that could generate those indirect lock
> chains that are hard to eliminate them all. Here is anther example, where it
> has,

Yeah and I strongly suspect they are consoles which are broken and need
to be fixed rathert than the problem papered over.

I do realize how tempting it is to remove all printks from the
zone->lock but do realize that as soon as the allocator starts using any
other locks then we are back to square one and the problem is there
again. We would have to drop _all_ printks from any locked section in
the allocator and I do not think this is viable.

Really, the only way forward is to make these consoles be more careful
of external dependencies.

I am also wondering, this code is there for a long time (or is there any
recent change?), why are we seeing reports only now? Are those consoles
rarely used or you are simply luck to hit those? Or are those really
representing a deadlock? Maybe the lockdep is just confused? I am not
familiar with the code but console_owner_lock is doing some complex
stuff to hand over the context.

> console_owner -> port_lock
> port_lock -> zone_lock
> 
> [  297.425922] -> #3 (&(&zone->lock)->rlock){-.-.}:
> [  297.425925]        __lock_acquire+0x5b3/0xb40
> [  297.425925]        lock_acquire+0x126/0x280
> [  297.425926]        _raw_spin_lock+0x2f/0x40
> [  297.425927]        rmqueue_bulk.constprop.21+0xb6/0x1160
> [  297.425928]        get_page_from_freelist+0x898/0x22c0
> [  297.425928]        __alloc_pages_nodemask+0x2f3/0x1cd0
> [  297.425929]        alloc_pages_current+0x9c/0x110
> [  297.425930]        allocate_slab+0x4c6/0x19c0
> [  297.425931]        new_slab+0x46/0x70
> [  297.425931]        ___slab_alloc+0x58b/0x960
> [  297.425932]        __slab_alloc+0x43/0x70
> [  297.425933]        __kmalloc+0x3ad/0x4b0
> [  297.425933]        __tty_buffer_request_room+0x100/0x250
> [  297.425934]        tty_insert_flip_string_fixed_flag+0x67/0x110
> [  297.425935]        pty_write+0xa2/0xf0
> [  297.425936]        n_tty_write+0x36b/0x7b0
> [  297.425936]        tty_write+0x284/0x4c0
> [  297.425937]        __vfs_write+0x50/0xa0
> [  297.425938]        vfs_write+0x105/0x290
> [  297.425939]        redirected_tty_write+0x6a/0xc0
> [  297.425939]        do_iter_write+0x248/0x2a0
> [  297.425940]        vfs_writev+0x106/0x1e0
> [  297.425941]        do_writev+0xd4/0x180
> [  297.425941]        __x64_sys_writev+0x45/0x50
> [  297.425942]        do_syscall_64+0xcc/0x76c
> [  297.425943]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
Qian Cai Oct. 7, 2019, 1:07 p.m. UTC | #9
On Mon, 2019-10-07 at 14:43 +0200, Michal Hocko wrote:
> On Mon 07-10-19 08:11:44, Qian Cai wrote:
> > On Mon, 2019-10-07 at 13:37 +0200, Michal Hocko wrote:
> > > On Mon 07-10-19 07:04:00, Qian Cai wrote:
> > > > 
> > > > 
> > > > > On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > 
> > > > > I do not think that removing the printk is the right long term solution.
> > > > > While I do agree that removing the debugging printk __offline_isolated_pages
> > > > > does make sense because it is essentially of a very limited use, this
> > > > > doesn't really solve the underlying problem.  There are likely other
> > > > > printks from zone->lock. It would be much more saner to actually
> > > > > disallow consoles to allocate any memory while printk is called from an
> > > > > atomic context.
> > > > 
> > > > No, there is only a handful of places called printk() from
> > > > zone->lock. It is normal that the callers will quietly process
> > > > “struct zone” modification in a short section with zone->lock
> > > > held.
> > > 
> > > It is extremely error prone to have any zone->lock vs. printk
> > > dependency. I do not want to play an endless whack a mole.
> > > 
> > > > No, it is not about “allocate any memory while printk is called from an
> > > > atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,
> > > > 
> > > > CPU0:                 CPU1:         CPU2:
> > > > console_owner
> > > >                             sclp_lock
> > > > sclp_lock                                 zone_lock
> > > >                             zone_lock
> > > >                                                  console_owner
> > > 
> > > Why would sclp_lock ever take a zone->lock (apart from an allocation).
> > > So really if sclp_lock is a lock that might be taken from many contexts
> > > and generate very subtle lock dependencies then it should better be
> > > really careful what it is calling into.
> > > 
> > > In other words you are trying to fix a wrong end of the problem. Fix the
> > > console to not allocate or depend on MM by other means.
> > 
> > It looks there are way too many places that could generate those indirect lock
> > chains that are hard to eliminate them all. Here is anther example, where it
> > has,
> 
> Yeah and I strongly suspect they are consoles which are broken and need
> to be fixed rathert than the problem papered over.
> 
> I do realize how tempting it is to remove all printks from the
> zone->lock but do realize that as soon as the allocator starts using any
> other locks then we are back to square one and the problem is there
> again. We would have to drop _all_ printks from any locked section in
> the allocator and I do not think this is viable.
> 
> Really, the only way forward is to make these consoles be more careful
> of external dependencies.

Even with the new printk() Petr proposed. There is no guarantee it will fix it
properly. It looks like just reduce the chance of this kind of deadlocks as it
may or may not call wake_up_klogd() in vprintk_emit() depends on timing.

zone->lock
printk_deferred()
  vprintk_emit()
    wake_up_klogd()
      wake_up_klogd_work_func()
        console_unlock()

which essentially went into the same path,

zone_lock -> console_owner_lock

What else options it has here?

> 
> I am also wondering, this code is there for a long time (or is there any
> recent change?), why are we seeing reports only now? Are those consoles
> rarely used or you are simply luck to hit those? Or are those really
> representing a deadlock? Maybe the lockdep is just confused? I am not
> familiar with the code but console_owner_lock is doing some complex
> stuff to hand over the context.

As I mentioned in the changelog that almost nobody call printk() with zone->lock 
held except in memory offline.

"The problem is probably there forever, but neither many developers will
run memory offline with the lockdep enabled nor admins in the field are
lucky enough yet to hit a perfect timing which required to trigger a
real deadlock. In addition, there aren't many places that call printk()
while zone->lock was held."

> 
> > console_owner -> port_lock
> > port_lock -> zone_lock
> > 
> > [  297.425922] -> #3 (&(&zone->lock)->rlock){-.-.}:
> > [  297.425925]        __lock_acquire+0x5b3/0xb40
> > [  297.425925]        lock_acquire+0x126/0x280
> > [  297.425926]        _raw_spin_lock+0x2f/0x40
> > [  297.425927]        rmqueue_bulk.constprop.21+0xb6/0x1160
> > [  297.425928]        get_page_from_freelist+0x898/0x22c0
> > [  297.425928]        __alloc_pages_nodemask+0x2f3/0x1cd0
> > [  297.425929]        alloc_pages_current+0x9c/0x110
> > [  297.425930]        allocate_slab+0x4c6/0x19c0
> > [  297.425931]        new_slab+0x46/0x70
> > [  297.425931]        ___slab_alloc+0x58b/0x960
> > [  297.425932]        __slab_alloc+0x43/0x70
> > [  297.425933]        __kmalloc+0x3ad/0x4b0
> > [  297.425933]        __tty_buffer_request_room+0x100/0x250
> > [  297.425934]        tty_insert_flip_string_fixed_flag+0x67/0x110
> > [  297.425935]        pty_write+0xa2/0xf0
> > [  297.425936]        n_tty_write+0x36b/0x7b0
> > [  297.425936]        tty_write+0x284/0x4c0
> > [  297.425937]        __vfs_write+0x50/0xa0
> > [  297.425938]        vfs_write+0x105/0x290
> > [  297.425939]        redirected_tty_write+0x6a/0xc0
> > [  297.425939]        do_iter_write+0x248/0x2a0
> > [  297.425940]        vfs_writev+0x106/0x1e0
> > [  297.425941]        do_writev+0xd4/0x180
> > [  297.425941]        __x64_sys_writev+0x45/0x50
> > [  297.425942]        do_syscall_64+0xcc/0x76c
> > [  297.425943]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
>
Petr Mladek Oct. 7, 2019, 2:10 p.m. UTC | #10
On Mon 2019-10-07 09:07:02, Qian Cai wrote:
> On Mon, 2019-10-07 at 14:43 +0200, Michal Hocko wrote:
> > On Mon 07-10-19 08:11:44, Qian Cai wrote:
> > > On Mon, 2019-10-07 at 13:37 +0200, Michal Hocko wrote:
> > > > On Mon 07-10-19 07:04:00, Qian Cai wrote:
> > > > > 
> > > > > 
> > > > > > On Oct 7, 2019, at 4:07 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > 
> > > > > > I do not think that removing the printk is the right long term solution.
> > > > > > While I do agree that removing the debugging printk __offline_isolated_pages
> > > > > > does make sense because it is essentially of a very limited use, this
> > > > > > doesn't really solve the underlying problem.  There are likely other
> > > > > > printks from zone->lock. It would be much more saner to actually
> > > > > > disallow consoles to allocate any memory while printk is called from an
> > > > > > atomic context.
> > > > > 
> > > > > No, there is only a handful of places called printk() from
> > > > > zone->lock. It is normal that the callers will quietly process
> > > > > “struct zone” modification in a short section with zone->lock
> > > > > held.
> > > > 
> > > > It is extremely error prone to have any zone->lock vs. printk
> > > > dependency. I do not want to play an endless whack a mole.
> > > > 
> > > > > No, it is not about “allocate any memory while printk is called from an
> > > > > atomic context”. It is opposite lock chain  from different processors which has the same effect. For example,
> > > > > 
> > > > > CPU0:                 CPU1:         CPU2:
> > > > > console_owner
> > > > >                             sclp_lock
> > > > > sclp_lock                                 zone_lock
> > > > >                             zone_lock
> > > > >                                                  console_owner
> > > > 
> > > > Why would sclp_lock ever take a zone->lock (apart from an allocation).
> > > > So really if sclp_lock is a lock that might be taken from many contexts
> > > > and generate very subtle lock dependencies then it should better be
> > > > really careful what it is calling into.
> > > > 
> > > > In other words you are trying to fix a wrong end of the problem. Fix the
> > > > console to not allocate or depend on MM by other means.
> > > 
> > > It looks there are way too many places that could generate those indirect lock
> > > chains that are hard to eliminate them all. Here is anther example, where it
> > > has,
> > 
> > Yeah and I strongly suspect they are consoles which are broken and need
> > to be fixed rathert than the problem papered over.
> > 
> > I do realize how tempting it is to remove all printks from the
> > zone->lock but do realize that as soon as the allocator starts using any
> > other locks then we are back to square one and the problem is there
> > again. We would have to drop _all_ printks from any locked section in
> > the allocator and I do not think this is viable.
> > 
> > Really, the only way forward is to make these consoles be more careful
> > of external dependencies.
> 
> Even with the new printk() Petr proposed. There is no guarantee it will fix it
> properly. It looks like just reduce the chance of this kind of deadlocks as it
> may or may not call wake_up_klogd() in vprintk_emit() depends on timing.

The chain below is wrong:

> zone->lock
> printk_deferred()
>   vprintk_emit()
>     wake_up_klogd()

wake_up_klogd() calls irq_work_queue(). It queues the work for
an interrupt handler and triggers the interrupt.

>       wake_up_klogd_work_func()
>         console_unlock()

The work is done in the interrupt context. The interrupt could
never be handled under zone->lock.

So, printk_deferred() would help. But I do not think that it is
really needed. I am going to answer the original mail with
all the full lockdep report.

Best Regards,
Petr
Petr Mladek Oct. 7, 2019, 2:30 p.m. UTC | #11
On Fri 2019-10-04 18:26:45, Qian Cai wrote:
> It is unsafe to call printk() while zone->lock was held, i.e.,
> 
> zone->lock --> console_lock
> 
> because the console could always allocate some memory in different code
> paths and form locking chains in an opposite order,
> 
> console_lock --> * --> zone->lock
> 
> As the result, it triggers lockdep splats like below and in different
> code paths in this thread [1]. Since has_unmovable_pages() was only used
> in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> the former will set the REPORT_FAILURE flag which will call printk().
> Hence, unlock the zone->lock just before the dump_page() there where
> when has_unmovable_pages() returns true, there is no need to hold the
> lock anyway in the rest of set_migratetype_isolate().
> 
> While at it, remove a problematic printk() in __offline_isolated_pages()
> only for debugging as well which will always disable lockdep on debug
> kernels.
> 
> The problem is probably there forever, but neither many developers will
> run memory offline with the lockdep enabled nor admins in the field are
> lucky enough yet to hit a perfect timing which required to trigger a
> real deadlock. In addition, there aren't many places that call printk()
> while zone->lock was held.
> 
> WARNING: possible circular locking dependency detected
> ------------------------------------------------------
> test.sh/1724 is trying to acquire lock:
> 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
> 01: 328/0xa30
> 
> but task is already holding lock:
> 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
> 01: late_page_range+0x216/0x538
> 
> which lock already depends on the new lock.
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #2 (&(&zone->lock)->rlock){-.-.}:
>        lock_acquire+0x21a/0x468
>        _raw_spin_lock+0x54/0x68
>        get_page_from_freelist+0x8b6/0x2d28
>        __alloc_pages_nodemask+0x246/0x658
>        __get_free_pages+0x34/0x78
>        sclp_init+0x106/0x690
>        sclp_register+0x2e/0x248
>        sclp_rw_init+0x4a/0x70
>        sclp_console_init+0x4a/0x1b8
>        console_init+0x2c8/0x410
>        start_kernel+0x530/0x6a0
>        startup_continue+0x70/0xd0

This code takes locks: sclp_lock --> &(&zone->lock)->rlock

> -> #1 (sclp_lock){-.-.}:
>        lock_acquire+0x21a/0x468
>        _raw_spin_lock_irqsave+0xcc/0xe8
>        sclp_add_request+0x34/0x308
>        sclp_conbuf_emit+0x100/0x138
>        sclp_console_write+0x96/0x3b8
>        console_unlock+0x6dc/0xa30
>        vprintk_emit+0x184/0x3c8
>        vprintk_default+0x44/0x50
>        printk+0xa8/0xc0
>        iommu_debugfs_setup+0xf2/0x108
>        iommu_init+0x6c/0x78
>        do_one_initcall+0x162/0x680
>        kernel_init_freeable+0x4e8/0x5a8
>        kernel_init+0x2a/0x188
>        ret_from_fork+0x30/0x34
>        kernel_thread_starter+0x0/0xc

This code path takes: console_owner --> sclp_lock

> -> #0 (console_owner){-...}:
>        check_noncircular+0x338/0x3e0
>        __lock_acquire+0x1e66/0x2d88
>        lock_acquire+0x21a/0x468
>        console_unlock+0x3a6/0xa30
>        vprintk_emit+0x184/0x3c8
>        vprintk_default+0x44/0x50
>        printk+0xa8/0xc0
>        __dump_page+0x1dc/0x710
>        dump_page+0x2e/0x58
>        has_unmovable_pages+0x2e8/0x470
>        start_isolate_page_range+0x404/0x538
>        __offline_pages+0x22c/0x1338
>        memory_subsys_offline+0xa6/0xe8
>        device_offline+0xe6/0x118
>        state_store+0xf0/0x110
>        kernfs_fop_write+0x1bc/0x270
>        vfs_write+0xce/0x220
>        ksys_write+0xea/0x190
>        system_call+0xd8/0x2b4

And this code path takes: &(&zone->lock)->rlock --> console_owner

> other info that might help us debug this:
> 
> Chain exists of:
>   console_owner --> sclp_lock --> &(&zone->lock)->rlock

All three code paths together create a cyclic dependency. This
is why lockdep reports a possible deadlock.

I believe that it cannot really happen because:

	static int __init
	sclp_console_init(void)
	{
	[...]
		rc = sclp_rw_init();
	[...]
		register_console(&sclp_console);
		return 0;
	}

sclp_rw_init() is called before register_console(). And
console_unlock() will never call sclp_console_write() before
the console is registered.

AFAIK, lockdep only compares existing chain of locks. It does
not know about console registration that would make some
code paths mutually exclusive.

I believe that it is a false positive. I do not know how to
avoid this lockdep report. I hope that it will disappear
by deferring all printk() calls rather soon.

Best Regards,
Petr
Michal Hocko Oct. 7, 2019, 2:49 p.m. UTC | #12
[Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
 Petr has explained it is a false positive
 http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
On Mon 07-10-19 16:30:02, Petr Mladek wrote:
[...]
> I believe that it cannot really happen because:
> 
> 	static int __init
> 	sclp_console_init(void)
> 	{
> 	[...]
> 		rc = sclp_rw_init();
> 	[...]
> 		register_console(&sclp_console);
> 		return 0;
> 	}
> 
> sclp_rw_init() is called before register_console(). And
> console_unlock() will never call sclp_console_write() before
> the console is registered.
> 
> AFAIK, lockdep only compares existing chain of locks. It does
> not know about console registration that would make some
> code paths mutually exclusive.
> 
> I believe that it is a false positive. I do not know how to
> avoid this lockdep report. I hope that it will disappear
> by deferring all printk() calls rather soon.

Thanks a lot for looking into this Petr. I have also checked the code
and I really fail to see why the allocation has to be done under the
lock in the first place. sclp_read_sccb and sclp_init_sccb are global
variables but I strongly suspect that they need a synchronization during
early init, callbacks are registered only later IIUC:

diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index d2ab3f07c008..4b1c033e3255 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -1169,13 +1169,13 @@ sclp_init(void)
 	unsigned long flags;
 	int rc = 0;
 
+	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
+	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
 	spin_lock_irqsave(&sclp_lock, flags);
 	/* Check for previous or running initialization */
 	if (sclp_init_state != sclp_init_state_uninitialized)
 		goto fail_unlock;
 	sclp_init_state = sclp_init_state_initializing;
-	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
-	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
 	BUG_ON(!sclp_read_sccb || !sclp_init_sccb);
 	/* Set up variables */
 	INIT_LIST_HEAD(&sclp_req_queue);
Qian Cai Oct. 7, 2019, 2:59 p.m. UTC | #13
On Mon, 2019-10-07 at 16:30 +0200, Petr Mladek wrote:
> On Fri 2019-10-04 18:26:45, Qian Cai wrote:
> > It is unsafe to call printk() while zone->lock was held, i.e.,
> > 
> > zone->lock --> console_lock
> > 
> > because the console could always allocate some memory in different code
> > paths and form locking chains in an opposite order,
> > 
> > console_lock --> * --> zone->lock
> > 
> > As the result, it triggers lockdep splats like below and in different
> > code paths in this thread [1]. Since has_unmovable_pages() was only used
> > in set_migratetype_isolate() and is_pageblock_removable_nolock(). Only
> > the former will set the REPORT_FAILURE flag which will call printk().
> > Hence, unlock the zone->lock just before the dump_page() there where
> > when has_unmovable_pages() returns true, there is no need to hold the
> > lock anyway in the rest of set_migratetype_isolate().
> > 
> > While at it, remove a problematic printk() in __offline_isolated_pages()
> > only for debugging as well which will always disable lockdep on debug
> > kernels.
> > 
> > The problem is probably there forever, but neither many developers will
> > run memory offline with the lockdep enabled nor admins in the field are
> > lucky enough yet to hit a perfect timing which required to trigger a
> > real deadlock. In addition, there aren't many places that call printk()
> > while zone->lock was held.
> > 
> > WARNING: possible circular locking dependency detected
> > ------------------------------------------------------
> > test.sh/1724 is trying to acquire lock:
> > 0000000052059ec0 (console_owner){-...}, at: console_unlock+0x
> > 01: 328/0xa30
> > 
> > but task is already holding lock:
> > 000000006ffd89c8 (&(&zone->lock)->rlock){-.-.}, at: start_iso
> > 01: late_page_range+0x216/0x538
> > 
> > which lock already depends on the new lock.
> > 
> > the existing dependency chain (in reverse order) is:
> > 
> > -> #2 (&(&zone->lock)->rlock){-.-.}:
> >        lock_acquire+0x21a/0x468
> >        _raw_spin_lock+0x54/0x68
> >        get_page_from_freelist+0x8b6/0x2d28
> >        __alloc_pages_nodemask+0x246/0x658
> >        __get_free_pages+0x34/0x78
> >        sclp_init+0x106/0x690
> >        sclp_register+0x2e/0x248
> >        sclp_rw_init+0x4a/0x70
> >        sclp_console_init+0x4a/0x1b8
> >        console_init+0x2c8/0x410
> >        start_kernel+0x530/0x6a0
> >        startup_continue+0x70/0xd0
> 
> This code takes locks: sclp_lock --> &(&zone->lock)->rlock
> 
> > -> #1 (sclp_lock){-.-.}:
> >        lock_acquire+0x21a/0x468
> >        _raw_spin_lock_irqsave+0xcc/0xe8
> >        sclp_add_request+0x34/0x308
> >        sclp_conbuf_emit+0x100/0x138
> >        sclp_console_write+0x96/0x3b8
> >        console_unlock+0x6dc/0xa30
> >        vprintk_emit+0x184/0x3c8
> >        vprintk_default+0x44/0x50
> >        printk+0xa8/0xc0
> >        iommu_debugfs_setup+0xf2/0x108
> >        iommu_init+0x6c/0x78
> >        do_one_initcall+0x162/0x680
> >        kernel_init_freeable+0x4e8/0x5a8
> >        kernel_init+0x2a/0x188
> >        ret_from_fork+0x30/0x34
> >        kernel_thread_starter+0x0/0xc
> 
> This code path takes: console_owner --> sclp_lock
> 
> > -> #0 (console_owner){-...}:
> >        check_noncircular+0x338/0x3e0
> >        __lock_acquire+0x1e66/0x2d88
> >        lock_acquire+0x21a/0x468
> >        console_unlock+0x3a6/0xa30
> >        vprintk_emit+0x184/0x3c8
> >        vprintk_default+0x44/0x50
> >        printk+0xa8/0xc0
> >        __dump_page+0x1dc/0x710
> >        dump_page+0x2e/0x58
> >        has_unmovable_pages+0x2e8/0x470
> >        start_isolate_page_range+0x404/0x538
> >        __offline_pages+0x22c/0x1338
> >        memory_subsys_offline+0xa6/0xe8
> >        device_offline+0xe6/0x118
> >        state_store+0xf0/0x110
> >        kernfs_fop_write+0x1bc/0x270
> >        vfs_write+0xce/0x220
> >        ksys_write+0xea/0x190
> >        system_call+0xd8/0x2b4
> 
> And this code path takes: &(&zone->lock)->rlock --> console_owner
> 
> > other info that might help us debug this:
> > 
> > Chain exists of:
> >   console_owner --> sclp_lock --> &(&zone->lock)->rlock
> 
> All three code paths together create a cyclic dependency. This
> is why lockdep reports a possible deadlock.
> 
> I believe that it cannot really happen because:
> 
> 	static int __init
> 	sclp_console_init(void)
> 	{
> 	[...]
> 		rc = sclp_rw_init();
> 	[...]
> 		register_console(&sclp_console);
> 		return 0;
> 	}
> 
> sclp_rw_init() is called before register_console(). And
> console_unlock() will never call sclp_console_write() before
> the console is registered.

It could really hard to tell for sure unless someone fully audit every place in
the code could do,

console_owner_lock --> sclp_lock

The lockdep will save only the earliest trace after first saw the lock order, so
those early boot one will always be there in the report.

> 
> AFAIK, lockdep only compares existing chain of locks. It does
> not know about console registration that would make some
> code paths mutually exclusive.

Yes.

> 
> I believe that it is a false positive. I do not know how to
> avoid this lockdep report. I hope that it will disappear
> by deferring all printk() calls rather soon.

However, the similar splat is for console_owner_lock --> port_lock below. I have
even seen the another one before with a 4-way lockdep splat (which was shot down
separately),

https://lore.kernel.org/lkml/1568817579.5576.172.camel@lca.pw/

console_sem --> pi_lock --> rq_lock --> zone_lock
zone_lock --> console_sem

It is almost impossible to eliminate all the indirect call chains from
console_sem/console_owner_lock to zone->lock because it is too normal that
something later needs to allocate some memory dynamically, so as long as it
directly call printk() with zone->lock held, it will be in trouble.

I really hope the new printk() will solve this class of the problem, but it is
essentially up to the air until a patchset was posted. There are just too many
questions out there to be answered.

[  297.425908] WARNING: possible circular locking dependency detected
[  297.425908] 5.3.0-next-20190917 #8 Not tainted
[  297.425909] ------------------------------------------------------
[  297.425910] test.sh/8653 is trying to acquire lock:
[  297.425911] ffffffff865a4460 (console_owner){-.-.}, at:
console_unlock+0x207/0x750

[  297.425914] but task is already holding lock:
[  297.425915] ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
__offline_isolated_pages+0x179/0x3e0

[  297.425919] which lock already depends on the new lock.


[  297.425920] the existing dependency chain (in reverse order) is:

[  297.425922] -> #3 (&(&zone->lock)->rlock){-.-.}:
[  297.425925]        __lock_acquire+0x5b3/0xb40
[  297.425925]        lock_acquire+0x126/0x280
[  297.425926]        _raw_spin_lock+0x2f/0x40
[  297.425927]        rmqueue_bulk.constprop.21+0xb6/0x1160
[  297.425928]        get_page_from_freelist+0x898/0x22c0
[  297.425928]        __alloc_pages_nodemask+0x2f3/0x1cd0
[  297.425929]        alloc_pages_current+0x9c/0x110
[  297.425930]        allocate_slab+0x4c6/0x19c0
[  297.425931]        new_slab+0x46/0x70
[  297.425931]        ___slab_alloc+0x58b/0x960
[  297.425932]        __slab_alloc+0x43/0x70
[  297.425933]        __kmalloc+0x3ad/0x4b0
[  297.425933]        __tty_buffer_request_room+0x100/0x250
[  297.425934]        tty_insert_flip_string_fixed_flag+0x67/0x110
[  297.425935]        pty_write+0xa2/0xf0
[  297.425936]        n_tty_write+0x36b/0x7b0
[  297.425936]        tty_write+0x284/0x4c0
[  297.425937]        __vfs_write+0x50/0xa0
[  297.425938]        vfs_write+0x105/0x290
[  297.425939]        redirected_tty_write+0x6a/0xc0
[  297.425939]        do_iter_write+0x248/0x2a0
[  297.425940]        vfs_writev+0x106/0x1e0
[  297.425941]        do_writev+0xd4/0x180
[  297.425941]        __x64_sys_writev+0x45/0x50
[  297.425942]        do_syscall_64+0xcc/0x76c
[  297.425943]        entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  297.425944] -> #2 (&(&port->lock)->rlock){-.-.}:
[  297.425946]        __lock_acquire+0x5b3/0xb40
[  297.425947]        lock_acquire+0x126/0x280
[  297.425948]        _raw_spin_lock_irqsave+0x3a/0x50
[  297.425949]        tty_port_tty_get+0x20/0x60
[  297.425949]        tty_port_default_wakeup+0xf/0x30
[  297.425950]        tty_port_tty_wakeup+0x39/0x40
[  297.425951]        uart_write_wakeup+0x2a/0x40
[  297.425952]        serial8250_tx_chars+0x22e/0x440
[  297.425952]        serial8250_handle_irq.part.8+0x14a/0x170
[  297.425953]        serial8250_default_handle_irq+0x5c/0x90
[  297.425954]        serial8250_interrupt+0xa6/0x130
[  297.425955]        __handle_irq_event_percpu+0x78/0x4f0
[  297.425955]        handle_irq_event_percpu+0x70/0x100
[  297.425956]        handle_irq_event+0x5a/0x8b
[  297.425957]        handle_edge_irq+0x117/0x370
[  297.425958]        do_IRQ+0x9e/0x1e0
[  297.425958]        ret_from_intr+0x0/0x2a
[  297.425959]        cpuidle_enter_state+0x156/0x8e0
[  297.425960]        cpuidle_enter+0x41/0x70
[  297.425960]        call_cpuidle+0x5e/0x90
[  297.425961]        do_idle+0x333/0x370
[  297.425962]        cpu_startup_entry+0x1d/0x1f
[  297.425962]        start_secondary+0x290/0x330
[  297.425963]        secondary_startup_64+0xb6/0xc0

[  297.425964] -> #1 (&port_lock_key){-.-.}:
[  297.425967]        __lock_acquire+0x5b3/0xb40
[  297.425967]        lock_acquire+0x126/0x280
[  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
[  297.425969]        serial8250_console_write+0x3e4/0x450
[  297.425970]        univ8250_console_write+0x4b/0x60
[  297.425970]        console_unlock+0x501/0x750
[  297.425971]        vprintk_emit+0x10d/0x340
[  297.425972]        vprintk_default+0x1f/0x30
[  297.425972]        vprintk_func+0x44/0xd4
[  297.425973]        printk+0x9f/0xc5
[  297.425974]        register_console+0x39c/0x520
[  297.425975]        univ8250_console_init+0x23/0x2d
[  297.425975]        console_init+0x338/0x4cd
[  297.425976]        start_kernel+0x534/0x724
[  297.425977]        x86_64_start_reservations+0x24/0x26
[  297.425977]        x86_64_start_kernel+0xf4/0xfb
[  297.425978]        secondary_startup_64+0xb6/0xc0

[  297.425979] -> #0 (console_owner){-.-.}:
[  297.425982]        check_prev_add+0x107/0xea0
[  297.425982]        validate_chain+0x8fc/0x1200
[  297.425983]        __lock_acquire+0x5b3/0xb40
[  297.425984]        lock_acquire+0x126/0x280
[  297.425984]        console_unlock+0x269/0x750
[  297.425985]        vprintk_emit+0x10d/0x340
[  297.425986]        vprintk_default+0x1f/0x30
[  297.425987]        vprintk_func+0x44/0xd4
[  297.425987]        printk+0x9f/0xc5
[  297.425988]        __offline_isolated_pages.cold.52+0x2f/0x30a
[  297.425989]        offline_isolated_pages_cb+0x17/0x30
[  297.425990]        walk_system_ram_range+0xda/0x160
[  297.425990]        __offline_pages+0x79c/0xa10
[  297.425991]        offline_pages+0x11/0x20
[  297.425992]        memory_subsys_offline+0x7e/0xc0
[  297.425992]        device_offline+0xd5/0x110
[  297.425993]        state_store+0xc6/0xe0
[  297.425994]        dev_attr_store+0x3f/0x60
[  297.425995]        sysfs_kf_write+0x89/0xb0
[  297.425995]        kernfs_fop_write+0x188/0x240
[  297.425996]        __vfs_write+0x50/0xa0
[  297.425997]        vfs_write+0x105/0x290
[  297.425997]        ksys_write+0xc6/0x160
[  297.425998]        __x64_sys_write+0x43/0x50
[  297.425999]        do_syscall_64+0xcc/0x76c
[  297.426000]        entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  297.426001] other info that might help us debug this:

[  297.426002] Chain exists of:
[  297.426002]   console_owner --> &(&port->lock)->rlock --> &(&zone->lock)-
>rlock

[  297.426007]  Possible unsafe locking scenario:

[  297.426008]        CPU0                    CPU1
[  297.426009]        ----                    ----
[  297.426009]   lock(&(&zone->lock)->rlock);
[  297.426011]                                lock(&(&port->lock)->rlock);
[  297.426013]                                lock(&(&zone->lock)->rlock);
[  297.426014]   lock(console_owner);

[  297.426016]  *** DEADLOCK ***

[  297.426017] 9 locks held by test.sh/8653:
[  297.426018]  #0: ffff88839ba7d408 (sb_writers#4){.+.+}, at:
vfs_write+0x25f/0x290
[  297.426021]  #1: ffff888277618880 (&of->mutex){+.+.}, at:
kernfs_fop_write+0x128/0x240
[  297.426024]  #2: ffff8898131fc218 (kn->count#115){.+.+}, at:
kernfs_fop_write+0x138/0x240
[  297.426028]  #3: ffffffff86962a80 (device_hotplug_lock){+.+.}, at:
lock_device_hotplug_sysfs+0x16/0x50
[  297.426031]  #4: ffff8884374f4990 (&dev->mutex){....}, at:
device_offline+0x70/0x110
[  297.426034]  #5: ffffffff86515250 (cpu_hotplug_lock.rw_sem){++++}, at:
__offline_pages+0xbf/0xa10
[  297.426037]  #6: ffffffff867405f0 (mem_hotplug_lock.rw_sem){++++}, at:
percpu_down_write+0x87/0x2f0
[  297.426040]  #7: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
__offline_isolated_pages+0x179/0x3e0
[  297.426043]  #8: ffffffff865a4920 (console_lock){+.+.}, at:
vprintk_emit+0x100/0x340

[  297.426047] stack backtrace:
[  297.426048] CPU: 1 PID: 8653 Comm: test.sh Not tainted 5.3.0-next-20190917 #8
[  297.426049] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10,
BIOS U34 05/21/2019
[  297.426049] Call Trace:
[  297.426050]  dump_stack+0x86/0xca
[  297.426051]  print_circular_bug.cold.31+0x243/0x26e
[  297.426051]  check_noncircular+0x29e/0x2e0
[  297.426052]  ? stack_trace_save+0x87/0xb0
[  297.426053]  ? print_circular_bug+0x120/0x120
[  297.426053]  check_prev_add+0x107/0xea0
[  297.426054]  validate_chain+0x8fc/0x1200
[  297.426055]  ? check_prev_add+0xea0/0xea0
[  297.426055]  __lock_acquire+0x5b3/0xb40
[  297.426056]  lock_acquire+0x126/0x280
[  297.426057]  ? console_unlock+0x207/0x750
[  297.426057]  ? __kasan_check_read+0x11/0x20
[  297.426058]  console_unlock+0x269/0x750
[  297.426059]  ? console_unlock+0x207/0x750
[  297.426059]  vprintk_emit+0x10d/0x340
[  297.426060]  vprintk_default+0x1f/0x30
[  297.426061]  vprintk_func+0x44/0xd4
[  297.426061]  ? do_raw_spin_lock+0x118/0x1d0
[  297.426062]  printk+0x9f/0xc5
[  297.426063]  ? kmsg_dump_rewind_nolock+0x64/0x64
[  297.426064]  ? __offline_isolated_pages+0x179/0x3e0
[  297.426064]  __offline_isolated_pages.cold.52+0x2f/0x30a
[  297.426065]  ? online_memory_block+0x20/0x20
[  297.426066]  offline_isolated_pages_cb+0x17/0x30
[  297.426067]  walk_system_ram_range+0xda/0x160
[  297.426067]  ? walk_mem_res+0x30/0x30
[  297.426068]  ? dissolve_free_huge_page+0x1e/0x2b0
[  297.426069]  __offline_pages+0x79c/0xa10
[  297.426069]  ? __add_memory+0xc0/0xc0
[  297.426070]  ? __kasan_check_write+0x14/0x20
[  297.426071]  ? __mutex_lock+0x344/0xcd0
[  297.426071]  ? _raw_spin_unlock_irqrestore+0x49/0x50
[  297.426072]  ? device_offline+0x70/0x110
[  297.426073]  ? klist_next+0x1c1/0x1e0
[  297.426073]  ? __mutex_add_waiter+0xc0/0xc0
[  297.426074]  ? klist_next+0x10b/0x1e0
[  297.426075]  ? klist_iter_exit+0x16/0x40
[  297.426076]  ? device_for_each_child+0xd0/0x110
[  297.426076]  offline_pages+0x11/0x20
[  297.426077]  memory_subsys_offline+0x7e/0xc0
[  297.426078]  device_offline+0xd5/0x110
[  297.426078]  ? auto_online_blocks_show+0x70/0x70
[  297.426079]  state_store+0xc6/0xe0
[  297.426080]  dev_attr_store+0x3f/0x60
[  297.426080]  ? device_match_name+0x40/0x40
[  297.426081]  sysfs_kf_write+0x89/0xb0
[  297.426082]  ? sysfs_file_ops+0xa0/0xa0
[  297.426082]  kernfs_fop_write+0x188/0x240
[  297.426083]  __vfs_write+0x50/0xa0
[  297.426084]  vfs_write+0x105/0x290
[  297.426084]  ksys_write+0xc6/0x160
[  297.426085]  ? __x64_sys_read+0x50/0x50
[  297.426086]  ? do_syscall_64+0x79/0x76c
[  297.426086]  ? do_syscall_64+0x79/0x76c
[  297.426087]  __x64_sys_write+0x43/0x50
[  297.426088]  do_syscall_64+0xcc/0x76c
[  297.426088]  ? trace_hardirqs_on_thunk+0x1a/0x20
[  297.426089]  ? syscall_return_slowpath+0x210/0x210
[  297.426090]  ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
[  297.426091]  ? trace_hardirqs_off_caller+0x3a/0x150
[  297.426092]  ? trace_hardirqs_off_thunk+0x1a/0x20
[  297.426092]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
Michal Hocko Oct. 7, 2019, 3:12 p.m. UTC | #14
On Mon 07-10-19 10:59:10, Qian Cai wrote:
[...]
> It is almost impossible to eliminate all the indirect call chains from
> console_sem/console_owner_lock to zone->lock because it is too normal that
> something later needs to allocate some memory dynamically, so as long as it
> directly call printk() with zone->lock held, it will be in trouble.

Do you have any example where the console driver really _has_ to
allocate. Because I have hard time to believe this is going to work at
all as the atomic context doesn't allow to do any memory reclaim and
such an allocation would be too easy to fail so the allocation cannot
really rely on it.

So again, crippling the MM code just because of lockdep false possitives
or a broken console driver sounds like a wrong way to approach the
problem.

> [  297.425964] -> #1 (&port_lock_key){-.-.}:
> [  297.425967]        __lock_acquire+0x5b3/0xb40
> [  297.425967]        lock_acquire+0x126/0x280
> [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> [  297.425969]        serial8250_console_write+0x3e4/0x450
> [  297.425970]        univ8250_console_write+0x4b/0x60
> [  297.425970]        console_unlock+0x501/0x750
> [  297.425971]        vprintk_emit+0x10d/0x340
> [  297.425972]        vprintk_default+0x1f/0x30
> [  297.425972]        vprintk_func+0x44/0xd4
> [  297.425973]        printk+0x9f/0xc5
> [  297.425974]        register_console+0x39c/0x520
> [  297.425975]        univ8250_console_init+0x23/0x2d
> [  297.425975]        console_init+0x338/0x4cd
> [  297.425976]        start_kernel+0x534/0x724
> [  297.425977]        x86_64_start_reservations+0x24/0x26
> [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> [  297.425978]        secondary_startup_64+0xb6/0xc0

This is an early init code again so the lockdep sounds like a false
possitive to me.
Qian Cai Oct. 7, 2019, 3:33 p.m. UTC | #15
On Mon, 2019-10-07 at 17:12 +0200, Michal Hocko wrote:
> On Mon 07-10-19 10:59:10, Qian Cai wrote:
> [...]
> > It is almost impossible to eliminate all the indirect call chains from
> > console_sem/console_owner_lock to zone->lock because it is too normal that
> > something later needs to allocate some memory dynamically, so as long as it
> > directly call printk() with zone->lock held, it will be in trouble.
> 
> Do you have any example where the console driver really _has_ to
> allocate. Because I have hard time to believe this is going to work at
> all as the atomic context doesn't allow to do any memory reclaim and
> such an allocation would be too easy to fail so the allocation cannot
> really rely on it.

I don't know how to explain to you clearly, but let me repeat again one last
time. There is no necessary for console driver directly to allocate considering
this example,

CPU0:              CPU1:    CPU2:       CPU3:
console_sem->lock                       zone->lock
                   pi->lock
pi->lock                    rq_lock
                   rq->lock
                            zone->lock
                                        console_sem->lock

Here it only need someone held the rq_lock and allocate some memory. There is
also true for port_lock. Since the deadlock could involve a lot of CPUs and a
longer lock chain, it is impossible to predict which one to allocate some memory
while held a lock could end up with the same problematic lock chain.

> 
> So again, crippling the MM code just because of lockdep false possitives
> or a broken console driver sounds like a wrong way to approach the
> problem.
> 
> > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > [  297.425967]        lock_acquire+0x126/0x280
> > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > [  297.425970]        univ8250_console_write+0x4b/0x60
> > [  297.425970]        console_unlock+0x501/0x750
> > [  297.425971]        vprintk_emit+0x10d/0x340
> > [  297.425972]        vprintk_default+0x1f/0x30
> > [  297.425972]        vprintk_func+0x44/0xd4
> > [  297.425973]        printk+0x9f/0xc5
> > [  297.425974]        register_console+0x39c/0x520
> > [  297.425975]        univ8250_console_init+0x23/0x2d
> > [  297.425975]        console_init+0x338/0x4cd
> > [  297.425976]        start_kernel+0x534/0x724
> > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > [  297.425978]        secondary_startup_64+0xb6/0xc0
> 
> This is an early init code again so the lockdep sounds like a false
> possitive to me.

This is just a tip of iceberg to show the lock dependency,

console_owner --> port_lock_key

which could easily happen everywhere with a simple printk().
Petr Mladek Oct. 8, 2019, 7:43 a.m. UTC | #16
On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
>  Petr has explained it is a false positive
>  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> [...]
> > I believe that it cannot really happen because:
> > 
> > 	static int __init
> > 	sclp_console_init(void)
> > 	{
> > 	[...]
> > 		rc = sclp_rw_init();
> > 	[...]
> > 		register_console(&sclp_console);
> > 		return 0;
> > 	}
> > 
> > sclp_rw_init() is called before register_console(). And
> > console_unlock() will never call sclp_console_write() before
> > the console is registered.
> > 
> > AFAIK, lockdep only compares existing chain of locks. It does
> > not know about console registration that would make some
> > code paths mutually exclusive.
> > 
> > I believe that it is a false positive. I do not know how to
> > avoid this lockdep report. I hope that it will disappear
> > by deferring all printk() calls rather soon.
> 
> Thanks a lot for looking into this Petr. I have also checked the code
> and I really fail to see why the allocation has to be done under the
> lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> variables but I strongly suspect that they need a synchronization during
> early init, callbacks are registered only later IIUC:

Good idea. It would work when the init function is called only once.
But see below.

> diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> index d2ab3f07c008..4b1c033e3255 100644
> --- a/drivers/s390/char/sclp.c
> +++ b/drivers/s390/char/sclp.c
> @@ -1169,13 +1169,13 @@ sclp_init(void)
>  	unsigned long flags;
>  	int rc = 0;
>  
> +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
>  	spin_lock_irqsave(&sclp_lock, flags);
>  	/* Check for previous or running initialization */
>  	if (sclp_init_state != sclp_init_state_uninitialized)
>  		goto fail_unlock;

It seems that sclp_init() could be called several times in parallel.
I see it called from sclp_register() and sclp_initcall().

I am not sure if it is really needed or if it is just a strange
desing.

It might be still possible to always do the allocation without the lock
and free the memory when it is not really used. But I am not sure
if we want to do this exercise just to avoid lockdep false positive.

Best Regards,
Petr
Petr Mladek Oct. 8, 2019, 8:15 a.m. UTC | #17
On Mon 2019-10-07 11:33:27, Qian Cai wrote:
> On Mon, 2019-10-07 at 17:12 +0200, Michal Hocko wrote:
> > On Mon 07-10-19 10:59:10, Qian Cai wrote:
> > [...]
> > > It is almost impossible to eliminate all the indirect call chains from
> > > console_sem/console_owner_lock to zone->lock because it is too normal that
> > > something later needs to allocate some memory dynamically, so as long as it
> > > directly call printk() with zone->lock held, it will be in trouble.
> > 
> > Do you have any example where the console driver really _has_ to
> > allocate. Because I have hard time to believe this is going to work at
> > all as the atomic context doesn't allow to do any memory reclaim and
> > such an allocation would be too easy to fail so the allocation cannot
> > really rely on it.
> 
> I don't know how to explain to you clearly, but let me repeat again one last
> time. There is no necessary for console driver directly to allocate considering
> this example,
> 
> CPU0:              CPU1:    CPU2:       CPU3:
> console_sem->lock                       zone->lock
>                    pi->lock
> pi->lock                    rq_lock
>                    rq->lock
>                             zone->lock
>                                         console_sem->lock

I am curious about CPU2. Does scheduler need to allocate memory?

> Here it only need someone held the rq_lock and allocate some memory. There is
> also true for port_lock. Since the deadlock could involve a lot of CPUs and a
> longer lock chain, it is impossible to predict which one to allocate some memory
> while held a lock could end up with the same problematic lock chain.
> 
> This is just a tip of iceberg to show the lock dependency,
> 
> console_owner --> port_lock_key
> 
> which could easily happen everywhere with a simple printk().

We have got several lockdep reports about possible deadlocks between
console_lock and port_lock caused by printk() called from console
code.

First note that they have been there for years. They were well hidden
until 4.11 released in April 2017. Where the commit
f975237b76827956fe13e ("printk: use printk_safe buffers in printk")
allowed recursive printk() and lockdep.

We believe that these deadlocks are really hard to hit. Console
drivers call printk() only in very critical and rare situations.
This is why nobody invested too much time into fixing these
so far.

There are basically three possibilities:

1. Do crazy exercises with locks all around the kernel to
   avoid the deadlocks. It is usually not worth it. And
   it is a "whack a mole" approach.

2. Use printk_deferred() in problematic code paths. It is
   a "whack a mole" approach as well. And we would end up
   with printk_deferred() used almost everywhere.

3. Always deffer the console handling in printk(). This would
   help also to avoid soft lockups. Several people pushed
   against this last few years because it might reduce
   the chance to see the message in case of system crash.

As I said, there has finally been agreement to always do
the offload few weeks ago. John Ogness is working on it.
So we might have the systematic solution for these deadlocks
rather sooner than later.

Feel free to ask John to CC you on the patches if you want
to help with review.

Best Regards,
Petr
Michal Hocko Oct. 8, 2019, 8:27 a.m. UTC | #18
On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> >  Petr has explained it is a false positive
> >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > [...]
> > > I believe that it cannot really happen because:
> > > 
> > > 	static int __init
> > > 	sclp_console_init(void)
> > > 	{
> > > 	[...]
> > > 		rc = sclp_rw_init();
> > > 	[...]
> > > 		register_console(&sclp_console);
> > > 		return 0;
> > > 	}
> > > 
> > > sclp_rw_init() is called before register_console(). And
> > > console_unlock() will never call sclp_console_write() before
> > > the console is registered.
> > > 
> > > AFAIK, lockdep only compares existing chain of locks. It does
> > > not know about console registration that would make some
> > > code paths mutually exclusive.
> > > 
> > > I believe that it is a false positive. I do not know how to
> > > avoid this lockdep report. I hope that it will disappear
> > > by deferring all printk() calls rather soon.
> > 
> > Thanks a lot for looking into this Petr. I have also checked the code
> > and I really fail to see why the allocation has to be done under the
> > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > variables but I strongly suspect that they need a synchronization during
> > early init, callbacks are registered only later IIUC:
> 
> Good idea. It would work when the init function is called only once.
> But see below.
> 
> > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > index d2ab3f07c008..4b1c033e3255 100644
> > --- a/drivers/s390/char/sclp.c
> > +++ b/drivers/s390/char/sclp.c
> > @@ -1169,13 +1169,13 @@ sclp_init(void)
> >  	unsigned long flags;
> >  	int rc = 0;
> >  
> > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> >  	spin_lock_irqsave(&sclp_lock, flags);
> >  	/* Check for previous or running initialization */
> >  	if (sclp_init_state != sclp_init_state_uninitialized)
> >  		goto fail_unlock;
> 
> It seems that sclp_init() could be called several times in parallel.
> I see it called from sclp_register() and sclp_initcall().

Interesting. Something for s390 people to answer I guess.
Anyway, this should be quite trivial to workaround by a cmpxch or alike.
Michal Hocko Oct. 8, 2019, 8:40 a.m. UTC | #19
On Mon 07-10-19 11:33:27, Qian Cai wrote:
> On Mon, 2019-10-07 at 17:12 +0200, Michal Hocko wrote:
> > On Mon 07-10-19 10:59:10, Qian Cai wrote:
> > [...]
> > > It is almost impossible to eliminate all the indirect call chains from
> > > console_sem/console_owner_lock to zone->lock because it is too normal that
> > > something later needs to allocate some memory dynamically, so as long as it
> > > directly call printk() with zone->lock held, it will be in trouble.
> > 
> > Do you have any example where the console driver really _has_ to
> > allocate. Because I have hard time to believe this is going to work at
> > all as the atomic context doesn't allow to do any memory reclaim and
> > such an allocation would be too easy to fail so the allocation cannot
> > really rely on it.
> 
> I don't know how to explain to you clearly, but let me repeat again one last
> time. There is no necessary for console driver directly to allocate considering
> this example,
> 
> CPU0:              CPU1:    CPU2:       CPU3:
> console_sem->lock                       zone->lock
>                    pi->lock
> pi->lock                    rq_lock
>                    rq->lock
>                             zone->lock
>                                         console_sem->lock
> 
> Here it only need someone held the rq_lock and allocate some memory.

Is the scheduler really allocating while holding the rq lock?

> There is
> also true for port_lock. Since the deadlock could involve a lot of CPUs and a
> longer lock chain, it is impossible to predict which one to allocate some memory
> while held a lock could end up with the same problematic lock chain.

And that is exactly what I've said earlier. Locks used by consoles
should really better be tail locks because then they are going to create
arbitrary dependency chains. The zone->lock is in no way special here.
 
> > So again, crippling the MM code just because of lockdep false possitives
> > or a broken console driver sounds like a wrong way to approach the
> > problem.
> > 
> > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > [  297.425967]        lock_acquire+0x126/0x280
> > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > [  297.425970]        console_unlock+0x501/0x750
> > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > [  297.425972]        vprintk_default+0x1f/0x30
> > > [  297.425972]        vprintk_func+0x44/0xd4
> > > [  297.425973]        printk+0x9f/0xc5
> > > [  297.425974]        register_console+0x39c/0x520
> > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > [  297.425975]        console_init+0x338/0x4cd
> > > [  297.425976]        start_kernel+0x534/0x724
> > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > 
> > This is an early init code again so the lockdep sounds like a false
> > possitive to me.
> 
> This is just a tip of iceberg to show the lock dependency,

Does this tip point to a real deadlock or merely a class of lockdep
false dependencies?

> console_owner --> port_lock_key
> 
> which could easily happen everywhere with a simple printk().
Qian Cai Oct. 8, 2019, 9:32 a.m. UTC | #20
> On Oct 8, 2019, at 4:15 AM, Petr Mladek <pmladek@suse.com> wrote:
> 
> I am curious about CPU2. Does scheduler need to allocate memory?

It happens before with debug objects. It might not 100% necessary but it could easily end up with this way in the current code. Moreover, this is just an example of how things could go wrong as it could happen for pi_lock or any other unknown downstream locks which are not really console related.
Qian Cai Oct. 8, 2019, 10:04 a.m. UTC | #21
> On Oct 8, 2019, at 4:40 AM, Michal Hocko <mhocko@kernel.org> wrote:
> 
> Does this tip point to a real deadlock or merely a class of lockdep
> false dependencies?

I lean towards it is a real deadlock given how trivial to generate those lock orders everywhere. On the other hand, it make a little different to spend too much time arguing the authentic of those reproducible locdep splats, as even false positives could be as bad for developers where it would disable the whole lockdep and mask off other true deadlock could happen later.
Michal Hocko Oct. 8, 2019, 10:39 a.m. UTC | #22
On Tue 08-10-19 06:04:32, Qian Cai wrote:
> 
> 
> > On Oct 8, 2019, at 4:40 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > 
> > Does this tip point to a real deadlock or merely a class of lockdep
> > false dependencies?
> 
> I lean towards it is a real deadlock given how trivial to generate those lock orders everywhere.

Have you actually triggered any real deadlock? With a zone->lock in
place it would be pretty clear with hard lockups detected.
Qian Cai Oct. 8, 2019, noon UTC | #23
> On Oct 8, 2019, at 6:39 AM, Michal Hocko <mhocko@kernel.org> wrote:
> 
> Have you actually triggered any real deadlock? With a zone->lock in
> place it would be pretty clear with hard lockups detected.

Yes, I did trigger here and there, and those lockdep splats are especially useful to figure out why.
Michal Hocko Oct. 8, 2019, 12:39 p.m. UTC | #24
On Tue 08-10-19 08:00:43, Qian Cai wrote:
> 
> 
> > On Oct 8, 2019, at 6:39 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > 
> > Have you actually triggered any real deadlock? With a zone->lock in
> > place it would be pretty clear with hard lockups detected.
> 
> Yes, I did trigger here and there, and those lockdep splats are
> especially useful to figure out why.

Can you provide a lockdep splat from an actual deadlock please? I am
sorry but your responses tend to be really cryptic and I never know when
you are talking about actual deadlocks and lockdep splats. I have asked
about the former several times never receiving a specific answer.
Christian Borntraeger Oct. 8, 2019, 12:56 p.m. UTC | #25
Adding Peter Oberparleiter.
Peter, can you have a look?

On 08.10.19 10:27, Michal Hocko wrote:
> On Tue 08-10-19 09:43:57, Petr Mladek wrote:
>> On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
>>> [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
>>>  Petr has explained it is a false positive
>>>  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
>>> On Mon 07-10-19 16:30:02, Petr Mladek wrote:
>>> [...]
>>>> I believe that it cannot really happen because:
>>>>
>>>> 	static int __init
>>>> 	sclp_console_init(void)
>>>> 	{
>>>> 	[...]
>>>> 		rc = sclp_rw_init();
>>>> 	[...]
>>>> 		register_console(&sclp_console);
>>>> 		return 0;
>>>> 	}
>>>>
>>>> sclp_rw_init() is called before register_console(). And
>>>> console_unlock() will never call sclp_console_write() before
>>>> the console is registered.
>>>>
>>>> AFAIK, lockdep only compares existing chain of locks. It does
>>>> not know about console registration that would make some
>>>> code paths mutually exclusive.
>>>>
>>>> I believe that it is a false positive. I do not know how to
>>>> avoid this lockdep report. I hope that it will disappear
>>>> by deferring all printk() calls rather soon.
>>>
>>> Thanks a lot for looking into this Petr. I have also checked the code
>>> and I really fail to see why the allocation has to be done under the
>>> lock in the first place. sclp_read_sccb and sclp_init_sccb are global
>>> variables but I strongly suspect that they need a synchronization during
>>> early init, callbacks are registered only later IIUC:
>>
>> Good idea. It would work when the init function is called only once.
>> But see below.
>>
>>> diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
>>> index d2ab3f07c008..4b1c033e3255 100644
>>> --- a/drivers/s390/char/sclp.c
>>> +++ b/drivers/s390/char/sclp.c
>>> @@ -1169,13 +1169,13 @@ sclp_init(void)
>>>  	unsigned long flags;
>>>  	int rc = 0;
>>>  
>>> +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
>>> +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
>>>  	spin_lock_irqsave(&sclp_lock, flags);
>>>  	/* Check for previous or running initialization */
>>>  	if (sclp_init_state != sclp_init_state_uninitialized)
>>>  		goto fail_unlock;
>>
>> It seems that sclp_init() could be called several times in parallel.
>> I see it called from sclp_register() and sclp_initcall().
> 
> Interesting. Something for s390 people to answer I guess.
> Anyway, this should be quite trivial to workaround by a cmpxch or alike.
>
Qian Cai Oct. 8, 2019, 1:06 p.m. UTC | #26
On Tue, 2019-10-08 at 14:39 +0200, Michal Hocko wrote:
> On Tue 08-10-19 08:00:43, Qian Cai wrote:
> > 
> > 
> > > On Oct 8, 2019, at 6:39 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > 
> > > Have you actually triggered any real deadlock? With a zone->lock in
> > > place it would be pretty clear with hard lockups detected.
> > 
> > Yes, I did trigger here and there, and those lockdep splats are
> > especially useful to figure out why.
> 
> Can you provide a lockdep splat from an actual deadlock please? I am
> sorry but your responses tend to be really cryptic and I never know when
> you are talking about actual deadlocks and lockdep splats. I have asked
> about the former several times never receiving a specific answer.

It is very time-consuming to confirm a lockdep splat is 100% matching a deadlock
giving that it is not able to reproduce on will yet, so when I did encounter a
memory offline deadlock where "echo offline > memory/state" just hang, but there
is no hard lockup probably because the hard lockup detector did not work
properly for some reasons or it keep trying to acquire a spin lock that only
keep the CPU 100%.
Petr Mladek Oct. 8, 2019, 1:08 p.m. UTC | #27
On Mon 2019-10-07 10:59:10, Qian Cai wrote:
> It is almost impossible to eliminate all the indirect call chains from
> console_sem/console_owner_lock to zone->lock because it is too normal that
> something later needs to allocate some memory dynamically, so as long as it
> directly call printk() with zone->lock held, it will be in trouble.

It is not normal the something needs to allocate memory under
console_sem. Console drivers are supposed to get the message
out even when the system is in really bad state and it is not
possible to allocate memory. I consider this a bug in the console
driver.

> I really hope the new printk() will solve this class of the problem, but it is
> essentially up to the air until a patchset was posted. There are just too many
> questions out there to be answered.

The new printk should prevent all deadlocks simply because it will
be fully lockless. The console drivers will be called from dedicated
kthreads and therefore from a well defined context.

Best Regards,
Petr
Steven Rostedt Oct. 8, 2019, 1:13 p.m. UTC | #28
On Tue, 8 Oct 2019 10:15:10 +0200
Petr Mladek <pmladek@suse.com> wrote:

> There are basically three possibilities:
> 
> 1. Do crazy exercises with locks all around the kernel to
>    avoid the deadlocks. It is usually not worth it. And
>    it is a "whack a mole" approach.
> 
> 2. Use printk_deferred() in problematic code paths. It is
>    a "whack a mole" approach as well. And we would end up
>    with printk_deferred() used almost everywhere.
> 
> 3. Always deffer the console handling in printk(). This would
>    help also to avoid soft lockups. Several people pushed
>    against this last few years because it might reduce
>    the chance to see the message in case of system crash.
> 
> As I said, there has finally been agreement to always do
> the offload few weeks ago. John Ogness is working on it.
> So we might have the systematic solution for these deadlocks
> rather sooner than later.

Another solution is to add the printk_deferred() in these places that
cause lockdep splats, and when John's work is done, it would be easy to
grep for them and remove them as they would no longer be needed.

This way we don't play whack-a-mole forever (only until we have a
proper solution) and everyone is happy that we no longer have these
false positive or I-don't-care lockdep splats which hide real lockdep
splats because lockdep shuts off as soon as it discovers its first
splat.

-- Steve
Qian Cai Oct. 8, 2019, 1:23 p.m. UTC | #29
On Tue, 2019-10-08 at 09:13 -0400, Steven Rostedt wrote:
> On Tue, 8 Oct 2019 10:15:10 +0200
> Petr Mladek <pmladek@suse.com> wrote:
> 
> > There are basically three possibilities:
> > 
> > 1. Do crazy exercises with locks all around the kernel to
> >    avoid the deadlocks. It is usually not worth it. And
> >    it is a "whack a mole" approach.
> > 
> > 2. Use printk_deferred() in problematic code paths. It is
> >    a "whack a mole" approach as well. And we would end up
> >    with printk_deferred() used almost everywhere.
> > 
> > 3. Always deffer the console handling in printk(). This would
> >    help also to avoid soft lockups. Several people pushed
> >    against this last few years because it might reduce
> >    the chance to see the message in case of system crash.
> > 
> > As I said, there has finally been agreement to always do
> > the offload few weeks ago. John Ogness is working on it.
> > So we might have the systematic solution for these deadlocks
> > rather sooner than later.
> 
> Another solution is to add the printk_deferred() in these places that
> cause lockdep splats, and when John's work is done, it would be easy to
> grep for them and remove them as they would no longer be needed.
> 
> This way we don't play whack-a-mole forever (only until we have a
> proper solution) and everyone is happy that we no longer have these
> false positive or I-don't-care lockdep splats which hide real lockdep
> splats because lockdep shuts off as soon as it discovers its first
> splat.

I feel like that is what I trying to do, but there seems a lot of resistances
with that approach where pragmatism met with perfectionism.
Steven Rostedt Oct. 8, 2019, 1:33 p.m. UTC | #30
On Tue, 08 Oct 2019 09:23:52 -0400
Qian Cai <cai@lca.pw> wrote:

> I feel like that is what I trying to do, but there seems a lot of resistances
> with that approach where pragmatism met with perfectionism.

It's the way it came across. It sounded as if you were proposing
"the solution". I'm coming out and explicitly saying that this may be a
"temporary solution", as having "printk_deferred()" we can easily
remove them when that becomes the default operation.

-- Steve
Qian Cai Oct. 8, 2019, 1:33 p.m. UTC | #31
On Tue, 2019-10-08 at 15:08 +0200, Petr Mladek wrote:
> On Mon 2019-10-07 10:59:10, Qian Cai wrote:
> > It is almost impossible to eliminate all the indirect call chains from
> > console_sem/console_owner_lock to zone->lock because it is too normal that
> > something later needs to allocate some memory dynamically, so as long as it
> > directly call printk() with zone->lock held, it will be in trouble.
> 
> It is not normal the something needs to allocate memory under
> console_sem. Console drivers are supposed to get the message
> out even when the system is in really bad state and it is not
> possible to allocate memory. I consider this a bug in the console
> driver.

Again, it is not directly under console_sem. It is *indirect*.

console_sem --> lockA
lockA --> lockB
lockB --> lockC

Anything allocating memory with lockB or lockC held will form the problematic
lock chain to trigger the lockdep splat with memory offline.
Michal Hocko Oct. 8, 2019, 1:37 p.m. UTC | #32
On Tue 08-10-19 09:06:29, Qian Cai wrote:
> On Tue, 2019-10-08 at 14:39 +0200, Michal Hocko wrote:
> > On Tue 08-10-19 08:00:43, Qian Cai wrote:
> > > 
> > > 
> > > > On Oct 8, 2019, at 6:39 AM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > 
> > > > Have you actually triggered any real deadlock? With a zone->lock in
> > > > place it would be pretty clear with hard lockups detected.
> > > 
> > > Yes, I did trigger here and there, and those lockdep splats are
> > > especially useful to figure out why.
> > 
> > Can you provide a lockdep splat from an actual deadlock please? I am
> > sorry but your responses tend to be really cryptic and I never know when
> > you are talking about actual deadlocks and lockdep splats. I have asked
> > about the former several times never receiving a specific answer.
> 
> It is very time-consuming to confirm a lockdep splat is 100% matching a deadlock
> giving that it is not able to reproduce on will yet, so when I did encounter a
> memory offline deadlock where "echo offline > memory/state" just hang, but there
> is no hard lockup probably because the hard lockup detector did not work
> properly for some reasons or it keep trying to acquire a spin lock that only
> keep the CPU 100%.

If there is a real deadlock due to zone->lock then you would certainly
get a hard lockup splat. So I strongly suspect that you are seeing a
completely different problem. Most likely some pages cannot be migrated
and the offlining code will retry for ever. You can terminate that from
the userspace by a fatal signal of course.
Petr Mladek Oct. 8, 2019, 1:42 p.m. UTC | #33
On Tue 2019-10-08 09:23:52, Qian Cai wrote:
> On Tue, 2019-10-08 at 09:13 -0400, Steven Rostedt wrote:
> > On Tue, 8 Oct 2019 10:15:10 +0200
> > Petr Mladek <pmladek@suse.com> wrote:
> > 
> > > There are basically three possibilities:
> > > 
> > > 1. Do crazy exercises with locks all around the kernel to
> > >    avoid the deadlocks. It is usually not worth it. And
> > >    it is a "whack a mole" approach.
> > > 
> > > 2. Use printk_deferred() in problematic code paths. It is
> > >    a "whack a mole" approach as well. And we would end up
> > >    with printk_deferred() used almost everywhere.
> > > 
> > > 3. Always deffer the console handling in printk(). This would
> > >    help also to avoid soft lockups. Several people pushed
> > >    against this last few years because it might reduce
> > >    the chance to see the message in case of system crash.
> > > 
> > > As I said, there has finally been agreement to always do
> > > the offload few weeks ago. John Ogness is working on it.
> > > So we might have the systematic solution for these deadlocks
> > > rather sooner than later.
> > 
> > Another solution is to add the printk_deferred() in these places that
> > cause lockdep splats, and when John's work is done, it would be easy to
> > grep for them and remove them as they would no longer be needed.
> > 
> > This way we don't play whack-a-mole forever (only until we have a
> > proper solution) and everyone is happy that we no longer have these
> > false positive or I-don't-care lockdep splats which hide real lockdep
> > splats because lockdep shuts off as soon as it discovers its first
> > splat.
> 
> I feel like that is what I trying to do, but there seems a lot of resistances
> with that approach where pragmatism met with perfectionism.

No, the resistance was against complicated code changes (games with
locks) and against removing useful messages. Such changes might cause
more harm than good.

I am not -mm maintainer so I could not guarantee that a patch
using printk_deferred() will get accepted. But it will have much
bigger chance than the original patch.

Anyway, printk_deferred() is a lost war. It is temporary solution
for one particular scenario. But as you said, there might be many
others. The long term solution is the printk rework.

Best Regards,
Petr
Michal Hocko Oct. 8, 2019, 1:48 p.m. UTC | #34
On Tue 08-10-19 15:42:56, Petr Mladek wrote:
[...]
> I am not -mm maintainer so I could not guarantee that a patch
> using printk_deferred() will get accepted. But it will have much
> bigger chance than the original patch.

I am not going to ack any such patch until it is clear what is the
actual problem. The disucssion in this thread boils down to point to
lockdep splats which are most likely false possitives and there is no
clear evidence that the is any actual deadlock as those would be clearly
identifiable if the zone->lock or any other spinlock spinlock were
participating.
Qian Cai Oct. 8, 2019, 2:03 p.m. UTC | #35
On Tue, 2019-10-08 at 15:42 +0200, Petr Mladek wrote:
> On Tue 2019-10-08 09:23:52, Qian Cai wrote:
> > On Tue, 2019-10-08 at 09:13 -0400, Steven Rostedt wrote:
> > > On Tue, 8 Oct 2019 10:15:10 +0200
> > > Petr Mladek <pmladek@suse.com> wrote:
> > > 
> > > > There are basically three possibilities:
> > > > 
> > > > 1. Do crazy exercises with locks all around the kernel to
> > > >    avoid the deadlocks. It is usually not worth it. And
> > > >    it is a "whack a mole" approach.
> > > > 
> > > > 2. Use printk_deferred() in problematic code paths. It is
> > > >    a "whack a mole" approach as well. And we would end up
> > > >    with printk_deferred() used almost everywhere.
> > > > 
> > > > 3. Always deffer the console handling in printk(). This would
> > > >    help also to avoid soft lockups. Several people pushed
> > > >    against this last few years because it might reduce
> > > >    the chance to see the message in case of system crash.
> > > > 
> > > > As I said, there has finally been agreement to always do
> > > > the offload few weeks ago. John Ogness is working on it.
> > > > So we might have the systematic solution for these deadlocks
> > > > rather sooner than later.
> > > 
> > > Another solution is to add the printk_deferred() in these places that
> > > cause lockdep splats, and when John's work is done, it would be easy to
> > > grep for them and remove them as they would no longer be needed.
> > > 
> > > This way we don't play whack-a-mole forever (only until we have a
> > > proper solution) and everyone is happy that we no longer have these
> > > false positive or I-don't-care lockdep splats which hide real lockdep
> > > splats because lockdep shuts off as soon as it discovers its first
> > > splat.
> > 
> > I feel like that is what I trying to do, but there seems a lot of resistances
> > with that approach where pragmatism met with perfectionism.
> 
> No, the resistance was against complicated code changes (games with
> locks) and against removing useful messages. Such changes might cause
> more harm than good.

I don't think there is "removing useful messages" in this patch. That one
printk() in __offline_isolated_pages() basically as Michal mentioned it is that
useful, but could be converted to printk_deferred() if anyone objected.

It is more complicated to convert dump_page() to use printk_deferred().

> 
> I am not -mm maintainer so I could not guarantee that a patch
> using printk_deferred() will get accepted. But it will have much
> bigger chance than the original patch.
> 
> Anyway, printk_deferred() is a lost war. It is temporary solution
> for one particular scenario. But as you said, there might be many
> others. The long term solution is the printk rework.
> 
> Best Regards,
> Petr
Michal Hocko Oct. 8, 2019, 2:08 p.m. UTC | #36
On Tue 08-10-19 10:03:01, Qian Cai wrote:
[..]
> I don't think there is "removing useful messages" in this patch. That one
> printk() in __offline_isolated_pages() basically as Michal mentioned it is that
> useful, but could be converted to printk_deferred() if anyone objected.

"remove from free list" printk is quite dubious as I've said. And I
wouldn't be opposed to remove it altogether. It just swamps the low with
something that is of no real use. It might have been useful while
shaping and debugging the initial code but it has close to zero value
for end users. But it makes no sense to make to make it printk_deferred
on grounds mentioned so far.
Qian Cai Oct. 8, 2019, 4:08 p.m. UTC | #37
On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
> Adding Peter Oberparleiter.
> Peter, can you have a look?
> 
> On 08.10.19 10:27, Michal Hocko wrote:
> > On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> > > On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > > > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> > > >  Petr has explained it is a false positive
> > > >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > > > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > > > [...]
> > > > > I believe that it cannot really happen because:
> > > > > 
> > > > > 	static int __init
> > > > > 	sclp_console_init(void)
> > > > > 	{
> > > > > 	[...]
> > > > > 		rc = sclp_rw_init();
> > > > > 	[...]
> > > > > 		register_console(&sclp_console);
> > > > > 		return 0;
> > > > > 	}
> > > > > 
> > > > > sclp_rw_init() is called before register_console(). And
> > > > > console_unlock() will never call sclp_console_write() before
> > > > > the console is registered.
> > > > > 
> > > > > AFAIK, lockdep only compares existing chain of locks. It does
> > > > > not know about console registration that would make some
> > > > > code paths mutually exclusive.
> > > > > 
> > > > > I believe that it is a false positive. I do not know how to
> > > > > avoid this lockdep report. I hope that it will disappear
> > > > > by deferring all printk() calls rather soon.
> > > > 
> > > > Thanks a lot for looking into this Petr. I have also checked the code
> > > > and I really fail to see why the allocation has to be done under the
> > > > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > > > variables but I strongly suspect that they need a synchronization during
> > > > early init, callbacks are registered only later IIUC:
> > > 
> > > Good idea. It would work when the init function is called only once.
> > > But see below.
> > > 
> > > > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > > > index d2ab3f07c008..4b1c033e3255 100644
> > > > --- a/drivers/s390/char/sclp.c
> > > > +++ b/drivers/s390/char/sclp.c
> > > > @@ -1169,13 +1169,13 @@ sclp_init(void)
> > > >  	unsigned long flags;
> > > >  	int rc = 0;
> > > >  
> > > > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > >  	spin_lock_irqsave(&sclp_lock, flags);
> > > >  	/* Check for previous or running initialization */
> > > >  	if (sclp_init_state != sclp_init_state_uninitialized)
> > > >  		goto fail_unlock;
> > > 
> > > It seems that sclp_init() could be called several times in parallel.
> > > I see it called from sclp_register() and sclp_initcall().
> > 
> > Interesting. Something for s390 people to answer I guess.
> > Anyway, this should be quite trivial to workaround by a cmpxch or alike.
> > 

The above fix is simply insufficient,

00: [    3.654307] WARNING: possible circular locking dependency detected       
00: [    3.654309] 5.4.0-rc1-next-20191004+ #4 Not tainted                      
00: [    3.654311] ------------------------------------------------------       
00: [    3.654313] swapper/0/1 is trying to acquire lock:                       
00: [    3.654314] 00000000553f3fb8 (sclp_lock){-.-.}, at: sclp_add_request+0x34
00: /0x308                                                                      
00: [    3.654320]                                                              
00: [    3.654322] but task is already holding lock:                            
00: [    3.654323] 00000000550c9fc0 (console_owner){....}, at: console_unlock+0x
00: 328/0xa30                                                                   
00: [    3.654329]                                                              
00: [    3.654331] which lock already depends on the new lock.                  
00: [    3.654332]                                                              
00: [    3.654333]                                                              
00: [    3.654335] the existing dependency chain (in reverse order) is:         
00: [    3.654336]                                                              
00: [    3.654337] -> #3 (console_owner){....}:                                 
00: [    3.654343]        lock_acquire+0x21a/0x468                              
00: [    3.654345]        console_unlock+0x3a6/0xa30                            
00: [    3.654346]        vprintk_emit+0x184/0x3c8                              
00: [    3.654348]        vprintk_default+0x44/0x50                             
00: [    3.654349]        printk+0xa8/0xc0                                      
00: [    3.654351]        get_random_u64+0x40/0x108                             
00: [    3.654360]        add_to_free_area_random+0x188/0x1c0                   
00: [    3.654364]        free_one_page+0x72/0x128                              
00: [    3.654366]        __free_pages_ok+0x51c/0xca0                           
00: [    3.654368]        memblock_free_all+0x30a/0x3b0                         
00: [    3.654370]        mem_init+0x84/0x200                                   
00: [    3.654371]        start_kernel+0x384/0x6a0                              
00: [    3.654373]        startup_continue+0x70/0xd0                            
00: [    3.654374]                                                              
00: [    3.654375] -> #2 (&(&zone->lock)->rlock){....}:                         
00: [    3.654382]        lock_acquire+0x21a/0x468                              
00: [    3.654383]        _raw_spin_lock+0x54/0x68                              
00: [    3.654385]        get_page_from_freelist+0x8b6/0x2d28                   
00: [    3.654387]        __alloc_pages_nodemask+0x246/0x658                    
00: [    3.654389]        alloc_slab_page+0x43c/0x858                           
00: [    3.654390]        allocate_slab+0x90/0x6f0                              
00: [    3.654392]        new_slab+0x88/0x90                                    
00: [    3.654393]        ___slab_alloc+0x600/0x988                             
00: [    3.654395]        __slab_alloc+0x5a/0x90                                
00: [    3.654396]        kmem_cache_alloc+0x340/0x4c0                          
00: [    3.654398]        fill_pool+0x27a/0x498                                 
00: [    3.654400]        __debug_object_init+0xa8/0x858                        
00: [    3.654401]        debug_object_activate+0x208/0x370                     
00: [    3.654403]        __call_rcu+0xb6/0x4a8                                 
00: [    3.654405]        unregister_external_irq+0x13a/0x140                   
00: [    3.654406]        sclp_init+0x5e0/0x688                                 
00: [    3.654408]        sclp_register+0x2e/0x248                              
00: [    3.654409]        sclp_rw_init+0x4a/0x70                                
00: [    3.654411]        sclp_console_init+0x4a/0x1b8                          
00: [    3.654413]        console_init+0x2c8/0x410                              
00: [    3.654414]        start_kernel+0x530/0x6a0                              
00: [    3.654416]        startup_continue+0x70/0xd0                            
00: [    3.654417]                                                              
00: [    3.654418] -> #1 (ext_int_hash_lock){....}:                             
00: [    3.654424]        lock_acquire+0x21a/0x468                              
00: [    3.654426]        _raw_spin_lock_irqsave+0xcc/0xe8                      
00: [    3.654427]        register_external_irq+0xb6/0x138                      
00: [    3.654429]        sclp_init+0x212/0x688                                 
00: [    3.654430]        sclp_register+0x2e/0x248                              
00: [    3.654432]        sclp_rw_init+0x4a/0x70                                
00: [    3.654434]        sclp_console_init+0x4a/0x1b8                          
00: [    3.654435]        console_init+0x2c8/0x410                              
00: [    3.654437]        start_kernel+0x530/0x6a0                              
00: [    3.654438]        startup_continue+0x70/0xd0                            
00: [    3.654439]                                                              
00: [    3.654440] -> #0 (sclp_lock){-.-.}:                                     
00: [    3.654446]        check_noncircular+0x338/0x3e0                         
00: [    3.654448]        __lock_acquire+0x1e66/0x2d88                          
00: [    3.654450]        lock_acquire+0x21a/0x468                              
00: [    3.654451]        _raw_spin_lock_irqsave+0xcc/0xe8                      
00: [    3.654453]        sclp_add_request+0x34/0x308                           
00: [    3.654455]        sclp_conbuf_emit+0x100/0x138                          
00: [    3.654456]        sclp_console_write+0x96/0x3b8                         
00: [    3.654458]        console_unlock+0x6dc/0xa30                            
00: [    3.654460]        vprintk_emit+0x184/0x3c8                              
00: [    3.654461]        vprintk_default+0x44/0x50                             
00: [    3.654463]        printk+0xa8/0xc0                                      
00: [    3.654464]        iommu_debugfs_setup+0xf2/0x108                        
00: [    3.654466]        iommu_init+0x6c/0x78                                  
00: [    3.654467]        do_one_initcall+0x162/0x680                           
00: [    3.654469]        kernel_init_freeable+0x4e8/0x5a8                      
00: [    3.654471]        kernel_init+0x2a/0x188                                
00: [    3.654472]        ret_from_fork+0x30/0x34                               
00: [    3.654474]        kernel_thread_starter+0x0/0xc                         
00: [    3.654475]                                                              
00: [    3.654476] other info that might help us debug this:                    
00: [    3.654477]                                                              
00: [    3.654479] Chain exists of:                                             
00: [    3.654480]   sclp_lock --> &(&zone->lock)->rlock --> console_owner      
00: [    3.654488]                                                              
00: [    3.654489]  Possible unsafe locking scenario:                           
00: [    3.654490]                                                              
00: [    3.654492]        CPU0                    CPU1                          
00: [    3.654493]        ----                    ----                          
00: [    3.654495]   lock(console_owner);                                       
00: [    3.654498]                                lock(&(&zone->lock)->rlock);  
00: [    3.654503]                                lock(console_owner);          
00: [    3.654506]   lock(sclp_lock);                                           
00: [    3.654509]                                                              
00: [    3.654511]  *** DEADLOCK ***                                            
00: [    3.654512]                                                              
00: [    3.654513] 2 locks held by swapper/0/1:                                 
00: [    3.654514]  #0: 00000000550ca200 (console_lock){+.+.}, at: vprintk_emit+
00: 0x178/0x3c8                                                                 
00: [    3.654521]  #1: 00000000550c9fc0 (console_owner){....}, at: console_unlo
00: ck+0x328/0xa30                                                              
00: [    3.654529]                                                              
00: [    3.654530] stack backtrace:                                             
00: [    3.654532] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc1-next-2019
00: 1004+ #4                                                                    
00: [    3.654534] Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)                 
00: [    3.654535] Call Trace:                                                  
00: [    3.654537] ([<000000005431e218>] show_stack+0x110/0x1b0)                
00: [    3.654539]  [<0000000054bdd926>] dump_stack+0x126/0x178                 
00: [    3.654541]  [<0000000054414b08>] check_noncircular+0x338/0x3e0          
00: [    3.654543]  [<000000005441aaf6>] __lock_acquire+0x1e66/0x2d88           
00: [    3.654545]  [<0000000054417e12>] lock_acquire+0x21a/0x468               
00: [    3.654547]  [<0000000054c18cc4>] _raw_spin_lock_irqsave+0xcc/0xe8       
00: [    3.654549]  [<00000000549f2e14>] sclp_add_request+0x34/0x308            
00: [    3.654551]  [<00000000549fa7f0>] sclp_conbuf_emit+0x100/0x138           
00: [    3.654553]  [<00000000549fa90e>] sclp_console_write+0x96/0x3b8          
00: [    3.654554]  [<000000005442b634>] console_unlock+0x6dc/0xa30             
00: [    3.654556]  [<000000005442de2c>] vprintk_emit+0x184/0x3c8               
00: [    3.654558]  [<000000005442e0b4>] vprintk_default+0x44/0x50              
00: [    3.654560]  [<000000005442eb60>] printk+0xa8/0xc0                       
00: [    3.654562]  [<000000005494ca3a>] iommu_debugfs_setup+0xf2/0x108         
00: [    3.654564]  [<000000005557c0ec>] iommu_init+0x6c/0x78                   
00: [    3.654566]  [<0000000054300fda>] do_one_initcall+0x162/0x680            
00: [    3.654568]  [<000000005553b9f0>] kernel_init_freeable+0x4e8/0x5a8       
00: [    3.654569]  [<0000000054c04172>] kernel_init+0x2a/0x188                 
00: [    3.654571]  [<0000000054c19fbc>] ret_from_fork+0x30/0x34                
00: [    3.654573]  [<0000000054c19fc0>] kernel_thread_starter+0x0/0xc
Michal Hocko Oct. 8, 2019, 6:35 p.m. UTC | #38
On Tue 08-10-19 12:08:37, Qian Cai wrote:
> On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
> > Adding Peter Oberparleiter.
> > Peter, can you have a look?
> > 
> > On 08.10.19 10:27, Michal Hocko wrote:
> > > On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> > > > On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > > > > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> > > > >  Petr has explained it is a false positive
> > > > >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > > > > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > > > > [...]
> > > > > > I believe that it cannot really happen because:
> > > > > > 
> > > > > > 	static int __init
> > > > > > 	sclp_console_init(void)
> > > > > > 	{
> > > > > > 	[...]
> > > > > > 		rc = sclp_rw_init();
> > > > > > 	[...]
> > > > > > 		register_console(&sclp_console);
> > > > > > 		return 0;
> > > > > > 	}
> > > > > > 
> > > > > > sclp_rw_init() is called before register_console(). And
> > > > > > console_unlock() will never call sclp_console_write() before
> > > > > > the console is registered.
> > > > > > 
> > > > > > AFAIK, lockdep only compares existing chain of locks. It does
> > > > > > not know about console registration that would make some
> > > > > > code paths mutually exclusive.
> > > > > > 
> > > > > > I believe that it is a false positive. I do not know how to
> > > > > > avoid this lockdep report. I hope that it will disappear
> > > > > > by deferring all printk() calls rather soon.
> > > > > 
> > > > > Thanks a lot for looking into this Petr. I have also checked the code
> > > > > and I really fail to see why the allocation has to be done under the
> > > > > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > > > > variables but I strongly suspect that they need a synchronization during
> > > > > early init, callbacks are registered only later IIUC:
> > > > 
> > > > Good idea. It would work when the init function is called only once.
> > > > But see below.
> > > > 
> > > > > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > > > > index d2ab3f07c008..4b1c033e3255 100644
> > > > > --- a/drivers/s390/char/sclp.c
> > > > > +++ b/drivers/s390/char/sclp.c
> > > > > @@ -1169,13 +1169,13 @@ sclp_init(void)
> > > > >  	unsigned long flags;
> > > > >  	int rc = 0;
> > > > >  
> > > > > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > >  	spin_lock_irqsave(&sclp_lock, flags);
> > > > >  	/* Check for previous or running initialization */
> > > > >  	if (sclp_init_state != sclp_init_state_uninitialized)
> > > > >  		goto fail_unlock;
> > > > 
> > > > It seems that sclp_init() could be called several times in parallel.
> > > > I see it called from sclp_register() and sclp_initcall().
> > > 
> > > Interesting. Something for s390 people to answer I guess.
> > > Anyway, this should be quite trivial to workaround by a cmpxch or alike.
> > > 
> 
> The above fix is simply insufficient,

Isn't this yet another init time lockdep false possitive?

> 00: [    3.654337] -> #3 (console_owner){....}:                                 
> 00: [    3.654343]        lock_acquire+0x21a/0x468                              
> 00: [    3.654345]        console_unlock+0x3a6/0xa30                            
> 00: [    3.654346]        vprintk_emit+0x184/0x3c8                              
> 00: [    3.654348]        vprintk_default+0x44/0x50                             
> 00: [    3.654349]        printk+0xa8/0xc0                                      
> 00: [    3.654351]        get_random_u64+0x40/0x108                             
> 00: [    3.654360]        add_to_free_area_random+0x188/0x1c0                   
> 00: [    3.654364]        free_one_page+0x72/0x128                              
> 00: [    3.654366]        __free_pages_ok+0x51c/0xca0                           
> 00: [    3.654368]        memblock_free_all+0x30a/0x3b0                         
> 00: [    3.654370]        mem_init+0x84/0x200                                   
> 00: [    3.654371]        start_kernel+0x384/0x6a0                              
> 00: [    3.654373]        startup_continue+0x70/0xd0                            

This one is actually a nice example why trying to get printk out of the
zone->lock is simply not viable. This one is likely a printk to warn
that the random pool is not fully intiailized. Just because the
allocator tries to randomize the initial free memory pool. You are not
going to remove that printk, right?

I fully agree that this class of lockdep splats are annoying especially
when they make the lockdep unusable but please discuss this with lockdep
maintainers and try to find some solution rather than go and try to
workaround the problem all over the place. If there are places that
would result in a cleaner code then go for it but please do not make the
code worse just because of a non existent problem flagged by a false
positive.
Qian Cai Oct. 8, 2019, 7:06 p.m. UTC | #39
On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
> On Tue 08-10-19 12:08:37, Qian Cai wrote:
> > On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
> > > Adding Peter Oberparleiter.
> > > Peter, can you have a look?
> > > 
> > > On 08.10.19 10:27, Michal Hocko wrote:
> > > > On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> > > > > On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > > > > > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> > > > > >  Petr has explained it is a false positive
> > > > > >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > > > > > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > > > > > [...]
> > > > > > > I believe that it cannot really happen because:
> > > > > > > 
> > > > > > > 	static int __init
> > > > > > > 	sclp_console_init(void)
> > > > > > > 	{
> > > > > > > 	[...]
> > > > > > > 		rc = sclp_rw_init();
> > > > > > > 	[...]
> > > > > > > 		register_console(&sclp_console);
> > > > > > > 		return 0;
> > > > > > > 	}
> > > > > > > 
> > > > > > > sclp_rw_init() is called before register_console(). And
> > > > > > > console_unlock() will never call sclp_console_write() before
> > > > > > > the console is registered.
> > > > > > > 
> > > > > > > AFAIK, lockdep only compares existing chain of locks. It does
> > > > > > > not know about console registration that would make some
> > > > > > > code paths mutually exclusive.
> > > > > > > 
> > > > > > > I believe that it is a false positive. I do not know how to
> > > > > > > avoid this lockdep report. I hope that it will disappear
> > > > > > > by deferring all printk() calls rather soon.
> > > > > > 
> > > > > > Thanks a lot for looking into this Petr. I have also checked the code
> > > > > > and I really fail to see why the allocation has to be done under the
> > > > > > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > > > > > variables but I strongly suspect that they need a synchronization during
> > > > > > early init, callbacks are registered only later IIUC:
> > > > > 
> > > > > Good idea. It would work when the init function is called only once.
> > > > > But see below.
> > > > > 
> > > > > > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > > > > > index d2ab3f07c008..4b1c033e3255 100644
> > > > > > --- a/drivers/s390/char/sclp.c
> > > > > > +++ b/drivers/s390/char/sclp.c
> > > > > > @@ -1169,13 +1169,13 @@ sclp_init(void)
> > > > > >  	unsigned long flags;
> > > > > >  	int rc = 0;
> > > > > >  
> > > > > > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > >  	spin_lock_irqsave(&sclp_lock, flags);
> > > > > >  	/* Check for previous or running initialization */
> > > > > >  	if (sclp_init_state != sclp_init_state_uninitialized)
> > > > > >  		goto fail_unlock;
> > > > > 
> > > > > It seems that sclp_init() could be called several times in parallel.
> > > > > I see it called from sclp_register() and sclp_initcall().
> > > > 
> > > > Interesting. Something for s390 people to answer I guess.
> > > > Anyway, this should be quite trivial to workaround by a cmpxch or alike.
> > > > 
> > 
> > The above fix is simply insufficient,
> 
> Isn't this yet another init time lockdep false possitive?

Again, this is not 100% false positive for sure yet.

> 
> > 00: [    3.654337] -> #3 (console_owner){....}:                                 
> > 00: [    3.654343]        lock_acquire+0x21a/0x468                              
> > 00: [    3.654345]        console_unlock+0x3a6/0xa30                            
> > 00: [    3.654346]        vprintk_emit+0x184/0x3c8                              
> > 00: [    3.654348]        vprintk_default+0x44/0x50                             
> > 00: [    3.654349]        printk+0xa8/0xc0                                      
> > 00: [    3.654351]        get_random_u64+0x40/0x108                             
> > 00: [    3.654360]        add_to_free_area_random+0x188/0x1c0                   
> > 00: [    3.654364]        free_one_page+0x72/0x128                              
> > 00: [    3.654366]        __free_pages_ok+0x51c/0xca0                           
> > 00: [    3.654368]        memblock_free_all+0x30a/0x3b0                         
> > 00: [    3.654370]        mem_init+0x84/0x200                                   
> > 00: [    3.654371]        start_kernel+0x384/0x6a0                              
> > 00: [    3.654373]        startup_continue+0x70/0xd0                            
> 
> This one is actually a nice example why trying to get printk out of the
> zone->lock is simply not viable. This one is likely a printk to warn
> that the random pool is not fully intiailized. Just because the
> allocator tries to randomize the initial free memory pool. You are not
> going to remove that printk, right?

Well, Sergey had a patch to convert that one to printk_deferred(), but even with
his patch, it will still trigger the lockdep splat here because the lock
dependency between zone->lock --> console_owner is still there from memory
offline.

> 
> I fully agree that this class of lockdep splats are annoying especially
> when they make the lockdep unusable but please discuss this with lockdep
> maintainers and try to find some solution rather than go and try to
> workaround the problem all over the place. If there are places that
> would result in a cleaner code then go for it but please do not make the
> code worse just because of a non existent problem flagged by a false
> positive.

It makes me wonder what make you think it is a false positive for sure.
Michal Hocko Oct. 8, 2019, 7:17 p.m. UTC | #40
On Tue 08-10-19 15:06:13, Qian Cai wrote:
> On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
[...]
> > I fully agree that this class of lockdep splats are annoying especially
> > when they make the lockdep unusable but please discuss this with lockdep
> > maintainers and try to find some solution rather than go and try to
> > workaround the problem all over the place. If there are places that
> > would result in a cleaner code then go for it but please do not make the
> > code worse just because of a non existent problem flagged by a false
> > positive.
> 
> It makes me wonder what make you think it is a false positive for sure.

Because this is an early init code? Because if it were a real deadlock
then your system wouldn't boot to get run with the real userspace
(remember there is zone->lock spinlock involved and that means that you
would hit hard lock after few seconds - but I feel I am repeating
myself).
Qian Cai Oct. 8, 2019, 7:35 p.m. UTC | #41
On Tue, 2019-10-08 at 21:17 +0200, Michal Hocko wrote:
> On Tue 08-10-19 15:06:13, Qian Cai wrote:
> > On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
> 
> [...]
> > > I fully agree that this class of lockdep splats are annoying especially
> > > when they make the lockdep unusable but please discuss this with lockdep
> > > maintainers and try to find some solution rather than go and try to
> > > workaround the problem all over the place. If there are places that
> > > would result in a cleaner code then go for it but please do not make the
> > > code worse just because of a non existent problem flagged by a false
> > > positive.
> > 
> > It makes me wonder what make you think it is a false positive for sure.
> 
> Because this is an early init code? Because if it were a real deadlock

No, that alone does not prove it is a false positive. There are many places
could generate that lock dependency but lockdep will always save the earliest
one.

> then your system wouldn't boot to get run with the real userspace
> (remember there is zone->lock spinlock involved and that means that you
> would hit hard lock after few seconds - but I feel I am repeating
> myself).
Petr Mladek Oct. 9, 2019, 11:39 a.m. UTC | #42
On Tue 2019-10-08 15:06:13, Qian Cai wrote:
> On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
> > On Tue 08-10-19 12:08:37, Qian Cai wrote:
> > > On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
> > > > Adding Peter Oberparleiter.
> > > > Peter, can you have a look?
> > > > 
> > > > On 08.10.19 10:27, Michal Hocko wrote:
> > > > > On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> > > > > > On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > > > > > > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> > > > > > >  Petr has explained it is a false positive
> > > > > > >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > > > > > > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > > > > > > [...]
> > > > > > > > I believe that it cannot really happen because:
> > > > > > > > 
> > > > > > > > 	static int __init
> > > > > > > > 	sclp_console_init(void)
> > > > > > > > 	{
> > > > > > > > 	[...]
> > > > > > > > 		rc = sclp_rw_init();
> > > > > > > > 	[...]
> > > > > > > > 		register_console(&sclp_console);
> > > > > > > > 		return 0;
> > > > > > > > 	}
> > > > > > > > 
> > > > > > > > sclp_rw_init() is called before register_console(). And
> > > > > > > > console_unlock() will never call sclp_console_write() before
> > > > > > > > the console is registered.
> > > > > > > > 
> > > > > > > > AFAIK, lockdep only compares existing chain of locks. It does
> > > > > > > > not know about console registration that would make some
> > > > > > > > code paths mutually exclusive.
> > > > > > > > 
> > > > > > > > I believe that it is a false positive. I do not know how to
> > > > > > > > avoid this lockdep report. I hope that it will disappear
> > > > > > > > by deferring all printk() calls rather soon.
> > > > > > > 
> > > > > > > Thanks a lot for looking into this Petr. I have also checked the code
> > > > > > > and I really fail to see why the allocation has to be done under the
> > > > > > > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > > > > > > variables but I strongly suspect that they need a synchronization during
> > > > > > > early init, callbacks are registered only later IIUC:
> > > > > > 
> > > > > > Good idea. It would work when the init function is called only once.
> > > > > > But see below.
> > > > > > 
> > > > > > > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > > > > > > index d2ab3f07c008..4b1c033e3255 100644
> > > > > > > --- a/drivers/s390/char/sclp.c
> > > > > > > +++ b/drivers/s390/char/sclp.c
> > > > > > > @@ -1169,13 +1169,13 @@ sclp_init(void)
> > > > > > >  	unsigned long flags;
> > > > > > >  	int rc = 0;
> > > > > > >  
> > > > > > > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > > > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > > >  	spin_lock_irqsave(&sclp_lock, flags);
> > > > > > >  	/* Check for previous or running initialization */
> > > > > > >  	if (sclp_init_state != sclp_init_state_uninitialized)
> > > > > > >  		goto fail_unlock;
> > > > > > 
> > > > > > It seems that sclp_init() could be called several times in parallel.
> > > > > > I see it called from sclp_register() and sclp_initcall().
> > > > > 
> > > > > Interesting. Something for s390 people to answer I guess.
> > > > > Anyway, this should be quite trivial to workaround by a cmpxch or alike.
> > > > > 
> > > 
> > > The above fix is simply insufficient,
> > 
> > Isn't this yet another init time lockdep false possitive?
> 
> Again, this is not 100% false positive for sure yet.
> 
> > 
> > > 00: [    3.654337] -> #3 (console_owner){....}:                                 
> > > 00: [    3.654343]        lock_acquire+0x21a/0x468                              
> > > 00: [    3.654345]        console_unlock+0x3a6/0xa30                            
> > > 00: [    3.654346]        vprintk_emit+0x184/0x3c8                              
> > > 00: [    3.654348]        vprintk_default+0x44/0x50                             
> > > 00: [    3.654349]        printk+0xa8/0xc0                                      
> > > 00: [    3.654351]        get_random_u64+0x40/0x108                             
> > > 00: [    3.654360]        add_to_free_area_random+0x188/0x1c0                   
> > > 00: [    3.654364]        free_one_page+0x72/0x128                              
> > > 00: [    3.654366]        __free_pages_ok+0x51c/0xca0                           
> > > 00: [    3.654368]        memblock_free_all+0x30a/0x3b0                         
> > > 00: [    3.654370]        mem_init+0x84/0x200                                   
> > > 00: [    3.654371]        start_kernel+0x384/0x6a0                              
> > > 00: [    3.654373]        startup_continue+0x70/0xd0                            
> > 
> > This one is actually a nice example why trying to get printk out of the
> > zone->lock is simply not viable. This one is likely a printk to warn
> > that the random pool is not fully intiailized. Just because the
> > allocator tries to randomize the initial free memory pool. You are not
> > going to remove that printk, right?
> 
> Well, Sergey had a patch to convert that one to printk_deferred(), but even with
> his patch, it will still trigger the lockdep splat here because the lock
> dependency between zone->lock --> console_owner is still there from memory
> offline.

Is's this another printk() that might need to become printk_deferred()?

Best Regards,
Petr
Petr Mladek Oct. 9, 2019, 11:49 a.m. UTC | #43
On Tue 2019-10-08 15:35:24, Qian Cai wrote:
> On Tue, 2019-10-08 at 21:17 +0200, Michal Hocko wrote:
> > On Tue 08-10-19 15:06:13, Qian Cai wrote:
> > > On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
> > 
> > [...]
> > > > I fully agree that this class of lockdep splats are annoying especially
> > > > when they make the lockdep unusable but please discuss this with lockdep
> > > > maintainers and try to find some solution rather than go and try to
> > > > workaround the problem all over the place. If there are places that
> > > > would result in a cleaner code then go for it but please do not make the
> > > > code worse just because of a non existent problem flagged by a false
> > > > positive.
> > > 
> > > It makes me wonder what make you think it is a false positive for sure.
> > 
> > Because this is an early init code? Because if it were a real deadlock
> 
> No, that alone does not prove it is a false positive. There are many places
> could generate that lock dependency but lockdep will always save the earliest
> one.

You are still mixing the pasted lockdep report and other situations
that will not get reported because of the first one.

We believe that the pasted report is pasted is false positive. And we
are against complicating the code just to avoid this false positive.

Are you sure that the workaround will not create real deadlocks
or races?

I see two realistic possibilities:

   + Make printk() always deferred. This will hopefully happen soon.

   + Improve lockdep so that false positives could get ignored
     without complicating the code.

Best Regards,
Petr
Qian Cai Oct. 9, 2019, 1:06 p.m. UTC | #44
On Wed, 2019-10-09 at 13:49 +0200, Petr Mladek wrote:
> On Tue 2019-10-08 15:35:24, Qian Cai wrote:
> > On Tue, 2019-10-08 at 21:17 +0200, Michal Hocko wrote:
> > > On Tue 08-10-19 15:06:13, Qian Cai wrote:
> > > > On Tue, 2019-10-08 at 20:35 +0200, Michal Hocko wrote:
> > > 
> > > [...]
> > > > > I fully agree that this class of lockdep splats are annoying especially
> > > > > when they make the lockdep unusable but please discuss this with lockdep
> > > > > maintainers and try to find some solution rather than go and try to
> > > > > workaround the problem all over the place. If there are places that
> > > > > would result in a cleaner code then go for it but please do not make the
> > > > > code worse just because of a non existent problem flagged by a false
> > > > > positive.
> > > > 
> > > > It makes me wonder what make you think it is a false positive for sure.
> > > 
> > > Because this is an early init code? Because if it were a real deadlock
> > 
> > No, that alone does not prove it is a false positive. There are many places
> > could generate that lock dependency but lockdep will always save the earliest
> > one.
> 
> You are still mixing the pasted lockdep report and other situations
> that will not get reported because of the first one.

The lockdep report is designed to only just give a clue on the existing locking
dependency. Then, it is normal that developers need to audit all places of that
particular locking dependency to confirm it is a true false positive.

> 
> We believe that the pasted report is pasted is false positive. And we
> are against complicating the code just to avoid this false positive.

This is totally incorrect as above and there is even another similar example of
the splat during memory offline I mentioned earlier,

https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/

[  297.425964] -> #1 (&port_lock_key){-.-.}:
[  297.425967]        __lock_acquire+0x5b3/0xb40
[  297.425967]        lock_acquire+0x126/0x280
[  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
[  297.425969]        serial8250_console_write+0x3e4/0x450
[  297.425970]        univ8250_console_write+0x4b/0x60
[  297.425970]        console_unlock+0x501/0x750
[  297.425971]        vprintk_emit+0x10d/0x340
[  297.425972]        vprintk_default+0x1f/0x30
[  297.425972]        vprintk_func+0x44/0xd4
[  297.425973]        printk+0x9f/0xc5
[  297.425974]        register_console+0x39c/0x520
[  297.425975]        univ8250_console_init+0x23/0x2d
[  297.425975]        console_init+0x338/0x4cd
[  297.425976]        start_kernel+0x534/0x724
[  297.425977]        x86_64_start_reservations+0x24/0x26
[  297.425977]        x86_64_start_kernel+0xf4/0xfb
[  297.425978]        secondary_startup_64+0xb6/0xc0

where the report again show the early boot call trace for the locking
dependency,

console_owner --> port_lock_key

but that dependency clearly not only happen in the early boot.

I agree with you that it is hard to hit because it needs 4 CPUs to hit the exact
the same spot.

> 
> Are you sure that the workaround will not create real deadlocks
> or races?
> 
> I see two realistic possibilities:
> 
>    + Make printk() always deferred. This will hopefully happen soon.
> 
>    + Improve lockdep so that false positives could get ignored
>      without complicating the code.

There are certainly rooms to improve the lockdep but it does not help in this
case as mentioned above.
Michal Hocko Oct. 9, 2019, 1:27 p.m. UTC | #45
On Wed 09-10-19 09:06:42, Qian Cai wrote:
[...]
> https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> 
> [  297.425964] -> #1 (&port_lock_key){-.-.}:
> [  297.425967]        __lock_acquire+0x5b3/0xb40
> [  297.425967]        lock_acquire+0x126/0x280
> [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> [  297.425969]        serial8250_console_write+0x3e4/0x450
> [  297.425970]        univ8250_console_write+0x4b/0x60
> [  297.425970]        console_unlock+0x501/0x750
> [  297.425971]        vprintk_emit+0x10d/0x340
> [  297.425972]        vprintk_default+0x1f/0x30
> [  297.425972]        vprintk_func+0x44/0xd4
> [  297.425973]        printk+0x9f/0xc5
> [  297.425974]        register_console+0x39c/0x520
> [  297.425975]        univ8250_console_init+0x23/0x2d
> [  297.425975]        console_init+0x338/0x4cd
> [  297.425976]        start_kernel+0x534/0x724
> [  297.425977]        x86_64_start_reservations+0x24/0x26
> [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> [  297.425978]        secondary_startup_64+0xb6/0xc0
> 
> where the report again show the early boot call trace for the locking
> dependency,
> 
> console_owner --> port_lock_key
> 
> but that dependency clearly not only happen in the early boot.

Can you provide an example of the runtime dependency without any early
boot artifacts? Because this discussion really doens't make much sense
without a clear example of a _real_ lockdep report that is not a false
possitive. All of them so far have been concluded to be false possitive
AFAIU.
Qian Cai Oct. 9, 2019, 1:43 p.m. UTC | #46
On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> On Wed 09-10-19 09:06:42, Qian Cai wrote:
> [...]
> > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > 
> > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > [  297.425967]        lock_acquire+0x126/0x280
> > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > [  297.425970]        univ8250_console_write+0x4b/0x60
> > [  297.425970]        console_unlock+0x501/0x750
> > [  297.425971]        vprintk_emit+0x10d/0x340
> > [  297.425972]        vprintk_default+0x1f/0x30
> > [  297.425972]        vprintk_func+0x44/0xd4
> > [  297.425973]        printk+0x9f/0xc5
> > [  297.425974]        register_console+0x39c/0x520
> > [  297.425975]        univ8250_console_init+0x23/0x2d
> > [  297.425975]        console_init+0x338/0x4cd
> > [  297.425976]        start_kernel+0x534/0x724
> > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > 
> > where the report again show the early boot call trace for the locking
> > dependency,
> > 
> > console_owner --> port_lock_key
> > 
> > but that dependency clearly not only happen in the early boot.
> 
> Can you provide an example of the runtime dependency without any early
> boot artifacts? Because this discussion really doens't make much sense
> without a clear example of a _real_ lockdep report that is not a false
> possitive. All of them so far have been concluded to be false possitive
> AFAIU.

An obvious one is in the above link. Just replace the trace in #1 above with
printk() from anywhere, i.e., just ignore the early boot calls there as they are
 not important.

printk()
  console_unlock()
    console_lock_spinning_enable() --> console_owner_lock
  call_console_drivers()
    serial8250_console_write() --> port->lock
Michal Hocko Oct. 9, 2019, 1:51 p.m. UTC | #47
On Wed 09-10-19 09:43:13, Qian Cai wrote:
> On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> > On Wed 09-10-19 09:06:42, Qian Cai wrote:
> > [...]
> > > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > > 
> > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > [  297.425967]        lock_acquire+0x126/0x280
> > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > [  297.425970]        console_unlock+0x501/0x750
> > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > [  297.425972]        vprintk_default+0x1f/0x30
> > > [  297.425972]        vprintk_func+0x44/0xd4
> > > [  297.425973]        printk+0x9f/0xc5
> > > [  297.425974]        register_console+0x39c/0x520
> > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > [  297.425975]        console_init+0x338/0x4cd
> > > [  297.425976]        start_kernel+0x534/0x724
> > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > > 
> > > where the report again show the early boot call trace for the locking
> > > dependency,
> > > 
> > > console_owner --> port_lock_key
> > > 
> > > but that dependency clearly not only happen in the early boot.
> > 
> > Can you provide an example of the runtime dependency without any early
> > boot artifacts? Because this discussion really doens't make much sense
> > without a clear example of a _real_ lockdep report that is not a false
> > possitive. All of them so far have been concluded to be false possitive
> > AFAIU.
> 
> An obvious one is in the above link. Just replace the trace in #1 above with
> printk() from anywhere, i.e., just ignore the early boot calls there as they are
>  not important.
> 
> printk()
>   console_unlock()
>     console_lock_spinning_enable() --> console_owner_lock
>   call_console_drivers()
>     serial8250_console_write() --> port->lock

Can you paste the full lock chain graph to be sure we are on the same
page?
Peter Oberparleiter Oct. 9, 2019, 1:56 p.m. UTC | #48
On 08.10.2019 18:08, Qian Cai wrote:
> On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
>> Adding Peter Oberparleiter.
>> Peter, can you have a look?
>>
>> On 08.10.19 10:27, Michal Hocko wrote:
>>> On Tue 08-10-19 09:43:57, Petr Mladek wrote:
>>>> On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
>>>>> [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
>>>>>  Petr has explained it is a false positive
>>>>>  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
>>>>> On Mon 07-10-19 16:30:02, Petr Mladek wrote:
>>>>> [...]
>>>>>> I believe that it cannot really happen because:
>>>>>>
>>>>>> 	static int __init
>>>>>> 	sclp_console_init(void)
>>>>>> 	{
>>>>>> 	[...]
>>>>>> 		rc = sclp_rw_init();
>>>>>> 	[...]
>>>>>> 		register_console(&sclp_console);
>>>>>> 		return 0;
>>>>>> 	}
>>>>>>
>>>>>> sclp_rw_init() is called before register_console(). And
>>>>>> console_unlock() will never call sclp_console_write() before
>>>>>> the console is registered.
>>>>>>
>>>>>> AFAIK, lockdep only compares existing chain of locks. It does
>>>>>> not know about console registration that would make some
>>>>>> code paths mutually exclusive.
>>>>>>
>>>>>> I believe that it is a false positive. I do not know how to
>>>>>> avoid this lockdep report. I hope that it will disappear
>>>>>> by deferring all printk() calls rather soon.
>>>>>
>>>>> Thanks a lot for looking into this Petr. I have also checked the code
>>>>> and I really fail to see why the allocation has to be done under the
>>>>> lock in the first place. sclp_read_sccb and sclp_init_sccb are global
>>>>> variables but I strongly suspect that they need a synchronization during
>>>>> early init, callbacks are registered only later IIUC:
>>>>
>>>> Good idea. It would work when the init function is called only once.
>>>> But see below.
>>>>
>>>>> diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
>>>>> index d2ab3f07c008..4b1c033e3255 100644
>>>>> --- a/drivers/s390/char/sclp.c
>>>>> +++ b/drivers/s390/char/sclp.c
>>>>> @@ -1169,13 +1169,13 @@ sclp_init(void)
>>>>>  	unsigned long flags;
>>>>>  	int rc = 0;
>>>>>  
>>>>> +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
>>>>> +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
>>>>>  	spin_lock_irqsave(&sclp_lock, flags);
>>>>>  	/* Check for previous or running initialization */
>>>>>  	if (sclp_init_state != sclp_init_state_uninitialized)
>>>>>  		goto fail_unlock;
>>>>
>>>> It seems that sclp_init() could be called several times in parallel.
>>>> I see it called from sclp_register() and sclp_initcall().
>>>
>>> Interesting. Something for s390 people to answer I guess.
>>> Anyway, this should be quite trivial to workaround by a cmpxch or alike.
>>>
> 
> The above fix is simply insufficient,
> 
> 00: [    3.654307] WARNING: possible circular locking dependency detected       
> 00: [    3.654309] 5.4.0-rc1-next-20191004+ #4 Not tainted                      
> 00: [    3.654311] ------------------------------------------------------       
> 00: [    3.654313] swapper/0/1 is trying to acquire lock:                       
> 00: [    3.654314] 00000000553f3fb8 (sclp_lock){-.-.}, at: sclp_add_request+0x34
> 00: /0x308                                                                      
> 00: [    3.654320]                                                              
> 00: [    3.654322] but task is already holding lock:                            
> 00: [    3.654323] 00000000550c9fc0 (console_owner){....}, at: console_unlock+0x
> 00: 328/0xa30                                                                   
> 00: [    3.654329]                                                              
> 00: [    3.654331] which lock already depends on the new lock.                  

I can confirm that both this lockdep warning as well as the original one
are both false positives: lockdep warns that code in sclp_init could
trigger a deadlock via the chain

   sclp_lock --> &(&zone->lock)->rlock --> console_owner

but

  a) before sclp_init successfully completes, register_console is not
     called, so there is no connection between console_owner -> sclp_lock
  b) after sclp_init completed, it won't be called again, so any
     dependency that requires a call-chain including sclp_init is
     impossible to achieve

Apparently lockdep cannot determine that sclp_init won't be called again.
I'm attaching a patch that moves sclp_init to __init so that lockdep has
a chance of knowing that the function will be gone after init.

This patch is intended for testing only though, to see if there are other
paths to similar possible deadlocks. I still need to decide if its worth
changing the code to remove false positives in lockdep.

A generic solution would be preferable from my point of view though,
because otherwise each console driver owner would need to ensure that any
lock taken in their console.write implementation is never held while
memory is allocated/released.

diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index d2ab3f07c008..13219e43d488 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -140,7 +140,6 @@ static void sclp_request_timeout(bool force_restart);
 static void sclp_process_queue(void);
 static void __sclp_make_read_req(void);
 static int sclp_init_mask(int calculate);
-static int sclp_init(void);

 static void
 __sclp_queue_read_req(void)
@@ -670,7 +669,8 @@ __sclp_get_mask(sccb_mask_t *receive_mask, sccb_mask_t *send_mask)
 	}
 }

-/* Register event listener. Return 0 on success, non-zero otherwise. */
+/* Register event listener. Return 0 on success, non-zero otherwise. Early
+ * callers (<= arch_initcall) must call sclp_init() first. */
 int
 sclp_register(struct sclp_register *reg)
 {
@@ -679,9 +679,8 @@ sclp_register(struct sclp_register *reg)
 	sccb_mask_t send_mask;
 	int rc;

-	rc = sclp_init();
-	if (rc)
-		return rc;
+	if (sclp_init_state != sclp_init_state_initialized)
+		return -EINVAL;
 	spin_lock_irqsave(&sclp_lock, flags);
 	/* Check event mask for collisions */
 	__sclp_get_mask(&receive_mask, &send_mask);
@@ -1163,8 +1162,7 @@ static struct platform_device *sclp_pdev;

 /* Initialize SCLP driver. Return zero if driver is operational, non-zero
  * otherwise. */
-static int
-sclp_init(void)
+int __init sclp_init(void)
 {
 	unsigned long flags;
 	int rc = 0;
diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h
index 196333013e54..463660261379 100644
--- a/drivers/s390/char/sclp.h
+++ b/drivers/s390/char/sclp.h
@@ -296,6 +296,7 @@ struct sclp_register {
 };

 /* externals from sclp.c */
+int __init sclp_init(void);
 int sclp_add_request(struct sclp_req *req);
 void sclp_sync_wait(void);
 int sclp_register(struct sclp_register *reg);
diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c
index 8966a1c1b548..a08ef2c8379e 100644
--- a/drivers/s390/char/sclp_con.c
+++ b/drivers/s390/char/sclp_con.c
@@ -319,6 +319,9 @@ sclp_console_init(void)
 	/* SCLP consoles are handled together */
 	if (!(CONSOLE_IS_SCLP || CONSOLE_IS_VT220))
 		return 0;
+	rc = sclp_init();
+	if (rc)
+		return rc;
 	rc = sclp_rw_init();
 	if (rc)
 		return rc;
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 3f9a6ef650fa..28b23e22248b 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -694,6 +694,11 @@ static int __init __sclp_vt220_init(int num_pages)
 	sclp_vt220_init_count++;
 	if (sclp_vt220_init_count != 1)
 		return 0;
+	rc = sclp_init();
+	if (rc) {
+		sclp_vt220_init_count--;
+		return rc;
+	}
 	spin_lock_init(&sclp_vt220_lock);
 	INIT_LIST_HEAD(&sclp_vt220_empty);
 	INIT_LIST_HEAD(&sclp_vt220_outqueue);
Qian Cai Oct. 9, 2019, 2:19 p.m. UTC | #49
On Wed, 2019-10-09 at 15:51 +0200, Michal Hocko wrote:
> On Wed 09-10-19 09:43:13, Qian Cai wrote:
> > On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> > > On Wed 09-10-19 09:06:42, Qian Cai wrote:
> > > [...]
> > > > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > > > 
> > > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > > [  297.425967]        lock_acquire+0x126/0x280
> > > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > > [  297.425970]        console_unlock+0x501/0x750
> > > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > > [  297.425972]        vprintk_default+0x1f/0x30
> > > > [  297.425972]        vprintk_func+0x44/0xd4
> > > > [  297.425973]        printk+0x9f/0xc5
> > > > [  297.425974]        register_console+0x39c/0x520
> > > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > > [  297.425975]        console_init+0x338/0x4cd
> > > > [  297.425976]        start_kernel+0x534/0x724
> > > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > > > 
> > > > where the report again show the early boot call trace for the locking
> > > > dependency,
> > > > 
> > > > console_owner --> port_lock_key
> > > > 
> > > > but that dependency clearly not only happen in the early boot.
> > > 
> > > Can you provide an example of the runtime dependency without any early
> > > boot artifacts? Because this discussion really doens't make much sense
> > > without a clear example of a _real_ lockdep report that is not a false
> > > possitive. All of them so far have been concluded to be false possitive
> > > AFAIU.
> > 
> > An obvious one is in the above link. Just replace the trace in #1 above with
> > printk() from anywhere, i.e., just ignore the early boot calls there as they are
> >  not important.
> > 
> > printk()
> >   console_unlock()
> >     console_lock_spinning_enable() --> console_owner_lock
> >   call_console_drivers()
> >     serial8250_console_write() --> port->lock
> 
> Can you paste the full lock chain graph to be sure we are on the same
> page?

WARNING: possible circular locking dependency detected
5.3.0-next-20190917 #8 Not tainted
------------------------------------------------------
test.sh/8653 is trying to acquire lock:
ffffffff865a4460 (console_owner){-.-.}, at:
console_unlock+0x207/0x750

but task is already holding lock:
ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
__offline_isolated_pages+0x179/0x3e0

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #3 (&(&zone->lock)->rlock){-.-.}:
       __lock_acquire+0x5b3/0xb40
       lock_acquire+0x126/0x280
       _raw_spin_lock+0x2f/0x40
       rmqueue_bulk.constprop.21+0xb6/0x1160
       get_page_from_freelist+0x898/0x22c0
       __alloc_pages_nodemask+0x2f3/0x1cd0
       alloc_pages_current+0x9c/0x110
       allocate_slab+0x4c6/0x19c0
       new_slab+0x46/0x70
       ___slab_alloc+0x58b/0x960
       __slab_alloc+0x43/0x70
       __kmalloc+0x3ad/0x4b0
       __tty_buffer_request_room+0x100/0x250
       tty_insert_flip_string_fixed_flag+0x67/0x110
       pty_write+0xa2/0xf0
       n_tty_write+0x36b/0x7b0
       tty_write+0x284/0x4c0
       __vfs_write+0x50/0xa0
       vfs_write+0x105/0x290
       redirected_tty_write+0x6a/0xc0
       do_iter_write+0x248/0x2a0
       vfs_writev+0x106/0x1e0
       do_writev+0xd4/0x180
       __x64_sys_writev+0x45/0x50
       do_syscall_64+0xcc/0x76c
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

-> #2 (&(&port->lock)->rlock){-.-.}:
       __lock_acquire+0x5b3/0xb40
       lock_acquire+0x126/0x280
       _raw_spin_lock_irqsave+0x3a/0x50
       tty_port_tty_get+0x20/0x60
       tty_port_default_wakeup+0xf/0x30
       tty_port_tty_wakeup+0x39/0x40
       uart_write_wakeup+0x2a/0x40
       serial8250_tx_chars+0x22e/0x440
       serial8250_handle_irq.part.8+0x14a/0x170
       serial8250_default_handle_irq+0x5c/0x90
       serial8250_interrupt+0xa6/0x130
       __handle_irq_event_percpu+0x78/0x4f0
       handle_irq_event_percpu+0x70/0x100
       handle_irq_event+0x5a/0x8b
       handle_edge_irq+0x117/0x370
       do_IRQ+0x9e/0x1e0
       ret_from_intr+0x0/0x2a
       cpuidle_enter_state+0x156/0x8e0
       cpuidle_enter+0x41/0x70
       call_cpuidle+0x5e/0x90
       do_idle+0x333/0x370
       cpu_startup_entry+0x1d/0x1f
       start_secondary+0x290/0x330
       secondary_startup_64+0xb6/0xc0

-> #1 (&port_lock_key){-.-.}:
       __lock_acquire+0x5b3/0xb40
       lock_acquire+0x126/0x280
       _raw_spin_lock_irqsave+0x3a/0x50
       serial8250_console_write+0x3e4/0x450
       univ8250_console_write+0x4b/0x60
       console_unlock+0x501/0x750
       vprintk_emit+0x10d/0x340
       vprintk_default+0x1f/0x30
       vprintk_func+0x44/0xd4
       printk+0x9f/0xc5

-> #0 (console_owner){-.-.}:
       check_prev_add+0x107/0xea0
       validate_chain+0x8fc/0x1200
       __lock_acquire+0x5b3/0xb40
       lock_acquire+0x126/0x280
       console_unlock+0x269/0x750
       vprintk_emit+0x10d/0x340
       vprintk_default+0x1f/0x30
       vprintk_func+0x44/0xd4
       printk+0x9f/0xc5
       __offline_isolated_pages.cold.52+0x2f/0x30a
       offline_isolated_pages_cb+0x17/0x30
       walk_system_ram_range+0xda/0x160
       __offline_pages+0x79c/0xa10
       offline_pages+0x11/0x20
       memory_subsys_offline+0x7e/0xc0
       device_offline+0xd5/0x110
       state_store+0xc6/0xe0
       dev_attr_store+0x3f/0x60
       sysfs_kf_write+0x89/0xb0
       kernfs_fop_write+0x188/0x240
       __vfs_write+0x50/0xa0
       vfs_write+0x105/0x290
       ksys_write+0xc6/0x160
       __x64_sys_write+0x43/0x50
       do_syscall_64+0xcc/0x76c
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

other info that might help us debug this:

Chain exists of:
  console_owner --> &(&port->lock)->rlock --> &(&zone->lock)-
>rlock

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&(&zone->lock)->rlock);
                               lock(&(&port->lock)->rlock);
                               lock(&(&zone->lock)->rlock);
  lock(console_owner);

 *** DEADLOCK ***

9 locks held by test.sh/8653:
 #0: ffff88839ba7d408 (sb_writers#4){.+.+}, at:
vfs_write+0x25f/0x290
 #1: ffff888277618880 (&of->mutex){+.+.}, at:
kernfs_fop_write+0x128/0x240
 #2: ffff8898131fc218 (kn->count#115){.+.+}, at:
kernfs_fop_write+0x138/0x240
 #3: ffffffff86962a80 (device_hotplug_lock){+.+.}, at:
lock_device_hotplug_sysfs+0x16/0x50
 #4: ffff8884374f4990 (&dev->mutex){....}, at:
device_offline+0x70/0x110
 #5: ffffffff86515250 (cpu_hotplug_lock.rw_sem){++++}, at:
__offline_pages+0xbf/0xa10
 #6: ffffffff867405f0 (mem_hotplug_lock.rw_sem){++++}, at:
percpu_down_write+0x87/0x2f0
 #7: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
__offline_isolated_pages+0x179/0x3e0
 #8: ffffffff865a4920 (console_lock){+.+.}, at:
vprintk_emit+0x100/0x340

stack backtrace:
CPU: 1 PID: 8653 Comm: test.sh Not tainted 5.3.0-next-20190917 #8
Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10,
BIOS U34 05/21/2019
Call Trace:
 dump_stack+0x86/0xca
 print_circular_bug.cold.31+0x243/0x26e
 check_noncircular+0x29e/0x2e0
 check_prev_add+0x107/0xea0
 validate_chain+0x8fc/0x1200
 __lock_acquire+0x5b3/0xb40
 lock_acquire+0x126/0x280
 console_unlock+0x269/0x750
 vprintk_emit+0x10d/0x340
 vprintk_default+0x1f/0x30
 vprintk_func+0x44/0xd4
 printk+0x9f/0xc5
 __offline_isolated_pages.cold.52+0x2f/0x30a
 offline_isolated_pages_cb+0x17/0x30
 walk_system_ram_range+0xda/0x160
 __offline_pages+0x79c/0xa10
 offline_pages+0x11/0x20
 memory_subsys_offline+0x7e/0xc0
 device_offline+0xd5/0x110
 state_store+0xc6/0xe0
 dev_attr_store+0x3f/0x60
 sysfs_kf_write+0x89/0xb0
 kernfs_fop_write+0x188/0x240
 __vfs_write+0x50/0xa0
 vfs_write+0x105/0x290
 ksys_write+0xc6/0x160
 __x64_sys_write+0x43/0x50
 do_syscall_64+0xcc/0x76c
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Petr Mladek Oct. 9, 2019, 2:24 p.m. UTC | #50
On Wed 2019-10-09 09:43:13, Qian Cai wrote:
> On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> > On Wed 09-10-19 09:06:42, Qian Cai wrote:
> > [...]
> > > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > > 
> > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > [  297.425967]        lock_acquire+0x126/0x280
> > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > [  297.425970]        console_unlock+0x501/0x750
> > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > [  297.425972]        vprintk_default+0x1f/0x30
> > > [  297.425972]        vprintk_func+0x44/0xd4
> > > [  297.425973]        printk+0x9f/0xc5
> > > [  297.425974]        register_console+0x39c/0x520
> > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > [  297.425975]        console_init+0x338/0x4cd
> > > [  297.425976]        start_kernel+0x534/0x724
> > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > > 
> > > where the report again show the early boot call trace for the locking
> > > dependency,
> > > 
> > > console_owner --> port_lock_key
> > > 
> > > but that dependency clearly not only happen in the early boot.
> > 
> > Can you provide an example of the runtime dependency without any early
> > boot artifacts? Because this discussion really doens't make much sense
> > without a clear example of a _real_ lockdep report that is not a false
> > possitive. All of them so far have been concluded to be false possitive
> > AFAIU.
> 
> An obvious one is in the above link. Just replace the trace in #1 above with
> printk() from anywhere, i.e., just ignore the early boot calls there as they are
>  not important.
> 
> printk()
>   console_unlock()
>     console_lock_spinning_enable() --> console_owner_lock
>   call_console_drivers()
>     serial8250_console_write() --> port->lock

Please, find the location where this really happens and then suggests
how the real deadlock could get fixed. So far, we have seen only
false positives and theoretical scenarios.

Best Regards,
Petr
Michal Hocko Oct. 9, 2019, 2:26 p.m. UTC | #51
On Wed 09-10-19 15:56:32, Peter Oberparleiter wrote:
[...]
> A generic solution would be preferable from my point of view though,
> because otherwise each console driver owner would need to ensure that any
> lock taken in their console.write implementation is never held while
> memory is allocated/released.

Considering that console.write is called from essentially arbitrary code
path IIUC then all the locks used in this path should be pretty much
tail locks or console internal ones without external dependencies.
Otherwise we are in a whack a mole situation chasing very complex lock
chains.
Michal Hocko Oct. 9, 2019, 2:34 p.m. UTC | #52
On Wed 09-10-19 10:19:44, Qian Cai wrote:
> On Wed, 2019-10-09 at 15:51 +0200, Michal Hocko wrote:
[...]
> > Can you paste the full lock chain graph to be sure we are on the same
> > page?
> 
> WARNING: possible circular locking dependency detected
> 5.3.0-next-20190917 #8 Not tainted
> ------------------------------------------------------
> test.sh/8653 is trying to acquire lock:
> ffffffff865a4460 (console_owner){-.-.}, at:
> console_unlock+0x207/0x750
> 
> but task is already holding lock:
> ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
> __offline_isolated_pages+0x179/0x3e0
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #3 (&(&zone->lock)->rlock){-.-.}:
>        __lock_acquire+0x5b3/0xb40
>        lock_acquire+0x126/0x280
>        _raw_spin_lock+0x2f/0x40
>        rmqueue_bulk.constprop.21+0xb6/0x1160
>        get_page_from_freelist+0x898/0x22c0
>        __alloc_pages_nodemask+0x2f3/0x1cd0
>        alloc_pages_current+0x9c/0x110
>        allocate_slab+0x4c6/0x19c0
>        new_slab+0x46/0x70
>        ___slab_alloc+0x58b/0x960
>        __slab_alloc+0x43/0x70
>        __kmalloc+0x3ad/0x4b0
>        __tty_buffer_request_room+0x100/0x250
>        tty_insert_flip_string_fixed_flag+0x67/0x110
>        pty_write+0xa2/0xf0
>        n_tty_write+0x36b/0x7b0
>        tty_write+0x284/0x4c0
>        __vfs_write+0x50/0xa0
>        vfs_write+0x105/0x290
>        redirected_tty_write+0x6a/0xc0
>        do_iter_write+0x248/0x2a0
>        vfs_writev+0x106/0x1e0
>        do_writev+0xd4/0x180
>        __x64_sys_writev+0x45/0x50
>        do_syscall_64+0xcc/0x76c
>        entry_SYSCALL_64_after_hwframe+0x49/0xbe

This one looks indeed legit. pty_write is allocating memory from inside
the port->lock. But this seems to be quite broken, right? The forward
progress depends on GFP_ATOMIC allocation which might fail easily under
memory pressure. So the preferred way to fix this should be to change
the allocation scheme to use the preallocated buffer and size it from a
context when it doesn't hold internal locks. It might be a more complex
fix than using printk_deferred or other games but addressing that would
make the pty code more robust as well.
Qian Cai Oct. 9, 2019, 2:46 p.m. UTC | #53
On Wed, 2019-10-09 at 16:24 +0200, Petr Mladek wrote:
> On Wed 2019-10-09 09:43:13, Qian Cai wrote:
> > On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> > > On Wed 09-10-19 09:06:42, Qian Cai wrote:
> > > [...]
> > > > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > > > 
> > > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > > [  297.425967]        lock_acquire+0x126/0x280
> > > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > > [  297.425970]        console_unlock+0x501/0x750
> > > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > > [  297.425972]        vprintk_default+0x1f/0x30
> > > > [  297.425972]        vprintk_func+0x44/0xd4
> > > > [  297.425973]        printk+0x9f/0xc5
> > > > [  297.425974]        register_console+0x39c/0x520
> > > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > > [  297.425975]        console_init+0x338/0x4cd
> > > > [  297.425976]        start_kernel+0x534/0x724
> > > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > > > 
> > > > where the report again show the early boot call trace for the locking
> > > > dependency,
> > > > 
> > > > console_owner --> port_lock_key
> > > > 
> > > > but that dependency clearly not only happen in the early boot.
> > > 
> > > Can you provide an example of the runtime dependency without any early
> > > boot artifacts? Because this discussion really doens't make much sense
> > > without a clear example of a _real_ lockdep report that is not a false
> > > possitive. All of them so far have been concluded to be false possitive
> > > AFAIU.
> > 
> > An obvious one is in the above link. Just replace the trace in #1 above with
> > printk() from anywhere, i.e., just ignore the early boot calls there as they are
> >  not important.
> > 
> > printk()
> >   console_unlock()
> >     console_lock_spinning_enable() --> console_owner_lock
> >   call_console_drivers()
> >     serial8250_console_write() --> port->lock
> 
> Please, find the location where this really happens and then suggests
> how the real deadlock could get fixed. So far, we have seen only
> false positives and theoretical scenarios.

Now the bar is higher again. You are now asking me to actually trigger this
potential deadlock live. I am probably better off buying some lottery tickets
then if I could be that lucky.
Qian Cai Oct. 9, 2019, 3:08 p.m. UTC | #54
On Wed, 2019-10-09 at 16:34 +0200, Michal Hocko wrote:
> On Wed 09-10-19 10:19:44, Qian Cai wrote:
> > On Wed, 2019-10-09 at 15:51 +0200, Michal Hocko wrote:
> 
> [...]
> > > Can you paste the full lock chain graph to be sure we are on the same
> > > page?
> > 
> > WARNING: possible circular locking dependency detected
> > 5.3.0-next-20190917 #8 Not tainted
> > ------------------------------------------------------
> > test.sh/8653 is trying to acquire lock:
> > ffffffff865a4460 (console_owner){-.-.}, at:
> > console_unlock+0x207/0x750
> > 
> > but task is already holding lock:
> > ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
> > __offline_isolated_pages+0x179/0x3e0
> > 
> > which lock already depends on the new lock.
> > 
> > 
> > the existing dependency chain (in reverse order) is:
> > 
> > -> #3 (&(&zone->lock)->rlock){-.-.}:
> >        __lock_acquire+0x5b3/0xb40
> >        lock_acquire+0x126/0x280
> >        _raw_spin_lock+0x2f/0x40
> >        rmqueue_bulk.constprop.21+0xb6/0x1160
> >        get_page_from_freelist+0x898/0x22c0
> >        __alloc_pages_nodemask+0x2f3/0x1cd0
> >        alloc_pages_current+0x9c/0x110
> >        allocate_slab+0x4c6/0x19c0
> >        new_slab+0x46/0x70
> >        ___slab_alloc+0x58b/0x960
> >        __slab_alloc+0x43/0x70
> >        __kmalloc+0x3ad/0x4b0
> >        __tty_buffer_request_room+0x100/0x250
> >        tty_insert_flip_string_fixed_flag+0x67/0x110
> >        pty_write+0xa2/0xf0
> >        n_tty_write+0x36b/0x7b0
> >        tty_write+0x284/0x4c0
> >        __vfs_write+0x50/0xa0
> >        vfs_write+0x105/0x290
> >        redirected_tty_write+0x6a/0xc0
> >        do_iter_write+0x248/0x2a0
> >        vfs_writev+0x106/0x1e0
> >        do_writev+0xd4/0x180
> >        __x64_sys_writev+0x45/0x50
> >        do_syscall_64+0xcc/0x76c
> >        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> This one looks indeed legit. pty_write is allocating memory from inside
> the port->lock. But this seems to be quite broken, right? The forward
> progress depends on GFP_ATOMIC allocation which might fail easily under
> memory pressure. So the preferred way to fix this should be to change
> the allocation scheme to use the preallocated buffer and size it from a
> context when it doesn't hold internal locks. It might be a more complex
> fix than using printk_deferred or other games but addressing that would
> make the pty code more robust as well.

I am not really sure if doing a surgery in pty code is better than fixing the
memory offline side as a short-term fix.
Qian Cai Oct. 9, 2019, 3:25 p.m. UTC | #55
On Wed, 2019-10-09 at 15:56 +0200, Peter Oberparleiter wrote:
> On 08.10.2019 18:08, Qian Cai wrote:
> > On Tue, 2019-10-08 at 14:56 +0200, Christian Borntraeger wrote:
> > > Adding Peter Oberparleiter.
> > > Peter, can you have a look?
> > > 
> > > On 08.10.19 10:27, Michal Hocko wrote:
> > > > On Tue 08-10-19 09:43:57, Petr Mladek wrote:
> > > > > On Mon 2019-10-07 16:49:37, Michal Hocko wrote:
> > > > > > [Cc s390 maintainers - the lockdep is http://lkml.kernel.org/r/1570228005-24979-1-git-send-email-cai@lca.pw
> > > > > >  Petr has explained it is a false positive
> > > > > >  http://lkml.kernel.org/r/20191007143002.l37bt2lzqtnqjqxu@pathway.suse.cz]
> > > > > > On Mon 07-10-19 16:30:02, Petr Mladek wrote:
> > > > > > [...]
> > > > > > > I believe that it cannot really happen because:
> > > > > > > 
> > > > > > > 	static int __init
> > > > > > > 	sclp_console_init(void)
> > > > > > > 	{
> > > > > > > 	[...]
> > > > > > > 		rc = sclp_rw_init();
> > > > > > > 	[...]
> > > > > > > 		register_console(&sclp_console);
> > > > > > > 		return 0;
> > > > > > > 	}
> > > > > > > 
> > > > > > > sclp_rw_init() is called before register_console(). And
> > > > > > > console_unlock() will never call sclp_console_write() before
> > > > > > > the console is registered.
> > > > > > > 
> > > > > > > AFAIK, lockdep only compares existing chain of locks. It does
> > > > > > > not know about console registration that would make some
> > > > > > > code paths mutually exclusive.
> > > > > > > 
> > > > > > > I believe that it is a false positive. I do not know how to
> > > > > > > avoid this lockdep report. I hope that it will disappear
> > > > > > > by deferring all printk() calls rather soon.
> > > > > > 
> > > > > > Thanks a lot for looking into this Petr. I have also checked the code
> > > > > > and I really fail to see why the allocation has to be done under the
> > > > > > lock in the first place. sclp_read_sccb and sclp_init_sccb are global
> > > > > > variables but I strongly suspect that they need a synchronization during
> > > > > > early init, callbacks are registered only later IIUC:
> > > > > 
> > > > > Good idea. It would work when the init function is called only once.
> > > > > But see below.
> > > > > 
> > > > > > diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> > > > > > index d2ab3f07c008..4b1c033e3255 100644
> > > > > > --- a/drivers/s390/char/sclp.c
> > > > > > +++ b/drivers/s390/char/sclp.c
> > > > > > @@ -1169,13 +1169,13 @@ sclp_init(void)
> > > > > >  	unsigned long flags;
> > > > > >  	int rc = 0;
> > > > > >  
> > > > > > +	sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > > +	sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA);
> > > > > >  	spin_lock_irqsave(&sclp_lock, flags);
> > > > > >  	/* Check for previous or running initialization */
> > > > > >  	if (sclp_init_state != sclp_init_state_uninitialized)
> > > > > >  		goto fail_unlock;
> > > > > 
> > > > > It seems that sclp_init() could be called several times in parallel.
> > > > > I see it called from sclp_register() and sclp_initcall().
> > > > 
> > > > Interesting. Something for s390 people to answer I guess.
> > > > Anyway, this should be quite trivial to workaround by a cmpxch or alike.
> > > > 
> > 
> > The above fix is simply insufficient,
> > 
> > 00: [    3.654307] WARNING: possible circular locking dependency detected       
> > 00: [    3.654309] 5.4.0-rc1-next-20191004+ #4 Not tainted                      
> > 00: [    3.654311] ------------------------------------------------------       
> > 00: [    3.654313] swapper/0/1 is trying to acquire lock:                       
> > 00: [    3.654314] 00000000553f3fb8 (sclp_lock){-.-.}, at: sclp_add_request+0x34
> > 00: /0x308                                                                      
> > 00: [    3.654320]                                                              
> > 00: [    3.654322] but task is already holding lock:                            
> > 00: [    3.654323] 00000000550c9fc0 (console_owner){....}, at: console_unlock+0x
> > 00: 328/0xa30                                                                   
> > 00: [    3.654329]                                                              
> > 00: [    3.654331] which lock already depends on the new lock.                  
> 
> I can confirm that both this lockdep warning as well as the original one
> are both false positives: lockdep warns that code in sclp_init could
> trigger a deadlock via the chain
> 
>    sclp_lock --> &(&zone->lock)->rlock --> console_owner
> 
> but
> 
>   a) before sclp_init successfully completes, register_console is not
>      called, so there is no connection between console_owner -> sclp_lock
>   b) after sclp_init completed, it won't be called again, so any
>      dependency that requires a call-chain including sclp_init is
>      impossible to achieve
> 
> Apparently lockdep cannot determine that sclp_init won't be called again.
> I'm attaching a patch that moves sclp_init to __init so that lockdep has
> a chance of knowing that the function will be gone after init.
> 
> This patch is intended for testing only though, to see if there are other
> paths to similar possible deadlocks. I still need to decide if its worth
> changing the code to remove false positives in lockdep.
> 
> A generic solution would be preferable from my point of view though,
> because otherwise each console driver owner would need to ensure that any
> lock taken in their console.write implementation is never held while
> memory is allocated/released.
> 
> diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
> index d2ab3f07c008..13219e43d488 100644
> --- a/drivers/s390/char/sclp.c
> +++ b/drivers/s390/char/sclp.c
> @@ -140,7 +140,6 @@ static void sclp_request_timeout(bool force_restart);
>  static void sclp_process_queue(void);
>  static void __sclp_make_read_req(void);
>  static int sclp_init_mask(int calculate);
> -static int sclp_init(void);
> 
>  static void
>  __sclp_queue_read_req(void)
> @@ -670,7 +669,8 @@ __sclp_get_mask(sccb_mask_t *receive_mask, sccb_mask_t *send_mask)
>  	}
>  }
> 
> -/* Register event listener. Return 0 on success, non-zero otherwise. */
> +/* Register event listener. Return 0 on success, non-zero otherwise. Early
> + * callers (<= arch_initcall) must call sclp_init() first. */
>  int
>  sclp_register(struct sclp_register *reg)
>  {
> @@ -679,9 +679,8 @@ sclp_register(struct sclp_register *reg)
>  	sccb_mask_t send_mask;
>  	int rc;
> 
> -	rc = sclp_init();
> -	if (rc)
> -		return rc;
> +	if (sclp_init_state != sclp_init_state_initialized)
> +		return -EINVAL;
>  	spin_lock_irqsave(&sclp_lock, flags);
>  	/* Check event mask for collisions */
>  	__sclp_get_mask(&receive_mask, &send_mask);
> @@ -1163,8 +1162,7 @@ static struct platform_device *sclp_pdev;
> 
>  /* Initialize SCLP driver. Return zero if driver is operational, non-zero
>   * otherwise. */
> -static int
> -sclp_init(void)
> +int __init sclp_init(void)
>  {
>  	unsigned long flags;
>  	int rc = 0;
> diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h
> index 196333013e54..463660261379 100644
> --- a/drivers/s390/char/sclp.h
> +++ b/drivers/s390/char/sclp.h
> @@ -296,6 +296,7 @@ struct sclp_register {
>  };
> 
>  /* externals from sclp.c */
> +int __init sclp_init(void);
>  int sclp_add_request(struct sclp_req *req);
>  void sclp_sync_wait(void);
>  int sclp_register(struct sclp_register *reg);
> diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c
> index 8966a1c1b548..a08ef2c8379e 100644
> --- a/drivers/s390/char/sclp_con.c
> +++ b/drivers/s390/char/sclp_con.c
> @@ -319,6 +319,9 @@ sclp_console_init(void)
>  	/* SCLP consoles are handled together */
>  	if (!(CONSOLE_IS_SCLP || CONSOLE_IS_VT220))
>  		return 0;
> +	rc = sclp_init();
> +	if (rc)
> +		return rc;
>  	rc = sclp_rw_init();
>  	if (rc)
>  		return rc;
> diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
> index 3f9a6ef650fa..28b23e22248b 100644
> --- a/drivers/s390/char/sclp_vt220.c
> +++ b/drivers/s390/char/sclp_vt220.c
> @@ -694,6 +694,11 @@ static int __init __sclp_vt220_init(int num_pages)
>  	sclp_vt220_init_count++;
>  	if (sclp_vt220_init_count != 1)
>  		return 0;
> +	rc = sclp_init();
> +	if (rc) {
> +		sclp_vt220_init_count--;
> +		return rc;
> +	}
>  	spin_lock_init(&sclp_vt220_lock);
>  	INIT_LIST_HEAD(&sclp_vt220_empty);
>  	INIT_LIST_HEAD(&sclp_vt220_outqueue);

Unfortunately, the patch does not help here. I guess the lockdep does not really
differential __init or not because those places can still have a chance to
deadlock in general after interrupt and preempt are enabled even during the boot
but it is just not the case here.

00: [    2.812088] WARNING: possible circular locking dependency detected       
00: [    2.812090] 5.4.0-rc2-next-20191009+ #4 Not tainted                      
00: [    2.812092] ------------------------------------------------------       
00: [    2.812094] swapper/0/1 is trying to acquire lock:                       
00: [    2.812095] 0000000036a07ed8 (sclp_lock){-.-.}, at: sclp_add_request+0x34
00: /0x308                                                                      
00: [    2.812102]                                                              
00: [    2.812103] but task is already holding lock:                            
00: [    2.812105] 00000000366d9ec0 (console_owner){....}, at: console_unlock+0x
00: 328/0xa30                                                                   
00: [    2.812111]                                                              
00: [    2.812113] which lock already depends on the new lock.                  
00: [    2.812114]                                                              
00: [    2.812115]                                                              
00: [    2.812117] the existing dependency chain (in reverse order) is:         
00: [    2.812118]                                                              
00: [    2.812120] -> #2 (console_owner){....}:                                 
00: [    2.812126]        lock_acquire+0x21a/0x468                              
00: [    2.812127]        console_unlock+0x3a6/0xa30                            
00: [    2.812129]        vprintk_emit+0x184/0x3c8                              
00: [    2.812131]        vprintk_default+0x44/0x50                             
00: [    2.812132]        printk+0xa8/0xc0                                      
00: [    2.812134]        get_random_u64+0x40/0x108                             
00: [    2.812135]        add_to_free_area_random+0x188/0x1c0                   
00: [    2.812137]        free_one_page+0x72/0x128                              
00: [    2.812139]        __free_pages_ok+0x51c/0xc90                           
00: [    2.812141]        memblock_free_all+0x30a/0x3b0                         
00: [    2.812142]        mem_init+0x84/0x200                                   
00: [    2.812144]        start_kernel+0x384/0x6a0                              
00: [    2.812145]        startup_continue+0x70/0xd0                            
00: [    2.812146]                                                              
00: [    2.812147] -> #1 (&(&zone->lock)->rlock){....}:                         
00: [    2.812154]        lock_acquire+0x21a/0x468                              
00: [    2.812155]        _raw_spin_lock+0x54/0x68                              
00: [    2.812157]        get_page_from_freelist+0x8b6/0x2d28                   
00: [    2.812159]        __alloc_pages_nodemask+0x246/0x658                    
00: [    2.812161]        __get_free_pages+0x34/0x78                            
00: [    2.812162]        sclp_init+0xce/0x640                                  
00: [    2.812164]        sclp_console_init+0x4e/0x1c0                          
00: [    2.812166]        console_init+0x2c8/0x410                              
00: [    2.812168]        start_kernel+0x530/0x6a0                              
00: [    2.812169]        startup_continue+0x70/0xd0                            
00: [    2.812170]                                                              
00: [    2.812171] -> #0 (sclp_lock){-.-.}:                                     
00: [    2.812177]        check_noncircular+0x338/0x3e0                         
00: [    2.812179]        __lock_acquire+0x1e66/0x2d88                          
00: [    2.812181]        lock_acquire+0x21a/0x468                              
00: [    2.812182]        _raw_spin_lock_irqsave+0xcc/0xe8                      
00: [    2.812184]        sclp_add_request+0x34/0x308                           
00: [    2.812186]        sclp_conbuf_emit+0x100/0x138                          
00: [    2.812187]        sclp_console_write+0x96/0x3b8                         
00: [    2.812189]        console_unlock+0x6dc/0xa30                            
00: [    2.812191]        vprintk_emit+0x184/0x3c8                              
00: [    2.812192]        vprintk_default+0x44/0x50                             
00: [    2.812194]        printk+0xa8/0xc0                                      
00: [    2.812196]        iommu_debugfs_setup+0xf2/0x108                        
00: [    2.812198]        iommu_init+0x6c/0x78                                  
00: [    2.812200]        do_one_initcall+0x162/0x680                           
00: [    2.812201]        kernel_init_freeable+0x4e8/0x5a8                      
00: [    2.812203]        kernel_init+0x2a/0x188                                
00: [    2.812205]        ret_from_fork+0x30/0x34                               
00: [    2.812206]        kernel_thread_starter+0x0/0xc                         
00: [    2.812207]                                                              
00: [    2.812209] other info that might help us debug this:                    
00: [    2.812210]                                                              
00: [    2.812211] Chain exists of:                                             
00: [    2.812212]   sclp_lock --> &(&zone->lock)->rlock --> console_owner      
00: [    2.812221]                                                              
00: [    2.812222]  Possible unsafe locking scenario:                           
00: [    2.812223]                                                              
00: [    2.812225]        CPU0                    CPU1                          
00: [    2.812227]        ----                    ----                          
00: [    2.812228]   lock(console_owner);                                       
00: [    2.812232]                                lock(&(&zone->lock)->rlock);  
00: [    2.812236]                                lock(console_owner);          
00: [    2.812239]   lock(sclp_lock);                                           
00: [    2.812243]                                                              
00: [    2.812244]  *** DEADLOCK ***                                            
00: [    2.812245]                                                              
00: [    2.812247] 2 locks held by swapper/0/1:                                 
00: [    2.812248]  #0: 00000000366da100 (console_lock){+.+.}, at: vprintk_emit+
00: 0x178/0x3c8                                                                 
00: [    2.812255]  #1: 00000000366d9ec0 (console_owner){....}, at: console_unlo
00: ck+0x328/0xa30                                                              
00: [    2.812262]                                                              
00: [    2.812264] stack backtrace:                                             
00: [    2.812266] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.4.0-rc2-next-2019
00: 1009+ #4                                                                    
00: [    2.812268] Hardware name: IBM 2964 N96 400 (z/VM 6.4.0)                 
00: [    2.812269] Call Trace:                                                  
00: [    2.812271] ([<000000003591e230>] show_stack+0x110/0x1b0)                
00: [    2.812273]  [<00000000361e025e>] dump_stack+0x126/0x178                 
00: [    2.812276]  [<0000000035a149b8>] check_noncircular+0x338/0x3e0          
00: [    2.812277]  [<0000000035a1a9a6>] __lock_acquire+0x1e66/0x2d88           
00: [    2.812279]  [<0000000035a17cc2>] lock_acquire+0x21a/0x468               
00: [    2.812281]  [<000000003621bce4>] _raw_spin_lock_irqsave+0xcc/0xe8       
00: [    2.812283]  [<0000000035ff534c>] sclp_add_request+0x34/0x308            
00: [    2.812285]  [<0000000035ffc6b8>] sclp_conbuf_emit+0x100/0x138           
00: [    2.812287]  [<0000000035ffc7d6>] sclp_console_write+0x96/0x3b8          
00: [    2.812289]  [<0000000035a2b4fc>] console_unlock+0x6dc/0xa30             
00: [    2.812291]  [<0000000035a2dd0c>] vprintk_emit+0x184/0x3c8               
00: [    2.812293]  [<0000000035a2df94>] vprintk_default+0x44/0x50              
00: [    2.812295]  [<0000000035a2ea40>] printk+0xa8/0xc0                       
00: [    2.812297]  [<0000000035f4e5ea>] iommu_debugfs_setup+0xf2/0x108         
00: [    2.812299]  [<0000000036b90554>] iommu_init+0x6c/0x78                   
00: [    2.812301]  [<0000000035900fda>] do_one_initcall+0x162/0x680            
00: [    2.812303]  [<0000000036b4f9f0>] kernel_init_freeable+0x4e8/0x5a8       
00: [    2.812305]  [<0000000036206bda>] kernel_init+0x2a/0x188                 
00: [    2.812306]  [<000000003621cfdc>] ret_from_fork+0x30/0x34                
00: [    2.812308]  [<000000003621cfe0>] kernel_thread_starter+0x0/0xc          
00: [    2.812310] INFO: lockdep is turned off.
Michal Hocko Oct. 9, 2019, 4:23 p.m. UTC | #56
On Wed 09-10-19 11:08:35, Qian Cai wrote:
> On Wed, 2019-10-09 at 16:34 +0200, Michal Hocko wrote:
> > On Wed 09-10-19 10:19:44, Qian Cai wrote:
> > > On Wed, 2019-10-09 at 15:51 +0200, Michal Hocko wrote:
> > 
> > [...]
> > > > Can you paste the full lock chain graph to be sure we are on the same
> > > > page?
> > > 
> > > WARNING: possible circular locking dependency detected
> > > 5.3.0-next-20190917 #8 Not tainted
> > > ------------------------------------------------------
> > > test.sh/8653 is trying to acquire lock:
> > > ffffffff865a4460 (console_owner){-.-.}, at:
> > > console_unlock+0x207/0x750
> > > 
> > > but task is already holding lock:
> > > ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at:
> > > __offline_isolated_pages+0x179/0x3e0
> > > 
> > > which lock already depends on the new lock.
> > > 
> > > 
> > > the existing dependency chain (in reverse order) is:
> > > 
> > > -> #3 (&(&zone->lock)->rlock){-.-.}:
> > >        __lock_acquire+0x5b3/0xb40
> > >        lock_acquire+0x126/0x280
> > >        _raw_spin_lock+0x2f/0x40
> > >        rmqueue_bulk.constprop.21+0xb6/0x1160
> > >        get_page_from_freelist+0x898/0x22c0
> > >        __alloc_pages_nodemask+0x2f3/0x1cd0
> > >        alloc_pages_current+0x9c/0x110
> > >        allocate_slab+0x4c6/0x19c0
> > >        new_slab+0x46/0x70
> > >        ___slab_alloc+0x58b/0x960
> > >        __slab_alloc+0x43/0x70
> > >        __kmalloc+0x3ad/0x4b0
> > >        __tty_buffer_request_room+0x100/0x250
> > >        tty_insert_flip_string_fixed_flag+0x67/0x110
> > >        pty_write+0xa2/0xf0
> > >        n_tty_write+0x36b/0x7b0
> > >        tty_write+0x284/0x4c0
> > >        __vfs_write+0x50/0xa0
> > >        vfs_write+0x105/0x290
> > >        redirected_tty_write+0x6a/0xc0
> > >        do_iter_write+0x248/0x2a0
> > >        vfs_writev+0x106/0x1e0
> > >        do_writev+0xd4/0x180
> > >        __x64_sys_writev+0x45/0x50
> > >        do_syscall_64+0xcc/0x76c
> > >        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > 
> > This one looks indeed legit. pty_write is allocating memory from inside
> > the port->lock. But this seems to be quite broken, right? The forward
> > progress depends on GFP_ATOMIC allocation which might fail easily under
> > memory pressure. So the preferred way to fix this should be to change
> > the allocation scheme to use the preallocated buffer and size it from a
> > context when it doesn't hold internal locks. It might be a more complex
> > fix than using printk_deferred or other games but addressing that would
> > make the pty code more robust as well.
> 
> I am not really sure if doing a surgery in pty code is better than fixing the
> memory offline side as a short-term fix.

If this was only about the memory offline code then I would agree. But
we are talking about any printk from the zone->lock context and that is
a bigger deal. Besides that it is quite natural that the printk code
should be more universal and allow to be also called from the MM
contexts as much as possible. If there is any really strong reason this
is not possible then it should be documented at least.
Sergey Senozhatsky Oct. 10, 2019, 5:12 a.m. UTC | #57
On (10/09/19 16:26), Michal Hocko wrote:
> On Wed 09-10-19 15:56:32, Peter Oberparleiter wrote:
> [...]
> > A generic solution would be preferable from my point of view though,
> > because otherwise each console driver owner would need to ensure that any
> > lock taken in their console.write implementation is never held while
> > memory is allocated/released.
>
> Considering that console.write is called from essentially arbitrary code
> path IIUC then all the locks used in this path should be pretty much
> tail locks or console internal ones without external dependencies.

That's a good expectation, but I guess it's not always the case.

One example might be NET console - net subsystem locks, net device
drivers locks, maybe even some MM locks (skb allocations?).

But even more "commonly used" consoles sometimes break that
expectation. E.g. 8250

serial8250_console_write()
 serial8250_modem_status()
  wake_up_interruptible()

And so on.

	-ss
Michal Hocko Oct. 10, 2019, 7:40 a.m. UTC | #58
On Thu 10-10-19 14:12:01, Sergey Senozhatsky wrote:
> On (10/09/19 16:26), Michal Hocko wrote:
> > On Wed 09-10-19 15:56:32, Peter Oberparleiter wrote:
> > [...]
> > > A generic solution would be preferable from my point of view though,
> > > because otherwise each console driver owner would need to ensure that any
> > > lock taken in their console.write implementation is never held while
> > > memory is allocated/released.
> >
> > Considering that console.write is called from essentially arbitrary code
> > path IIUC then all the locks used in this path should be pretty much
> > tail locks or console internal ones without external dependencies.
> 
> That's a good expectation, but I guess it's not always the case.
> 
> One example might be NET console - net subsystem locks, net device
> drivers locks, maybe even some MM locks (skb allocations?).

I am not familiar with the netconsole code TBH. If there is absolutely
no way around that then we might have to bite a bullet and consider some
of MM locks a land of no printk. I have already said that in this
thread. I am mostly pushing back on "let's just go the simplest way"
approach.
 
> But even more "commonly used" consoles sometimes break that
> expectation. E.g. 8250
> 
> serial8250_console_write()
>  serial8250_modem_status()
>   wake_up_interruptible()

By that expectation you mean they are using external locks or that they
really _need_ to allocate. Because if you are pointing to
wake_up_interruptible and therefore the rq then this is a well known
thing and I was under impression even documented but I can only see
LOGLEVEL_SCHED that is arguably a very obscure way to document the fact.
Petr Mladek Oct. 10, 2019, 7:57 a.m. UTC | #59
On Wed 2019-10-09 10:46:14, Qian Cai wrote:
> On Wed, 2019-10-09 at 16:24 +0200, Petr Mladek wrote:
> > On Wed 2019-10-09 09:43:13, Qian Cai wrote:
> > > On Wed, 2019-10-09 at 15:27 +0200, Michal Hocko wrote:
> > > > On Wed 09-10-19 09:06:42, Qian Cai wrote:
> > > > [...]
> > > > > https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/
> > > > > 
> > > > > [  297.425964] -> #1 (&port_lock_key){-.-.}:
> > > > > [  297.425967]        __lock_acquire+0x5b3/0xb40
> > > > > [  297.425967]        lock_acquire+0x126/0x280
> > > > > [  297.425968]        _raw_spin_lock_irqsave+0x3a/0x50
> > > > > [  297.425969]        serial8250_console_write+0x3e4/0x450
> > > > > [  297.425970]        univ8250_console_write+0x4b/0x60
> > > > > [  297.425970]        console_unlock+0x501/0x750
> > > > > [  297.425971]        vprintk_emit+0x10d/0x340
> > > > > [  297.425972]        vprintk_default+0x1f/0x30
> > > > > [  297.425972]        vprintk_func+0x44/0xd4
> > > > > [  297.425973]        printk+0x9f/0xc5
> > > > > [  297.425974]        register_console+0x39c/0x520
> > > > > [  297.425975]        univ8250_console_init+0x23/0x2d
> > > > > [  297.425975]        console_init+0x338/0x4cd
> > > > > [  297.425976]        start_kernel+0x534/0x724
> > > > > [  297.425977]        x86_64_start_reservations+0x24/0x26
> > > > > [  297.425977]        x86_64_start_kernel+0xf4/0xfb
> > > > > [  297.425978]        secondary_startup_64+0xb6/0xc0
> > > > > 
> > > > > where the report again show the early boot call trace for the locking
> > > > > dependency,
> > > > > 
> > > > > console_owner --> port_lock_key
> > > > > 
> > > > > but that dependency clearly not only happen in the early boot.
> > > > 
> > > > Can you provide an example of the runtime dependency without any early
> > > > boot artifacts? Because this discussion really doens't make much sense
> > > > without a clear example of a _real_ lockdep report that is not a false
> > > > possitive. All of them so far have been concluded to be false possitive
> > > > AFAIU.
> > > 
> > > An obvious one is in the above link. Just replace the trace in #1 above with
> > > printk() from anywhere, i.e., just ignore the early boot calls there as they are
> > >  not important.
> > > 
> > > printk()
> > >   console_unlock()
> > >     console_lock_spinning_enable() --> console_owner_lock
> > >   call_console_drivers()
> > >     serial8250_console_write() --> port->lock
> > 
> > Please, find the location where this really happens and then suggests
> > how the real deadlock could get fixed. So far, we have seen only
> > false positives and theoretical scenarios.
> 
> Now the bar is higher again. You are now asking me to actually trigger this
> potential deadlock live. I am probably better off buying some lottery tickets
> then if I could be that lucky.

No, we just do not want to comlicate the code too much just to hide
false positives from lockdep.

I do not ask you to reproduce the deadlock. I ask you to find
a code path where the deadlock might really happen. It seems
that you actually found one in the tty code in the other mail.

Best Regards,
Petr
Sergey Senozhatsky Oct. 10, 2019, 8:16 a.m. UTC | #60
On (10/10/19 09:40), Michal Hocko wrote:
[..]
> > > Considering that console.write is called from essentially arbitrary code
> > > path IIUC then all the locks used in this path should be pretty much
> > > tail locks or console internal ones without external dependencies.
> > 
> > That's a good expectation, but I guess it's not always the case.
> > 
> > One example might be NET console - net subsystem locks, net device
> > drivers locks, maybe even some MM locks (skb allocations?).
> 
> I am not familiar with the netconsole code TBH. If there is absolutely
> no way around that then we might have to bite a bullet and consider some
> of MM locks a land of no printk.

So this is what netconsole does (before we pass on udp to net device
driver code, which *may be* can do more allocations, I don't know):

write_msg()
 netpoll_send_udp()
  find_skb()
   alloc_skb(len, GFP_ATOMIC)
    kmem_cache_alloc_node()

You are the right person to ask this question to - how many MM
locks are involved when we do GFP_ATOMIC kmem_cache allocation?
Is there anything to be concerned about?

> I have already said that in this thread. I am mostly pushing back
> on "let's just go the simplest way" approach.

I understand this. And don't have objections. Merely trying to get
better understanding.

> > But even more "commonly used" consoles sometimes break that
> > expectation. E.g. 8250
> >
> > serial8250_console_write()
> >  serial8250_modem_status()
> >   wake_up_interruptible()
>
> By that expectation you mean they are using external locks or that they
> really _need_ to allocate.

Sort of both. netconsole does allocations and has external lock
dependencies. Some of serial console drivers have external lock
dependencies (but don't do allocations, as far as I'm concerned).

> Because if you are pointing to wake_up_interruptible and therefore the rq
> then this is a well known thing and I was under impression even documented
> but I can only see LOGLEVEL_SCHED that is arguably a very obscure way to
> document the fact.

That's right. All I was commenting on is that console.write() callbacks
have external lock dependencies.

	-ss
Petr Mladek Oct. 10, 2019, 8:21 a.m. UTC | #61
On Thu 2019-10-10 14:12:01, Sergey Senozhatsky wrote:
> On (10/09/19 16:26), Michal Hocko wrote:
> > On Wed 09-10-19 15:56:32, Peter Oberparleiter wrote:
> > [...]
> > > A generic solution would be preferable from my point of view though,
> > > because otherwise each console driver owner would need to ensure that any
> > > lock taken in their console.write implementation is never held while
> > > memory is allocated/released.
> >
> > Considering that console.write is called from essentially arbitrary code
> > path IIUC then all the locks used in this path should be pretty much
> > tail locks or console internal ones without external dependencies.
> 
> That's a good expectation, but I guess it's not always the case.
> 
> One example might be NET console - net subsystem locks, net device
> drivers locks, maybe even some MM locks (skb allocations?).
> 
> But even more "commonly used" consoles sometimes break that
> expectation. E.g. 8250
> 
> serial8250_console_write()
>  serial8250_modem_status()
>   wake_up_interruptible()
> 
> And so on.

I think that the only maintainable solution is to call the console
drivers in a well defined context (kthread). We have finally opened
doors to do this change.

Using printk_deferred() or removing the problematic printk() is
a short term workaround. I am not going to block such patches.
But the final decision will be on maintainers of the affected subsystems.

Deferring console work should prevent the deadlocks. Another story
is allocating memory from console->write() callback. It makes the
console less reliable when there is a memory contention. Preventing
this would be very valuable.

Best Regards,
Petr
Michal Hocko Oct. 10, 2019, 8:37 a.m. UTC | #62
On Thu 10-10-19 17:16:29, Sergey Senozhatsky wrote:
> On (10/10/19 09:40), Michal Hocko wrote:
> [..]
> > > > Considering that console.write is called from essentially arbitrary code
> > > > path IIUC then all the locks used in this path should be pretty much
> > > > tail locks or console internal ones without external dependencies.
> > > 
> > > That's a good expectation, but I guess it's not always the case.
> > > 
> > > One example might be NET console - net subsystem locks, net device
> > > drivers locks, maybe even some MM locks (skb allocations?).
> > 
> > I am not familiar with the netconsole code TBH. If there is absolutely
> > no way around that then we might have to bite a bullet and consider some
> > of MM locks a land of no printk.
> 
> So this is what netconsole does (before we pass on udp to net device
> driver code, which *may be* can do more allocations, I don't know):
> 
> write_msg()
>  netpoll_send_udp()
>   find_skb()
>    alloc_skb(len, GFP_ATOMIC)
>     kmem_cache_alloc_node()
> 
> You are the right person to ask this question to - how many MM
> locks are involved when we do GFP_ATOMIC kmem_cache allocation?
> Is there anything to be concerned about?

At least zone->lock might involved. Maybe even more.
Sergey Senozhatsky Oct. 10, 2019, 8:39 a.m. UTC | #63
On (10/10/19 10:21), Petr Mladek wrote:
[..]
> > > Considering that console.write is called from essentially arbitrary code
> > > path IIUC then all the locks used in this path should be pretty much
> > > tail locks or console internal ones without external dependencies.
> > 
> > That's a good expectation, but I guess it's not always the case.
> > 
> > One example might be NET console - net subsystem locks, net device
> > drivers locks, maybe even some MM locks (skb allocations?).
> > 
> > But even more "commonly used" consoles sometimes break that
> > expectation. E.g. 8250
> > 
> > serial8250_console_write()
> >  serial8250_modem_status()
> >   wake_up_interruptible()
> > 
> > And so on.
> 
> I think that the only maintainable solution is to call the console
> drivers in a well defined context (kthread). We have finally opened
> doors to do this change.

Yeah, that's a pretty complex thing, I suspect. Panic flush to
netcon may deadlock if oops occurs under one of those "important
MM locks" (if any MM locks are actually involved in ATOMIC skb
allocation). If there are such MM locks, then I think flush_on_panic
issue can't be address by printing kthread or ->atomic_write callback.

> Using printk_deferred() or removing the problematic printk() is
> a short term workaround. I am not going to block such patches.
> But the final decision will be on maintainers of the affected subsystems.

Agreed.

	-ss
Qian Cai Oct. 10, 2019, 9:01 a.m. UTC | #64
> On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> 
> If this was only about the memory offline code then I would agree. But
> we are talking about any printk from the zone->lock context and that is
> a bigger deal. Besides that it is quite natural that the printk code
> should be more universal and allow to be also called from the MM
> contexts as much as possible. If there is any really strong reason this
> is not possible then it should be documented at least.

Where is the best place to document this? I am thinking about under the “struct zone” definition’s lock field in mmzone.h.

Does workaround this potential deadlock and fix a few lockdep false positives for free count as a really strong reason?
Michal Hocko Oct. 10, 2019, 10:59 a.m. UTC | #65
On Thu 10-10-19 05:01:44, Qian Cai wrote:
> 
> 
> > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > 
> > If this was only about the memory offline code then I would agree. But
> > we are talking about any printk from the zone->lock context and that is
> > a bigger deal. Besides that it is quite natural that the printk code
> > should be more universal and allow to be also called from the MM
> > contexts as much as possible. If there is any really strong reason this
> > is not possible then it should be documented at least.
> 
> Where is the best place to document this? I am thinking about under
> the “struct zone” definition’s lock field in mmzone.h.

I am not sure TBH and I do not think we have reached the state where
this would be the only way forward.
Petr Mladek Oct. 10, 2019, 11:11 a.m. UTC | #66
On Thu 2019-10-10 17:39:08, Sergey Senozhatsky wrote:
> On (10/10/19 10:21), Petr Mladek wrote:
> [..]
> > > > Considering that console.write is called from essentially arbitrary code
> > > > path IIUC then all the locks used in this path should be pretty much
> > > > tail locks or console internal ones without external dependencies.
> > > 
> > > That's a good expectation, but I guess it's not always the case.
> > > 
> > > One example might be NET console - net subsystem locks, net device
> > > drivers locks, maybe even some MM locks (skb allocations?).
> > > 
> > > But even more "commonly used" consoles sometimes break that
> > > expectation. E.g. 8250
> > > 
> > > serial8250_console_write()
> > >  serial8250_modem_status()
> > >   wake_up_interruptible()
> > > 
> > > And so on.
> > 
> > I think that the only maintainable solution is to call the console
> > drivers in a well defined context (kthread). We have finally opened
> > doors to do this change.
> 
> Yeah, that's a pretty complex thing, I suspect. Panic flush to
> netcon may deadlock if oops occurs under one of those "important
> MM locks" (if any MM locks are actually involved in ATOMIC skb
> allocation). If there are such MM locks, then I think flush_on_panic
> issue can't be address by printing kthread or ->atomic_write callback.

Sure, we could not rely on kthreads in panic(). In this situation
any lock taken from console->write() callback is a possible source
of a deadlock.

Note that I say that the locks are the problem and not printk()
called under these locks. It is because:

  + The experience shows that we could not prevent people from
    using printk() anywhere.

  + printk() might get called even when it is not used explicitly.
    For example, from NMI, IRQ, Oops.


So, the best solution is to avoid as many locks in console->write()
callbacks as possible. Especially this means as less dependencies
on external subsystems as possible. MM is an obvious candidate.
We should avoid calling MM not only because it uses locks but
also because there might not be any available memory.

Of course, there are better and worse console drivers. It is hard
to expect that all will or can be made safe. From the printk()
point of view, the defense against the problematic consoles
might be:

   + Classify each console driver. Flush all messages to the most
     reliable consoles first and to the least reliable ones at last.

   + Prevent calling consoles when there is other way to preserve
     the messages over the reboot, e.g. crashdump or permanent memory.


Of course, we will still need a way how to actively search for
problems in advance. For example, printk() could use a fake
lock even during the normal operation so that it could trigger
lockdep splat. We could enable it after all the init calls
are proceed to reduce the number of false positives.


Hmm, this discussion probably belongs to another thread about
printk() design.

Anyway, it seems that removing MM from console->write() calls
is a win-win solution.

Best Regards,
Petr
Qian Cai Oct. 10, 2019, 1:11 p.m. UTC | #67
On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > 
> > 
> > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > 
> > > If this was only about the memory offline code then I would agree. But
> > > we are talking about any printk from the zone->lock context and that is
> > > a bigger deal. Besides that it is quite natural that the printk code
> > > should be more universal and allow to be also called from the MM
> > > contexts as much as possible. If there is any really strong reason this
> > > is not possible then it should be documented at least.
> > 
> > Where is the best place to document this? I am thinking about under
> > the “struct zone” definition’s lock field in mmzone.h.
> 
> I am not sure TBH and I do not think we have reached the state where
> this would be the only way forward.

How about I revised the changelog to focus on memory offline rather than making
a rule that nobody should call printk() with zone->lock held?
Michal Hocko Oct. 10, 2019, 2:18 p.m. UTC | #68
On Thu 10-10-19 09:11:52, Qian Cai wrote:
> On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> > On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > > 
> > > 
> > > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > 
> > > > If this was only about the memory offline code then I would agree. But
> > > > we are talking about any printk from the zone->lock context and that is
> > > > a bigger deal. Besides that it is quite natural that the printk code
> > > > should be more universal and allow to be also called from the MM
> > > > contexts as much as possible. If there is any really strong reason this
> > > > is not possible then it should be documented at least.
> > > 
> > > Where is the best place to document this? I am thinking about under
> > > the “struct zone” definition’s lock field in mmzone.h.
> > 
> > I am not sure TBH and I do not think we have reached the state where
> > this would be the only way forward.
> 
> How about I revised the changelog to focus on memory offline rather than making
> a rule that nobody should call printk() with zone->lock held?

If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
am still not convinced that fiddling with dump_page in the isolation
code is justified though.
Qian Cai Oct. 10, 2019, 2:47 p.m. UTC | #69
On Thu, 2019-10-10 at 16:18 +0200, Michal Hocko wrote:
> On Thu 10-10-19 09:11:52, Qian Cai wrote:
> > On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> > > On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > > > 
> > > > 
> > > > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > 
> > > > > If this was only about the memory offline code then I would agree. But
> > > > > we are talking about any printk from the zone->lock context and that is
> > > > > a bigger deal. Besides that it is quite natural that the printk code
> > > > > should be more universal and allow to be also called from the MM
> > > > > contexts as much as possible. If there is any really strong reason this
> > > > > is not possible then it should be documented at least.
> > > > 
> > > > Where is the best place to document this? I am thinking about under
> > > > the “struct zone” definition’s lock field in mmzone.h.
> > > 
> > > I am not sure TBH and I do not think we have reached the state where
> > > this would be the only way forward.
> > 
> > How about I revised the changelog to focus on memory offline rather than making
> > a rule that nobody should call printk() with zone->lock held?
> 
> If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
> am still not convinced that fiddling with dump_page in the isolation
> code is justified though.

No, dump_page() there has to be fixed together for memory offline to be useful.
What's the other options it has here? By not holding zone->lock in dump_page()
from set_migratetype_isolate(), it even has a good side-effect to increase the
system throughput as dump_page() could be time-consuming. It may make the code a
bit cleaner by introducing a has_unmovable_pages_locked() version.
Michal Hocko Oct. 10, 2019, 5:30 p.m. UTC | #70
On Thu 10-10-19 10:47:38, Qian Cai wrote:
> On Thu, 2019-10-10 at 16:18 +0200, Michal Hocko wrote:
> > On Thu 10-10-19 09:11:52, Qian Cai wrote:
> > > On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> > > > On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > > > > 
> > > > > 
> > > > > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > 
> > > > > > If this was only about the memory offline code then I would agree. But
> > > > > > we are talking about any printk from the zone->lock context and that is
> > > > > > a bigger deal. Besides that it is quite natural that the printk code
> > > > > > should be more universal and allow to be also called from the MM
> > > > > > contexts as much as possible. If there is any really strong reason this
> > > > > > is not possible then it should be documented at least.
> > > > > 
> > > > > Where is the best place to document this? I am thinking about under
> > > > > the “struct zone” definition’s lock field in mmzone.h.
> > > > 
> > > > I am not sure TBH and I do not think we have reached the state where
> > > > this would be the only way forward.
> > > 
> > > How about I revised the changelog to focus on memory offline rather than making
> > > a rule that nobody should call printk() with zone->lock held?
> > 
> > If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
> > am still not convinced that fiddling with dump_page in the isolation
> > code is justified though.
> 
> No, dump_page() there has to be fixed together for memory offline to be useful.
> What's the other options it has here?

I would really prefer to not repeat myself
http://lkml.kernel.org/r/20191010074049.GD18412@dhcp22.suse.cz

> By not holding zone->lock in dump_page()
> from set_migratetype_isolate(), it even has a good side-effect to increase the
> system throughput as dump_page() could be time-consuming. It may make the code a
> bit cleaner by introducing a has_unmovable_pages_locked() version.

I do not see why we should really optimize this cold path.
Qian Cai Oct. 10, 2019, 5:48 p.m. UTC | #71
On Thu, 2019-10-10 at 19:30 +0200, Michal Hocko wrote:
> On Thu 10-10-19 10:47:38, Qian Cai wrote:
> > On Thu, 2019-10-10 at 16:18 +0200, Michal Hocko wrote:
> > > On Thu 10-10-19 09:11:52, Qian Cai wrote:
> > > > On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> > > > > On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > > > > > 
> > > > > > 
> > > > > > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > > 
> > > > > > > If this was only about the memory offline code then I would agree. But
> > > > > > > we are talking about any printk from the zone->lock context and that is
> > > > > > > a bigger deal. Besides that it is quite natural that the printk code
> > > > > > > should be more universal and allow to be also called from the MM
> > > > > > > contexts as much as possible. If there is any really strong reason this
> > > > > > > is not possible then it should be documented at least.
> > > > > > 
> > > > > > Where is the best place to document this? I am thinking about under
> > > > > > the “struct zone” definition’s lock field in mmzone.h.
> > > > > 
> > > > > I am not sure TBH and I do not think we have reached the state where
> > > > > this would be the only way forward.
> > > > 
> > > > How about I revised the changelog to focus on memory offline rather than making
> > > > a rule that nobody should call printk() with zone->lock held?
> > > 
> > > If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
> > > am still not convinced that fiddling with dump_page in the isolation
> > > code is justified though.
> > 
> > No, dump_page() there has to be fixed together for memory offline to be useful.
> > What's the other options it has here?
> 
> I would really prefer to not repeat myself
> http://lkml.kernel.org/r/20191010074049.GD18412@dhcp22.suse.cz

Care to elaborate what does that mean? I am confused on if you finally agree on
no printk() while held zone->lock or not. You said "If there is absolutely
no way around that then we might have to bite a bullet and consider some
of MM locks a land of no printk." which makes me think you agreed, but your
stance from the last reply seems you were opposite to it.

> 
> > By not holding zone->lock in dump_page()
> > from set_migratetype_isolate(), it even has a good side-effect to increase the
> > system throughput as dump_page() could be time-consuming. It may make the code a
> > bit cleaner by introducing a has_unmovable_pages_locked() version.
> 
> I do not see why we should really optimize this cold path.
Michal Hocko Oct. 10, 2019, 6:06 p.m. UTC | #72
On Thu 10-10-19 13:48:06, Qian Cai wrote:
> On Thu, 2019-10-10 at 19:30 +0200, Michal Hocko wrote:
> > On Thu 10-10-19 10:47:38, Qian Cai wrote:
> > > On Thu, 2019-10-10 at 16:18 +0200, Michal Hocko wrote:
> > > > On Thu 10-10-19 09:11:52, Qian Cai wrote:
> > > > > On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
> > > > > > On Thu 10-10-19 05:01:44, Qian Cai wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > > On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
> > > > > > > > 
> > > > > > > > If this was only about the memory offline code then I would agree. But
> > > > > > > > we are talking about any printk from the zone->lock context and that is
> > > > > > > > a bigger deal. Besides that it is quite natural that the printk code
> > > > > > > > should be more universal and allow to be also called from the MM
> > > > > > > > contexts as much as possible. If there is any really strong reason this
> > > > > > > > is not possible then it should be documented at least.
> > > > > > > 
> > > > > > > Where is the best place to document this? I am thinking about under
> > > > > > > the “struct zone” definition’s lock field in mmzone.h.
> > > > > > 
> > > > > > I am not sure TBH and I do not think we have reached the state where
> > > > > > this would be the only way forward.
> > > > > 
> > > > > How about I revised the changelog to focus on memory offline rather than making
> > > > > a rule that nobody should call printk() with zone->lock held?
> > > > 
> > > > If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
> > > > am still not convinced that fiddling with dump_page in the isolation
> > > > code is justified though.
> > > 
> > > No, dump_page() there has to be fixed together for memory offline to be useful.
> > > What's the other options it has here?
> > 
> > I would really prefer to not repeat myself
> > http://lkml.kernel.org/r/20191010074049.GD18412@dhcp22.suse.cz
> 
> Care to elaborate what does that mean? I am confused on if you finally agree on
> no printk() while held zone->lock or not. You said "If there is absolutely
> no way around that then we might have to bite a bullet and consider some
> of MM locks a land of no printk." which makes me think you agreed, but your
> stance from the last reply seems you were opposite to it.

I really do mean that the first step is to remove the dependency from
the printk and remove any allocation from the console callbacks. If that
turns out to be infeasible then we have to bite the bullet and think of
a way to drop all printks from all locks that participate in an atomic
allocation requests.
David Hildenbrand Oct. 10, 2019, 6:59 p.m. UTC | #73
On 10.10.19 20:06, Michal Hocko wrote:
> On Thu 10-10-19 13:48:06, Qian Cai wrote:
>> On Thu, 2019-10-10 at 19:30 +0200, Michal Hocko wrote:
>>> On Thu 10-10-19 10:47:38, Qian Cai wrote:
>>>> On Thu, 2019-10-10 at 16:18 +0200, Michal Hocko wrote:
>>>>> On Thu 10-10-19 09:11:52, Qian Cai wrote:
>>>>>> On Thu, 2019-10-10 at 12:59 +0200, Michal Hocko wrote:
>>>>>>> On Thu 10-10-19 05:01:44, Qian Cai wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>> On Oct 9, 2019, at 12:23 PM, Michal Hocko <mhocko@kernel.org> wrote:
>>>>>>>>>
>>>>>>>>> If this was only about the memory offline code then I would agree. But
>>>>>>>>> we are talking about any printk from the zone->lock context and that is
>>>>>>>>> a bigger deal. Besides that it is quite natural that the printk code
>>>>>>>>> should be more universal and allow to be also called from the MM
>>>>>>>>> contexts as much as possible. If there is any really strong reason this
>>>>>>>>> is not possible then it should be documented at least.
>>>>>>>>
>>>>>>>> Where is the best place to document this? I am thinking about under
>>>>>>>> the “struct zone” definition’s lock field in mmzone.h.
>>>>>>>
>>>>>>> I am not sure TBH and I do not think we have reached the state where
>>>>>>> this would be the only way forward.
>>>>>>
>>>>>> How about I revised the changelog to focus on memory offline rather than making
>>>>>> a rule that nobody should call printk() with zone->lock held?
>>>>>
>>>>> If you are to remove the CONFIG_DEBUG_VM printk then I am all for it. I
>>>>> am still not convinced that fiddling with dump_page in the isolation
>>>>> code is justified though.
>>>>
>>>> No, dump_page() there has to be fixed together for memory offline to be useful.
>>>> What's the other options it has here?
>>>
>>> I would really prefer to not repeat myself
>>> http://lkml.kernel.org/r/20191010074049.GD18412@dhcp22.suse.cz
>>
>> Care to elaborate what does that mean? I am confused on if you finally agree on
>> no printk() while held zone->lock or not. You said "If there is absolutely
>> no way around that then we might have to bite a bullet and consider some
>> of MM locks a land of no printk." which makes me think you agreed, but your
>> stance from the last reply seems you were opposite to it.
> 
> I really do mean that the first step is to remove the dependency from
> the printk and remove any allocation from the console callbacks. If that
> turns out to be infeasible then we have to bite the bullet and think of
> a way to drop all printks from all locks that participate in an atomic
> allocation requests.
> 

I second that and dropping the useless printk() as Michal mentioned. I 
would beg to not uglify the offlining/isolation code with __nolock 
variants or dropping locks somewhere down in a function. If everything 
fails, I rather want to see the prinkt's gone or returning details in a 
struct back to the caller, that can print it instead.

e.g.,

struct unmovable_page_info {
	const char *reason;
	struct page *page;
...
};

You should get the idea.

Patch
diff mbox series

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 1099c2fee20f..256d491151f5 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -34,7 +34,7 @@  static inline bool is_migrate_isolate(int migratetype)
 #define REPORT_FAILURE	0x2
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 int migratetype, int flags);
+			 int migratetype, int flags, unsigned long irqflags);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1be791f772d..4635ae0da48b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1204,7 +1204,8 @@  static bool is_pageblock_removable_nolock(unsigned long pfn)
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 
-	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
+	return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+				    SKIP_HWPOISON, 0);
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15c2050c629b..5352aa25b9f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8170,7 +8170,7 @@  void *__init alloc_large_system_hash(const char *tablename,
  * race condition. So you can't expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-			 int migratetype, int flags)
+			 int migratetype, int flags, unsigned long irqflags)
 {
 	unsigned long found;
 	unsigned long iter = 0;
@@ -8276,9 +8276,11 @@  bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 	}
 	return false;
 unmovable:
-	WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
-	if (flags & REPORT_FAILURE)
+	if (flags & REPORT_FAILURE) {
+		spin_unlock_irqrestore(&zone->lock, irqflags);
 		dump_page(pfn_to_page(pfn + iter), reason);
+	}
+	WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
 	return true;
 }
 
@@ -8588,10 +8590,6 @@  void zone_pcp_reset(struct zone *zone)
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 		offlined_pages += 1 << order;
-#ifdef CONFIG_DEBUG_VM
-		pr_info("remove from free list %lx %d %lx\n",
-			pfn, 1 << order, end_pfn);
-#endif
 		del_page_from_free_area(page, &zone->free_area[order]);
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 89c19c0feadb..a656c6abb5ac 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -60,7 +60,7 @@  static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	 * We just check MOVABLE pages.
 	 */
 	if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
-				 isol_flags))
+				 isol_flags, flags))
 		ret = 0;
 
 	/*
@@ -81,7 +81,10 @@  static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 		__mod_zone_freepage_state(zone, -nr_pages, mt);
 	}
 
-	spin_unlock_irqrestore(&zone->lock, flags);
+	/* It may has already been unlocked in has_unmovable_pages() above. */
+	if (!(ret && isol_flags & REPORT_FAILURE))
+		spin_unlock_irqrestore(&zone->lock, flags);
+
 	if (!ret)
 		drain_all_pages(zone);
 	return ret;