Message ID | 20200114201114.14696-1-cai@lca.pw (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [-next] mm/hotplug: silence a lockdep splat with printk() | expand |
On 14.01.20 21:11, Qian Cai wrote: > Similar to the recent commit [1] merged into the random and -next trees, > it is not a good idea to call printk() with zone->lock held. The > standard fix is to use printk_deferred() in those places, but memory > offline will call dump_page() which need to defer after the lock. While > at it, remove a similar but unnecessary debug printk() as well. > > [1] https://lore.kernel.org/lkml/1573679785-21068-1-git-send-email-cai@lca.pw/ > > Signed-off-by: Qian Cai <cai@lca.pw> > --- > include/linux/page-isolation.h | 2 +- > mm/memory_hotplug.c | 2 +- > mm/page_alloc.c | 12 +++++------- > mm/page_isolation.c | 10 +++++++++- > 4 files changed, 16 insertions(+), 10 deletions(-) > > diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h > index 148e65a9c606..5d8ba078006f 100644 > --- a/include/linux/page-isolation.h > +++ b/include/linux/page-isolation.h > @@ -34,7 +34,7 @@ static inline bool is_migrate_isolate(int migratetype) > #define REPORT_FAILURE 0x2 > > bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > - int flags); > + int flags, char *dump); > void set_pageblock_migratetype(struct page *page, int migratetype); > int move_freepages_block(struct zone *zone, struct page *page, > int migratetype, int *num_movable); > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 7a6de9b0dcab..f10928538fa3 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -1149,7 +1149,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) > return false; > > return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, > - MEMORY_OFFLINE); > + MEMORY_OFFLINE, NULL); > } > > /* Checks if this range of memory is likely to be hot-removable. */ > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index e56cd1f33242..b6bec3925e80 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -8204,7 +8204,7 @@ void *__init alloc_large_system_hash(const char *tablename, > * race condition. So you can't expect this function should be exact. > */ > bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > - int flags) > + int flags, char *dump) > { > unsigned long iter = 0; > unsigned long pfn = page_to_pfn(page); > @@ -8305,8 +8305,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > return false; > unmovable: > WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); > - if (flags & REPORT_FAILURE) > - dump_page(pfn_to_page(pfn + iter), reason); > + if (flags & REPORT_FAILURE) { > + page = pfn_to_page(pfn + iter); > + strscpy(dump, reason, 64); > + } > return true; > } > > @@ -8711,10 +8713,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) > BUG_ON(!PageBuddy(page)); > order = page_order(page); > offlined_pages += 1 << order; > -#ifdef CONFIG_DEBUG_VM > - pr_info("remove from free list %lx %d %lx\n", > - pfn, 1 << order, end_pfn); > -#endif ack to getting rid of this. Regarding the other stuff, I remember Michal had an opinion about the previous approach, so it's best if he comments.
On Tue 14-01-20 21:20:08, David Hildenbrand wrote: > On 14.01.20 21:11, Qian Cai wrote: > > Similar to the recent commit [1] merged into the random and -next trees, > > it is not a good idea to call printk() with zone->lock held. The > > standard fix is to use printk_deferred() in those places, but memory > > offline will call dump_page() which need to defer after the lock. While > > at it, remove a similar but unnecessary debug printk() as well. > > > > [1] https://lore.kernel.org/lkml/1573679785-21068-1-git-send-email-cai@lca.pw/ > > > > Signed-off-by: Qian Cai <cai@lca.pw> > > --- > > include/linux/page-isolation.h | 2 +- > > mm/memory_hotplug.c | 2 +- > > mm/page_alloc.c | 12 +++++------- > > mm/page_isolation.c | 10 +++++++++- > > 4 files changed, 16 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h > > index 148e65a9c606..5d8ba078006f 100644 > > --- a/include/linux/page-isolation.h > > +++ b/include/linux/page-isolation.h > > @@ -34,7 +34,7 @@ static inline bool is_migrate_isolate(int migratetype) > > #define REPORT_FAILURE 0x2 > > > > bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > > - int flags); > > + int flags, char *dump); > > void set_pageblock_migratetype(struct page *page, int migratetype); > > int move_freepages_block(struct zone *zone, struct page *page, > > int migratetype, int *num_movable); > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > > index 7a6de9b0dcab..f10928538fa3 100644 > > --- a/mm/memory_hotplug.c > > +++ b/mm/memory_hotplug.c > > @@ -1149,7 +1149,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) > > return false; > > > > return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, > > - MEMORY_OFFLINE); > > + MEMORY_OFFLINE, NULL); > > } > > > > /* Checks if this range of memory is likely to be hot-removable. */ > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index e56cd1f33242..b6bec3925e80 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -8204,7 +8204,7 @@ void *__init alloc_large_system_hash(const char *tablename, > > * race condition. So you can't expect this function should be exact. > > */ > > bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > > - int flags) > > + int flags, char *dump) > > { > > unsigned long iter = 0; > > unsigned long pfn = page_to_pfn(page); > > @@ -8305,8 +8305,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, > > return false; > > unmovable: > > WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); > > - if (flags & REPORT_FAILURE) > > - dump_page(pfn_to_page(pfn + iter), reason); > > + if (flags & REPORT_FAILURE) { > > + page = pfn_to_page(pfn + iter); > > + strscpy(dump, reason, 64); > > + } > > return true; > > } > > > > @@ -8711,10 +8713,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) > > BUG_ON(!PageBuddy(page)); > > order = page_order(page); > > offlined_pages += 1 << order; > > -#ifdef CONFIG_DEBUG_VM > > - pr_info("remove from free list %lx %d %lx\n", > > - pfn, 1 << order, end_pfn); > > -#endif > > ack to getting rid of this. Yeah and that should go into it's own patch. > Regarding the other stuff, I remember Michal had an opinion about the > previous approach, so it's best if he comments. Yeah, that was a long discussion with a lot of lockdep false positives. I believe I have made it clear that the console code shouldn't depend on memory allocation because that is just too fragile. If that is not possible for some reason then it has to be mentioned in the changelog. I really do not want us to add kludges to the MM code just because of printk deficiencies unless that is absolutely inevitable.
> On Jan 14, 2020, at 4:02 PM, Michal Hocko <mhocko@kernel.org> wrote: > > Yeah, that was a long discussion with a lot of lockdep false positives. > I believe I have made it clear that the console code shouldn't depend on > memory allocation because that is just too fragile. If that is not > possible for some reason then it has to be mentioned in the changelog. > I really do not want us to add kludges to the MM code just because of > printk deficiencies unless that is absolutely inevitable. I don’t know how to convince you, but both random number generator and printk() maintainers agreed to get ride of printk() with zone->lock held as you can see in the approved commit mentioned in this patch description because it is a whac-a-mole to fix other places. In other word, the patch alone fixes quite a few false positives and potential real deadlocks. Maybe Andrew please has a look at this directly?
> On Jan 14, 2020, at 4:02 PM, Michal Hocko <mhocko@kernel.org> wrote: >> >> Yeah, that was a long discussion with a lot of lockdep false positives. >> I believe I have made it clear that the console code shouldn't depend on >> memory allocation because that is just too fragile. If that is not >> possible for some reason then it has to be mentioned in the changelog. >> I really do not want us to add kludges to the MM code just because of >> printk deficiencies unless that is absolutely inevitable. > > I don't know how to convince you, but both random number generator and > printk() maintainers agreed to get ride of printk() with zone->lock > held as you can see in the approved commit mentioned in this patch > description because it is a whac-a-mole to fix other places. In other > word, the patch alone fixes quite a few false positives and potential > real deadlocks. Maybe Andrew please has a look at this directly? > Well, a few things. The changelog is quite poor. It doesn't describe the problem (console drivers allocating memory) not does it describe the solution (deferring the dump_page() until after release of zone->lock). So I changed it to this: : Some console drivers can perform memory allocation at inappropriate times, : which can result in lockdep warnings (and presumably deadlocks) if printk : is called with zone->lock held. : : By far the best fix is to reeducate those console drivers to not perform : these allocations, but this is proving difficult. : : Another but poorer approach is to call printk_deferred() when holding : zone->lock, but memory offline will call dump_page() which needs to defer : after the lock. : : So change has_unmovable_pages() so that it no longer calls dump_page() : itself - instead it passes the page's descripton (as a string) back to the : caller so that in the case of a has_unmovable_pages() failure, the caller : can call dump_page() after releasing zone->lock. : : While at it, remove a similar but unnecessary debug printk() as well. But I see a couple of other issues. > @@ -8290,8 +8290,10 @@ bool has_unmovable_pages(struct zone *zo > return false; > unmovable: > WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); > - if (flags & REPORT_FAILURE) > - dump_page(pfn_to_page(pfn + iter), reason); > + if (flags & REPORT_FAILURE) { > + page = pfn_to_page(pfn + iter); This statement appears to be unnecessary. > + strscpy(dump, reason, 64); > + } Also, that whole `reason' thing in has_unmovable_pages() is just there to tell us whether it was an "unmovable page" or a "CMA page". This doesn't seem terribly useful to me. Also, I expect that the dump_page() output will permit the user to determine that it was a CMA page anyway. If not, we can change dump_page() to add that info. So how about we remove that whole `reason' thing and possibly enhance dump_page()? The patch then becomes much simpler.
> On Jan 14, 2020, at 6:53 PM, Andrew Morton <akpm@linux-foundation.org> wrote: > >> On Jan 14, 2020, at 4:02 PM, Michal Hocko <mhocko@kernel.org> wrote: >>> >>> Yeah, that was a long discussion with a lot of lockdep false positives. >>> I believe I have made it clear that the console code shouldn't depend on >>> memory allocation because that is just too fragile. If that is not >>> possible for some reason then it has to be mentioned in the changelog. >>> I really do not want us to add kludges to the MM code just because of >>> printk deficiencies unless that is absolutely inevitable. >> >> I don't know how to convince you, but both random number generator and >> printk() maintainers agreed to get ride of printk() with zone->lock >> held as you can see in the approved commit mentioned in this patch >> description because it is a whac-a-mole to fix other places. In other >> word, the patch alone fixes quite a few false positives and potential >> real deadlocks. Maybe Andrew please has a look at this directly? >> > > Well, a few things. > > The changelog is quite poor. It doesn't describe the problem (console > drivers allocating memory) not does it describe the solution > (deferring the dump_page() until after release of zone->lock). > > So I changed it to this: > > : Some console drivers can perform memory allocation at inappropriate times, > : which can result in lockdep warnings (and presumably deadlocks) if printk > : is called with zone->lock held. > : > : By far the best fix is to reeducate those console drivers to not perform > : these allocations, but this is proving difficult. … but this is proving difficult because even if we fixed that directly, lockdep Is still able to find an indirect dependency chain, for example [1] CPU1: console_owner —> port_lock_key CPU2: port_lock_key —> (&port->lock)->rlock CPU3: (&port->lock)->rlock —> zone->lock which will trigger a splat with zone->lock —> console_owner [1] https://lore.kernel.org/linux-mm/1570460350.5576.290.camel@lca.pw/ > : > : Another but poorer approach is to call printk_deferred() when holding > : zone->lock, but memory offline will call dump_page() which needs to defer > : after the lock. > : > : So change has_unmovable_pages() so that it no longer calls dump_page() > : itself - instead it passes the page's descripton (as a string) back to the > : caller so that in the case of a has_unmovable_pages() failure, the caller > : can call dump_page() after releasing zone->lock. > : > : While at it, remove a similar but unnecessary debug printk() as well. > > But I see a couple of other issues. > >> @@ -8290,8 +8290,10 @@ bool has_unmovable_pages(struct zone *zo >> return false; >> unmovable: >> WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); >> - if (flags & REPORT_FAILURE) >> - dump_page(pfn_to_page(pfn + iter), reason); >> + if (flags & REPORT_FAILURE) { >> + page = pfn_to_page(pfn + iter); > > This statement appears to be unnecessary. dump_page() in set_migratetype_isolate() needs that “page”. > >> + strscpy(dump, reason, 64); >> + } > > > Also, that whole `reason' thing in has_unmovable_pages() is just there > to tell us whether it was an "unmovable page" or a "CMA page". This > doesn't seem terribly useful to me. Also, I expect that the > dump_page() output will permit the user to determine that it was a CMA > page anyway. If not, we can change dump_page() to add that info. > > So how about we remove that whole `reason' thing and possibly enhance > dump_page()? The patch then becomes much simpler. Sounds like a good idea. I’ll send a v2.
On Tue, 14 Jan 2020 20:02:31 -0500 Qian Cai <cai@lca.pw> wrote: > > > >> @@ -8290,8 +8290,10 @@ bool has_unmovable_pages(struct zone *zo > >> return false; > >> unmovable: > >> WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); > >> - if (flags & REPORT_FAILURE) > >> - dump_page(pfn_to_page(pfn + iter), reason); > >> + if (flags & REPORT_FAILURE) { > >> + page = pfn_to_page(pfn + iter); > > > > This statement appears to be unnecessary. > > dump_page() in set_migratetype_isolate() needs that “page”. local var `page' is never used after this statement.
> On Jan 14, 2020, at 8:19 PM, Andrew Morton <akpm@linux-foundation.org> wrote: > > On Tue, 14 Jan 2020 20:02:31 -0500 Qian Cai <cai@lca.pw> wrote: > >> >> >>>> @@ -8290,8 +8290,10 @@ bool has_unmovable_pages(struct zone *zo >>>> return false; >>>> unmovable: >>>> WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); >>>> - if (flags & REPORT_FAILURE) >>>> - dump_page(pfn_to_page(pfn + iter), reason); >>>> + if (flags & REPORT_FAILURE) { >>>> + page = pfn_to_page(pfn + iter); >>> >>> This statement appears to be unnecessary. >> >> dump_page() in set_migratetype_isolate() needs that “page”. > > local var `page' is never used after this statement. > The goal is to reuse the parameter of has_unmovable_pages((…, page, …) as a return value, so after it returns, dump_page(page, ...) could use it. I don’t see where it was defined as a local variable. This is probably a bit too hacky, so I’ll change has_unmovable_pages() to either return "struct page *” or NULL which is easier to understand.
On Tue 14-01-20 16:40:49, Qian Cai wrote: > > > > On Jan 14, 2020, at 4:02 PM, Michal Hocko <mhocko@kernel.org> wrote: > > > > Yeah, that was a long discussion with a lot of lockdep false positives. > > I believe I have made it clear that the console code shouldn't depend on > > memory allocation because that is just too fragile. If that is not > > possible for some reason then it has to be mentioned in the changelog. > > I really do not want us to add kludges to the MM code just because of > > printk deficiencies unless that is absolutely inevitable. > > I don’t know how to convince you, but both random number generator > and printk() maintainers agreed to get ride of printk() with > zone->lock held as you can see in the approved commit mentioned in > this patch description because it is a whac-a-mole to fix other > places. I really do not understand this argument. It is quite a specific path in the console code which cannot allocate any memory or use locks which depend on the allocation via a lock chain, right? So how come this is a whack a mole? Also any console that really needs GFP_ATOMIC to write something out is just inherently broken so it should better be fixed rather than worked around.
On Tue 2020-01-14 16:40:49, Qian Cai wrote: > > > > On Jan 14, 2020, at 4:02 PM, Michal Hocko <mhocko@kernel.org> wrote: > > > > Yeah, that was a long discussion with a lot of lockdep false positives. > > I believe I have made it clear that the console code shouldn't depend on > > memory allocation because that is just too fragile. If that is not > > possible for some reason then it has to be mentioned in the changelog. > > I really do not want us to add kludges to the MM code just because of > > printk deficiencies unless that is absolutely inevitable. > > I don’t know how to convince you, but both random number generator > and printk() maintainers agreed to get ride of printk() with > zone->lock held as you can see in the approved commit mentioned I neither acked nor blocked the fix in the random generator. I believe that it was false positive. But the fix was trivial and I did not have any better solution in the pocket. > in this patch description because it is a whac-a-mole to fix other > places. This is misleading. Using printk_deferred() in _warn_unseeded_randomness() is whack-a-mole approach as well. The most realistic real solution is to deffer consoles into kthreads. It is being discussed for years. There is finally an agreement to get this upstream. But the priority is to add lockless ringbuffer first. I could understand that Michal is against hack in -mm code that would just hide a false positive warning. If you really need a solution before the console offload gets upstream then I suggest to do something really simple. For example, disable lockdep around the allocation in console registration code that is proven to produce the false positive chain. Best Regards, Petr
> On Jan 15, 2020, at 4:52 AM, Petr Mladek <pmladek@suse.com> wrote: > > I could understand that Michal is against hack in -mm code that > would just hide a false positive warning. Well, I don’t have any confidence to say everything this patch is trying to fix is false positives. I have been spent the last a few months to research this, so I don’t feel like to do this again. https://lore.kernel.org/linux-mm/1570633715.5937.10.camel@lca.pw/ > > If you really need a solution before the console offload gets > upstream then I suggest to do something really simple. For example, > disable lockdep around the allocation in console registration code > that is proven to produce the false positive chain.
On Wed 2020-01-15 06:49:03, Qian Cai wrote: > > > > On Jan 15, 2020, at 4:52 AM, Petr Mladek <pmladek@suse.com> wrote: > > > > I could understand that Michal is against hack in -mm code that > > would just hide a false positive warning. > > Well, I don’t have any confidence to say everything this patch is > trying to fix is false positives. You look at this from a wrong angle. AFAIK, all lockdep reports pasted in the below mentioned thread were false positives. Now, this patch complicates an already complicated -mm code to hide the warning and fix theoretical problems. I suggest to disable lockdep around the safe allocation in the console initialization code. Then we will see if there are other locations that trigger this lockdep warning. It is trivial and will not complicate the code because of false positives. > I have been spent the last a few months to research this, so > I don’t feel like to do this again. > > https://lore.kernel.org/linux-mm/1570633715.5937.10.camel@lca.pw/ Have you tried to disable lockdep around the problematic allocation? Have you seen other lockdep reports caused by exactly this printk() in the allocator code? My big problem with this patch is that the commit message does not contain any lockdep report. It will complicate removing the hack when it is not longer needed. Nobody will know what was the exact problem and if it is safe to get removed. I believe that printk() will offload console handling rather sooner than later and this extra logic will not be necessary. Best Regards, Petr
> On Jan 15, 2020, at 12:02 PM, Petr Mladek <pmladek@suse.com> wrote: > > On Wed 2020-01-15 06:49:03, Qian Cai wrote: >> >> >>> On Jan 15, 2020, at 4:52 AM, Petr Mladek <pmladek@suse.com> wrote: >>> >>> I could understand that Michal is against hack in -mm code that >>> would just hide a false positive warning. >> >> Well, I don’t have any confidence to say everything this patch is >> trying to fix is false positives. > > You look at this from a wrong angle. AFAIK, all lockdep reports pasted > in the below mentioned thread were false positives. Now, this patch > complicates an already complicated -mm code to hide the warning > and fix theoretical problems. What makes you say all of those are false positives? [12471.671123] WARNING: possible circular locking dependency detected [12471.677995] 5.4.0-rc6-next-20191111+ #2 Tainted: G W L [12471.684950] ------------------------------------------------------ [12471.691819] read_all/70259 is trying to acquire lock: [12471.697559] ffff00977b407290 (&(&zone->lock)->rlock){..-.}, at: rmqueue+0xf1c/0x2050 [12471.706005] but task is already holding lock: [12471.713219] 69ff00082000fd18 (&(&n->list_lock)->rlock){-.-.}, at: list_locations+0x104/0x4b4 [12471.722364] which lock already depends on the new lock. [12471.732617] the existing dependency chain (in reverse order) is: [12471.741480] -> #4 (&(&n->list_lock)->rlock){-.-.}: [12471.749150] lock_acquire+0x320/0x360 [12471.754028] _raw_spin_lock+0x64/0x80 [12471.758903] get_partial_node+0x48/0x208 [12471.764037] ___slab_alloc+0x1b8/0x640 [12471.768997] __kmalloc+0x3c4/0x490 [12471.773623] __tty_buffer_request_room+0x118/0x1f8 [12471.779627] tty_insert_flip_string_fixed_flag+0x6c/0x144 [12471.786240] pty_write+0x80/0xd0 [12471.790680] n_tty_write+0x450/0x60c [12471.795466] tty_write+0x338/0x474 [12471.800082] __vfs_write+0x88/0x214 [12471.804780] vfs_write+0x12c/0x1a4 [12471.809393] redirected_tty_write+0x90/0xdc [12471.814790] do_loop_readv_writev+0x140/0x180 [12471.820357] do_iter_write+0xe0/0x10c [12471.825230] vfs_writev+0x134/0x1cc [12471.829929] do_writev+0xbc/0x130 [12471.834455] __arm64_sys_writev+0x58/0x8c [12471.839688] el0_svc_handler+0x170/0x240 [12471.844829] el0_sync_handler+0x150/0x250 [12471.850049] el0_sync+0x164/0x180 [12471.854572] -> #3 (&(&port->lock)->rlock){-.-.}: [12471.862057] lock_acquire+0x320/0x360 [12471.866930] _raw_spin_lock_irqsave+0x7c/0x9c [12471.872498] tty_port_tty_get+0x24/0x60 [12471.877545] tty_port_default_wakeup+0x1c/0x3c [12471.883199] tty_port_tty_wakeup+0x34/0x40 [12471.888510] uart_write_wakeup+0x28/0x44 [12471.893644] pl011_tx_chars+0x1b8/0x270 [12471.898696] pl011_start_tx+0x24/0x70 [12471.903570] __uart_start+0x5c/0x68 [12471.908269] uart_write+0x164/0x1c8 [12471.912969] do_output_char+0x33c/0x348 [12471.918016] n_tty_write+0x4bc/0x60c [12471.922802] tty_write+0x338/0x474 [12471.927414] redirected_tty_write+0xc0/0xdc [12471.932808] do_loop_readv_writev+0x140/0x180 [12471.938375] do_iter_write+0xe0/0x10c [12471.943248] vfs_writev+0x134/0x1cc [12471.947950] do_writev+0xbc/0x130 [12471.952478] __arm64_sys_writev+0x58/0x8c [12471.957700] el0_svc_handler+0x170/0x240 [12471.962833] el0_sync_handler+0x150/0x250 [12471.968053] el0_sync+0x164/0x180 [12471.972576] -> #2 (&port_lock_key){-.-.}: [12471.979453] lock_acquire+0x320/0x360 [12471.984326] _raw_spin_lock+0x64/0x80 [12471.989200] pl011_console_write+0xec/0x2cc [12471.994595] console_unlock+0x794/0x96c [12471.999641] vprintk_emit+0x260/0x31c [12472.004513] vprintk_default+0x54/0x7c [12472.009475] vprintk_func+0x218/0x254 [12472.014358] printk+0x7c/0xa4 [12472.018536] register_console+0x734/0x7b0 [12472.023757] uart_add_one_port+0x734/0x834 [12472.029065] pl011_register_port+0x6c/0xac [12472.034372] sbsa_uart_probe+0x234/0x2ec [12472.039508] platform_drv_probe+0xd4/0x124 [12472.044821] really_probe+0x250/0x71c [12472.049694] driver_probe_device+0xb4/0x200 [12472.055090] __device_attach_driver+0xd8/0x188 [12472.060744] bus_for_each_drv+0xbc/0x110 [12472.065878] __device_attach+0x120/0x220 [12472.071012] device_initial_probe+0x20/0x2c [12472.076405] bus_probe_device+0x54/0x100 [12472.081539] device_add+0xae8/0xc2c [12472.086242] platform_device_add+0x278/0x3b8 [12472.091725] platform_device_register_full+0x238/0x2ac [12472.098079] acpi_create_platform_device+0x2dc/0x3a8 [12472.104263] acpi_bus_attach+0x390/0x3cc [12472.109397] acpi_bus_attach+0x108/0x3cc [12472.114531] acpi_bus_attach+0x108/0x3cc [12472.119664] acpi_bus_attach+0x108/0x3cc [12472.124798] acpi_bus_scan+0x7c/0xb0 [12472.129588] acpi_scan_init+0xe4/0x304 [12472.134548] acpi_init+0x100/0x114 [12472.139160] do_one_initcall+0x348/0x6a0 [12472.144299] do_initcall_level+0x190/0x1fc [12472.149606] do_basic_setup+0x34/0x4c [12472.154479] kernel_init_freeable+0x19c/0x260 [12472.160051] kernel_init+0x18/0x338 [12472.164751] ret_from_fork+0x10/0x18 [12472.169534] -> #1 (console_owner){-...}: [12472.176323] lock_acquire+0x320/0x360 [12472.181196] console_lock_spinning_enable+0x6c/0x7c [12472.187284] console_unlock+0x4f8/0x96c [12472.192330] vprintk_emit+0x260/0x31c [12472.197202] vprintk_default+0x54/0x7c [12472.202162] vprintk_func+0x218/0x254 [12472.207035] printk+0x7c/0xa4 [12472.211218] get_random_u64+0x1c4/0x1dc [12472.216266] shuffle_pick_tail+0x40/0xac [12472.221408] __free_one_page+0x424/0x710 [12472.226541] free_one_page+0x70/0x120 [12472.231415] __free_pages_ok+0x61c/0xa94 [12472.236550] __free_pages_core+0x1bc/0x294 [12472.241861] memblock_free_pages+0x38/0x48 [12472.247171] __free_pages_memory+0xcc/0xfc [12472.252478] __free_memory_core+0x70/0x78 [12472.257699] free_low_memory_core_early+0x148/0x18c [12472.263787] memblock_free_all+0x18/0x54 [12472.268921] mem_init+0xb4/0x17c [12472.273360] mm_init+0x14/0x38 [12472.277625] start_kernel+0x19c/0x530 [12472.282495] -> #0 (&(&zone->lock)->rlock){..-.}: [12472.289977] validate_chain+0xf6c/0x2e2c [12472.295111] __lock_acquire+0x868/0xc2c [12472.300159] lock_acquire+0x320/0x360 [12472.305032] _raw_spin_lock_irqsave+0x7c/0x9c [12472.310599] rmqueue+0xf1c/0x2050 [12472.315128] get_page_from_freelist+0x474/0x688 [12472.320869] __alloc_pages_nodemask+0x3b4/0x18dc [12472.326707] alloc_pages_current+0xd0/0xe0 [12472.332014] __get_free_pages+0x24/0x6c [12472.337061] alloc_loc_track+0x38/0x80 [12472.342022] process_slab+0x228/0x544 [12472.346895] list_locations+0x158/0x4b4 [12472.351942] alloc_calls_show+0x38/0x48 [12472.356991] slab_attr_show+0x38/0x54 [12472.361876] sysfs_kf_seq_show+0x198/0x2d4 [12472.367184] kernfs_seq_show+0xa4/0xcc [12472.372150] seq_read+0x394/0x918 [12472.376676] kernfs_fop_read+0xa8/0x334 [12472.381722] __vfs_read+0x88/0x20c [12472.386334] vfs_read+0xdc/0x110 [12472.390773] ksys_read+0xb0/0x120 [12472.395298] __arm64_sys_read+0x54/0x88 [12472.400345] el0_svc_handler+0x170/0x240 [12472.405479] el0_sync_handler+0x150/0x250 [12472.410699] el0_sync+0x164/0x180 [12472.415223] other info that might help us debug this: [12472.425304] Chain exists of: &(&zone->lock)->rlock --> &(&port->lock)->rlock --> &(&n->list_lock)->rlock [12472.439914] Possible unsafe locking scenario: [12472.447216] CPU0 CPU1 [12472.452434] ---- ---- [12472.457650] lock(&(&n->list_lock)->rlock); [12472.462610] lock(&(&port->lock)->rlock); [12472.469914] lock(&(&n->list_lock)->rlock); [12472.477390] lock(&(&zone->lock)->rlock); [12472.482175] *** DEADLOCK *** [12472.490172] 4 locks held by read_all/70259: [12472.495041] #0: 33ff00947d9881e0 (&p->lock){+.+.}, at: seq_read+0x50/0x918 [12472.502701] #1: f9ff0095cb6e2680 (&of->mutex){+.+.}, at: kernfs_seq_start+0x34/0xf0 [12472.511141] #2: b8ff00083dc2dd08 (kn->count#48){++++}, at: kernfs_seq_start+0x44/0xf0 [12472.519756] #3: 69ff00082000fd18 (&(&n->list_lock)->rlock){-.-.}, at: list_locations+0x104/0x4b4 [12472.529325] stack backtrace: [12472.535069] CPU: 236 PID: 70259 Comm: read_all Tainted: G W L 5.4.0-rc6-next-20191111+ #2 [12472.545062] Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.11 06/18/2019 [12472.555489] Call trace: [12472.558626] dump_backtrace+0x0/0x248 [12472.562977] show_stack+0x20/0x2c [12472.566992] dump_stack+0xe8/0x150 [12472.571084] print_circular_bug+0x368/0x380 [12472.575957] check_noncircular+0x28c/0x294 [12472.580742] validate_chain+0xf6c/0x2e2c [12472.585355] __lock_acquire+0x868/0xc2c [12472.589882] lock_acquire+0x320/0x360 [12472.594234] _raw_spin_lock_irqsave+0x7c/0x9c [12472.599280] rmqueue+0xf1c/0x2050 [12472.603286] get_page_from_freelist+0x474/0x688 [12472.608506] __alloc_pages_nodemask+0x3b4/0x18dc [12472.613813] alloc_pages_current+0xd0/0xe0 [12472.618600] __get_free_pages+0x24/0x6c [12472.623126] alloc_loc_track+0x38/0x80 [12472.627565] process_slab+0x228/0x544 [12472.631917] list_locations+0x158/0x4b4 [12472.636444] alloc_calls_show+0x38/0x48 [12472.640969] slab_attr_show+0x38/0x54 [12472.645322] sysfs_kf_seq_show+0x198/0x2d4 [12472.650108] kernfs_seq_show+0xa4/0xcc [12472.654547] seq_read+0x394/0x918 [12472.658552] kernfs_fop_read+0xa8/0x334 [12472.663078] __vfs_read+0x88/0x20c [12472.667169] vfs_read+0xdc/0x110 [12472.671087] ksys_read+0xb0/0x120 [12472.675091] __arm64_sys_read+0x54/0x88 [12472.679618] el0_svc_handler+0x170/0x240 [12472.684231] el0_sync_handler+0x150/0x250 [12472.688929] el0_sync+0x164/0x180 the existing dependency chain (in reverse order) is: -> #4 (&pool->lock/1){-.-.}: lock_acquire+0x320/0x360 _raw_spin_lock+0x64/0x80 __queue_work+0x4b4/0xa10 queue_work_on+0xac/0x11c tty_schedule_flip+0x84/0xbc tty_flip_buffer_push+0x1c/0x28 pty_write+0x98/0xd0 n_tty_write+0x450/0x60c tty_write+0x338/0x474 __vfs_write+0x88/0x214 vfs_write+0x12c/0x1a4 redirected_tty_write+0x90/0xdc do_loop_readv_writev+0x140/0x180 do_iter_write+0xe0/0x10c vfs_writev+0x134/0x1cc do_writev+0xbc/0x130 __arm64_sys_writev+0x58/0x8c el0_svc_handler+0x170/0x240 el0_sync_handler+0x150/0x250 el0_sync+0x164/0x180 -> #3 (&(&port->lock)->rlock){-.-.}: lock_acquire+0x320/0x360 _raw_spin_lock_irqsave+0x7c/0x9c tty_port_tty_get+0x24/0x60 tty_port_default_wakeup+0x1c/0x3c tty_port_tty_wakeup+0x34/0x40 uart_write_wakeup+0x28/0x44 pl011_tx_chars+0x1b8/0x270 pl011_start_tx+0x24/0x70 __uart_start+0x5c/0x68 uart_write+0x164/0x1c8 do_output_char+0x33c/0x348 n_tty_write+0x4bc/0x60c tty_write+0x338/0x474 redirected_tty_write+0xc0/0xdc do_loop_readv_writev+0x140/0x180 do_iter_write+0xe0/0x10c vfs_writev+0x134/0x1cc do_writev+0xbc/0x130 __arm64_sys_writev+0x58/0x8c el0_svc_handler+0x170/0x240 el0_sync_handler+0x150/0x250 el0_sync+0x164/0x180 -> #2 (&port_lock_key){-.-.}: lock_acquire+0x320/0x360 _raw_spin_lock+0x64/0x80 pl011_console_write+0xec/0x2cc console_unlock+0x794/0x96c vprintk_emit+0x260/0x31c vprintk_default+0x54/0x7c vprintk_func+0x218/0x254 printk+0x7c/0xa4 register_console+0x734/0x7b0 uart_add_one_port+0x734/0x834 pl011_register_port+0x6c/0xac sbsa_uart_probe+0x234/0x2ec platform_drv_probe+0xd4/0x124 really_probe+0x250/0x71c driver_probe_device+0xb4/0x200 __device_attach_driver+0xd8/0x188 bus_for_each_drv+0xbc/0x110 __device_attach+0x120/0x220 device_initial_probe+0x20/0x2c bus_probe_device+0x54/0x100 device_add+0xae8/0xc2c platform_device_add+0x278/0x3b8 platform_device_register_full+0x238/0x2ac acpi_create_platform_device+0x2dc/0x3a8 acpi_bus_attach+0x390/0x3cc acpi_bus_attach+0x108/0x3cc acpi_bus_attach+0x108/0x3cc acpi_bus_attach+0x108/0x3cc acpi_bus_scan+0x7c/0xb0 acpi_scan_init+0xe4/0x304 acpi_init+0x100/0x114 do_one_initcall+0x348/0x6a0 do_initcall_level+0x190/0x1fc do_basic_setup+0x34/0x4c kernel_init_freeable+0x19c/0x260 kernel_init+0x18/0x338 ret_from_fork+0x10/0x18 -> #1 (console_owner){-...}: lock_acquire+0x320/0x360 console_lock_spinning_enable+0x6c/0x7c console_unlock+0x4f8/0x96c vprintk_emit+0x260/0x31c vprintk_default+0x54/0x7c vprintk_func+0x218/0x254 printk+0x7c/0xa4 get_random_u64+0x1c4/0x1dc shuffle_pick_tail+0x40/0xac __free_one_page+0x424/0x710 free_one_page+0x70/0x120 __free_pages_ok+0x61c/0xa94 __free_pages_core+0x1bc/0x294 memblock_free_pages+0x38/0x48 __free_pages_memory+0xcc/0xfc __free_memory_core+0x70/0x78 free_low_memory_core_early+0x148/0x18c memblock_free_all+0x18/0x54 mem_init+0xb4/0x17c mm_init+0x14/0x38 start_kernel+0x19c/0x530 -> #0 (&(&zone->lock)->rlock){..-.}: validate_chain+0xf6c/0x2e2c __lock_acquire+0x868/0xc2c lock_acquire+0x320/0x360 _raw_spin_lock+0x64/0x80 rmqueue+0x138/0x2050 get_page_from_freelist+0x474/0x688 __alloc_pages_nodemask+0x3b4/0x18dc alloc_pages_current+0xd0/0xe0 alloc_slab_page+0x2b4/0x5e0 new_slab+0xc8/0x6bc ___slab_alloc+0x3b8/0x640 kmem_cache_alloc+0x4b4/0x588 __debug_object_init+0x778/0x8b4 debug_object_init_on_stack+0x40/0x50 start_flush_work+0x16c/0x3f0 __flush_work+0xb8/0x124 flush_work+0x20/0x30 xlog_cil_force_lsn+0x88/0x204 [xfs] xfs_log_force_lsn+0x128/0x1b8 [xfs] xfs_file_fsync+0x3c4/0x488 [xfs] vfs_fsync_range+0xb0/0xd0 generic_write_sync+0x80/0xa0 [xfs] xfs_file_buffered_aio_write+0x66c/0x6e4 [xfs] xfs_file_write_iter+0x1a0/0x218 [xfs] __vfs_write+0x1cc/0x214 vfs_write+0x12c/0x1a4 ksys_write+0xb0/0x120 __arm64_sys_write+0x54/0x88 el0_svc_handler+0x170/0x240 el0_sync_handler+0x150/0x250 el0_sync+0x164/0x180 other info that might help us debug this: Chain exists of: &(&zone->lock)->rlock --> &(&port->lock)->rlock --> &pool->lock/1 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&pool->lock/1); lock(&(&port->lock)->rlock); lock(&pool->lock/1); lock(&(&zone->lock)->rlock); *** DEADLOCK *** 4 locks held by doio/49441: #0: a0ff00886fc27408 (sb_writers#8){.+.+}, at: vfs_write+0x118/0x1a4 #1: 8fff00080810dfe0 (&xfs_nondir_ilock_class){++++}, at: xfs_ilock+0x2a8/0x300 [xfs] #2: ffff9000129f2390 (rcu_read_lock){....}, at: rcu_lock_acquire+0x8/0x38 #3: 60ff000822352818 (&pool->lock/1){-.-.}, at: start_flush_work+0xd8/0x3f0 stack backtrace: CPU: 48 PID: 49441 Comm: doio Tainted: G W Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.11 06/18/2019 Call trace: dump_backtrace+0x0/0x248 show_stack+0x20/0x2c dump_stack+0xe8/0x150 print_circular_bug+0x368/0x380 check_noncircular+0x28c/0x294 validate_chain+0xf6c/0x2e2c __lock_acquire+0x868/0xc2c lock_acquire+0x320/0x360 _raw_spin_lock+0x64/0x80 rmqueue+0x138/0x2050 get_page_from_freelist+0x474/0x688 __alloc_pages_nodemask+0x3b4/0x18dc alloc_pages_current+0xd0/0xe0 alloc_slab_page+0x2b4/0x5e0 new_slab+0xc8/0x6bc ___slab_alloc+0x3b8/0x640 kmem_cache_alloc+0x4b4/0x588 __debug_object_init+0x778/0x8b4 debug_object_init_on_stack+0x40/0x50 start_flush_work+0x16c/0x3f0 __flush_work+0xb8/0x124 flush_work+0x20/0x30 xlog_cil_force_lsn+0x88/0x204 [xfs] xfs_log_force_lsn+0x128/0x1b8 [xfs] xfs_file_fsync+0x3c4/0x488 [xfs] vfs_fsync_range+0xb0/0xd0 generic_write_sync+0x80/0xa0 [xfs] xfs_file_buffered_aio_write+0x66c/0x6e4 [xfs] xfs_file_write_iter+0x1a0/0x218 [xfs] __vfs_write+0x1cc/0x214 vfs_write+0x12c/0x1a4 ksys_write+0xb0/0x120 __arm64_sys_write+0x54/0x88 el0_svc_handler+0x170/0x240 el0_sync_handler+0x150/0x250 el0_sync+0x164/0x180 WARNING: possible circular locking dependency detected 5.3.0-next-20190917 #8 Not tainted ------------------------------------------------------ test.sh/8653 is trying to acquire lock: ffffffff865a4460 (console_owner){-.-.}, at: console_unlock+0x207/0x750 but task is already holding lock: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at: __offline_isolated_pages+0x179/0x3e0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&(&zone->lock)->rlock){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock+0x2f/0x40 rmqueue_bulk.constprop.21+0xb6/0x1160 get_page_from_freelist+0x898/0x22c0 __alloc_pages_nodemask+0x2f3/0x1cd0 alloc_pages_current+0x9c/0x110 allocate_slab+0x4c6/0x19c0 new_slab+0x46/0x70 ___slab_alloc+0x58b/0x960 __slab_alloc+0x43/0x70 __kmalloc+0x3ad/0x4b0 __tty_buffer_request_room+0x100/0x250 tty_insert_flip_string_fixed_flag+0x67/0x110 pty_write+0xa2/0xf0 n_tty_write+0x36b/0x7b0 tty_write+0x284/0x4c0 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 redirected_tty_write+0x6a/0xc0 do_iter_write+0x248/0x2a0 vfs_writev+0x106/0x1e0 do_writev+0xd4/0x180 __x64_sys_writev+0x45/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #2 (&(&port->lock)->rlock){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock_irqsave+0x3a/0x50 tty_port_tty_get+0x20/0x60 tty_port_default_wakeup+0xf/0x30 tty_port_tty_wakeup+0x39/0x40 uart_write_wakeup+0x2a/0x40 serial8250_tx_chars+0x22e/0x440 serial8250_handle_irq.part.8+0x14a/0x170 serial8250_default_handle_irq+0x5c/0x90 serial8250_interrupt+0xa6/0x130 __handle_irq_event_percpu+0x78/0x4f0 handle_irq_event_percpu+0x70/0x100 handle_irq_event+0x5a/0x8b handle_edge_irq+0x117/0x370 do_IRQ+0x9e/0x1e0 ret_from_intr+0x0/0x2a cpuidle_enter_state+0x156/0x8e0 cpuidle_enter+0x41/0x70 call_cpuidle+0x5e/0x90 do_idle+0x333/0x370 cpu_startup_entry+0x1d/0x1f start_secondary+0x290/0x330 secondary_startup_64+0xb6/0xc0 -> #1 (&port_lock_key){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock_irqsave+0x3a/0x50 serial8250_console_write+0x3e4/0x450 univ8250_console_write+0x4b/0x60 console_unlock+0x501/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 -> #0 (console_owner){-.-.}: check_prev_add+0x107/0xea0 validate_chain+0x8fc/0x1200 __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 console_unlock+0x269/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 __offline_isolated_pages.cold.52+0x2f/0x30a offline_isolated_pages_cb+0x17/0x30 walk_system_ram_range+0xda/0x160 __offline_pages+0x79c/0xa10 offline_pages+0x11/0x20 memory_subsys_offline+0x7e/0xc0 device_offline+0xd5/0x110 state_store+0xc6/0xe0 dev_attr_store+0x3f/0x60 sysfs_kf_write+0x89/0xb0 kernfs_fop_write+0x188/0x240 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 ksys_write+0xc6/0x160 __x64_sys_write+0x43/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: console_owner --> &(&port->lock)->rlock --> &(&zone->lock)- >rlock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&zone->lock)->rlock); lock(&(&port->lock)->rlock); lock(&(&zone->lock)->rlock); lock(console_owner); *** DEADLOCK *** 9 locks held by test.sh/8653: #0: ffff88839ba7d408 (sb_writers#4){.+.+}, at: vfs_write+0x25f/0x290 #1: ffff888277618880 (&of->mutex){+.+.}, at: kernfs_fop_write+0x128/0x240 #2: ffff8898131fc218 (kn->count#115){.+.+}, at: kernfs_fop_write+0x138/0x240 #3: ffffffff86962a80 (device_hotplug_lock){+.+.}, at: lock_device_hotplug_sysfs+0x16/0x50 #4: ffff8884374f4990 (&dev->mutex){....}, at: device_offline+0x70/0x110 #5: ffffffff86515250 (cpu_hotplug_lock.rw_sem){++++}, at: __offline_pages+0xbf/0xa10 #6: ffffffff867405f0 (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x87/0x2f0 #7: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at: __offline_isolated_pages+0x179/0x3e0 #8: ffffffff865a4920 (console_lock){+.+.}, at: vprintk_emit+0x100/0x340 stack backtrace: CPU: 1 PID: 8653 Comm: test.sh Not tainted 5.3.0-next-20190917 #8 Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 05/21/2019 Call Trace: dump_stack+0x86/0xca print_circular_bug.cold.31+0x243/0x26e check_noncircular+0x29e/0x2e0 check_prev_add+0x107/0xea0 validate_chain+0x8fc/0x1200 __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 console_unlock+0x269/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 __offline_isolated_pages.cold.52+0x2f/0x30a offline_isolated_pages_cb+0x17/0x30 walk_system_ram_range+0xda/0x160 __offline_pages+0x79c/0xa10 offline_pages+0x11/0x20 memory_subsys_offline+0x7e/0xc0 device_offline+0xd5/0x110 state_store+0xc6/0xe0 dev_attr_store+0x3f/0x60 sysfs_kf_write+0x89/0xb0 kernfs_fop_write+0x188/0x240 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 ksys_write+0xc6/0x160 __x64_sys_write+0x43/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe > > I suggest to disable lockdep around the safe allocation in the console > initialization code. Then we will see if there are other locations > that trigger this lockdep warning. It is trivial and will not > complicate the code because of false positives. > > >> I have been spent the last a few months to research this, so >> I don’t feel like to do this again. >> >> https://lore.kernel.org/linux-mm/1570633715.5937.10.camel@lca.pw/ > > Have you tried to disable lockdep around the problematic allocation? > > Have you seen other lockdep reports caused by exactly this printk() > in the allocator code? > > My big problem with this patch is that the commit message does not > contain any lockdep report. It will complicate removing the hack > when it is not longer needed. Nobody will know what was the exact > problem and if it is safe to get removed. I believe that printk() > will offload console handling rather sooner than later and this > extra logic will not be necessary. > > Best Regards, > Petr
On Wed 2020-01-15 12:16:17, Qian Cai wrote: > > > > On Jan 15, 2020, at 12:02 PM, Petr Mladek <pmladek@suse.com> wrote: > > > > On Wed 2020-01-15 06:49:03, Qian Cai wrote: > >> > >> > >>> On Jan 15, 2020, at 4:52 AM, Petr Mladek <pmladek@suse.com> wrote: > >>> > >>> I could understand that Michal is against hack in -mm code that > >>> would just hide a false positive warning. > >> > >> Well, I don’t have any confidence to say everything this patch is > >> trying to fix is false positives. > > > > You look at this from a wrong angle. AFAIK, all lockdep reports pasted > > in the below mentioned thread were false positives. Now, this patch > > complicates an already complicated -mm code to hide the warning > > and fix theoretical problems. > > What makes you say all of those are false positives? I have to admit that the 3 provided lockdep reports really looked suspicious. I must have somehow missed/forgot about them. If the last proposed change removes them and is acceptable for -mm people then it looks like a reasonable solution. Best Regards, Petr
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 148e65a9c606..5d8ba078006f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -34,7 +34,7 @@ static inline bool is_migrate_isolate(int migratetype) #define REPORT_FAILURE 0x2 bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, - int flags); + int flags, char *dump); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7a6de9b0dcab..f10928538fa3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1149,7 +1149,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) return false; return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); + MEMORY_OFFLINE, NULL); } /* Checks if this range of memory is likely to be hot-removable. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e56cd1f33242..b6bec3925e80 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8204,7 +8204,7 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, - int flags) + int flags, char *dump) { unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); @@ -8305,8 +8305,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, return false; unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - if (flags & REPORT_FAILURE) - dump_page(pfn_to_page(pfn + iter), reason); + if (flags & REPORT_FAILURE) { + page = pfn_to_page(pfn + iter); + strscpy(dump, reason, 64); + } return true; } @@ -8711,10 +8713,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; -#ifdef CONFIG_DEBUG_VM - pr_info("remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); -#endif del_page_from_free_area(page, &zone->free_area[order]); pfn += (1 << order); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 1f8b9dfecbe8..ce0fe3c1ceff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -20,6 +20,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ struct zone *zone; unsigned long flags; int ret = -EBUSY; + char dump[64]; zone = page_zone(page); @@ -37,7 +38,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, migratetype, isol_flags)) { + if (!has_unmovable_pages(zone, page, migratetype, isol_flags, + dump)) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); @@ -54,6 +56,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(zone); + else if (isol_flags & REPORT_FAILURE) + /* + * printk() with zone->lock held will guarantee to trigger a + * lockdep splat, so defer it here. + */ + dump_page(page, dump); return ret; }
Similar to the recent commit [1] merged into the random and -next trees, it is not a good idea to call printk() with zone->lock held. The standard fix is to use printk_deferred() in those places, but memory offline will call dump_page() which need to defer after the lock. While at it, remove a similar but unnecessary debug printk() as well. [1] https://lore.kernel.org/lkml/1573679785-21068-1-git-send-email-cai@lca.pw/ Signed-off-by: Qian Cai <cai@lca.pw> --- include/linux/page-isolation.h | 2 +- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 12 +++++------- mm/page_isolation.c | 10 +++++++++- 4 files changed, 16 insertions(+), 10 deletions(-)