diff mbox series

[RFC,4/5] mm, memory_hotplug: print reason for the offlining failure

Message ID 20181107101830.17405-5-mhocko@kernel.org (mailing list archive)
State New, archived
Headers show
Series mm, memory_hotplug: improve memory offlining failures debugging | expand

Commit Message

Michal Hocko Nov. 7, 2018, 10:18 a.m. UTC
From: Michal Hocko <mhocko@suse.com>

The memory offlining failure reporting is inconsistent and insufficient.
Some error paths simply do not report the failure to the log at all.
When we do report there are no details about the reason of the failure
and there are several of them which makes memory offlining failures
hard to debug.

Make sure that the
	memory offlining [mem %#010llx-%#010llx] failed
message is printed for all failures and also provide a short textual
reason for the failure e.g.

[ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff

this tells us that the offlining has failed because of a signal pending
aka user intervention.

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/memory_hotplug.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

Comments

Andrew Morton Nov. 7, 2018, 10:04 p.m. UTC | #1
On Wed,  7 Nov 2018 11:18:29 +0100 Michal Hocko <mhocko@kernel.org> wrote:

> From: Michal Hocko <mhocko@suse.com>
> 
> The memory offlining failure reporting is inconsistent and insufficient.
> Some error paths simply do not report the failure to the log at all.
> When we do report there are no details about the reason of the failure
> and there are several of them which makes memory offlining failures
> hard to debug.
> 
> Make sure that the
> 	memory offlining [mem %#010llx-%#010llx] failed
> message is printed for all failures and also provide a short textual
> reason for the failure e.g.
> 
> [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff
> 
> this tells us that the offlining has failed because of a signal pending
> aka user intervention.
> 
> ...

Some of these messages will come out looking a bit odd.

> @@ -1573,7 +1576,8 @@ static int __ref __offline_pages(unsigned long start_pfn,
>  				       MIGRATE_MOVABLE, true);
>  	if (ret) {
>  		mem_hotplug_done();
> -		return ret;
> +		reason = "failed to isolate range";

"memory offlining [mem ...] failed due to failed to isolate range"

> +		goto failed_removal
>  	}
>  
>  	arg.start_pfn = start_pfn;
> @@ -1582,15 +1586,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
>  
>  	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
>  	ret = notifier_to_errno(ret);
> -	if (ret)
> -		goto failed_removal;
> +	if (ret) {
> +		reason = "notifiers failure";

"memory offlining [mem ...] failed due to notifiers failure"

> @@ -1607,8 +1615,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
>  	 * actually in order to make hugetlbfs's object counting consistent.
>  	 */
>  	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
> -	if (ret)
> -		goto failed_removal;
> +	if (ret) {
> +		reason = "fails to disolve hugetlb pages";

"memory offlining [mem ...] failed due to fails to disolve hugetlb pages"


Fix:

--- a/mm/memory_hotplug.c~mm-memory_hotplug-print-reason-for-the-offlining-failure-fix
+++ a/mm/memory_hotplug.c
@@ -1576,7 +1576,7 @@ static int __ref __offline_pages(unsigne
 				       MIGRATE_MOVABLE, true);
 	if (ret) {
 		mem_hotplug_done();
-		reason = "failed to isolate range";
+		reason = "failure to isolate range";
 		goto failed_removal
 	}
 
@@ -1587,7 +1587,7 @@ static int __ref __offline_pages(unsigne
 	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
 	ret = notifier_to_errno(ret);
 	if (ret) {
-		reason = "notifiers failure";
+		reason = "notifier failure";
 		goto failed_removal_isolated;
 	}
 
@@ -1616,7 +1616,7 @@ repeat:
 	 */
 	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
 	if (ret) {
-		reason = "fails to disolve hugetlb pages";
+		reason = "failure to dissolve huge pages";
 		goto failed_removal_isolated;
 	}
 	/* check again */
Anshuman Khandual Nov. 8, 2018, 6:23 a.m. UTC | #2
On 11/07/2018 03:48 PM, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> The memory offlining failure reporting is inconsistent and insufficient.
> Some error paths simply do not report the failure to the log at all.
> When we do report there are no details about the reason of the failure
> and there are several of them which makes memory offlining failures
> hard to debug.
> 
> Make sure that the
> 	memory offlining [mem %#010llx-%#010llx] failed
> message is printed for all failures and also provide a short textual
> reason for the failure e.g.
> 
> [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff
> 
> this tells us that the offlining has failed because of a signal pending
> aka user intervention.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>

It might help to enumerate these failure reason strings and use macros.
Michal Hocko Nov. 8, 2018, 7:59 a.m. UTC | #3
On Thu 08-11-18 11:53:21, Anshuman Khandual wrote:
> 
> 
> On 11/07/2018 03:48 PM, Michal Hocko wrote:
> > From: Michal Hocko <mhocko@suse.com>
> > 
> > The memory offlining failure reporting is inconsistent and insufficient.
> > Some error paths simply do not report the failure to the log at all.
> > When we do report there are no details about the reason of the failure
> > and there are several of them which makes memory offlining failures
> > hard to debug.
> > 
> > Make sure that the
> > 	memory offlining [mem %#010llx-%#010llx] failed
> > message is printed for all failures and also provide a short textual
> > reason for the failure e.g.
> > 
> > [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff
> > 
> > this tells us that the offlining has failed because of a signal pending
> > aka user intervention.
> > 
> > Signed-off-by: Michal Hocko <mhocko@suse.com>
> 
> It might help to enumerate these failure reason strings and use macros.

Does it really make sense when all of them are on-off things? I would
agree if they were reused somewhere.
Michal Hocko Nov. 8, 2018, 8:01 a.m. UTC | #4
On Wed 07-11-18 14:04:13, Andrew Morton wrote:
[...]
> Fix:
> 
> --- a/mm/memory_hotplug.c~mm-memory_hotplug-print-reason-for-the-offlining-failure-fix
> +++ a/mm/memory_hotplug.c
> @@ -1576,7 +1576,7 @@ static int __ref __offline_pages(unsigne
>  				       MIGRATE_MOVABLE, true);
>  	if (ret) {
>  		mem_hotplug_done();
> -		reason = "failed to isolate range";
> +		reason = "failure to isolate range";
>  		goto failed_removal
>  	}
>  
> @@ -1587,7 +1587,7 @@ static int __ref __offline_pages(unsigne
>  	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
>  	ret = notifier_to_errno(ret);
>  	if (ret) {
> -		reason = "notifiers failure";
> +		reason = "notifier failure";
>  		goto failed_removal_isolated;
>  	}
>  
> @@ -1616,7 +1616,7 @@ repeat:
>  	 */
>  	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
>  	if (ret) {
> -		reason = "fails to disolve hugetlb pages";
> +		reason = "failure to dissolve huge pages";
>  		goto failed_removal_isolated;
>  	}
>  	/* check again */
> _
> 

LGTM, thanks!
Michal Hocko Nov. 13, 2018, 8:02 a.m. UTC | #5
On Wed 07-11-18 14:04:13, Andrew Morton wrote:
> On Wed,  7 Nov 2018 11:18:29 +0100 Michal Hocko <mhocko@kernel.org> wrote:
> 
> > From: Michal Hocko <mhocko@suse.com>
> > 
> > The memory offlining failure reporting is inconsistent and insufficient.
> > Some error paths simply do not report the failure to the log at all.
> > When we do report there are no details about the reason of the failure
> > and there are several of them which makes memory offlining failures
> > hard to debug.
> > 
> > Make sure that the
> > 	memory offlining [mem %#010llx-%#010llx] failed
> > message is printed for all failures and also provide a short textual
> > reason for the failure e.g.
> > 
> > [ 1984.506184] rac1 kernel: memory offlining [mem 0x82600000000-0x8267fffffff] failed due to signal backoff
> > 
> > this tells us that the offlining has failed because of a signal pending
> > aka user intervention.
> > 
> > ...
> 
> Some of these messages will come out looking a bit odd.
> 
> > @@ -1573,7 +1576,8 @@ static int __ref __offline_pages(unsigned long start_pfn,
> >  				       MIGRATE_MOVABLE, true);
> >  	if (ret) {
> >  		mem_hotplug_done();
> > -		return ret;
> > +		reason = "failed to isolate range";
> 
> "memory offlining [mem ...] failed due to failed to isolate range"
> 
> > +		goto failed_removal
> >  	}
> >  
> >  	arg.start_pfn = start_pfn;
> > @@ -1582,15 +1586,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
> >  
> >  	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
> >  	ret = notifier_to_errno(ret);
> > -	if (ret)
> > -		goto failed_removal;
> > +	if (ret) {
> > +		reason = "notifiers failure";
> 
> "memory offlining [mem ...] failed due to notifiers failure"
> 
> > @@ -1607,8 +1615,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
> >  	 * actually in order to make hugetlbfs's object counting consistent.
> >  	 */
> >  	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
> > -	if (ret)
> > -		goto failed_removal;
> > +	if (ret) {
> > +		reason = "fails to disolve hugetlb pages";
> 
> "memory offlining [mem ...] failed due to fails to disolve hugetlb pages"
> 
> 
> Fix:
> 
> --- a/mm/memory_hotplug.c~mm-memory_hotplug-print-reason-for-the-offlining-failure-fix
> +++ a/mm/memory_hotplug.c
> @@ -1576,7 +1576,7 @@ static int __ref __offline_pages(unsigne
>  				       MIGRATE_MOVABLE, true);
>  	if (ret) {
>  		mem_hotplug_done();
> -		reason = "failed to isolate range";
> +		reason = "failure to isolate range";
>  		goto failed_removal
>  	}

0day has noticed the missing ; here.

Andrew, could you pick up the follow up fix please?


commit 614212af5c20126aea1edaceb78aa586e19802cf
Author: Michal Hocko <mhocko@suse.com>
Date:   Tue Nov 13 09:01:50 2018 +0100

    fold me "mm, memory_hotplug: print reason for the offlining failure"

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f5f1b2a27cb3..c82193db4be6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1581,7 +1581,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	if (ret) {
 		mem_hotplug_done();
 		reason = "failure to isolate range";
-		goto failed_removal
+		goto failed_removal;
 	}
 
 	arg.start_pfn = start_pfn;
diff mbox series

Patch

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a92b1b8f6218..1badac89c58e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1553,6 +1553,7 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	unsigned long valid_start, valid_end;
 	struct zone *zone;
 	struct memory_notify arg;
+	char *reason;
 
 	mem_hotplug_begin();
 
@@ -1561,7 +1562,9 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
 				  &valid_end)) {
 		mem_hotplug_done();
-		return -EINVAL;
+		ret = -EINVAL;
+		reason = "multizone range";
+		goto failed_removal;
 	}
 
 	zone = page_zone(pfn_to_page(valid_start));
@@ -1573,7 +1576,8 @@  static int __ref __offline_pages(unsigned long start_pfn,
 				       MIGRATE_MOVABLE, true);
 	if (ret) {
 		mem_hotplug_done();
-		return ret;
+		reason = "failed to isolate range";
+		goto failed_removal
 	}
 
 	arg.start_pfn = start_pfn;
@@ -1582,15 +1586,19 @@  static int __ref __offline_pages(unsigned long start_pfn,
 
 	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
 	ret = notifier_to_errno(ret);
-	if (ret)
-		goto failed_removal;
+	if (ret) {
+		reason = "notifiers failure";
+		goto failed_removal_isolated;
+	}
 
 	pfn = start_pfn;
 repeat:
 	/* start memory hot removal */
 	ret = -EINTR;
-	if (signal_pending(current))
-		goto failed_removal;
+	if (signal_pending(current)) {
+		reason = "signal backoff";
+		goto failed_removal_isolated;
+	}
 
 	cond_resched();
 	lru_add_drain_all();
@@ -1607,8 +1615,10 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	 * actually in order to make hugetlbfs's object counting consistent.
 	 */
 	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
-	if (ret)
-		goto failed_removal;
+	if (ret) {
+		reason = "fails to disolve hugetlb pages";
+		goto failed_removal_isolated;
+	}
 	/* check again */
 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
 	if (offlined_pages < 0)
@@ -1648,13 +1658,15 @@  static int __ref __offline_pages(unsigned long start_pfn,
 	mem_hotplug_done();
 	return 0;
 
+failed_removal_isolated:
+	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 failed_removal:
-	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
 		 (unsigned long long) start_pfn << PAGE_SHIFT,
-		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+		 reason);
 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
 	/* pushback to free area */
-	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 	mem_hotplug_done();
 	return ret;
 }