diff mbox series

[V3,2/3] arm64/mm/hotplug: Enable MEM_OFFLINE event handling

Message ID 1600689908-28213-3-git-send-email-anshuman.khandual@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64/mm/hotplug: Improve memory offline event notifier | expand

Commit Message

Anshuman Khandual Sept. 21, 2020, 12:05 p.m. UTC
This enables MEM_OFFLINE memory event handling. It will help intercept any
possible error condition such as if boot memory some how still got offlined
even after an explicit notifier failure, potentially by a future change in
generic hot plug framework. This would help detect such scenarios and help
debug further.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/mm/mmu.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

Comments

Anshuman Khandual Sept. 23, 2020, 4:44 a.m. UTC | #1
On 09/21/2020 05:35 PM, Anshuman Khandual wrote:
> This enables MEM_OFFLINE memory event handling. It will help intercept any
> possible error condition such as if boot memory some how still got offlined
> even after an explicit notifier failure, potentially by a future change in
> generic hot plug framework. This would help detect such scenarios and help
> debug further.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Steve Capper <steve.capper@arm.com>
> Cc: Mark Brown <broonie@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/mm/mmu.c | 37 ++++++++++++++++++++++++++++++++-----
>  1 file changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index df3b7415b128..6b171bd88bcf 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1482,13 +1482,40 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>  	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
>  	unsigned long pfn = arg->start_pfn;
>  
> -	if (action != MEM_GOING_OFFLINE)
> +	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>  		return NOTIFY_OK;
>  
> -	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> -		ms = __pfn_to_section(pfn);
> -		if (early_section(ms))
> -			return NOTIFY_BAD;
> +	if (action == MEM_GOING_OFFLINE) {
> +		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +			ms = __pfn_to_section(pfn);
> +			if (early_section(ms)) {
> +				pr_warn("Boot memory offlining attempted\n");
> +				return NOTIFY_BAD;
> +			}
> +		}
> +	} else if (action == MEM_OFFLINE) {
> +		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +			ms = __pfn_to_section(pfn);
> +			if (early_section(ms)) {
> +
> +				/*
> +				 * This should have never happened. Boot memory
> +				 * offlining should have been prevented by this
> +				 * very notifier. Probably some memory removal
> +				 * procedure might have changed which would then
> +				 * require further debug.
> +				 */
> +				pr_err("Boot memory offlined\n");

It is returning in the first instance, when a section inside the
offline range happen to be part of the boot memory. So wondering
if it would be better to call out here, entire attempted offline
range or just the first section inside that which overlaps with
boot memory ? But some range information here will be helpful.
Gavin Shan Sept. 23, 2020, 6:31 a.m. UTC | #2
Hi Anshuman,

On 9/21/20 10:05 PM, Anshuman Khandual wrote:
> This enables MEM_OFFLINE memory event handling. It will help intercept any
> possible error condition such as if boot memory some how still got offlined
> even after an explicit notifier failure, potentially by a future change in
> generic hot plug framework. This would help detect such scenarios and help
> debug further.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Marc Zyngier <maz@kernel.org>
> Cc: Steve Capper <steve.capper@arm.com>
> Cc: Mark Brown <broonie@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---

I'm not sure if it makes sense since MEM_OFFLINE won't be triggered
after NOTIFY_BAD is returned from MEM_GOING_OFFLINE. NOTIFY_BAD means
the whole offline process is stopped. It would be guranteed by generic
framework from syntax standpoint.

However, this looks good if MEM_OFFLINE is triggered without calling
into MEM_GOING_OFFLINE previously, but it would be a bug from generic
framework.

>   arch/arm64/mm/mmu.c | 37 ++++++++++++++++++++++++++++++++-----
>   1 file changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index df3b7415b128..6b171bd88bcf 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1482,13 +1482,40 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>   	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
>   	unsigned long pfn = arg->start_pfn;
>   
> -	if (action != MEM_GOING_OFFLINE)
> +	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>   		return NOTIFY_OK;
>   
> -	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> -		ms = __pfn_to_section(pfn);
> -		if (early_section(ms))
> -			return NOTIFY_BAD;
> +	if (action == MEM_GOING_OFFLINE) {
> +		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +			ms = __pfn_to_section(pfn);
> +			if (early_section(ms)) {
> +				pr_warn("Boot memory offlining attempted\n");
> +				return NOTIFY_BAD;
> +			}
> +		}
> +	} else if (action == MEM_OFFLINE) {
> +		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +			ms = __pfn_to_section(pfn);
> +			if (early_section(ms)) {
> +
> +				/*
> +				 * This should have never happened. Boot memory
> +				 * offlining should have been prevented by this
> +				 * very notifier. Probably some memory removal
> +				 * procedure might have changed which would then
> +				 * require further debug.
> +				 */
> +				pr_err("Boot memory offlined\n");
> +
> +				/*
> +				 * Core memory hotplug does not process a return
> +				 * code from the notifier for MEM_OFFLINE event.
> +				 * Error condition has been reported. Report as
> +				 * ignored.
> +				 */
> +				return NOTIFY_DONE;
> +			}
> +		}
>   	}
>   	return NOTIFY_OK;
>   }
> 

It's pretty much irrelevant comment if the patch doesn't make sense:
the logical block for MEM_GOING_OFFLINE would be reused by MEM_OFFLINE
as they looks similar except the return value and error message :)

Cheers,
Gavin
Anshuman Khandual Sept. 24, 2020, 3:51 a.m. UTC | #3
On 09/23/2020 12:01 PM, Gavin Shan wrote:
> Hi Anshuman,
> 
> On 9/21/20 10:05 PM, Anshuman Khandual wrote:
>> This enables MEM_OFFLINE memory event handling. It will help intercept any
>> possible error condition such as if boot memory some how still got offlined
>> even after an explicit notifier failure, potentially by a future change in
>> generic hot plug framework. This would help detect such scenarios and help
>> debug further.
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.com>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: Marc Zyngier <maz@kernel.org>
>> Cc: Steve Capper <steve.capper@arm.com>
>> Cc: Mark Brown <broonie@kernel.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
> 
> I'm not sure if it makes sense since MEM_OFFLINE won't be triggered
> after NOTIFY_BAD is returned from MEM_GOING_OFFLINE. NOTIFY_BAD means
> the whole offline process is stopped. It would be guranteed by generic
> framework from syntax standpoint.

Right but the intent here is to catch any deviation in generic hotplug
semantics going forward.
 > 
> However, this looks good if MEM_OFFLINE is triggered without calling
> into MEM_GOING_OFFLINE previously, but it would be a bug from generic
> framework.

Exactly, this will just ensure that we know about any change or a bug
in the generic framework. But if required, this additional check can
be enabled only with DEBUG_VM.

> 
>>   arch/arm64/mm/mmu.c | 37 ++++++++++++++++++++++++++++++++-----
>>   1 file changed, 32 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index df3b7415b128..6b171bd88bcf 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1482,13 +1482,40 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>>       unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
>>       unsigned long pfn = arg->start_pfn;
>>   -    if (action != MEM_GOING_OFFLINE)
>> +    if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>>           return NOTIFY_OK;
>>   -    for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>> -        ms = __pfn_to_section(pfn);
>> -        if (early_section(ms))
>> -            return NOTIFY_BAD;
>> +    if (action == MEM_GOING_OFFLINE) {
>> +        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>> +            ms = __pfn_to_section(pfn);
>> +            if (early_section(ms)) {
>> +                pr_warn("Boot memory offlining attempted\n");
>> +                return NOTIFY_BAD;
>> +            }
>> +        }
>> +    } else if (action == MEM_OFFLINE) {
>> +        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>> +            ms = __pfn_to_section(pfn);
>> +            if (early_section(ms)) {
>> +
>> +                /*
>> +                 * This should have never happened. Boot memory
>> +                 * offlining should have been prevented by this
>> +                 * very notifier. Probably some memory removal
>> +                 * procedure might have changed which would then
>> +                 * require further debug.
>> +                 */
>> +                pr_err("Boot memory offlined\n");
>> +
>> +                /*
>> +                 * Core memory hotplug does not process a return
>> +                 * code from the notifier for MEM_OFFLINE event.
>> +                 * Error condition has been reported. Report as
>> +                 * ignored.
>> +                 */
>> +                return NOTIFY_DONE;
>> +            }
>> +        }
>>       }
>>       return NOTIFY_OK;
>>   }
>>
> 
> It's pretty much irrelevant comment if the patch doesn't make sense:
> the logical block for MEM_GOING_OFFLINE would be reused by MEM_OFFLINE
> as they looks similar except the return value and error message :)

This can be reorganized in the above mentioned format as well. Without
much additional code or iteration, it might not need DEBUG_VM as well.

for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
	ms = __pfn_to_section(pfn);
	if (!early_section(ms))
		continue;

	if (action == MEM_GOING_OFFLINE) {
		pr_warn("Boot memory offlining attempted\n");
		return NOTIFY_BAD;
	}
	else if (action == MEM_OFFLINE) {
		pr_err("Boot memory offlined\n");
		return NOTIFY_DONE;
	}
}
return NOTIFY_OK;
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index df3b7415b128..6b171bd88bcf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1482,13 +1482,40 @@  static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
 	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
 	unsigned long pfn = arg->start_pfn;
 
-	if (action != MEM_GOING_OFFLINE)
+	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
 		return NOTIFY_OK;
 
-	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		ms = __pfn_to_section(pfn);
-		if (early_section(ms))
-			return NOTIFY_BAD;
+	if (action == MEM_GOING_OFFLINE) {
+		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+			ms = __pfn_to_section(pfn);
+			if (early_section(ms)) {
+				pr_warn("Boot memory offlining attempted\n");
+				return NOTIFY_BAD;
+			}
+		}
+	} else if (action == MEM_OFFLINE) {
+		for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+			ms = __pfn_to_section(pfn);
+			if (early_section(ms)) {
+
+				/*
+				 * This should have never happened. Boot memory
+				 * offlining should have been prevented by this
+				 * very notifier. Probably some memory removal
+				 * procedure might have changed which would then
+				 * require further debug.
+				 */
+				pr_err("Boot memory offlined\n");
+
+				/*
+				 * Core memory hotplug does not process a return
+				 * code from the notifier for MEM_OFFLINE event.
+				 * Error condition has been reported. Report as
+				 * ignored.
+				 */
+				return NOTIFY_DONE;
+			}
+		}
 	}
 	return NOTIFY_OK;
 }