diff mbox series

arm64: Do not defer reserve_crashkernel() for platforms with no DMA memory zones

Message ID 1645056294-6509-1-git-send-email-vijayb@linux.microsoft.com (mailing list archive)
State New, archived
Headers show
Series arm64: Do not defer reserve_crashkernel() for platforms with no DMA memory zones | expand

Commit Message

Vijay Balakrishna Feb. 17, 2022, 12:04 a.m. UTC
The following patches resulted in deferring crash kernel reservation to
mem_init(), mainly aimed at platforms with DMA memory zones (no IOMMU),
in particular Raspberry Pi 4.

commit 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32")
commit 8424ecdde7df ("arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges")
commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into mem_init()")
commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel reservation is required")

Above changes introduced boot slowdown due to linear map creation for
all the memory banks with NO_BLOCK_MAPPINGS, see discussion[1].  The proposed
changes restore crash kernel reservation to earlier behavior thus avoids
slow boot, particularly for platforms with IOMMU (no DMA memory zones).

[1] https://lore.kernel.org/all/9436d033-579b-55fa-9b00-6f4b661c2dd7@linux.microsoft.com/

Signed-off-by: Vijay Balakrishna <vijayb@linux.microsoft.com>
Cc: stable@vger.kernel.org
---
Tested changes to confirm no ~150ms boot slowdown on our SoC with IOMMU
and 8GB memory.  Also tested with ZONE_DMA and/or ZONE_DMA32 configs to confirm
no regression to deferring scheme of crash kernel memory reservation.
In both cases successfully collected kernel crash dump.
---
 arch/arm64/mm/init.c | 14 +++++++++++---
 arch/arm64/mm/mmu.c  | 24 +++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 4 deletions(-)

Comments

Nicolas Saenz Julienne Feb. 17, 2022, 10:49 a.m. UTC | #1
On Wed, 2022-02-16 at 16:04 -0800, Vijay Balakrishna wrote:
> The following patches resulted in deferring crash kernel reservation to
> mem_init(), mainly aimed at platforms with DMA memory zones (no IOMMU),
> in particular Raspberry Pi 4.
> 
> commit 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32")
> commit 8424ecdde7df ("arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges")
> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into mem_init()")
> commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel reservation is required")
> 
> Above changes introduced boot slowdown due to linear map creation for
> all the memory banks with NO_BLOCK_MAPPINGS, see discussion[1].  The proposed
> changes restore crash kernel reservation to earlier behavior thus avoids
> slow boot, particularly for platforms with IOMMU (no DMA memory zones).
> 
> [1] https://lore.kernel.org/all/9436d033-579b-55fa-9b00-6f4b661c2dd7@linux.microsoft.com/
> 
> Signed-off-by: Vijay Balakrishna <vijayb@linux.microsoft.com>
> Cc: stable@vger.kernel.org
> ---
> Tested changes to confirm no ~150ms boot slowdown on our SoC with IOMMU
> and 8GB memory.  Also tested with ZONE_DMA and/or ZONE_DMA32 configs to confirm
> no regression to deferring scheme of crash kernel memory reservation.
> In both cases successfully collected kernel crash dump.
> ---
>  arch/arm64/mm/init.c | 14 +++++++++++---
>  arch/arm64/mm/mmu.c  | 24 +++++++++++++++++++++++-
>  2 files changed, 34 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index db63cc885771..f2a982c19b75 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -62,7 +62,11 @@ EXPORT_SYMBOL(memstart_addr);
>   * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
>   * otherwise it is empty.
>   */
> -phys_addr_t arm64_dma_phys_limit __ro_after_init;
> +#if !defined(CONFIG_ZONE_DMA) && !defined(CONFIG_ZONE_DMA32)
> +phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
> +#else
> +phys_addr_t __ro_after_init arm64_dma_phys_limit;
> +#endif
>  
>  #ifdef CONFIG_KEXEC_CORE
>  /*
> @@ -153,8 +157,6 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
>  	if (!arm64_dma_phys_limit)
>  		arm64_dma_phys_limit = dma32_phys_limit;
>  #endif
> -	if (!arm64_dma_phys_limit)
> -		arm64_dma_phys_limit = PHYS_MASK + 1;
>  	max_zone_pfns[ZONE_NORMAL] = max;
>  
>  	free_area_init(max_zone_pfns);
> @@ -315,6 +317,10 @@ void __init arm64_memblock_init(void)
>  
>  	early_init_fdt_scan_reserved_mem();
>  
> +#if !defined(CONFIG_ZONE_DMA) && !defined(CONFIG_ZONE_DMA32)
> +	reserve_crashkernel();
> +#endif
> +
>  	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>  }
>  
> @@ -357,11 +363,13 @@ void __init bootmem_init(void)
>  	 */
>  	dma_contiguous_reserve(arm64_dma_phys_limit);
>  
> +#if defined(CONFIG_ZONE_DMA) || defined(CONFIG_ZONE_DMA32)
>  	/*
>  	 * request_standard_resources() depends on crashkernel's memory being
>  	 * reserved, so do it here.
>  	 */
>  	reserve_crashkernel();
> +#endif
>  
>  	memblock_dump_all();
>  }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index acfae9b41cc8..e7faf5edccfc 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -517,7 +517,7 @@ static void __init map_mem(pgd_t *pgdp)
>  	 */
>  	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
>  
> -	if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
>  		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>  
>  	/*
> @@ -528,6 +528,14 @@ static void __init map_mem(pgd_t *pgdp)
>  	 */
>  	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>  
> +#ifdef CONFIG_KEXEC_CORE
> +	if (crash_mem_map && !crashk_res.end)
> +		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

Using IS_ENABLED(ZONE_DMA/DMA32) instead of '!crashk_res.end' would be more
efficient and a bit more explicit IMO.

>  	/* map all the memory banks */
>  	for_each_mem_range(i, &start, &end) {
>  		if (start >= end)
> @@ -554,6 +562,20 @@ static void __init map_mem(pgd_t *pgdp)
>  	__map_memblock(pgdp, kernel_start, kernel_end,
>  		       PAGE_KERNEL, NO_CONT_MAPPINGS);
>  	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
> +#ifdef CONFIG_KEXEC_CORE
> +	/*
> +	 * Use page-level mappings here so that we can shrink the region
> +	 * in page granularity and put back unused memory to buddy system
> +	 * through /sys/kernel/kexec_crash_size interface.
> +	 */
> +	if (crashk_res.end) {

Same here.

> +		__map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
> +			       PAGE_KERNEL,
> +			       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> +		memblock_clear_nomap(crashk_res.start,
> +				     resource_size(&crashk_res));
> +	}
> +#endif

Now, I carefully reviewed the patch and it seems to be doing the right thing.
But even while knowlegable on the topic, it took a good amount of effort to
untangle the possible code paths. I suspect it's going to be painful to
maintain. I'd suggest at least introducing a comment explaining the situation.

If there approach if deemed acceptable, I'll test is on the RPi4.

Regards,
Nicolas
Nicolas Saenz Julienne Feb. 17, 2022, 10:54 a.m. UTC | #2
Sorry, typos:

On Thu, 2022-02-17 at 11:49 +0100, nicolas saenz julienne wrote:
> If there approach if deemed acceptable, I'll test is on the RPi4.

"If the approach is deemed acceptable,"

Regards,
Nicoals
Vijay Balakrishna Feb. 17, 2022, 6:26 p.m. UTC | #3
On 2/17/2022 2:49 AM, nicolas saenz julienne wrote:
> On Wed, 2022-02-16 at 16:04 -0800, Vijay Balakrishna wrote:
>> The following patches resulted in deferring crash kernel reservation to
>> mem_init(), mainly aimed at platforms with DMA memory zones (no IOMMU),
>> in particular Raspberry Pi 4.
>>
>> commit 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32")
>> commit 8424ecdde7df ("arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges")
>> commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into mem_init()")
..
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index acfae9b41cc8..e7faf5edccfc 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -517,7 +517,7 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 */
>>   	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
>>   
>> -	if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
>> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
>>   		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>   
>>   	/*
>> @@ -528,6 +528,14 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 */
>>   	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>>   
>> +#ifdef CONFIG_KEXEC_CORE
>> +	if (crash_mem_map && !crashk_res.end)
>> +		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> 
> Using IS_ENABLED(ZONE_DMA/DMA32) instead of '!crashk_res.end' would be more
> efficient and a bit more explicit IMO.

Sure, I will make change in a follow up submission.

> 
>>   	/* map all the memory banks */
>>   	for_each_mem_range(i, &start, &end) {
>>   		if (start >= end)
>> @@ -554,6 +562,20 @@ static void __init map_mem(pgd_t *pgdp)
>>   	__map_memblock(pgdp, kernel_start, kernel_end,
>>   		       PAGE_KERNEL, NO_CONT_MAPPINGS);
>>   	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
>> +#ifdef CONFIG_KEXEC_CORE
>> +	/*
>> +	 * Use page-level mappings here so that we can shrink the region
>> +	 * in page granularity and put back unused memory to buddy system
>> +	 * through /sys/kernel/kexec_crash_size interface.
>> +	 */
>> +	if (crashk_res.end) {
> 
> Same here.

Yes.

> 
>> +		__map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
>> +			       PAGE_KERNEL,
>> +			       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>> +		memblock_clear_nomap(crashk_res.start,
>> +				     resource_size(&crashk_res));
>> +	}
>> +#endif
> 
> Now, I carefully reviewed the patch and it seems to be doing the right thing.
> But even while knowlegable on the topic, it took a good amount of effort to
> untangle the possible code paths. I suspect it's going to be painful to
> maintain. I'd suggest at least introducing a comment explaining the situation.

I appreciate your review.  Yes, it took a good amount of time for me 
(new here) too and glad for your notice.  Let me take a shot at 
explaining in my next revision.
> 
> If there approach if deemed acceptable, I'll test is on the RPi4.

Please, your testing on RPi4 would be valuable.

Thanks,
Vijay

> 
> Regards,
> Nicolas
diff mbox series

Patch

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771..f2a982c19b75 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -62,7 +62,11 @@  EXPORT_SYMBOL(memstart_addr);
  * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
  * otherwise it is empty.
  */
-phys_addr_t arm64_dma_phys_limit __ro_after_init;
+#if !defined(CONFIG_ZONE_DMA) && !defined(CONFIG_ZONE_DMA32)
+phys_addr_t __ro_after_init arm64_dma_phys_limit = PHYS_MASK + 1;
+#else
+phys_addr_t __ro_after_init arm64_dma_phys_limit;
+#endif
 
 #ifdef CONFIG_KEXEC_CORE
 /*
@@ -153,8 +157,6 @@  static void __init zone_sizes_init(unsigned long min, unsigned long max)
 	if (!arm64_dma_phys_limit)
 		arm64_dma_phys_limit = dma32_phys_limit;
 #endif
-	if (!arm64_dma_phys_limit)
-		arm64_dma_phys_limit = PHYS_MASK + 1;
 	max_zone_pfns[ZONE_NORMAL] = max;
 
 	free_area_init(max_zone_pfns);
@@ -315,6 +317,10 @@  void __init arm64_memblock_init(void)
 
 	early_init_fdt_scan_reserved_mem();
 
+#if !defined(CONFIG_ZONE_DMA) && !defined(CONFIG_ZONE_DMA32)
+	reserve_crashkernel();
+#endif
+
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 }
 
@@ -357,11 +363,13 @@  void __init bootmem_init(void)
 	 */
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
+#if defined(CONFIG_ZONE_DMA) || defined(CONFIG_ZONE_DMA32)
 	/*
 	 * request_standard_resources() depends on crashkernel's memory being
 	 * reserved, so do it here.
 	 */
 	reserve_crashkernel();
+#endif
 
 	memblock_dump_all();
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8..e7faf5edccfc 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -517,7 +517,7 @@  static void __init map_mem(pgd_t *pgdp)
 	 */
 	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
 
-	if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
+	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
@@ -528,6 +528,14 @@  static void __init map_mem(pgd_t *pgdp)
 	 */
 	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
 
+#ifdef CONFIG_KEXEC_CORE
+	if (crash_mem_map && !crashk_res.end)
+		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+
+	if (crashk_res.end)
+		memblock_mark_nomap(crashk_res.start,
+				    resource_size(&crashk_res));
+#endif
 	/* map all the memory banks */
 	for_each_mem_range(i, &start, &end) {
 		if (start >= end)
@@ -554,6 +562,20 @@  static void __init map_mem(pgd_t *pgdp)
 	__map_memblock(pgdp, kernel_start, kernel_end,
 		       PAGE_KERNEL, NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
+#ifdef CONFIG_KEXEC_CORE
+	/*
+	 * Use page-level mappings here so that we can shrink the region
+	 * in page granularity and put back unused memory to buddy system
+	 * through /sys/kernel/kexec_crash_size interface.
+	 */
+	if (crashk_res.end) {
+		__map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
+			       PAGE_KERNEL,
+			       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+		memblock_clear_nomap(crashk_res.start,
+				     resource_size(&crashk_res));
+	}
+#endif
 }
 
 void mark_rodata_ro(void)