diff mbox series

[v6,3/4] arm64: use both ZONE_DMA and ZONE_DMA32

Message ID 20190911182546.17094-4-nsaenzjulienne@suse.de (mailing list archive)
State New, archived
Headers show
Series Raspberry Pi 4 DMA addressing support | expand

Commit Message

Nicolas Saenz Julienne Sept. 11, 2019, 6:25 p.m. UTC
So far all arm64 devices have supported 32 bit DMA masks for their
peripherals. This is not true anymore for the Raspberry Pi 4 as most of
it's peripherals can only address the first GB of memory on a total of
up to 4 GB.

This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
to be addressable with a 32 bit mask. So it was decided to re-introduce
ZONE_DMA in arm64.

ZONE_DMA will contain the lower 1G of memory, which is currently the
memory area addressable by any peripheral on an arm64 device.
ZONE_DMA32 will contain the rest of the 32 bit addressable memory.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

---

I kept the Reviewed-by as the last bug solution was proposed by Catalin

Changes in v6:
- Fixed bug in max_zone_phys

Changes in v5:
- Fixed swiotlb initialization

Changes in v4:
- Fixed issue when NUMA=n and ZONE_DMA=n
- Merged two max_zone_dma*_phys() functions

Changes in v3:
- Used fixed size ZONE_DMA
- Fix check befor swiotlb_init()

Changes in v2:
- Update comment to reflect new zones split
- ZONE_DMA will never be left empty

 arch/arm64/Kconfig            |  4 +++
 arch/arm64/include/asm/page.h |  2 ++
 arch/arm64/mm/init.c          | 54 +++++++++++++++++++++++++----------
 3 files changed, 45 insertions(+), 15 deletions(-)

Comments

Qian Cai Oct. 21, 2019, 2:15 p.m. UTC | #1
> On Sep 11, 2019, at 2:25 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de> wrote:
> 
> So far all arm64 devices have supported 32 bit DMA masks for their
> peripherals. This is not true anymore for the Raspberry Pi 4 as most of
> it's peripherals can only address the first GB of memory on a total of
> up to 4 GB.
> 
> This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
> to be addressable with a 32 bit mask. So it was decided to re-introduce
> ZONE_DMA in arm64.
> 
> ZONE_DMA will contain the lower 1G of memory, which is currently the
> memory area addressable by any peripheral on an arm64 device.
> ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
> 
> Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> 
> ---
> 
> I kept the Reviewed-by as the last bug solution was proposed by Catalin
> 
> Changes in v6:
> - Fixed bug in max_zone_phys
> 
> Changes in v5:
> - Fixed swiotlb initialization
> 
> Changes in v4:
> - Fixed issue when NUMA=n and ZONE_DMA=n
> - Merged two max_zone_dma*_phys() functions
> 
> Changes in v3:
> - Used fixed size ZONE_DMA
> - Fix check befor swiotlb_init()
> 
> Changes in v2:
> - Update comment to reflect new zones split
> - ZONE_DMA will never be left empty
> 
> arch/arm64/Kconfig            |  4 +++
> arch/arm64/include/asm/page.h |  2 ++
> arch/arm64/mm/init.c          | 54 +++++++++++++++++++++++++----------
> 3 files changed, 45 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 6b6362b83004..2dbe0165bd15 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -267,6 +267,10 @@ config GENERIC_CSUM
> config GENERIC_CALIBRATE_DELAY
> 	def_bool y
> 
> +config ZONE_DMA
> +	bool "Support DMA zone" if EXPERT
> +	default y
> +
> config ZONE_DMA32
> 	bool "Support DMA32 zone" if EXPERT
> 	default y
> diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> index d39ddb258a04..7b8c98830101 100644
> --- a/arch/arm64/include/asm/page.h
> +++ b/arch/arm64/include/asm/page.h
> @@ -38,4 +38,6 @@ extern int pfn_valid(unsigned long);
> 
> #include <asm-generic/getorder.h>
> 
> +#define ARCH_ZONE_DMA_BITS 30
> +
> #endif
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 8e9bc64c5878..44f07fdf7a59 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -56,6 +56,13 @@ EXPORT_SYMBOL(physvirt_offset);
> struct page *vmemmap __ro_after_init;
> EXPORT_SYMBOL(vmemmap);
> 
> +/*
> + * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of
> + * memory as some devices, namely the Raspberry Pi 4, have peripherals with
> + * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32
> + * bit addressable memory area.
> + */
> +phys_addr_t arm64_dma_phys_limit __ro_after_init;
> phys_addr_t arm64_dma32_phys_limit __ro_after_init;
> 
> #ifdef CONFIG_KEXEC_CORE
> @@ -169,15 +176,16 @@ static void __init reserve_elfcorehdr(void)
> {
> }
> #endif /* CONFIG_CRASH_DUMP */
> +
> /*
> - * Return the maximum physical address for ZONE_DMA32 (DMA_BIT_MASK(32)). It
> - * currently assumes that for memory starting above 4G, 32-bit devices will
> - * use a DMA offset.
> + * Return the maximum physical address for a zone with a given address size
> + * limit. It currently assumes that for memory starting above 4G, 32-bit
> + * devices will use a DMA offset.
>  */
> -static phys_addr_t __init max_zone_dma32_phys(void)
> +static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
> {
> -	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
> -	return min(offset + (1ULL << 32), memblock_end_of_DRAM());
> +	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits);
> +	return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
> }
> 
> #ifdef CONFIG_NUMA
> @@ -186,6 +194,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> {
> 	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
> 
> +#ifdef CONFIG_ZONE_DMA
> +	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
> +#endif
> #ifdef CONFIG_ZONE_DMA32
> 	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit);
> #endif
> @@ -201,13 +212,18 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> 	struct memblock_region *reg;
> 	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
> 	unsigned long max_dma32 = min;
> +	unsigned long max_dma = min;
> 
> 	memset(zone_size, 0, sizeof(zone_size));
> 
> -	/* 4GB maximum for 32-bit only capable devices */
> +#ifdef CONFIG_ZONE_DMA
> +	max_dma = PFN_DOWN(arm64_dma_phys_limit);
> +	zone_size[ZONE_DMA] = max_dma - min;
> +	max_dma32 = max_dma;
> +#endif
> #ifdef CONFIG_ZONE_DMA32
> 	max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
> -	zone_size[ZONE_DMA32] = max_dma32 - min;
> +	zone_size[ZONE_DMA32] = max_dma32 - max_dma;
> #endif
> 	zone_size[ZONE_NORMAL] = max - max_dma32;
> 
> @@ -219,11 +235,17 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> 
> 		if (start >= max)
> 			continue;
> -
> +#ifdef CONFIG_ZONE_DMA
> +		if (start < max_dma) {
> +			unsigned long dma_end = min_not_zero(end, max_dma);
> +			zhole_size[ZONE_DMA] -= dma_end - start;
> +		}
> +#endif
> #ifdef CONFIG_ZONE_DMA32
> 		if (start < max_dma32) {
> -			unsigned long dma_end = min(end, max_dma32);
> -			zhole_size[ZONE_DMA32] -= dma_end - start;
> +			unsigned long dma32_end = min(end, max_dma32);
> +			unsigned long dma32_start = max(start, max_dma);
> +			zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
> 		}
> #endif
> 		if (end > max_dma32) {
> @@ -418,9 +440,11 @@ void __init arm64_memblock_init(void)
> 
> 	early_init_fdt_scan_reserved_mem();
> 
> -	/* 4GB maximum for 32-bit only capable devices */
> +	if (IS_ENABLED(CONFIG_ZONE_DMA))
> +		arm64_dma_phys_limit = max_zone_phys(ARCH_ZONE_DMA_BITS);
> +
> 	if (IS_ENABLED(CONFIG_ZONE_DMA32))
> -		arm64_dma32_phys_limit = max_zone_dma32_phys();
> +		arm64_dma32_phys_limit = max_zone_phys(32);
> 	else
> 		arm64_dma32_phys_limit = PHYS_MASK + 1;
> 
> @@ -430,7 +454,7 @@ void __init arm64_memblock_init(void)
> 
> 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
> 
> -	dma_contiguous_reserve(arm64_dma32_phys_limit);
> +	dma_contiguous_reserve(arm64_dma_phys_limit ? : arm64_dma32_phys_limit);
> }
> 
> void __init bootmem_init(void)
> @@ -534,7 +558,7 @@ static void __init free_unused_memmap(void)
> void __init mem_init(void)
> {
> 	if (swiotlb_force == SWIOTLB_FORCE ||
> -	    max_pfn > (arm64_dma32_phys_limit >> PAGE_SHIFT))
> +	    max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit))
> 		swiotlb_init(1);
> 	else
> 		swiotlb_force = SWIOTLB_NO_FORCE;
> -- 
> 2.23.0
> 
> 

With ZONE_DMA=y, this config will fail to reserve 512M CMA on a server,

https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config

CONFIG_DMA_CMA=y
CONFIG_CMA_SIZE_MBYTES=64
CONFIG_CMA_SIZE_SEL_MBYTES=y
CONFIG_CMA_ALIGNMENT=8
CONFIG_CMA=y
CONFIG_CMA_DEBUGFS=y
CONFIG_CMA_AREAS=7

Is this expected?
Nicolas Saenz Julienne Oct. 21, 2019, 2:34 p.m. UTC | #2
On Mon, 2019-10-21 at 10:15 -0400, Qian Cai wrote:
> > On Sep 11, 2019, at 2:25 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > wrote:
> > 
> > So far all arm64 devices have supported 32 bit DMA masks for their
> > peripherals. This is not true anymore for the Raspberry Pi 4 as most of
> > it's peripherals can only address the first GB of memory on a total of
> > up to 4 GB.
> > 
> > This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
> > to be addressable with a 32 bit mask. So it was decided to re-introduce
> > ZONE_DMA in arm64.
> > 
> > ZONE_DMA will contain the lower 1G of memory, which is currently the
> > memory area addressable by any peripheral on an arm64 device.
> > ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
> > 
> > Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> > 
> > ---
> 
> With ZONE_DMA=y, this config will fail to reserve 512M CMA on a server,
> 
> https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
> 
> CONFIG_DMA_CMA=y
> CONFIG_CMA_SIZE_MBYTES=64
> CONFIG_CMA_SIZE_SEL_MBYTES=y
> CONFIG_CMA_ALIGNMENT=8
> CONFIG_CMA=y
> CONFIG_CMA_DEBUGFS=y
> CONFIG_CMA_AREAS=7
> 
> Is this expected?

Not really, just tested cma=512M on a Raspberry Pi4, and it went well. The only
thing on my build that differs from your config is CONFIG_CMA_DEBUGFS.

Could you post more information on the device you're experiencing this on? Also
some logs.

Regards,
Nicolas
Qian Cai Oct. 21, 2019, 2:46 p.m. UTC | #3
> On Oct 21, 2019, at 10:34 AM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de> wrote:
> 
> On Mon, 2019-10-21 at 10:15 -0400, Qian Cai wrote:
>>> On Sep 11, 2019, at 2:25 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
>>> wrote:
>>> 
>>> So far all arm64 devices have supported 32 bit DMA masks for their
>>> peripherals. This is not true anymore for the Raspberry Pi 4 as most of
>>> it's peripherals can only address the first GB of memory on a total of
>>> up to 4 GB.
>>> 
>>> This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
>>> to be addressable with a 32 bit mask. So it was decided to re-introduce
>>> ZONE_DMA in arm64.
>>> 
>>> ZONE_DMA will contain the lower 1G of memory, which is currently the
>>> memory area addressable by any peripheral on an arm64 device.
>>> ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
>>> 
>>> Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
>>> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
>>> 
>>> ---
>> 
>> With ZONE_DMA=y, this config will fail to reserve 512M CMA on a server,
>> 
>> https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
>> 
>> CONFIG_DMA_CMA=y
>> CONFIG_CMA_SIZE_MBYTES=64
>> CONFIG_CMA_SIZE_SEL_MBYTES=y
>> CONFIG_CMA_ALIGNMENT=8
>> CONFIG_CMA=y
>> CONFIG_CMA_DEBUGFS=y
>> CONFIG_CMA_AREAS=7
>> 
>> Is this expected?
> 
> Not really, just tested cma=512M on a Raspberry Pi4, and it went well. The only
> thing on my build that differs from your config is CONFIG_CMA_DEBUGFS.
> 
> Could you post more information on the device you're experiencing this on? Also
> some logs.

With the above config, it does not even need "cma=512M" kernel cmdline.

[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x431f0af1]
[    0.000000] Linux version 5.4.0-rc4-next-20191021+ (clang version 8.0.1 (Red Hat 8.0.1-1.module+el8.1.0+3866+6be7f4d8)) #1 SMP Mon Oct 21 10:03:03 EDT 2019
[    0.000000] Setting debug_guardpage_minorder to 1
[    0.000000] efi: Getting EFI parameters from FDT:
[    0.000000] efi: EFI v2.70 by American Megatrends
[    0.000000] efi:  ESRT=0xf935ed98  SMBIOS=0xfcc90000  SMBIOS 3.0=0xfcc80000  ACPI 2.0=0xfac80000  MEMRESERVE=0xfacd1018 
[    0.000000] esrt: Reserving ESRT space from 0x00000000f935ed98 to 0x00000000f935edd0.
[    0.000000] crashkernel reserved: 0x00000097db400000 - 0x00000097fb400000 (512 MB)
[    0.000000] cma: Reserved 512 MiB at 0x00000000a0000000

With ZONE_DMA=y, it will say,

cma: Failed to reserve 512 MiB

The machine is a ThunderX2 server.

https://buy.hpe.com/us/en/servers/apollo-systems/apollo-70-system/apollo-70-system/hpe-apollo-70-system/p/1010742472

# lscpu
Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              256
On-line CPU(s) list: 0-255
Thread(s) per core:  4
Core(s) per socket:  32
Socket(s):           2
NUMA node(s):        2
Vendor ID:           Cavium
Model:               1
Model name:          ThunderX2 99xx
Stepping:            0x1
BogoMIPS:            400.00
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            32768K
NUMA node0 CPU(s):   0-127
NUMA node1 CPU(s):   128-255
Flags:               fp asimd aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
Nicolas Saenz Julienne Oct. 21, 2019, 5:01 p.m. UTC | #4
On Mon, 2019-10-21 at 10:46 -0400, Qian Cai wrote:
> > On Oct 21, 2019, at 10:34 AM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de
> > > wrote:
> > 
> > On Mon, 2019-10-21 at 10:15 -0400, Qian Cai wrote:
> > > > On Sep 11, 2019, at 2:25 PM, Nicolas Saenz Julienne <
> > > > nsaenzjulienne@suse.de>
> > > > wrote:
> > > > 
> > > > So far all arm64 devices have supported 32 bit DMA masks for their
> > > > peripherals. This is not true anymore for the Raspberry Pi 4 as most of
> > > > it's peripherals can only address the first GB of memory on a total of
> > > > up to 4 GB.
> > > > 
> > > > This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
> > > > to be addressable with a 32 bit mask. So it was decided to re-introduce
> > > > ZONE_DMA in arm64.
> > > > 
> > > > ZONE_DMA will contain the lower 1G of memory, which is currently the
> > > > memory area addressable by any peripheral on an arm64 device.
> > > > ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
> > > > 
> > > > Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > > > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
> > > > 
> > > > ---
> > > 
> > > With ZONE_DMA=y, this config will fail to reserve 512M CMA on a server,
> > > 
> > > https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
> > > 
> > > CONFIG_DMA_CMA=y
> > > CONFIG_CMA_SIZE_MBYTES=64
> > > CONFIG_CMA_SIZE_SEL_MBYTES=y
> > > CONFIG_CMA_ALIGNMENT=8
> > > CONFIG_CMA=y
> > > CONFIG_CMA_DEBUGFS=y
> > > CONFIG_CMA_AREAS=7
> > > 
> > > Is this expected?
> > 
> > Not really, just tested cma=512M on a Raspberry Pi4, and it went well. The
> > only
> > thing on my build that differs from your config is CONFIG_CMA_DEBUGFS.
> > 
> > Could you post more information on the device you're experiencing this on?
> > Also
> > some logs.
> 
> With the above config, it does not even need "cma=512M" kernel cmdline.
> 
> [    0.000000] Booting Linux on physical CPU 0x0000000000 [0x431f0af1]
> [    0.000000] Linux version 5.4.0-rc4-next-20191021+ (clang version 8.0.1
> (Red Hat 8.0.1-1.module+el8.1.0+3866+6be7f4d8)) #1 SMP Mon Oct 21 10:03:03 EDT
> 2019
> [    0.000000] Setting debug_guardpage_minorder to 1
> [    0.000000] efi: Getting EFI parameters from FDT:
> [    0.000000] efi: EFI v2.70 by American Megatrends
> [    0.000000] efi:  ESRT=0xf935ed98  SMBIOS=0xfcc90000  SMBIOS
> 3.0=0xfcc80000  ACPI 2.0=0xfac80000  MEMRESERVE=0xfacd1018 
> [    0.000000] esrt: Reserving ESRT space from 0x00000000f935ed98 to
> 0x00000000f935edd0.
> [    0.000000] crashkernel reserved: 0x00000097db400000 - 0x00000097fb400000
> (512 MB)
> [    0.000000] cma: Reserved 512 MiB at 0x00000000a0000000
> 
> With ZONE_DMA=y, it will say,
> 
> cma: Failed to reserve 512 MiB
> 
> The machine is a ThunderX2 server.
> 
> 
https://buy.hpe.com/us/en/servers/apollo-systems/apollo-70-system/apollo-70-system/hpe-apollo-70-system/p/1010742472
> 
> # lscpu
> Architecture:        aarch64
> Byte Order:          Little Endian
> CPU(s):              256
> On-line CPU(s) list: 0-255
> Thread(s) per core:  4
> Core(s) per socket:  32
> Socket(s):           2
> NUMA node(s):        2
> Vendor ID:           Cavium
> Model:               1
> Model name:          ThunderX2 99xx
> Stepping:            0x1
> BogoMIPS:            400.00
> L1d cache:           32K
> L1i cache:           32K
> L2 cache:            256K
> L3 cache:            32768K
> NUMA node0 CPU(s):   0-127
> NUMA node1 CPU(s):   128-255
> Flags:               fp asimd aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm

Hi Qian,
I happen to have access to a very similar machine:

thunderx25:~ # lscpu 
Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              224
On-line CPU(s) list: 0-223
Thread(s) per core:  4
Core(s) per socket:  28
Socket(s):           2
NUMA node(s):        2
Vendor ID:           Cavium
Model:               1
Model name:          ThunderX2 99xx
Stepping:            0x1
CPU max MHz:         2500.0000
CPU min MHz:         1000.0000
BogoMIPS:            400.00
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            32768K
NUMA node0 CPU(s):   0-111
NUMA node1 CPU(s):   112-223
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid
asimdrdm

I tested a kernel with your configuration plus CONFIG_ZONE_DMA=y yet I'm unable
to reproduce the error. The CMA allocation is successful.

[    0.000000][    T0] Booting Linux on physical CPU 0x0000000000 [0x431f0af1]
[    0.000000][    T0] Linux version 5.4.0-rc4-next-20191021 (nico@linux-9qgx) (gcc version 9.2.1 20190903 [gcc-9-branch revision 275330] (SUSE Linux)) #60 SMP Mon Oct 21 18:48:51 CEST 2019
[    0.000000][    T0] printk: debug: ignoring loglevel setting.
[    0.000000][    T0] efi: Getting EFI parameters from FDT:
[    0.000000][    T0] efi: EFI v2.70 by American Megatrends
[    0.000000][    T0] efi:  ESRT=0xf10b4198  SMBIOS=0xfcc90000  SMBIOS 3.0=0xfcc80000  ACPI 2.0=0xf9670000  MEMRESERVE=0xf1117018
[    0.000000][    T0] esrt: Reserving ESRT space from 0x00000000f10b4198 to 0x00000000f10b41d0.
[    0.000000][    T0] cma: Reserved 512 MiB at 0x00000000a0000000
[    0.000000][    T0] ACPI: Early table checksum verification disabled
[    0.000000][    T0] ACPI: RSDP 0x00000000F9670000 000024 (v02 HPE   )
[    0.000000][    T0] ACPI: XSDT 0x00000000F9670028 0000DC (v01 HPE    ServerCL 01072009 AMI  00010013)
[    0.000000][    T0] ACPI: FACP 0x00000000F9670108 000114 (v06 HPE    ServerCL 01072009 AMI  00010013)
[    0.000000][    T0] ACPI: DSDT 0x00000000F9670220 000714 (v02 HPE    ServerCL 20150406 INTL 20170831)
[    0.000000][    T0] ACPI: FIDT 0x00000000F9670938 00009C (v01 HPE    ServerCL 01072009 AMI  00010013)
[    0.000000][    T0] ACPI: DBG2 0x00000000F96709D8 000062 (v00 HPE    ServerCL 00000000 INTL 20170831)
[    0.000000][    T0] ACPI: SPMI 0x00000000F9670A40 000041 (v05 HPE    ServerCL 00000000 AMI. 00000000)
[    0.000000][    T0] ACPI: PCCT 0x00000000F9670A88 000FB0 (v01 HPE    ServerCL 00000001 INTL 20170831)
[    0.000000][    T0] ACPI: SLIT 0x00000000F9671A38 000030 (v01 HPE    ServerCL 00000001 INTL 20170831)
[    0.000000][    T0] ACPI: SPMI 0x00000000F9671A68 000041 (v04 HPE    ServerCL 00000001 INTL 20170831)
[    0.000000][    T0] ACPI: SSDT 0x00000000F9671AB0 004217 (v02 HPE    N0BXPCI  20150406 INTL 20170831)
[    0.000000][    T0] ACPI: SSDT 0x00000000F9675CC8 019654 (v02 HPE    ServerCL 20150406 INTL 20170831)
[    0.000000][    T0] ACPI: SSDT 0x00000000F968F320 0041CB (v02 HPE    N1BXPCI  20150406 INTL 20170831)
[    0.000000][    T0] ACPI: SSDT 0x00000000F96934F0 01980C (v02 HPE    ServerCL 20150406 INTL 20170831)
[    0.000000][    T0] ACPI: BERT 0x00000000F96ACD00 000030 (v01 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: GTDT 0x00000000F96ACD30 00007C (v02 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: HEST 0x00000000F96ACDB0 000308 (v01 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: APIC 0x00000000F96AD0B8 00468C (v04 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: MCFG 0x00000000F96B1748 00003C (v01 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: NFIT 0x00000000F96B1788 000028 (v01 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: PPTT 0x00000000F96B17B0 0018B4 (v01 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: SRAT 0x00000000F96B3068 0010A8 (v03 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: IORT 0x00000000F96B4110 000688 (v00 HPE    ServerCL 20150406 CAVM 00000099)
[    0.000000][    T0] ACPI: BGRT 0x00000000F96B4798 000038 (v01 HPE    ServerCL 01072009 AMI  00010013)
[    0.000000][    T0] ACPI: SPCR 0x00000000F96B47D0 000050 (v02 HPE    ServerCL 01072009 AMI. 0005000D)
[    0.000000][    T0] ACPI: WSMT 0x00000000F96B4820 000028 (v01 HPE    ServerCL 01072009 AMI  00010013)
[    0.000000][    T0] ACPI: SPCR: Unexpected SPCR Access Width.  Defaulting to byte size
[    0.000000][    T0] ACPI: SPCR: console: pl011,mmio,0x402020000,115200
[    0.000000][    T0] ACPI: SRAT: Node 0 PXM 0 [mem 0x80000000-0xfeffffff]
[    0.000000][    T0] ACPI: SRAT: Node 0 PXM 0 [mem 0x880000000-0xfffffffff]
[    0.000000][    T0] ACPI: SRAT: Node 0 PXM 0 [mem 0x8800000000-0x89fcffffff]
[    0.000000][    T0] ACPI: SRAT: Node 1 PXM 1 [mem 0x89fd000000-0x93fcffffff]
[    0.000000][    T0] NUMA: NODE_DATA [mem 0x89fcff5bc0-0x89fcffffff]
[    0.000000][    T0] NUMA: NODE_DATA [mem 0x93fc5b5bc0-0x93fc5bffff]
[    0.000000][    T0] Zone ranges:
[    0.000000][    T0]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
[    0.000000][    T0]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
[    0.000000][    T0]   Normal   [mem 0x0000000100000000-0x00000093fcffffff]
[    0.000000][    T0] Movable zone start for each node
[    0.000000][    T0] Early memory node ranges
[    0.000000][    T0]   node   0: [mem 0x00000000802f0000-0x000000008030ffff]
[    0.000000][    T0]   node   0: [mem 0x0000000080310000-0x00000000bfffffff]
[    0.000000][    T0]   node   0: [mem 0x00000000c0000000-0x00000000c0cbffff]
[    0.000000][    T0]   node   0: [mem 0x00000000c0cc0000-0x00000000f104ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000f1050000-0x00000000f10affff]
[    0.000000][    T0]   node   0: [mem 0x00000000f10b0000-0x00000000f96fffff]
[    0.000000][    T0]   node   0: [mem 0x00000000f9700000-0x00000000f98affff]
[    0.000000][    T0]   node   0: [mem 0x00000000f98b0000-0x00000000fa92ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fa930000-0x00000000faa6ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000faa70000-0x00000000fabbffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fabc0000-0x00000000fabdffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fabe0000-0x00000000fadeffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fadf0000-0x00000000fae4ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fae50000-0x00000000fc8cffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fc8d0000-0x00000000fc8dffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fc8e0000-0x00000000fca9ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fcaa0000-0x00000000fcaaffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fcab0000-0x00000000fcb3ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fcb40000-0x00000000fd1effff]
[    0.000000][    T0]   node   0: [mem 0x00000000fd1f0000-0x00000000feceffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fecf0000-0x00000000fed1ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fed20000-0x00000000fed2ffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fed30000-0x00000000feddffff]
[    0.000000][    T0]   node   0: [mem 0x00000000fede0000-0x00000000feffffff]
[    0.000000][    T0]   node   0: [mem 0x0000000880000000-0x0000000fffffffff]
[    0.000000][    T0]   node   0: [mem 0x0000008800000000-0x00000089fcffffff]
[    0.000000][    T0]   node   1: [mem 0x00000089fd000000-0x00000093fcffffff]
[    0.000000][    T0] Zeroed struct page in unavailable ranges: 440 pages
[    0.000000][    T0] Initmem setup node 0 [mem 0x00000000802f0000-0x00000089fcffffff]
[    0.000000][    T0] On node 0 totalpages: 654289
[    0.000000][    T0]   DMA zone: 16 pages used for memmap
[    0.000000][    T0]   DMA zone: 0 pages reserved
[    0.000000][    T0]   DMA zone: 16337 pages, LIFO batch:3
[    0.000000][    T0]   DMA32 zone: 16 pages used for memmap
[    0.000000][    T0]   DMA32 zone: 16128 pages, LIFO batch:3
[    0.000000][    T0]   Normal zone: 608 pages used for memmap
[    0.000000][    T0]   Normal zone: 621824 pages, LIFO batch:3
[    0.000000][    T0] Initmem setup node 1 [mem 0x00000089fd000000-0x00000093fcffffff]
[    0.000000][    T0] On node 1 totalpages: 655360
[    0.000000][    T0]   Normal zone: 640 pages used for memmap
[    0.000000][    T0]   Normal zone: 655360 pages, LIFO batch:3

Could you enable CMA debugging to see if anything interesting comes out of it.

Regards,
Nicolas
Qian Cai Oct. 21, 2019, 5:25 p.m. UTC | #5
> On Oct 21, 2019, at 1:01 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de> wrote:
> 
> Could you enable CMA debugging to see if anything interesting comes out of it.

I did but nothing interesting came out. Did you use the same config I gave? Also, it has those cmdline.

page_poison=on page_owner=on numa_balancing=enable \
systemd.unified_cgroup_hierarchy=1 debug_guardpage_minorder=1 \
page_alloc.shuffle=1
Nicolas Saenz Julienne Oct. 21, 2019, 5:55 p.m. UTC | #6
On Mon, 2019-10-21 at 13:25 -0400, Qian Cai wrote:
> > On Oct 21, 2019, at 1:01 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > wrote:
> > 
> > Could you enable CMA debugging to see if anything interesting comes out of
> > it.
> 
> I did but nothing interesting came out. Did you use the same config I gave?

Yes, aside from enabling ZONE_DMA.

> Also, it has those cmdline.
>
> page_poison=on page_owner=on numa_balancing=enable \
> systemd.unified_cgroup_hierarchy=1 debug_guardpage_minorder=1 \
> page_alloc.shuffle=1

No luck, still works for me even after adding those extra flags. IIRC most of
them (if not all) are not even parsed by the time CMA is configured.

So, can you confirm the zones setup you're seeing is similar to this one:

[    0.000000][    T0] Zone ranges:
[    0.000000][    T0]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
[    0.000000][    T0]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
[    0.000000][    T0]   Normal   [mem 0x0000000100000000-0x00000093fcffffff]

Maybe your memory starts between 0xe0000000-0xffffffff. That would be
problematic (although somewhat unwarranted).

Regards,
Nicolas
Qian Cai Oct. 21, 2019, 8:36 p.m. UTC | #7
> On Oct 21, 2019, at 1:55 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de> wrote:
> 
> On Mon, 2019-10-21 at 13:25 -0400, Qian Cai wrote:
>>> On Oct 21, 2019, at 1:01 PM, Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
>>> wrote:
>>> 
>>> Could you enable CMA debugging to see if anything interesting comes out of
>>> it.
>> 
>> I did but nothing interesting came out. Did you use the same config I gave?
> 
> Yes, aside from enabling ZONE_DMA.
> 
>> Also, it has those cmdline.
>> 
>> page_poison=on page_owner=on numa_balancing=enable \
>> systemd.unified_cgroup_hierarchy=1 debug_guardpage_minorder=1 \
>> page_alloc.shuffle=1
> 
> No luck, still works for me even after adding those extra flags. IIRC most of
> them (if not all) are not even parsed by the time CMA is configured.
> 
> So, can you confirm the zones setup you're seeing is similar to this one:
> 
> [    0.000000][    T0] Zone ranges:
> [    0.000000][    T0]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
> [    0.000000][    T0]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
> [    0.000000][    T0]   Normal   [mem 0x0000000100000000-0x00000093fcffffff]
> 
> Maybe your memory starts between 0xe0000000-0xffffffff. That would be
> problematic (although somewhat unwarranted).

I managed to get more information here,

[    0.000000] cma: dma_contiguous_reserve(limit c0000000)
[    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global area
[    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
[    0.000000] cma: Failed to reserve 512 MiB

Full dmesg:

https://cailca.github.io/files/dmesg.txt
Nicolas Saenz Julienne Oct. 22, 2019, 11:23 a.m. UTC | #8
On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> I managed to get more information here,
> 
> [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global area
> [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base
> 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
> [    0.000000] cma: Failed to reserve 512 MiB
> 
> Full dmesg:
> 
> https://cailca.github.io/files/dmesg.txt

OK I got it, reproduced it too.

Here are the relevant logs:

	[    0.000000]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
	[    0.000000]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
	[    0.000000]   Normal   [mem 0x0000000100000000-0x00000097fcffffff]

As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff which
is slightly smaller than 1GB.

	[    0.000000] crashkernel reserved: 0x000000009fe00000 - 0x00000000bfe00000 (512 MB)

Here crashkernel reserved 512M in ZONE_DMA.

	[    0.000000] cma: Failed to reserve 512 MiB

CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough space.
Makes sense.

A fix could be moving crashkernel reservations after CMA and then if unable to
fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little over the
top, yet although most devices will be fine with ZONE_DMA32, the RPi4 needs
crashkernel to be reserved in ZONE_DMA.

My knowledge of Kdump is limited, so I'd love to see what Catalin has to say.
Here's a tested patch of what I'm proposing:

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 120c26af916b..49f3c3a34ae2 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -76,6 +76,7 @@ phys_addr_t arm64_dma32_phys_limit __ro_after_init;
 static void __init reserve_crashkernel(void)
 {
        unsigned long long crash_base, crash_size;
+       phys_addr_t limit = arm64_dma_phys_limit;
        int ret;

        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -86,11 +87,14 @@ static void __init reserve_crashkernel(void)

        crash_size = PAGE_ALIGN(crash_size);

+again:
        if (crash_base == 0) {
                /* Current arm64 boot protocol requires 2MB alignment */
-               crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
-                               crash_size, SZ_2M);
-               if (crash_base == 0) {
+               crash_base = memblock_find_in_range(0, limit, crash_size,
SZ_2M);
+               if (!crash_base && limit == arm64_dma_phys_limit) {
+                       limit = arm64_dma32_phys_limit;
+                       goto again;
+               } else if (!crash_base && limit == arm64_dma32_phys_limit) {
                        pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
                                crash_size);
                        return;
@@ -448,13 +452,13 @@ void __init arm64_memblock_init(void)
        else
                arm64_dma32_phys_limit = PHYS_MASK + 1;

-       reserve_crashkernel();
-
        reserve_elfcorehdr();

        high_memory = __va(memblock_end_of_DRAM() - 1) + 1;

        dma_contiguous_reserve(arm64_dma_phys_limit ? : arm64_dma32_phys_limit);
+
+       reserve_crashkernel();
 }

 void __init bootmem_init(void)


Regards,
Nicolas
Matthias Brugger Oct. 23, 2019, 7:11 a.m. UTC | #9
On 22/10/2019 13:23, Nicolas Saenz Julienne wrote:
> On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
>> I managed to get more information here,
>>
>> [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
>> [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global area
>> [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base
>> 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
>> [    0.000000] cma: Failed to reserve 512 MiB
>>
>> Full dmesg:
>>
>> https://cailca.github.io/files/dmesg.txt
> 
> OK I got it, reproduced it too.
> 
> Here are the relevant logs:
> 
> 	[    0.000000]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
> 	[    0.000000]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
> 	[    0.000000]   Normal   [mem 0x0000000100000000-0x00000097fcffffff]
> 
> As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff which
> is slightly smaller than 1GB.
> 
> 	[    0.000000] crashkernel reserved: 0x000000009fe00000 - 0x00000000bfe00000 (512 MB)
> 
> Here crashkernel reserved 512M in ZONE_DMA.
> 
> 	[    0.000000] cma: Failed to reserve 512 MiB
> 
> CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough space.
> Makes sense.
> 
> A fix could be moving crashkernel reservations after CMA and then if unable to
> fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little over the
> top, yet although most devices will be fine with ZONE_DMA32, the RPi4 needs
> crashkernel to be reserved in ZONE_DMA.
> 
> My knowledge of Kdump is limited, so I'd love to see what Catalin has to say.
> Here's a tested patch of what I'm proposing:
> 
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 120c26af916b..49f3c3a34ae2 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -76,6 +76,7 @@ phys_addr_t arm64_dma32_phys_limit __ro_after_init;
>  static void __init reserve_crashkernel(void)
>  {
>         unsigned long long crash_base, crash_size;
> +       phys_addr_t limit = arm64_dma_phys_limit;
>         int ret;
> 
>         ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
> @@ -86,11 +87,14 @@ static void __init reserve_crashkernel(void)
> 
>         crash_size = PAGE_ALIGN(crash_size);
> 
> +again:
>         if (crash_base == 0) {
>                 /* Current arm64 boot protocol requires 2MB alignment */
> -               crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
> -                               crash_size, SZ_2M);
> -               if (crash_base == 0) {
> +               crash_base = memblock_find_in_range(0, limit, crash_size,
> SZ_2M);
> +               if (!crash_base && limit == arm64_dma_phys_limit) {
> +                       limit = arm64_dma32_phys_limit;
> +                       goto again;

I'd try to avoid the goto.
Apart from that we should write some information message that the crashkernel
got reserved in arm64_dma_phys_limit. Otherwise RPi4 might break silently and
this will give the user at least a hint what happened.

Regards,
Matthias

> +               } else if (!crash_base && limit == arm64_dma32_phys_limit) {
>                         pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
>                                 crash_size);
>                         return;
> @@ -448,13 +452,13 @@ void __init arm64_memblock_init(void)
>         else
>                 arm64_dma32_phys_limit = PHYS_MASK + 1;
> 
> -       reserve_crashkernel();
> -
>         reserve_elfcorehdr();
> 
>         high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
> 
>         dma_contiguous_reserve(arm64_dma_phys_limit ? : arm64_dma32_phys_limit);
> +
> +       reserve_crashkernel();
>  }
> 
>  void __init bootmem_init(void)
> 
> 
> Regards,
> Nicolas
>
Catalin Marinas Oct. 31, 2019, 3:51 p.m. UTC | #10
(sorry, I've been away last week and only now caught up with emails)

On Tue, Oct 22, 2019 at 01:23:32PM +0200, Nicolas Saenz Julienne wrote:
> On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> > I managed to get more information here,
> > 
> > [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> > [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global area
> > [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base
> > 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
> > [    0.000000] cma: Failed to reserve 512 MiB
> > 
> > Full dmesg:
> > 
> > https://cailca.github.io/files/dmesg.txt
> 
> OK I got it, reproduced it too.
> 
> Here are the relevant logs:
> 
> 	[    0.000000]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
> 	[    0.000000]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
> 	[    0.000000]   Normal   [mem 0x0000000100000000-0x00000097fcffffff]
> 
> As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff which
> is slightly smaller than 1GB.
> 
> 	[    0.000000] crashkernel reserved: 0x000000009fe00000 - 0x00000000bfe00000 (512 MB)
> 
> Here crashkernel reserved 512M in ZONE_DMA.
> 
> 	[    0.000000] cma: Failed to reserve 512 MiB
> 
> CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough space.
> Makes sense.
> 
> A fix could be moving crashkernel reservations after CMA and then if unable to
> fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little over the
> top, yet although most devices will be fine with ZONE_DMA32, the RPi4 needs
> crashkernel to be reserved in ZONE_DMA.

Does RPi4 need CMA in ZONE_DMA? If not, I'd rather reserve the CMA from
ZONE_DMA32.

Even if you moved the crash kernel, someone else might complain that
they had 2GB of CMA and it no longer works.
Nicolas Saenz Julienne Oct. 31, 2019, 4:04 p.m. UTC | #11
On Thu, 2019-10-31 at 15:51 +0000, Catalin Marinas wrote:
> (sorry, I've been away last week and only now caught up with emails)
> 
> On Tue, Oct 22, 2019 at 01:23:32PM +0200, Nicolas Saenz Julienne wrote:
> > On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> > > I managed to get more information here,
> > > 
> > > [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> > > [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global
> > > area
> > > [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base
> > > 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
> > > [    0.000000] cma: Failed to reserve 512 MiB
> > > 
> > > Full dmesg:
> > > 
> > > https://cailca.github.io/files/dmesg.txt
> > 
> > OK I got it, reproduced it too.
> > 
> > Here are the relevant logs:
> > 
> > 	[    0.000000]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
> > 	[    0.000000]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
> > 	[    0.000000]   Normal   [mem 0x0000000100000000-0x00000097fcffffff]
> > 
> > As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff
> > which
> > is slightly smaller than 1GB.
> > 
> > 	[    0.000000] crashkernel reserved: 0x000000009fe00000 -
> > 0x00000000bfe00000 (512 MB)
> > 
> > Here crashkernel reserved 512M in ZONE_DMA.
> > 
> > 	[    0.000000] cma: Failed to reserve 512 MiB
> > 
> > CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough
> > space.
> > Makes sense.
> > 
> > A fix could be moving crashkernel reservations after CMA and then if unable
> > to
> > fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little over
> > the
> > top, yet although most devices will be fine with ZONE_DMA32, the RPi4 needs
> > crashkernel to be reserved in ZONE_DMA.
> 
> Does RPi4 need CMA in ZONE_DMA? If not, I'd rather reserve the CMA from
> ZONE_DMA32.

Yes, CMA is imperatively to be reserved in ZONE_DMA.

> Even if you moved the crash kernel, someone else might complain that
> they had 2GB of CMA and it no longer works.

I have yet to look into it, but I've been told that on x86/x64 they have a
'high' flag to be set alongside with crashkernel that forces the allocation
into ZONE_DMA32. We could mimic this behavior for big servers that don't depend
on ZONE_DMA but need to reserve big chunks of memory.

Regards,
Nicolas
Catalin Marinas Oct. 31, 2019, 6:02 p.m. UTC | #12
On Thu, Oct 31, 2019 at 05:04:34PM +0100, Nicolas Saenz Julienne wrote:
> On Thu, 2019-10-31 at 15:51 +0000, Catalin Marinas wrote:
> > (sorry, I've been away last week and only now caught up with emails)
> > 
> > On Tue, Oct 22, 2019 at 01:23:32PM +0200, Nicolas Saenz Julienne wrote:
> > > On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> > > > I managed to get more information here,
> > > > 
> > > > [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> > > > [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for global
> > > > area
> > > > [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000, base
> > > > 0x0000000000000000, limit 0x00000000c0000000 alignment 0x0000000000000000)
> > > > [    0.000000] cma: Failed to reserve 512 MiB
> > > > 
> > > > Full dmesg:
> > > > 
> > > > https://cailca.github.io/files/dmesg.txt
> > > 
> > > OK I got it, reproduced it too.
> > > 
> > > Here are the relevant logs:
> > > 
> > > 	[    0.000000]   DMA      [mem 0x00000000802f0000-0x00000000bfffffff]
> > > 	[    0.000000]   DMA32    [mem 0x00000000c0000000-0x00000000ffffffff]
> > > 	[    0.000000]   Normal   [mem 0x0000000100000000-0x00000097fcffffff]
> > > 
> > > As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff
> > > which
> > > is slightly smaller than 1GB.
> > > 
> > > 	[    0.000000] crashkernel reserved: 0x000000009fe00000 -
> > > 0x00000000bfe00000 (512 MB)
> > > 
> > > Here crashkernel reserved 512M in ZONE_DMA.
> > > 
> > > 	[    0.000000] cma: Failed to reserve 512 MiB
> > > 
> > > CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough
> > > space.
> > > Makes sense.
> > > 
> > > A fix could be moving crashkernel reservations after CMA and then if unable
> > > to
> > > fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little over
> > > the
> > > top, yet although most devices will be fine with ZONE_DMA32, the RPi4 needs
> > > crashkernel to be reserved in ZONE_DMA.
> > 
> > Does RPi4 need CMA in ZONE_DMA? If not, I'd rather reserve the CMA from
> > ZONE_DMA32.
> 
> Yes, CMA is imperatively to be reserved in ZONE_DMA.
> 
> > Even if you moved the crash kernel, someone else might complain that
> > they had 2GB of CMA and it no longer works.
> 
> I have yet to look into it, but I've been told that on x86/x64 they have a
> 'high' flag to be set alongside with crashkernel that forces the allocation
> into ZONE_DMA32. We could mimic this behavior for big servers that don't depend
> on ZONE_DMA but need to reserve big chunks of memory.

The 'high' flag actually talks about crashkernel reserved above 4G which
is not really the case here. Since RPi4 is the odd one out, I'd rather
have the default crashkernel and CMA in the ZONE_DMA32 (current mainline
behaviour) and have the RPi4 use explicit size@offset parameters for
crashkernel and cma.
Nicolas Saenz Julienne Oct. 31, 2019, 6:11 p.m. UTC | #13
On Thu, 2019-10-31 at 18:02 +0000, Catalin Marinas wrote:
> On Thu, Oct 31, 2019 at 05:04:34PM +0100, Nicolas Saenz Julienne wrote:
> > On Thu, 2019-10-31 at 15:51 +0000, Catalin Marinas wrote:
> > > (sorry, I've been away last week and only now caught up with emails)
> > > 
> > > On Tue, Oct 22, 2019 at 01:23:32PM +0200, Nicolas Saenz Julienne wrote:
> > > > On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> > > > > I managed to get more information here,
> > > > > 
> > > > > [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> > > > > [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for
> > > > > global
> > > > > area
> > > > > [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000,
> > > > > base
> > > > > 0x0000000000000000, limit 0x00000000c0000000 alignment
> > > > > 0x0000000000000000)
> > > > > [    0.000000] cma: Failed to reserve 512 MiB
> > > > > 
> > > > > Full dmesg:
> > > > > 
> > > > > https://cailca.github.io/files/dmesg.txt
> > > > 
> > > > OK I got it, reproduced it too.
> > > > 
> > > > Here are the relevant logs:
> > > > 
> > > > 	[    0.000000]   DMA      [mem 0x00000000802f0000-
> > > > 0x00000000bfffffff]
> > > > 	[    0.000000]   DMA32    [mem 0x00000000c0000000-
> > > > 0x00000000ffffffff]
> > > > 	[    0.000000]   Normal   [mem 0x0000000100000000-
> > > > 0x00000097fcffffff]
> > > > 
> > > > As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff
> > > > which
> > > > is slightly smaller than 1GB.
> > > > 
> > > > 	[    0.000000] crashkernel reserved: 0x000000009fe00000 -
> > > > 0x00000000bfe00000 (512 MB)
> > > > 
> > > > Here crashkernel reserved 512M in ZONE_DMA.
> > > > 
> > > > 	[    0.000000] cma: Failed to reserve 512 MiB
> > > > 
> > > > CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough
> > > > space.
> > > > Makes sense.
> > > > 
> > > > A fix could be moving crashkernel reservations after CMA and then if
> > > > unable
> > > > to
> > > > fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little
> > > > over
> > > > the
> > > > top, yet although most devices will be fine with ZONE_DMA32, the RPi4
> > > > needs
> > > > crashkernel to be reserved in ZONE_DMA.
> > > 
> > > Does RPi4 need CMA in ZONE_DMA? If not, I'd rather reserve the CMA from
> > > ZONE_DMA32.
> > 
> > Yes, CMA is imperatively to be reserved in ZONE_DMA.
> > 
> > > Even if you moved the crash kernel, someone else might complain that
> > > they had 2GB of CMA and it no longer works.
> > 
> > I have yet to look into it, but I've been told that on x86/x64 they have a
> > 'high' flag to be set alongside with crashkernel that forces the allocation
> > into ZONE_DMA32. We could mimic this behavior for big servers that don't
> > depend
> > on ZONE_DMA but need to reserve big chunks of memory.
> 
> The 'high' flag actually talks about crashkernel reserved above 4G which
> is not really the case here. Since RPi4 is the odd one out, I'd rather
> have the default crashkernel and CMA in the ZONE_DMA32 (current mainline
> behaviour) and have the RPi4 use explicit size@offset parameters for
> crashkernel and cma.

Fair enough, I'll send a fix for this on Monday if it's OK with you.
Catalin Marinas Oct. 31, 2019, 6:13 p.m. UTC | #14
On Thu, Oct 31, 2019 at 07:11:27PM +0100, Nicolas Saenz Julienne wrote:
> On Thu, 2019-10-31 at 18:02 +0000, Catalin Marinas wrote:
> > On Thu, Oct 31, 2019 at 05:04:34PM +0100, Nicolas Saenz Julienne wrote:
> > > On Thu, 2019-10-31 at 15:51 +0000, Catalin Marinas wrote:
> > > > (sorry, I've been away last week and only now caught up with emails)
> > > > 
> > > > On Tue, Oct 22, 2019 at 01:23:32PM +0200, Nicolas Saenz Julienne wrote:
> > > > > On Mon, 2019-10-21 at 16:36 -0400, Qian Cai wrote:
> > > > > > I managed to get more information here,
> > > > > > 
> > > > > > [    0.000000] cma: dma_contiguous_reserve(limit c0000000)
> > > > > > [    0.000000] cma: dma_contiguous_reserve: reserving 64 MiB for
> > > > > > global
> > > > > > area
> > > > > > [    0.000000] cma: cma_declare_contiguous(size 0x0000000004000000,
> > > > > > base
> > > > > > 0x0000000000000000, limit 0x00000000c0000000 alignment
> > > > > > 0x0000000000000000)
> > > > > > [    0.000000] cma: Failed to reserve 512 MiB
> > > > > > 
> > > > > > Full dmesg:
> > > > > > 
> > > > > > https://cailca.github.io/files/dmesg.txt
> > > > > 
> > > > > OK I got it, reproduced it too.
> > > > > 
> > > > > Here are the relevant logs:
> > > > > 
> > > > > 	[    0.000000]   DMA      [mem 0x00000000802f0000-
> > > > > 0x00000000bfffffff]
> > > > > 	[    0.000000]   DMA32    [mem 0x00000000c0000000-
> > > > > 0x00000000ffffffff]
> > > > > 	[    0.000000]   Normal   [mem 0x0000000100000000-
> > > > > 0x00000097fcffffff]
> > > > > 
> > > > > As you can see ZONE_DMA spans from 0x00000000802f0000-0x00000000bfffffff
> > > > > which
> > > > > is slightly smaller than 1GB.
> > > > > 
> > > > > 	[    0.000000] crashkernel reserved: 0x000000009fe00000 -
> > > > > 0x00000000bfe00000 (512 MB)
> > > > > 
> > > > > Here crashkernel reserved 512M in ZONE_DMA.
> > > > > 
> > > > > 	[    0.000000] cma: Failed to reserve 512 MiB
> > > > > 
> > > > > CMA tried to allocate 512M in ZONE_DMA which fails as there is no enough
> > > > > space.
> > > > > Makes sense.
> > > > > 
> > > > > A fix could be moving crashkernel reservations after CMA and then if
> > > > > unable
> > > > > to
> > > > > fit in ZONE_DMA try ZONE_DMA32 before bailing out. Maybe it's a little
> > > > > over
> > > > > the
> > > > > top, yet although most devices will be fine with ZONE_DMA32, the RPi4
> > > > > needs
> > > > > crashkernel to be reserved in ZONE_DMA.
> > > > 
> > > > Does RPi4 need CMA in ZONE_DMA? If not, I'd rather reserve the CMA from
> > > > ZONE_DMA32.
> > > 
> > > Yes, CMA is imperatively to be reserved in ZONE_DMA.
> > > 
> > > > Even if you moved the crash kernel, someone else might complain that
> > > > they had 2GB of CMA and it no longer works.
> > > 
> > > I have yet to look into it, but I've been told that on x86/x64 they have a
> > > 'high' flag to be set alongside with crashkernel that forces the allocation
> > > into ZONE_DMA32. We could mimic this behavior for big servers that don't
> > > depend
> > > on ZONE_DMA but need to reserve big chunks of memory.
> > 
> > The 'high' flag actually talks about crashkernel reserved above 4G which
> > is not really the case here. Since RPi4 is the odd one out, I'd rather
> > have the default crashkernel and CMA in the ZONE_DMA32 (current mainline
> > behaviour) and have the RPi4 use explicit size@offset parameters for
> > crashkernel and cma.
> 
> Fair enough, I'll send a fix for this on Monday if it's OK with you.

That's fine. Thanks.
John Stultz Dec. 3, 2019, 5:08 a.m. UTC | #15
On Wed, Sep 11, 2019 at 11:26 AM Nicolas Saenz Julienne
<nsaenzjulienne@suse.de> wrote:
> So far all arm64 devices have supported 32 bit DMA masks for their
> peripherals. This is not true anymore for the Raspberry Pi 4 as most of
> it's peripherals can only address the first GB of memory on a total of
> up to 4 GB.
>
> This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
> to be addressable with a 32 bit mask. So it was decided to re-introduce
> ZONE_DMA in arm64.
>
> ZONE_DMA will contain the lower 1G of memory, which is currently the
> memory area addressable by any peripheral on an arm64 device.
> ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
>
> Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Hey Nicolas,
  Testing the db845c with linus/master, I found a regression causing
system hangs in early boot:

[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x517f803c]
[    0.000000] Linux version 5.4.0-mainline-10675-g957a03b9e38f
(docker@a4ec90a1e72c) (gcc version 7.4.0 (Ubuntu/Linaro
7.4.0-1ubuntu1~18.04.1)) #1209 SMP PREEMPT Tue Dec 3 00:23:15 UTC 2019
[    0.000000] Machine model: Thundercomm Dragonboard 845c
[    0.000000] earlycon: qcom_geni0 at MMIO 0x0000000000a84000
(options '115200n8')
[    0.000000] printk: bootconsole [qcom_geni0] enabled
[    0.000000] efi: Getting EFI parameters from FDT:
[    0.000000] efi: UEFI not found.
[    0.000000] cma: Reserved 16 MiB at 0x00000000ff000000
[    0.000000] psci: probing for conduit method from DT.
[    0.000000] psci: PSCIv1.1 detected in firmware.
[    0.000000] psci: Using standard PSCI v0.2 function IDs
[    0.000000] psci: MIGRATE_INFO_TYPE not supported.
[    0.000000] psci: SMC Calling Convention v1.0
[    0.000000] psci: OSI mode supported.
[    0.000000] percpu: Embedded 31 pages/cpu s87512 r8192 d31272 u126976
[    0.000000] Detected VIPT I-cache on CPU0
[    0.000000] CPU features: detected: GIC system register CPU interface
[    0.000000] CPU features: kernel page table isolation forced ON by KASLR
[    0.000000] CPU features: detected: Kernel page table isolation (KPTI)
[    0.000000] ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware
[    0.000000] CPU features: detected: Hardware dirty bit management
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: -188245
[    0.000000] Kernel command line: earlycon
firmware_class.path=/vendor/firmware/ androidboot.hardware=db845c
init=/init androidboot.boot_devices=soc/1d84000.ufshc
printk.devkmsg=on buildvariant=userdebug root=/dev/sda2
androidboot.bootdevice=1d84000.ufshc androidboot.serialno=c4e1189c
androidboot.baseband=sda
msm_drm.dsi_display0=dsi_lt9611_1080_video_display:
androidboot.slot_suffix=_a skip_initramfs rootwait ro init=/init

<hangs indefinitely here>

I bisected the issue down to this patch (1a8e1cef7603 upstream - the
previous patch a573cdd7973d works though I need to apply the
arm64_dma_phys_limit bit from this one as the previous patch doesn't
build on its own).

In the above log:
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: -188245
looks the most suspect, and going back to the working a573cdd7973d +
build fix I see:
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 957419

Do you have any suggestions for what might be going wrong?

thanks
-john
John Stultz Dec. 3, 2019, 5:38 a.m. UTC | #16
On Mon, Dec 2, 2019 at 9:08 PM John Stultz <john.stultz@linaro.org> wrote:
>
> On Wed, Sep 11, 2019 at 11:26 AM Nicolas Saenz Julienne
> <nsaenzjulienne@suse.de> wrote:
> > So far all arm64 devices have supported 32 bit DMA masks for their
> > peripherals. This is not true anymore for the Raspberry Pi 4 as most of
> > it's peripherals can only address the first GB of memory on a total of
> > up to 4 GB.
> >
> > This goes against ZONE_DMA32's intent, as it's expected for ZONE_DMA32
> > to be addressable with a 32 bit mask. So it was decided to re-introduce
> > ZONE_DMA in arm64.
> >
> > ZONE_DMA will contain the lower 1G of memory, which is currently the
> > memory area addressable by any peripheral on an arm64 device.
> > ZONE_DMA32 will contain the rest of the 32 bit addressable memory.
> >
> > Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
>
> Hey Nicolas,
>   Testing the db845c with linus/master, I found a regression causing
> system hangs in early boot:
>
> [    0.000000] Booting Linux on physical CPU 0x0000000000 [0x517f803c]
> [    0.000000] Linux version 5.4.0-mainline-10675-g957a03b9e38f
> (docker@a4ec90a1e72c) (gcc version 7.4.0 (Ubuntu/Linaro
> 7.4.0-1ubuntu1~18.04.1)) #1209 SMP PREEMPT Tue Dec 3 00:23:15 UTC 2019
> [    0.000000] Machine model: Thundercomm Dragonboard 845c
> [    0.000000] earlycon: qcom_geni0 at MMIO 0x0000000000a84000
> (options '115200n8')
> [    0.000000] printk: bootconsole [qcom_geni0] enabled
> [    0.000000] efi: Getting EFI parameters from FDT:
> [    0.000000] efi: UEFI not found.
> [    0.000000] cma: Reserved 16 MiB at 0x00000000ff000000
> [    0.000000] psci: probing for conduit method from DT.
> [    0.000000] psci: PSCIv1.1 detected in firmware.
> [    0.000000] psci: Using standard PSCI v0.2 function IDs
> [    0.000000] psci: MIGRATE_INFO_TYPE not supported.
> [    0.000000] psci: SMC Calling Convention v1.0
> [    0.000000] psci: OSI mode supported.
> [    0.000000] percpu: Embedded 31 pages/cpu s87512 r8192 d31272 u126976
> [    0.000000] Detected VIPT I-cache on CPU0
> [    0.000000] CPU features: detected: GIC system register CPU interface
> [    0.000000] CPU features: kernel page table isolation forced ON by KASLR
> [    0.000000] CPU features: detected: Kernel page table isolation (KPTI)
> [    0.000000] ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware
> [    0.000000] CPU features: detected: Hardware dirty bit management
> [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: -188245
> [    0.000000] Kernel command line: earlycon
> firmware_class.path=/vendor/firmware/ androidboot.hardware=db845c
> init=/init androidboot.boot_devices=soc/1d84000.ufshc
> printk.devkmsg=on buildvariant=userdebug root=/dev/sda2
> androidboot.bootdevice=1d84000.ufshc androidboot.serialno=c4e1189c
> androidboot.baseband=sda
> msm_drm.dsi_display0=dsi_lt9611_1080_video_display:
> androidboot.slot_suffix=_a skip_initramfs rootwait ro init=/init
>
> <hangs indefinitely here>
>
> I bisected the issue down to this patch (1a8e1cef7603 upstream - the
> previous patch a573cdd7973d works though I need to apply the
> arm64_dma_phys_limit bit from this one as the previous patch doesn't
> build on its own).
>
> In the above log:
> [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: -188245
> looks the most suspect, and going back to the working a573cdd7973d +
> build fix I see:
> [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 957419
>
> Do you have any suggestions for what might be going wrong?

Digging further, it seems the error is found in calculate_node_totalpages()
 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);

Where for zone DMA32 size is 262144, but real_size is calculated as -883520.

I've not traced through to figure out why zone_absent_pages_in_node is
coming up with such a large number yet, but I'm about to crash so I
wanted to share.

thanks
-john
John Stultz Dec. 3, 2019, 6:03 a.m. UTC | #17
On Mon, Dec 2, 2019 at 9:38 PM John Stultz <john.stultz@linaro.org> wrote:
> On Mon, Dec 2, 2019 at 9:08 PM John Stultz <john.stultz@linaro.org> wrote:
> > Hey Nicolas,
> >   Testing the db845c with linus/master, I found a regression causing
> > system hangs in early boot:
...
> > In the above log:
> > [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: -188245
> > looks the most suspect, and going back to the working a573cdd7973d +
> > build fix I see:
> > [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 957419
> >
> > Do you have any suggestions for what might be going wrong?
>
> Digging further, it seems the error is found in calculate_node_totalpages()
>  real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
>                                                   node_start_pfn, node_end_pfn,
>                                                   zholes_size);
>
> Where for zone DMA32 size is 262144, but real_size is calculated as -883520.
>
> I've not traced through to figure out why zone_absent_pages_in_node is
> coming up with such a large number yet, but I'm about to crash so I
> wanted to share.

Ok, narrowing it down further, it seems its the following bit from the patch:

> @@ -201,13 +212,18 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
>         struct memblock_region *reg;
>         unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
>         unsigned long max_dma32 = min;
> +       unsigned long max_dma = min;
>
>         memset(zone_size, 0, sizeof(zone_size));
>
> -       /* 4GB maximum for 32-bit only capable devices */
> +#ifdef CONFIG_ZONE_DMA
> +       max_dma = PFN_DOWN(arm64_dma_phys_limit);
> +       zone_size[ZONE_DMA] = max_dma - min;
> +       max_dma32 = max_dma;
> +#endif
>  #ifdef CONFIG_ZONE_DMA32
>         max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
> -       zone_size[ZONE_DMA32] = max_dma32 - min;
> +       zone_size[ZONE_DMA32] = max_dma32 - max_dma;
>  #endif
>         zone_size[ZONE_NORMAL] = max - max_dma32;
>
> @@ -219,11 +235,17 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
>
>                 if (start >= max)
>                         continue;
> -
> +#ifdef CONFIG_ZONE_DMA
> +               if (start < max_dma) {
> +                       unsigned long dma_end = min_not_zero(end, max_dma);
> +                       zhole_size[ZONE_DMA] -= dma_end - start;
> +               }
> +#endif
>  #ifdef CONFIG_ZONE_DMA32
>                 if (start < max_dma32) {
> -                       unsigned long dma_end = min(end, max_dma32);
> -                       zhole_size[ZONE_DMA32] -= dma_end - start;
> +                       unsigned long dma32_end = min(end, max_dma32);
> +                       unsigned long dma32_start = max(start, max_dma);
> +                       zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
>                 }
>  #endif
>                 if (end > max_dma32) {

The zhole_sizes end up being:
zhole_size: DMA: 67671, DMA32: 1145664 NORMAL: 0

This seems to be due to dma32_start being calculated as 786432 each
time - I'm guessing that's the max_dma value.
Where dma32_end is around 548800, but changes each iteration (so we
end up subtracting a negative value each pass, growing the size).

thanks
-john
Will Deacon Dec. 3, 2019, 10:12 a.m. UTC | #18
Hi John,

On Mon, Dec 02, 2019 at 10:03:17PM -0800, John Stultz wrote:
> Ok, narrowing it down further, it seems its the following bit from the
> patch:
> 
> > @@ -201,13 +212,18 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> >         struct memblock_region *reg;
> >         unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
> >         unsigned long max_dma32 = min;
> > +       unsigned long max_dma = min;
> >
> >         memset(zone_size, 0, sizeof(zone_size));
> >
> > -       /* 4GB maximum for 32-bit only capable devices */
> > +#ifdef CONFIG_ZONE_DMA
> > +       max_dma = PFN_DOWN(arm64_dma_phys_limit);
> > +       zone_size[ZONE_DMA] = max_dma - min;
> > +       max_dma32 = max_dma;
> > +#endif
> >  #ifdef CONFIG_ZONE_DMA32
> >         max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
> > -       zone_size[ZONE_DMA32] = max_dma32 - min;
> > +       zone_size[ZONE_DMA32] = max_dma32 - max_dma;
> >  #endif
> >         zone_size[ZONE_NORMAL] = max - max_dma32;
> >
> > @@ -219,11 +235,17 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> >
> >                 if (start >= max)
> >                         continue;
> > -
> > +#ifdef CONFIG_ZONE_DMA
> > +               if (start < max_dma) {
> > +                       unsigned long dma_end = min_not_zero(end, max_dma);
> > +                       zhole_size[ZONE_DMA] -= dma_end - start;
> > +               }
> > +#endif
> >  #ifdef CONFIG_ZONE_DMA32
> >                 if (start < max_dma32) {
> > -                       unsigned long dma_end = min(end, max_dma32);
> > -                       zhole_size[ZONE_DMA32] -= dma_end - start;
> > +                       unsigned long dma32_end = min(end, max_dma32);
> > +                       unsigned long dma32_start = max(start, max_dma);
> > +                       zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
> >                 }
> >  #endif
> >                 if (end > max_dma32) {
> 
> The zhole_sizes end up being:
> zhole_size: DMA: 67671, DMA32: 1145664 NORMAL: 0
> 
> This seems to be due to dma32_start being calculated as 786432 each
> time - I'm guessing that's the max_dma value.
> Where dma32_end is around 548800, but changes each iteration (so we
> end up subtracting a negative value each pass, growing the size).

Yeah, this logic looks utterly buggered to me so I'm surprised that nobody
else has seen the problem. In particlar, kernelci is reporting success
on the same SoC :/

https://kernelci.org/boot/sdm845-db845c/

The logs also don't seem to match up with the trees either. For example,
looking at the boot logs for:

https://kernelci.org/boot/id/5de5fc60b1ed6e2d46960f08/

It claims that the kernel is "next-2019-12-30" but the dmesg says:

[    0.000000] Linux version 5.4.0 (KernelCI@b19b74fe311d) (gcc version
8.3.0 (Debian 8.3.0-2)) #1 SMP PREEMPT Tue Dec 3 03:14:07 UTC 2019

Which isn't great.

Anyway, I've had a go at fixing it below but it's 100% untested. I think
the issue is that your SoC has memblocks contained entirely within the
ZONE_DMA region and we don't cater for that at all when processing the
ZONE_DMA32 region.

Will

--->8

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index be9481cdf3b9..af365ce59ed6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -242,19 +242,19 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
 		if (start < max_dma) {
 			unsigned long dma_end = min_not_zero(end, max_dma);
 			zhole_size[ZONE_DMA] -= dma_end - start;
+			start = dma_end;
 		}
 #endif
 #ifdef CONFIG_ZONE_DMA32
-		if (start < max_dma32) {
+		if (start >= max_dma && start < max_dma32) {
 			unsigned long dma32_end = min(end, max_dma32);
-			unsigned long dma32_start = max(start, max_dma);
-			zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
+			zhole_size[ZONE_DMA32] -= dma32_end - start;
+			start = dma32_end;
 		}
 #endif
-		if (end > max_dma32) {
+		if (start >= max_dma32) {
 			unsigned long normal_end = min(end, max);
-			unsigned long normal_start = max(start, max_dma32);
-			zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
+			zhole_size[ZONE_NORMAL] -= normal_end - start;
 		}
 	}
Catalin Marinas Dec. 3, 2019, 11:19 a.m. UTC | #19
On Tue, Dec 03, 2019 at 10:12:50AM +0000, Will Deacon wrote:
> On Mon, Dec 02, 2019 at 10:03:17PM -0800, John Stultz wrote:
> > Ok, narrowing it down further, it seems its the following bit from the
> > patch:
> > 
> > > @@ -201,13 +212,18 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> > >         struct memblock_region *reg;
> > >         unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
> > >         unsigned long max_dma32 = min;
> > > +       unsigned long max_dma = min;
> > >
> > >         memset(zone_size, 0, sizeof(zone_size));
> > >
> > > -       /* 4GB maximum for 32-bit only capable devices */
> > > +#ifdef CONFIG_ZONE_DMA
> > > +       max_dma = PFN_DOWN(arm64_dma_phys_limit);
> > > +       zone_size[ZONE_DMA] = max_dma - min;
> > > +       max_dma32 = max_dma;
> > > +#endif
> > >  #ifdef CONFIG_ZONE_DMA32
> > >         max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
> > > -       zone_size[ZONE_DMA32] = max_dma32 - min;
> > > +       zone_size[ZONE_DMA32] = max_dma32 - max_dma;
> > >  #endif
> > >         zone_size[ZONE_NORMAL] = max - max_dma32;
> > >
> > > @@ -219,11 +235,17 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
> > >
> > >                 if (start >= max)
> > >                         continue;
> > > -
> > > +#ifdef CONFIG_ZONE_DMA
> > > +               if (start < max_dma) {
> > > +                       unsigned long dma_end = min_not_zero(end, max_dma);
> > > +                       zhole_size[ZONE_DMA] -= dma_end - start;
> > > +               }
> > > +#endif
> > >  #ifdef CONFIG_ZONE_DMA32
> > >                 if (start < max_dma32) {
> > > -                       unsigned long dma_end = min(end, max_dma32);
> > > -                       zhole_size[ZONE_DMA32] -= dma_end - start;
> > > +                       unsigned long dma32_end = min(end, max_dma32);
> > > +                       unsigned long dma32_start = max(start, max_dma);
> > > +                       zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
> > >                 }
> > >  #endif
> > >                 if (end > max_dma32) {
> > 
> > The zhole_sizes end up being:
> > zhole_size: DMA: 67671, DMA32: 1145664 NORMAL: 0
> > 
> > This seems to be due to dma32_start being calculated as 786432 each
> > time - I'm guessing that's the max_dma value.
> > Where dma32_end is around 548800, but changes each iteration (so we
> > end up subtracting a negative value each pass, growing the size).
[...]
> Anyway, I've had a go at fixing it below but it's 100% untested. I think
> the issue is that your SoC has memblocks contained entirely within the
> ZONE_DMA region and we don't cater for that at all when processing the
> ZONE_DMA32 region.

This seems to be issue, the SoC memory contained withing ZONE_DMA. I
managed to reproduce it under KVM/Qemu by reducing the amount of memory
given to the guest. You'd also need NUMA disabled to hit this code path.

Your proposed change fixes it but I'll send a tested-by on the full
patch when it hits the list.
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6b6362b83004..2dbe0165bd15 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -267,6 +267,10 @@  config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
+config ZONE_DMA
+	bool "Support DMA zone" if EXPERT
+	default y
+
 config ZONE_DMA32
 	bool "Support DMA32 zone" if EXPERT
 	default y
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index d39ddb258a04..7b8c98830101 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -38,4 +38,6 @@  extern int pfn_valid(unsigned long);
 
 #include <asm-generic/getorder.h>
 
+#define ARCH_ZONE_DMA_BITS 30
+
 #endif
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8e9bc64c5878..44f07fdf7a59 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -56,6 +56,13 @@  EXPORT_SYMBOL(physvirt_offset);
 struct page *vmemmap __ro_after_init;
 EXPORT_SYMBOL(vmemmap);
 
+/*
+ * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of
+ * memory as some devices, namely the Raspberry Pi 4, have peripherals with
+ * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32
+ * bit addressable memory area.
+ */
+phys_addr_t arm64_dma_phys_limit __ro_after_init;
 phys_addr_t arm64_dma32_phys_limit __ro_after_init;
 
 #ifdef CONFIG_KEXEC_CORE
@@ -169,15 +176,16 @@  static void __init reserve_elfcorehdr(void)
 {
 }
 #endif /* CONFIG_CRASH_DUMP */
+
 /*
- * Return the maximum physical address for ZONE_DMA32 (DMA_BIT_MASK(32)). It
- * currently assumes that for memory starting above 4G, 32-bit devices will
- * use a DMA offset.
+ * Return the maximum physical address for a zone with a given address size
+ * limit. It currently assumes that for memory starting above 4G, 32-bit
+ * devices will use a DMA offset.
  */
-static phys_addr_t __init max_zone_dma32_phys(void)
+static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
 {
-	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
-	return min(offset + (1ULL << 32), memblock_end_of_DRAM());
+	phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits);
+	return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
 }
 
 #ifdef CONFIG_NUMA
@@ -186,6 +194,9 @@  static void __init zone_sizes_init(unsigned long min, unsigned long max)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
 
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
+#endif
 #ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit);
 #endif
@@ -201,13 +212,18 @@  static void __init zone_sizes_init(unsigned long min, unsigned long max)
 	struct memblock_region *reg;
 	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
 	unsigned long max_dma32 = min;
+	unsigned long max_dma = min;
 
 	memset(zone_size, 0, sizeof(zone_size));
 
-	/* 4GB maximum for 32-bit only capable devices */
+#ifdef CONFIG_ZONE_DMA
+	max_dma = PFN_DOWN(arm64_dma_phys_limit);
+	zone_size[ZONE_DMA] = max_dma - min;
+	max_dma32 = max_dma;
+#endif
 #ifdef CONFIG_ZONE_DMA32
 	max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
-	zone_size[ZONE_DMA32] = max_dma32 - min;
+	zone_size[ZONE_DMA32] = max_dma32 - max_dma;
 #endif
 	zone_size[ZONE_NORMAL] = max - max_dma32;
 
@@ -219,11 +235,17 @@  static void __init zone_sizes_init(unsigned long min, unsigned long max)
 
 		if (start >= max)
 			continue;
-
+#ifdef CONFIG_ZONE_DMA
+		if (start < max_dma) {
+			unsigned long dma_end = min_not_zero(end, max_dma);
+			zhole_size[ZONE_DMA] -= dma_end - start;
+		}
+#endif
 #ifdef CONFIG_ZONE_DMA32
 		if (start < max_dma32) {
-			unsigned long dma_end = min(end, max_dma32);
-			zhole_size[ZONE_DMA32] -= dma_end - start;
+			unsigned long dma32_end = min(end, max_dma32);
+			unsigned long dma32_start = max(start, max_dma);
+			zhole_size[ZONE_DMA32] -= dma32_end - dma32_start;
 		}
 #endif
 		if (end > max_dma32) {
@@ -418,9 +440,11 @@  void __init arm64_memblock_init(void)
 
 	early_init_fdt_scan_reserved_mem();
 
-	/* 4GB maximum for 32-bit only capable devices */
+	if (IS_ENABLED(CONFIG_ZONE_DMA))
+		arm64_dma_phys_limit = max_zone_phys(ARCH_ZONE_DMA_BITS);
+
 	if (IS_ENABLED(CONFIG_ZONE_DMA32))
-		arm64_dma32_phys_limit = max_zone_dma32_phys();
+		arm64_dma32_phys_limit = max_zone_phys(32);
 	else
 		arm64_dma32_phys_limit = PHYS_MASK + 1;
 
@@ -430,7 +454,7 @@  void __init arm64_memblock_init(void)
 
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 
-	dma_contiguous_reserve(arm64_dma32_phys_limit);
+	dma_contiguous_reserve(arm64_dma_phys_limit ? : arm64_dma32_phys_limit);
 }
 
 void __init bootmem_init(void)
@@ -534,7 +558,7 @@  static void __init free_unused_memmap(void)
 void __init mem_init(void)
 {
 	if (swiotlb_force == SWIOTLB_FORCE ||
-	    max_pfn > (arm64_dma32_phys_limit >> PAGE_SHIFT))
+	    max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit))
 		swiotlb_init(1);
 	else
 		swiotlb_force = SWIOTLB_NO_FORCE;