diff mbox series

[RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line

Message ID 20230126164352.17562-1-quic_c_gdjako@quicinc.com (mailing list archive)
State New, archived
Headers show
Series [RFC] mm: Allow ZONE_DMA32 to be disabled via kernel command line | expand

Commit Message

Georgi Djakov Jan. 26, 2023, 4:43 p.m. UTC
From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>

It's useful to have an option to disable the ZONE_DMA32 during boot as
CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
example). There are platforms that do not use this zone and in some high
memory pressure scenarios this would help on easing kswapd (to leave file
backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
these platforms - kswapd is woken up more easily and drains the file cache
which leads to some performance issues.

Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
[georgi: updated commit text]
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
---
The main question here is whether we can have a kernel command line
option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
I can imagine this being useful also for Linux distros. 

 .../admin-guide/kernel-parameters.txt         |  5 +++++
 arch/arm64/mm/init.c                          | 20 ++++++++++++++++-
 arch/x86/mm/init.c                            | 20 ++++++++++++++++-
 include/linux/dma-direct.h                    | 22 +++++++++++++++++++
 kernel/dma/direct.c                           |  5 +++--
 kernel/dma/pool.c                             |  8 +++----
 6 files changed, 72 insertions(+), 8 deletions(-)

Comments

Dave Hansen Jan. 26, 2023, 6:51 p.m. UTC | #1
On 1/26/23 08:43, Georgi Djakov wrote:
> From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> 
> It's useful to have an option to disable the ZONE_DMA32 during boot as
> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> example). There are platforms that do not use this zone and in some high
> memory pressure scenarios this would help on easing kswapd (to leave file
> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> these platforms - kswapd is woken up more easily and drains the file cache
> which leads to some performance issues.

Any chance you could expound on "some performance issues", maybe with
actual workloads and numbers?

Also, what are the practical implications here?  There are obviously an
ever decreasing number of 32-bit DMA devices out there.  Somebody that
has one and uses this option might be sad because now they're stuck
using ZONE_DMA which is quite tiny.

What other ZONE_DMA32 users are left?  Will anyone else care?  There is
some DMA32 slab and vmalloc() functionality remaining.  Is it impacted?

What should the default be?  If disable_dma32=0 by default, this seems
like code that will never get tested.  There are some rather subtle
relationships between ZONE_DMA32 and other zones that could get missed
without sufficient testing, like...

alloc_flags_nofragment() has some rather relevant comments that are left
unaddressed:

>         /*
>          * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
>          * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
>          * on UMA that if Normal is populated then so is DMA32.
>          */

This patch appears to break the "on UMA" assumption.


> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 58a0bb2c17f1..1a56098c0e19 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>  	return 0;
>  }
>  
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;

Is this referenced in any hot paths?  It might deserve a static branch.

> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index cb258f58fdc8..b8af7e2f21f5 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>  
>  static bool __initdata can_use_brk_pgt = true;
>  
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
>  /*
>   * Pages returned are already directly mapped.
>   *
> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
>  	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
>  #endif
>  #ifdef CONFIG_ZONE_DMA32
> -	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
> +	max_zone_pfns[ZONE_DMA32]	= disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
>  #endif
>  	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
>  #ifdef CONFIG_HIGHMEM
> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
>  	free_area_init(max_zone_pfns);
>  }
>  
> +static int __init early_disable_dma32(char *buf)
> +{
> +	if (!buf)
> +		return -EINVAL;
> +
> +	if (!strcmp(buf, "on"))
> +		disable_dma32 = true;
> +
> +	return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);

Ick.  Is there no way to do this other than a cross-arch copy/paste?

> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 18aade195884..ed69618cf1fc 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -24,6 +24,28 @@ struct bus_dma_region {
>  	u64		offset;
>  };
>  
> +static inline bool zone_dma32_is_empty(int node)
> +{
> +#ifdef CONFIG_ZONE_DMA32
> +	pg_data_t *pgdat = NODE_DATA(node);
> +
> +	return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
> +#else
> +	return true;
> +#endif
> +}
> +
> +static inline bool zone_dma32_are_empty(void)
> +{
> +	int node;
> +
> +	for_each_node(node)
> +		if (!zone_dma32_is_empty(node))
> +			return false;
> +
> +	return true;
> +}

That for_each_node() loop can be 1024 nodes long.  I hope this isn't in
any hot paths.  It'll be fine on DMA32 systems because it'll fall out of
the loop on the first pass.  But, if someone uses your shiny new option,
they're in for a long loop.

Oh, and this is:

#define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)

Did you really want all *possible* nodes?  Or will
for_each_online_node() do?

Also, I'd be rather shocked if there wasn't one-stop-shopping for this
somewhere.  Consulting your new 'disable_dma32' variable would make this
function rather cheap.  I suspect a totally empty set of DMA32 will also
leave its mark in arch_zone_*_possible_pfn[].

>  static inline dma_addr_t translate_phys_to_dma(struct device *dev,
>  		phys_addr_t paddr)
>  {
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 63859a101ed8..754210c65658 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
>  	*phys_limit = dma_to_phys(dev, dma_limit);
>  	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>  		return GFP_DMA;
> -	if (*phys_limit <= DMA_BIT_MASK(32))
> +	if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
>  		return GFP_DMA32;
>  	return 0;
>  }
> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>  
>  		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
>  		    phys_limit < DMA_BIT_MASK(64) &&
> -		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
> +		    !(gfp & (GFP_DMA32 | GFP_DMA)) &&
> +		    !zone_dma32_is_empty(node)) {
>  			gfp |= GFP_DMA32;
>  			goto again;
>  		}

The zone_dma32_is_empty() value is known at compile-time if
!CONFIG_ZONE_DMA32.  That would make it redundant to check both.

> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 4d40dcce7604..8e79903fbda8 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
>  	end = cma_get_base(cma) + size - 1;
>  	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>  		return end <= DMA_BIT_MASK(zone_dma_bits);
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>  		return end <= DMA_BIT_MASK(32);
>  	return true;
>  }
> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
>  	if (IS_ENABLED(CONFIG_ZONE_DMA))
>  		atomic_pool_resize(atomic_pool_dma,
>  				   GFP_KERNEL | GFP_DMA);
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32))
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
>  		atomic_pool_resize(atomic_pool_dma32,
>  				   GFP_KERNEL | GFP_DMA32);
>  	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
>  		if (!atomic_pool_dma)
>  			ret = -ENOMEM;
>  	}
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
>  		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
>  						GFP_KERNEL | GFP_DMA32);
>  		if (!atomic_pool_dma32)
> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
>  static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
>  {
>  	if (prev == NULL) {
> -		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> +		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>  			return atomic_pool_dma32;
>  		if (atomic_pool_dma && (gfp & GFP_DMA))
>  			return atomic_pool_dma;

I think every single call to zone_dma32_are_empty() is !'d.  It seems
like it chose the wrong polarity.
Robin Murphy Jan. 26, 2023, 7:15 p.m. UTC | #2
On 2023-01-26 16:43, Georgi Djakov wrote:
> From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> 
> It's useful to have an option to disable the ZONE_DMA32 during boot as
> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> example). There are platforms that do not use this zone and in some high
> memory pressure scenarios this would help on easing kswapd (to leave file
> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> these platforms - kswapd is woken up more easily and drains the file cache
> which leads to some performance issues.
> 
> Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> [georgi: updated commit text]
> Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
> ---
> The main question here is whether we can have a kernel command line
> option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> I can imagine this being useful also for Linux distros.

FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed 
around in a few places" are very different notions...

However, I'm just going to take a step back and read the commit message 
a few more times... Given what it claims, I can't help but ask why 
wouldn't we want a parameter to control kswapd's behaviour and address 
that issue directly, rather than a massive hammer that breaks everyone 
allocating explicitly or implicitly with __GFP_DMA32 (especially on 
systems where it doesn't normally matter because all memory is below 4GB 
anyway), just to achieve one rather niche side-effect?

Thanks,
Robin.

>   .../admin-guide/kernel-parameters.txt         |  5 +++++
>   arch/arm64/mm/init.c                          | 20 ++++++++++++++++-
>   arch/x86/mm/init.c                            | 20 ++++++++++++++++-
>   include/linux/dma-direct.h                    | 22 +++++++++++++++++++
>   kernel/dma/direct.c                           |  5 +++--
>   kernel/dma/pool.c                             |  8 +++----
>   6 files changed, 72 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index cb12df402161..854ff65ac6b0 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1070,6 +1070,11 @@
>   			Disable Dynamic DMA Window support. Use this
>   			to workaround buggy firmware.
>   
> +	disable_dma32=	[KNL]
> +			Dynamically disable ZONE_DMA32 on kernels compiled with
> +			CONFIG_ZONE_DMA32=y.
> +
> +
>   	disable_ipv6=	[IPV6]
>   			See Documentation/networking/ipv6.rst.
>   
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 58a0bb2c17f1..1a56098c0e19 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>   	return 0;
>   }
>   
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
>   /*
>    * reserve_crashkernel() - reserves memory for crash kernel
>    *
> @@ -244,7 +250,7 @@ static void __init zone_sizes_init(void)
>   	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
>   #endif
>   #ifdef CONFIG_ZONE_DMA32
> -	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
> +	max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit);
>   	if (!arm64_dma_phys_limit)
>   		arm64_dma_phys_limit = dma32_phys_limit;
>   #endif
> @@ -253,6 +259,18 @@ static void __init zone_sizes_init(void)
>   	free_area_init(max_zone_pfns);
>   }
>   
> +static int __init early_disable_dma32(char *buf)
> +{
> +	if (!buf)
> +		return -EINVAL;
> +
> +	if (!strcmp(buf, "on"))
> +		disable_dma32 = true;
> +
> +	return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
>   int pfn_is_map_memory(unsigned long pfn)
>   {
>   	phys_addr_t addr = PFN_PHYS(pfn);
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index cb258f58fdc8..b8af7e2f21f5 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>   
>   static bool __initdata can_use_brk_pgt = true;
>   
> +/*
> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
> + * CONFIG_ZONE_DMA32.
> + */
> +static bool disable_dma32 __ro_after_init;
> +
>   /*
>    * Pages returned are already directly mapped.
>    *
> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
>   	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
>   #endif
>   #ifdef CONFIG_ZONE_DMA32
> -	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
> +	max_zone_pfns[ZONE_DMA32]	= disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
>   #endif
>   	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
>   #ifdef CONFIG_HIGHMEM
> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
>   	free_area_init(max_zone_pfns);
>   }
>   
> +static int __init early_disable_dma32(char *buf)
> +{
> +	if (!buf)
> +		return -EINVAL;
> +
> +	if (!strcmp(buf, "on"))
> +		disable_dma32 = true;
> +
> +	return 0;
> +}
> +early_param("disable_dma32", early_disable_dma32);
> +
>   __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
>   	.loaded_mm = &init_mm,
>   	.next_asid = 1,
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 18aade195884..ed69618cf1fc 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -24,6 +24,28 @@ struct bus_dma_region {
>   	u64		offset;
>   };
>   
> +static inline bool zone_dma32_is_empty(int node)
> +{
> +#ifdef CONFIG_ZONE_DMA32
> +	pg_data_t *pgdat = NODE_DATA(node);
> +
> +	return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
> +#else
> +	return true;
> +#endif
> +}
> +
> +static inline bool zone_dma32_are_empty(void)
> +{
> +	int node;
> +
> +	for_each_node(node)
> +		if (!zone_dma32_is_empty(node))
> +			return false;
> +
> +	return true;
> +}
> +
>   static inline dma_addr_t translate_phys_to_dma(struct device *dev,
>   		phys_addr_t paddr)
>   {
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 63859a101ed8..754210c65658 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
>   	*phys_limit = dma_to_phys(dev, dma_limit);
>   	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>   		return GFP_DMA;
> -	if (*phys_limit <= DMA_BIT_MASK(32))
> +	if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
>   		return GFP_DMA32;
>   	return 0;
>   }
> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>   
>   		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
>   		    phys_limit < DMA_BIT_MASK(64) &&
> -		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
> +		    !(gfp & (GFP_DMA32 | GFP_DMA)) &&
> +		    !zone_dma32_is_empty(node)) {
>   			gfp |= GFP_DMA32;
>   			goto again;
>   		}
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 4d40dcce7604..8e79903fbda8 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
>   	end = cma_get_base(cma) + size - 1;
>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>   		return end <= DMA_BIT_MASK(zone_dma_bits);
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>   		return end <= DMA_BIT_MASK(32);
>   	return true;
>   }
> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
>   	if (IS_ENABLED(CONFIG_ZONE_DMA))
>   		atomic_pool_resize(atomic_pool_dma,
>   				   GFP_KERNEL | GFP_DMA);
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32))
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
>   		atomic_pool_resize(atomic_pool_dma32,
>   				   GFP_KERNEL | GFP_DMA32);
>   	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
>   		if (!atomic_pool_dma)
>   			ret = -ENOMEM;
>   	}
> -	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
>   		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
>   						GFP_KERNEL | GFP_DMA32);
>   		if (!atomic_pool_dma32)
> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
>   static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
>   {
>   	if (prev == NULL) {
> -		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
> +		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>   			return atomic_pool_dma32;
>   		if (atomic_pool_dma && (gfp & GFP_DMA))
>   			return atomic_pool_dma;
Georgi Djakov Jan. 26, 2023, 10:56 p.m. UTC | #3
Hi Dave,

Thanks for the detailed review and comments!

On 26.01.23 20:51, Dave Hansen wrote:
> On 1/26/23 08:43, Georgi Djakov wrote:
>> From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
>>
>> It's useful to have an option to disable the ZONE_DMA32 during boot as
>> CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
>> example). There are platforms that do not use this zone and in some high
>> memory pressure scenarios this would help on easing kswapd (to leave file
>> backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
>> these platforms - kswapd is woken up more easily and drains the file cache
>> which leads to some performance issues.
> 
> Any chance you could expound on "some performance issues", maybe with
> actual workloads and numbers?

The goal is to prevent any reclaim at all or defer it as much as possible.
I can gather some numbers, but let me explain the scenario first. Let's say
you have 100 processes and you put some to sleep. After some time kswapd
reclaims file cached memory. You wake one of these processes back and some
lagging is observed because files have to be re-opened, as they were reclaimed
from the file cache.

> 
> Also, what are the practical implications here?  There are obviously an
> ever decreasing number of 32-bit DMA devices out there.  Somebody that
> has one and uses this option might be sad because now they're stuck
> using ZONE_DMA which is quite tiny.

True. There are side effects indeed.

I think that Robin's suggestion to try to control kswapd's behavior is
really something worth exploring!

Thanks,
Georgi

> What other ZONE_DMA32 users are left?  Will anyone else care?  There is
> some DMA32 slab and vmalloc() functionality remaining.  Is it impacted?
> 
> What should the default be?  If disable_dma32=0 by default, this seems
> like code that will never get tested.  There are some rather subtle
> relationships between ZONE_DMA32 and other zones that could get missed
> without sufficient testing, like...
> 
> alloc_flags_nofragment() has some rather relevant comments that are left
> unaddressed:
> 
>>          /*
>>           * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
>>           * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
>>           * on UMA that if Normal is populated then so is DMA32.
>>           */
> 
> This patch appears to break the "on UMA" assumption.
> 
> 
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 58a0bb2c17f1..1a56098c0e19 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -118,6 +118,12 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
>>   	return 0;
>>   }
>>   
>> +/*
>> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
>> + * CONFIG_ZONE_DMA32.
>> + */
>> +static bool disable_dma32 __ro_after_init;
> 
> Is this referenced in any hot paths?  It might deserve a static branch.
> 
>> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
>> index cb258f58fdc8..b8af7e2f21f5 100644
>> --- a/arch/x86/mm/init.c
>> +++ b/arch/x86/mm/init.c
>> @@ -112,6 +112,12 @@ static unsigned long min_pfn_mapped;
>>   
>>   static bool __initdata can_use_brk_pgt = true;
>>   
>> +/*
>> + * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
>> + * CONFIG_ZONE_DMA32.
>> + */
>> +static bool disable_dma32 __ro_after_init;
>> +
>>   /*
>>    * Pages returned are already directly mapped.
>>    *
>> @@ -1032,7 +1038,7 @@ void __init zone_sizes_init(void)
>>   	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
>>   #endif
>>   #ifdef CONFIG_ZONE_DMA32
>> -	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
>> +	max_zone_pfns[ZONE_DMA32]	= disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
>>   #endif
>>   	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
>>   #ifdef CONFIG_HIGHMEM
>> @@ -1042,6 +1048,18 @@ void __init zone_sizes_init(void)
>>   	free_area_init(max_zone_pfns);
>>   }
>>   
>> +static int __init early_disable_dma32(char *buf)
>> +{
>> +	if (!buf)
>> +		return -EINVAL;
>> +
>> +	if (!strcmp(buf, "on"))
>> +		disable_dma32 = true;
>> +
>> +	return 0;
>> +}
>> +early_param("disable_dma32", early_disable_dma32);
> 
> Ick.  Is there no way to do this other than a cross-arch copy/paste?
> 
>> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
>> index 18aade195884..ed69618cf1fc 100644
>> --- a/include/linux/dma-direct.h
>> +++ b/include/linux/dma-direct.h
>> @@ -24,6 +24,28 @@ struct bus_dma_region {
>>   	u64		offset;
>>   };
>>   
>> +static inline bool zone_dma32_is_empty(int node)
>> +{
>> +#ifdef CONFIG_ZONE_DMA32
>> +	pg_data_t *pgdat = NODE_DATA(node);
>> +
>> +	return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
>> +#else
>> +	return true;
>> +#endif
>> +}
>> +
>> +static inline bool zone_dma32_are_empty(void)
>> +{
>> +	int node;
>> +
>> +	for_each_node(node)
>> +		if (!zone_dma32_is_empty(node))
>> +			return false;
>> +
>> +	return true;
>> +}
> 
> That for_each_node() loop can be 1024 nodes long.  I hope this isn't in
> any hot paths.  It'll be fine on DMA32 systems because it'll fall out of
> the loop on the first pass.  But, if someone uses your shiny new option,
> they're in for a long loop.
> 
> Oh, and this is:
> 
> #define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)
> 
> Did you really want all *possible* nodes?  Or will
> for_each_online_node() do?
> 
> Also, I'd be rather shocked if there wasn't one-stop-shopping for this
> somewhere.  Consulting your new 'disable_dma32' variable would make this
> function rather cheap.  I suspect a totally empty set of DMA32 will also
> leave its mark in arch_zone_*_possible_pfn[].
> 
>>   static inline dma_addr_t translate_phys_to_dma(struct device *dev,
>>   		phys_addr_t paddr)
>>   {
>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>> index 63859a101ed8..754210c65658 100644
>> --- a/kernel/dma/direct.c
>> +++ b/kernel/dma/direct.c
>> @@ -60,7 +60,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
>>   	*phys_limit = dma_to_phys(dev, dma_limit);
>>   	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
>>   		return GFP_DMA;
>> -	if (*phys_limit <= DMA_BIT_MASK(32))
>> +	if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
>>   		return GFP_DMA32;
>>   	return 0;
>>   }
>> @@ -145,7 +145,8 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>>   
>>   		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
>>   		    phys_limit < DMA_BIT_MASK(64) &&
>> -		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
>> +		    !(gfp & (GFP_DMA32 | GFP_DMA)) &&
>> +		    !zone_dma32_is_empty(node)) {
>>   			gfp |= GFP_DMA32;
>>   			goto again;
>>   		}
> 
> The zone_dma32_is_empty() value is known at compile-time if
> !CONFIG_ZONE_DMA32.  That would make it redundant to check both.
> 
>> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
>> index 4d40dcce7604..8e79903fbda8 100644
>> --- a/kernel/dma/pool.c
>> +++ b/kernel/dma/pool.c
>> @@ -71,7 +71,7 @@ static bool cma_in_zone(gfp_t gfp)
>>   	end = cma_get_base(cma) + size - 1;
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
>>   		return end <= DMA_BIT_MASK(zone_dma_bits);
>> -	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>>   		return end <= DMA_BIT_MASK(32);
>>   	return true;
>>   }
>> @@ -153,7 +153,7 @@ static void atomic_pool_work_fn(struct work_struct *work)
>>   	if (IS_ENABLED(CONFIG_ZONE_DMA))
>>   		atomic_pool_resize(atomic_pool_dma,
>>   				   GFP_KERNEL | GFP_DMA);
>> -	if (IS_ENABLED(CONFIG_ZONE_DMA32))
>> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
>>   		atomic_pool_resize(atomic_pool_dma32,
>>   				   GFP_KERNEL | GFP_DMA32);
>>   	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
>> @@ -209,7 +209,7 @@ static int __init dma_atomic_pool_init(void)
>>   		if (!atomic_pool_dma)
>>   			ret = -ENOMEM;
>>   	}
>> -	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
>> +	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
>>   		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
>>   						GFP_KERNEL | GFP_DMA32);
>>   		if (!atomic_pool_dma32)
>> @@ -224,7 +224,7 @@ postcore_initcall(dma_atomic_pool_init);
>>   static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
>>   {
>>   	if (prev == NULL) {
>> -		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
>> +		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
>>   			return atomic_pool_dma32;
>>   		if (atomic_pool_dma && (gfp & GFP_DMA))
>>   			return atomic_pool_dma;
> 
> I think every single call to zone_dma32_are_empty() is !'d.  It seems
> like it chose the wrong polarity.
Randy Dunlap Jan. 27, 2023, 12:57 a.m. UTC | #4
On 1/26/23 10:51, Dave Hansen wrote:
> On 1/26/23 08:43, Georgi Djakov wrote:
>> From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>

  
>> +static int __init early_disable_dma32(char *buf)
>> +{
>> +	if (!buf)
>> +		return -EINVAL;
>> +
>> +	if (!strcmp(buf, "on"))
>> +		disable_dma32 = true;
>> +
>> +	return 0;
>> +}
>> +early_param("disable_dma32", early_disable_dma32);
> 
> Ick.  Is there no way to do this other than a cross-arch copy/paste?

I think that using __setup() instead of early_param() would allow that.
Chris Goldsworthy Jan. 27, 2023, 2:20 a.m. UTC | #5
On Thu, Jan 26, 2023 at 07:15:26PM +0000, Robin Murphy wrote:
> On 2023-01-26 16:43, Georgi Djakov wrote:
> >From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> >
> >It's useful to have an option to disable the ZONE_DMA32 during boot as
> >CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> >example). There are platforms that do not use this zone and in some high
> >memory pressure scenarios this would help on easing kswapd (to leave file
> >backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> >these platforms - kswapd is woken up more easily and drains the file cache
> >which leads to some performance issues.
> >
> >Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> >[georgi: updated commit text]
> >Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
> >---
> >The main question here is whether we can have a kernel command line
> >option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> >I can imagine this being useful also for Linux distros.
> 
> FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed around
> in a few places" are very different notions...
> 
> However, I'm just going to take a step back and read the commit message a
> few more times... Given what it claims, I can't help but ask why wouldn't we
> want a parameter to control kswapd's behaviour and address that issue
> directly, rather than a massive hammer that breaks everyone allocating
> explicitly or implicitly with __GFP_DMA32 (especially on systems where it
> doesn't normally matter because all memory is below 4GB anyway), just to
> achieve one rather niche side-effect?
> 
> Thanks,
> Robin.

Hi Robin,

The commit text doesn't spell out the scenario we want to avoid, so I
will do that for clarity. We use a kernel binary compiled for us, and
by default has CONFIG_ZONE_DMA32 (and it can't be disabled for now as
another party needs it). Our higher-end SoCs are usually used with
8-12 GB of DDR, so using a 12 GB device as an example, we would have 8
GB of ZONE_NORMAL memory and 4 GB of ZONE_MOVABLE memory with the
feature, and 4 GB of ZONE_DMA32, 4 GB of ZONE_NORMAL and 4 GB of
ZONE_MOVABLE otherwise.

Without the feature enabled, consider a GFP_KERNEL allocation that
causes a low watermark beach in ZONE_NORMAL, such that such that
ZONE_DMA32 is almost full. This will cause kswapd to start reclaiming
memory, despite the fact that that we might have gigabytes of free
memory in ZONE_DMA32 that can be used by anyone (since GFP_MOVABLE and
GFP_NORMAL can fall back to using ZONE_DMA32).

So, fleshing out your suggestion to make it concrete for our case, we
would want to stop kswapd from doing reclaim on ZONE_NORMAL watermark
breaches when ZONE_DMA32 is present (since anything targeting
ZONE_NORMAL can fall back to ZONE_DMA32).

Thanks,

Chris.
Christoph Hellwig Jan. 27, 2023, 6:35 a.m. UTC | #6
On Thu, Jan 26, 2023 at 10:51:17AM -0800, Dave Hansen wrote:
> 
> Also, what are the practical implications here?  There are obviously an
> ever decreasing number of 32-bit DMA devices out there.  Somebody that
> has one and uses this option might be sad because now they're stuck
> using ZONE_DMA which is quite tiny.
> 
> What other ZONE_DMA32 users are left?  Will anyone else care?  There is
> some DMA32 slab and vmalloc() functionality remaining.  Is it impacted?

DMA32 never supported lab.  But < 64-bit DMA device are unfortunately
still not uncommon, and configuring out ZONE_DMA32 breaks them pretty
badly as we guarantee that a DMA mask of 32-bit always works.

So I'm not only very much against this patch, but also the currently
existing way to configure out ZONE_DMA32 on arm64, which needs to
go away.

If people want ZONE_DMA32 to go away we need something to replace
it first, like a large enough CMA region in the 32-bit addressable
range.
H. Peter Anvin Jan. 27, 2023, 6:52 a.m. UTC | #7
On January 26, 2023 10:35:55 PM PST, Christoph Hellwig <hch@lst.de> wrote:
>On Thu, Jan 26, 2023 at 10:51:17AM -0800, Dave Hansen wrote:
>> 
>> Also, what are the practical implications here?  There are obviously an
>> ever decreasing number of 32-bit DMA devices out there.  Somebody that
>> has one and uses this option might be sad because now they're stuck
>> using ZONE_DMA which is quite tiny.
>> 
>> What other ZONE_DMA32 users are left?  Will anyone else care?  There is
>> some DMA32 slab and vmalloc() functionality remaining.  Is it impacted?
>
>DMA32 never supported lab.  But < 64-bit DMA device are unfortunately
>still not uncommon, and configuring out ZONE_DMA32 breaks them pretty
>badly as we guarantee that a DMA mask of 32-bit always works.
>
>So I'm not only very much against this patch, but also the currently
>existing way to configure out ZONE_DMA32 on arm64, which needs to
>go away.
>
>If people want ZONE_DMA32 to go away we need something to replace
>it first, like a large enough CMA region in the 32-bit addressable
>range.

Not to mention all kinds of odd masks like 30, 31, 39, 40, 46, ... bits.
Christoph Hellwig Jan. 27, 2023, 7:07 a.m. UTC | #8
On Thu, Jan 26, 2023 at 10:52:43PM -0800, H. Peter Anvin wrote:
> >If people want ZONE_DMA32 to go away we need something to replace
> >it first, like a large enough CMA region in the 32-bit addressable
> >range.
> 
> Not to mention all kinds of odd masks like 30, 31, 39, 40, 46, ... bits.

Yes.  Out of those all >= 32 are falling straight into ZONE_DM32,
the lower ones we do a first try in ZONE_DMA32 and then fall back to
ZONE_DMA.  <= 29 mask OTOH are really rate in modern systems for
actual devices.  So with a CMA region for what is currently ZONE_DMA
and one for the first 1G we'd probably cover most of what's actually
needed for x86_64.  Of course on 32-bit architetures things become
a lot more complicated due to highmem.
Hillf Danton Jan. 27, 2023, 8:55 a.m. UTC | #9
On Thu, 26 Jan 2023 18:20:26 -0800 Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> On Thu, Jan 26, 2023 at 07:15:26PM +0000, Robin Murphy wrote:
> > On 2023-01-26 16:43, Georgi Djakov wrote:
> > >From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> > >
> > >It's useful to have an option to disable the ZONE_DMA32 during boot as
> > >CONFIG_ZONE_DMA32 is by default enabled (on multiplatform kernels for
> > >example). There are platforms that do not use this zone and in some high
> > >memory pressure scenarios this would help on easing kswapd (to leave file
> > >backed memory intact / unreclaimed). When the ZONE_DMA32 is enabled on
> > >these platforms - kswapd is woken up more easily and drains the file cache
> > >which leads to some performance issues.
> > >
> > >Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> > >[georgi: updated commit text]
> > >Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
> > >---
> > >The main question here is whether we can have a kernel command line
> > >option to disable CONFIG_ZONE_DMA32 during boot (at least on arm64).
> > >I can imagine this being useful also for Linux distros.
> > 
> > FWIW I'd say that "disabled" and "left empty then awkwardly tiptoed around
> > in a few places" are very different notions...
> > 
> > However, I'm just going to take a step back and read the commit message a
> > few more times... Given what it claims, I can't help but ask why wouldn't we
> > want a parameter to control kswapd's behaviour and address that issue
> > directly, rather than a massive hammer that breaks everyone allocating
> > explicitly or implicitly with __GFP_DMA32 (especially on systems where it
> > doesn't normally matter because all memory is below 4GB anyway), just to
> > achieve one rather niche side-effect?
> > 
> > Thanks,
> > Robin.
> 
> Hi Robin,
> 
> The commit text doesn't spell out the scenario we want to avoid, so I
> will do that for clarity. We use a kernel binary compiled for us, and
> by default has CONFIG_ZONE_DMA32 (and it can't be disabled for now as
> another party needs it). Our higher-end SoCs are usually used with
> 8-12 GB of DDR, so using a 12 GB device as an example, we would have 8
> GB of ZONE_NORMAL memory and 4 GB of ZONE_MOVABLE memory with the
> feature, and 4 GB of ZONE_DMA32, 4 GB of ZONE_NORMAL and 4 GB of
> ZONE_MOVABLE otherwise.
> 
> Without the feature enabled, consider a GFP_KERNEL allocation that
> causes a low watermark beach in ZONE_NORMAL, such that such that
> ZONE_DMA32 is almost full. This will cause kswapd to start reclaiming
> memory, despite the fact that that we might have gigabytes of free
> memory in ZONE_DMA32 that can be used by anyone (since GFP_MOVABLE and
> GFP_NORMAL can fall back to using ZONE_DMA32).

If kswapd is busy reclaiming pages even given gigabytes of free memory
in the DMA32 zone then it is a CPU hog.

Feel free to check pgdat_balanced() and prepare_kswapd_sleep().
> 
> So, fleshing out your suggestion to make it concrete for our case, we
> would want to stop kswapd from doing reclaim on ZONE_NORMAL watermark
> breaches when ZONE_DMA32 is present (since anything targeting
> ZONE_NORMAL can fall back to ZONE_DMA32).
> 
> Thanks,
> 
> Chris.
>
Chris Goldsworthy Feb. 1, 2023, 4:09 a.m. UTC | #10
On Fri, Jan 27, 2023 at 04:55:53PM +0800, Hillf Danton wrote:
> On Thu, 26 Jan 2023 18:20:26 -0800 Chris Goldsworthy <quic_cgoldswo@quicinc.com>
> > On Thu, Jan 26, 2023 at 07:15:26PM +0000, Robin Murphy wrote:
> > > However, I'm just going to take a step back and read the commit message a
> > > few more times... Given what it claims, I can't help but ask why wouldn't we
> > > want a parameter to control kswapd's behaviour and address that issue
> > > directly, rather than a massive hammer that breaks everyone allocating
> > > explicitly or implicitly with __GFP_DMA32 (especially on systems where it
> > > doesn't normally matter because all memory is below 4GB anyway), just to
> > > achieve one rather niche side-effect?
> > > 
> > > Thanks,
> > > Robin.
> > 
> > Hi Robin,
> > 
> > The commit text doesn't spell out the scenario we want to avoid, so I
> > will do that for clarity. We use a kernel binary compiled for us, and
> > by default has CONFIG_ZONE_DMA32 (and it can't be disabled for now as
> > another party needs it). Our higher-end SoCs are usually used with
> > 8-12 GB of DDR, so using a 12 GB device as an example, we would have 8
> > GB of ZONE_NORMAL memory and 4 GB of ZONE_MOVABLE memory with the
> > feature, and 4 GB of ZONE_DMA32, 4 GB of ZONE_NORMAL and 4 GB of
> > ZONE_MOVABLE otherwise.
> > 
> > Without the feature enabled, consider a GFP_KERNEL allocation that
> > causes a low watermark beach in ZONE_NORMAL, such that such that
> > ZONE_DMA32 is almost full. This will cause kswapd to start reclaiming
> > memory, despite the fact that that we might have gigabytes of free
> > memory in ZONE_DMA32 that can be used by anyone (since GFP_MOVABLE and
> > GFP_NORMAL can fall back to using ZONE_DMA32).
> 
> If kswapd is busy reclaiming pages even given gigabytes of free memory
> in the DMA32 zone then it is a CPU hog.
> 
> Feel free to check pgdat_balanced() and prepare_kswapd_sleep().

Thanks for pointing out this gap in my understanding - I'm taking a closer look
at these paths to see whether there is room for what Robin suggested.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb12df402161..854ff65ac6b0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1070,6 +1070,11 @@ 
 			Disable Dynamic DMA Window support. Use this
 			to workaround buggy firmware.
 
+	disable_dma32=	[KNL]
+			Dynamically disable ZONE_DMA32 on kernels compiled with
+			CONFIG_ZONE_DMA32=y.
+
+
 	disable_ipv6=	[IPV6]
 			See Documentation/networking/ipv6.rst.
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 58a0bb2c17f1..1a56098c0e19 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -118,6 +118,12 @@  static int __init reserve_crashkernel_low(unsigned long long low_size)
 	return 0;
 }
 
+/*
+ * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
+ * CONFIG_ZONE_DMA32.
+ */
+static bool disable_dma32 __ro_after_init;
+
 /*
  * reserve_crashkernel() - reserves memory for crash kernel
  *
@@ -244,7 +250,7 @@  static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
 #endif
 #ifdef CONFIG_ZONE_DMA32
-	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
+	max_zone_pfns[ZONE_DMA32] = disable_dma32 ? 0 : PFN_DOWN(dma32_phys_limit);
 	if (!arm64_dma_phys_limit)
 		arm64_dma_phys_limit = dma32_phys_limit;
 #endif
@@ -253,6 +259,18 @@  static void __init zone_sizes_init(void)
 	free_area_init(max_zone_pfns);
 }
 
+static int __init early_disable_dma32(char *buf)
+{
+	if (!buf)
+		return -EINVAL;
+
+	if (!strcmp(buf, "on"))
+		disable_dma32 = true;
+
+	return 0;
+}
+early_param("disable_dma32", early_disable_dma32);
+
 int pfn_is_map_memory(unsigned long pfn)
 {
 	phys_addr_t addr = PFN_PHYS(pfn);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cb258f58fdc8..b8af7e2f21f5 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -112,6 +112,12 @@  static unsigned long min_pfn_mapped;
 
 static bool __initdata can_use_brk_pgt = true;
 
+/*
+ * Provide a run-time mean of disabling ZONE_DMA32 if it is enabled via
+ * CONFIG_ZONE_DMA32.
+ */
+static bool disable_dma32 __ro_after_init;
+
 /*
  * Pages returned are already directly mapped.
  *
@@ -1032,7 +1038,7 @@  void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
 #endif
 #ifdef CONFIG_ZONE_DMA32
-	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
+	max_zone_pfns[ZONE_DMA32]	= disable_dma32 ? 0 : min(MAX_DMA32_PFN, max_low_pfn);
 #endif
 	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
 #ifdef CONFIG_HIGHMEM
@@ -1042,6 +1048,18 @@  void __init zone_sizes_init(void)
 	free_area_init(max_zone_pfns);
 }
 
+static int __init early_disable_dma32(char *buf)
+{
+	if (!buf)
+		return -EINVAL;
+
+	if (!strcmp(buf, "on"))
+		disable_dma32 = true;
+
+	return 0;
+}
+early_param("disable_dma32", early_disable_dma32);
+
 __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
 	.next_asid = 1,
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 18aade195884..ed69618cf1fc 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -24,6 +24,28 @@  struct bus_dma_region {
 	u64		offset;
 };
 
+static inline bool zone_dma32_is_empty(int node)
+{
+#ifdef CONFIG_ZONE_DMA32
+	pg_data_t *pgdat = NODE_DATA(node);
+
+	return zone_is_empty(&pgdat->node_zones[ZONE_DMA32]);
+#else
+	return true;
+#endif
+}
+
+static inline bool zone_dma32_are_empty(void)
+{
+	int node;
+
+	for_each_node(node)
+		if (!zone_dma32_is_empty(node))
+			return false;
+
+	return true;
+}
+
 static inline dma_addr_t translate_phys_to_dma(struct device *dev,
 		phys_addr_t paddr)
 {
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 63859a101ed8..754210c65658 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -60,7 +60,7 @@  static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	*phys_limit = dma_to_phys(dev, dma_limit);
 	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
 		return GFP_DMA;
-	if (*phys_limit <= DMA_BIT_MASK(32))
+	if (*phys_limit <= DMA_BIT_MASK(32) && !zone_dma32_is_empty(dev_to_node(dev)))
 		return GFP_DMA32;
 	return 0;
 }
@@ -145,7 +145,8 @@  static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
 		    phys_limit < DMA_BIT_MASK(64) &&
-		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
+		    !(gfp & (GFP_DMA32 | GFP_DMA)) &&
+		    !zone_dma32_is_empty(node)) {
 			gfp |= GFP_DMA32;
 			goto again;
 		}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..8e79903fbda8 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -71,7 +71,7 @@  static bool cma_in_zone(gfp_t gfp)
 	end = cma_get_base(cma) + size - 1;
 	if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
 		return end <= DMA_BIT_MASK(zone_dma_bits);
-	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
 		return end <= DMA_BIT_MASK(32);
 	return true;
 }
@@ -153,7 +153,7 @@  static void atomic_pool_work_fn(struct work_struct *work)
 	if (IS_ENABLED(CONFIG_ZONE_DMA))
 		atomic_pool_resize(atomic_pool_dma,
 				   GFP_KERNEL | GFP_DMA);
-	if (IS_ENABLED(CONFIG_ZONE_DMA32))
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty())
 		atomic_pool_resize(atomic_pool_dma32,
 				   GFP_KERNEL | GFP_DMA32);
 	atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
@@ -209,7 +209,7 @@  static int __init dma_atomic_pool_init(void)
 		if (!atomic_pool_dma)
 			ret = -ENOMEM;
 	}
-	if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+	if (IS_ENABLED(CONFIG_ZONE_DMA32) && !zone_dma32_are_empty()) {
 		atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
 						GFP_KERNEL | GFP_DMA32);
 		if (!atomic_pool_dma32)
@@ -224,7 +224,7 @@  postcore_initcall(dma_atomic_pool_init);
 static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
 {
 	if (prev == NULL) {
-		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+		if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32) && !zone_dma32_are_empty())
 			return atomic_pool_dma32;
 		if (atomic_pool_dma && (gfp & GFP_DMA))
 			return atomic_pool_dma;