diff mbox series

[net-next,v3,1/7] dma: compile-out DMA sync op calls when not used

Message ID 20240214162201.4168778-2-aleksander.lobakin@intel.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series dma: skip calling no-op sync ops when possible | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 10484 this patch: 10484
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 2507 this patch: 2507
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 11384 this patch: 11384
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Please use a blank line after function/struct/union/enum declarations
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-02-15--00-00 (tests: 1443)

Commit Message

Alexander Lobakin Feb. 14, 2024, 4:21 p.m. UTC
Some platforms do have DMA, but DMA there is always direct and coherent.
Currently, even on such platforms DMA sync operations are compiled and
called.
Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
either sync operations are needed or there is DMA ops or swiotlb
enabled. Set dma_need_sync() and dma_skip_sync() depending on this
symbol state and don't call sync ops when dma_skip_sync() is true.
The change allows for future optimizations of DMA sync calls depending
on compile-time or runtime conditions.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 kernel/dma/Kconfig          |  4 ++
 include/linux/dma-mapping.h | 80 +++++++++++++++++++++++++++++++------
 kernel/dma/mapping.c        | 20 +++++-----
 3 files changed, 81 insertions(+), 23 deletions(-)

Comments

Robin Murphy Feb. 14, 2024, 5:20 p.m. UTC | #1
On 2024-02-14 4:21 pm, Alexander Lobakin wrote:
> Some platforms do have DMA, but DMA there is always direct and coherent.
> Currently, even on such platforms DMA sync operations are compiled and
> called.
> Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
> either sync operations are needed or there is DMA ops or swiotlb
> enabled. Set dma_need_sync() and dma_skip_sync() depending on this
> symbol state and don't call sync ops when dma_skip_sync() is true.
> The change allows for future optimizations of DMA sync calls depending
> on compile-time or runtime conditions.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>   kernel/dma/Kconfig          |  4 ++
>   include/linux/dma-mapping.h | 80 +++++++++++++++++++++++++++++++------
>   kernel/dma/mapping.c        | 20 +++++-----
>   3 files changed, 81 insertions(+), 23 deletions(-)
> 
> diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
> index d62f5957f36b..1c9ff05b1ecb 100644
> --- a/kernel/dma/Kconfig
> +++ b/kernel/dma/Kconfig
> @@ -107,6 +107,10 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC
>   	bool
>   	depends on SWIOTLB
>   
> +config DMA_NEED_SYNC
> +	def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
> +		 ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_OPS || SWIOTLB

I'm not sure DMA_OPS belongs here - several architectures have 
non-trivial ops without syncs, e.g. Alpha.

> +
>   config DMA_RESTRICTED_POOL
>   	bool "DMA Restricted Pool"
>   	depends on OF && OF_RESERVED_MEM && SWIOTLB
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 4a658de44ee9..6c7640441214 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -117,13 +117,13 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
>   		size_t size, enum dma_data_direction dir, unsigned long attrs);
>   void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
>   		enum dma_data_direction dir, unsigned long attrs);
> -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
> +void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
>   		enum dma_data_direction dir);
> -void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
> +void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
>   		size_t size, enum dma_data_direction dir);
> -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
> +void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
>   		    int nelems, enum dma_data_direction dir);
> -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
> +void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
>   		       int nelems, enum dma_data_direction dir);
>   void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
>   		gfp_t flag, unsigned long attrs);
> @@ -147,7 +147,7 @@ u64 dma_get_required_mask(struct device *dev);
>   bool dma_addressing_limited(struct device *dev);
>   size_t dma_max_mapping_size(struct device *dev);
>   size_t dma_opt_mapping_size(struct device *dev);
> -bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
> +bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
>   unsigned long dma_get_merge_boundary(struct device *dev);
>   struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
>   		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
> @@ -195,19 +195,19 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
>   		size_t size, enum dma_data_direction dir, unsigned long attrs)
>   {
>   }
> -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
> -		size_t size, enum dma_data_direction dir)
> +static inline void __dma_sync_single_for_cpu(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)

To me it would feel more logical to put all the wrappers inside the 
#ifdef CONFIG_HAS_DMA and not touch these stubs at all (what does it 
mean to skip an inline no-op?). Or in fact, if dma_skip_sync() is 
constant false for !HAS_DMA, then we could also just make the external 
function declarations unconditional and remove the stubs. Not a critical 
matter though, and I defer to whatever Christoph thinks is most 
maintainable.

>   {
>   }
> -static inline void dma_sync_single_for_device(struct device *dev,
> +static inline void __dma_sync_single_for_device(struct device *dev,
>   		dma_addr_t addr, size_t size, enum dma_data_direction dir)
>   {
>   }
> -static inline void dma_sync_sg_for_cpu(struct device *dev,
> +static inline void __dma_sync_sg_for_cpu(struct device *dev,
>   		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
>   {
>   }
> -static inline void dma_sync_sg_for_device(struct device *dev,
> +static inline void __dma_sync_sg_for_device(struct device *dev,
>   		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
>   {
>   }
> @@ -277,7 +277,7 @@ static inline size_t dma_opt_mapping_size(struct device *dev)
>   {
>   	return 0;
>   }
> -static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +static inline bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
>   {
>   	return false;
>   }
> @@ -348,18 +348,72 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
>   	return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
>   }
>   
> +static inline void __dma_sync_single_range_for_cpu(struct device *dev,
> +		dma_addr_t addr, unsigned long offset, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	__dma_sync_single_for_cpu(dev, addr + offset, size, dir);
> +}
> +
> +static inline void __dma_sync_single_range_for_device(struct device *dev,
> +		dma_addr_t addr, unsigned long offset, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	__dma_sync_single_for_device(dev, addr + offset, size, dir);
> +}

There is no need to introduce these two.

> +
> +static inline bool dma_skip_sync(const struct device *dev)
> +{
> +	return !IS_ENABLED(CONFIG_DMA_NEED_SYNC);
> +}
> +
> +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
> +{
> +	return !dma_skip_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
> +}

That's a bit of a mind-bender... is it actually just

	return !dma_skip_sync(dev) && __dma_need_sync(dev, dma_addr);

?

(I do still think the negative flag makes it all a little harder to 
follow in general than a positive "device needs to consider syncs" flag 
would.)

> +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
> +		size_t size, enum dma_data_direction dir)
> +{
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_single_for_cpu(dev, addr, size, dir);
> +}
> +
> +static inline void dma_sync_single_for_device(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_single_for_device(dev, addr, size, dir);
> +}
> +
> +static inline void dma_sync_sg_for_cpu(struct device *dev,
> +		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
> +{
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_sg_for_cpu(dev, sg, nelems, dir);
> +}
> +
> +static inline void dma_sync_sg_for_device(struct device *dev,
> +		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
> +{
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_sg_for_device(dev, sg, nelems, dir);
> +}
> +
>   static inline void dma_sync_single_range_for_cpu(struct device *dev,
>   		dma_addr_t addr, unsigned long offset, size_t size,
>   		enum dma_data_direction dir)
>   {
> -	return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_single_for_cpu(dev, addr + offset, size, dir);
>   }
>   
>   static inline void dma_sync_single_range_for_device(struct device *dev,
>   		dma_addr_t addr, unsigned long offset, size_t size,
>   		enum dma_data_direction dir)
>   {
> -	return dma_sync_single_for_device(dev, addr + offset, size, dir);
> +	if (!dma_skip_sync(dev))
> +		__dma_sync_single_for_device(dev, addr + offset, size, dir);
>   }

These two don't need changing either, since the dma_sync_single_* 
wrappers have already taken care of it.

Thanks,
Robin.
Robin Murphy Feb. 14, 2024, 6:09 p.m. UTC | #2
On 2024-02-14 4:21 pm, Alexander Lobakin wrote:
[...]
> +static inline bool dma_skip_sync(const struct device *dev)
> +{
> +	return !IS_ENABLED(CONFIG_DMA_NEED_SYNC);
> +}

One more thing, could we please also make this conditional on 
!CONFIG_DMA_API_DEBUG so that that doesn't lose coverage for validating 
syncs?

Thanks,
Robin.
Christoph Hellwig Feb. 15, 2024, 5:06 a.m. UTC | #3
On Wed, Feb 14, 2024 at 05:20:50PM +0000, Robin Murphy wrote:
>>   +config DMA_NEED_SYNC
>> +	def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
>> +		 ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_OPS || SWIOTLB
>
> I'm not sure DMA_OPS belongs here - several architectures have non-trivial 
> ops without syncs, e.g. Alpha.

True, but peeking through the ops is a bit hard.  And I don't think it's
worth optimizing the dma sync performance on Alpha :)

>> +static inline void __dma_sync_single_for_cpu(struct device *dev,
>> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
>
> To me it would feel more logical to put all the wrappers inside the #ifdef 
> CONFIG_HAS_DMA and not touch these stubs at all (what does it mean to skip 
> an inline no-op?). Or in fact, if dma_skip_sync() is constant false for 
> !HAS_DMA, then we could also just make the external function declarations 
> unconditional and remove the stubs. Not a critical matter though, and I 
> defer to whatever Christoph thinks is most maintainable.

Your idea sounds reasonable to me, but I don't have a strong preference.

>> +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
>> +{
>> +	return !dma_skip_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
>> +}
>
> That's a bit of a mind-bender... is it actually just
>
> 	return !dma_skip_sync(dev) && __dma_need_sync(dev, dma_addr);
>
> ?

That looks a lot more readable for sure.

> (I do still think the negative flag makes it all a little harder to follow 
> in general than a positive "device needs to consider syncs" flag would.)

Probably.
Christoph Hellwig Feb. 15, 2024, 5:06 a.m. UTC | #4
On Wed, Feb 14, 2024 at 06:09:08PM +0000, Robin Murphy wrote:
> On 2024-02-14 4:21 pm, Alexander Lobakin wrote:
> [...]
>> +static inline bool dma_skip_sync(const struct device *dev)
>> +{
>> +	return !IS_ENABLED(CONFIG_DMA_NEED_SYNC);
>> +}
>
> One more thing, could we please also make this conditional on 
> !CONFIG_DMA_API_DEBUG so that that doesn't lose coverage for validating 
> syncs?

Agreed.
Alexander Lobakin Feb. 19, 2024, 12:53 p.m. UTC | #5
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 14 Feb 2024 17:20:50 +0000

> On 2024-02-14 4:21 pm, Alexander Lobakin wrote:

[...]

>> -static inline void dma_sync_single_for_cpu(struct device *dev,
>> dma_addr_t addr,
>> -        size_t size, enum dma_data_direction dir)
>> +static inline void __dma_sync_single_for_cpu(struct device *dev,
>> +        dma_addr_t addr, size_t size, enum dma_data_direction dir)
> 
> To me it would feel more logical to put all the wrappers inside the
> #ifdef CONFIG_HAS_DMA and not touch these stubs at all (what does it
> mean to skip an inline no-op?). Or in fact, if dma_skip_sync() is
> constant false for !HAS_DMA, then we could also just make the external
> function declarations unconditional and remove the stubs. Not a critical
> matter though, and I defer to whatever Christoph thinks is most
> maintainable.

It's done like that due to that I'm adding a runtime check in the second
patch. I don't feel like touching this twice makes sense.

[...]

>> @@ -348,18 +348,72 @@ static inline void dma_unmap_single_attrs(struct
>> device *dev, dma_addr_t addr,
>>       return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
>>   }
>>   +static inline void __dma_sync_single_range_for_cpu(struct device *dev,
>> +        dma_addr_t addr, unsigned long offset, size_t size,
>> +        enum dma_data_direction dir)
>> +{
>> +    __dma_sync_single_for_cpu(dev, addr + offset, size, dir);
>> +}
>> +
>> +static inline void __dma_sync_single_range_for_device(struct device
>> *dev,
>> +        dma_addr_t addr, unsigned long offset, size_t size,
>> +        enum dma_data_direction dir)
>> +{
>> +    __dma_sync_single_for_device(dev, addr + offset, size, dir);
>> +}
> 
> There is no need to introduce these two.

I already replied to this in the previous thread. Some subsys may want
to check for the shortcut earlier to avoid call ladders of their own
functions. See patch 6 for example where I use this one.

> 
>> +
>> +static inline bool dma_skip_sync(const struct device *dev)
>> +{
>> +    return !IS_ENABLED(CONFIG_DMA_NEED_SYNC);
>> +}
>> +
>> +static inline bool dma_need_sync(struct device *dev, dma_addr_t
>> dma_addr)
>> +{
>> +    return !dma_skip_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
>> +}
> 
> That's a bit of a mind-bender... is it actually just
> 
>     return !dma_skip_sync(dev) && __dma_need_sync(dev, dma_addr);

Oh, indeed ._.

> 
> ?
> 
> (I do still think the negative flag makes it all a little harder to
> follow in general than a positive "device needs to consider syncs" flag
> would.)

I think it was in the original Eric's idea and I kept this.
I'm fine with inverting it.

[...]

> Thanks,
> Robin.

Thanks,
Olek
Robin Murphy Feb. 26, 2024, 4:27 p.m. UTC | #6
On 19/02/2024 12:53 pm, Alexander Lobakin wrote:
> From: Robin Murphy <robin.murphy@arm.com>
> Date: Wed, 14 Feb 2024 17:20:50 +0000
> 
>> On 2024-02-14 4:21 pm, Alexander Lobakin wrote:
> 
> [...]
> 
>>> -static inline void dma_sync_single_for_cpu(struct device *dev,
>>> dma_addr_t addr,
>>> -        size_t size, enum dma_data_direction dir)
>>> +static inline void __dma_sync_single_for_cpu(struct device *dev,
>>> +        dma_addr_t addr, size_t size, enum dma_data_direction dir)
>>
>> To me it would feel more logical to put all the wrappers inside the
>> #ifdef CONFIG_HAS_DMA and not touch these stubs at all (what does it
>> mean to skip an inline no-op?). Or in fact, if dma_skip_sync() is
>> constant false for !HAS_DMA, then we could also just make the external
>> function declarations unconditional and remove the stubs. Not a critical
>> matter though, and I defer to whatever Christoph thinks is most
>> maintainable.
> 
> It's done like that due to that I'm adding a runtime check in the second
> patch. I don't feel like touching this twice makes sense.

Huh? Why would anything need touching twice? All I'm saying is that it's 
pretty pointless to add any invocations of dma_skip_sync() in !HAS_DMA 
paths where we already know the whole API is stubbed out anyway. The 
only cases which are worth differentiating here are HAS_DMA + 
DMA_NEED_SYNC vs. HAS_DMA + !DMA_NEED_SYNC (with the subsequent runtime 
check then just subdividing the former).

> 
> [...]
> 
>>> @@ -348,18 +348,72 @@ static inline void dma_unmap_single_attrs(struct
>>> device *dev, dma_addr_t addr,
>>>        return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
>>>    }
>>>    +static inline void __dma_sync_single_range_for_cpu(struct device *dev,
>>> +        dma_addr_t addr, unsigned long offset, size_t size,
>>> +        enum dma_data_direction dir)
>>> +{
>>> +    __dma_sync_single_for_cpu(dev, addr + offset, size, dir);
>>> +}
>>> +
>>> +static inline void __dma_sync_single_range_for_device(struct device
>>> *dev,
>>> +        dma_addr_t addr, unsigned long offset, size_t size,
>>> +        enum dma_data_direction dir)
>>> +{
>>> +    __dma_sync_single_for_device(dev, addr + offset, size, dir);
>>> +}
>>
>> There is no need to introduce these two.
> 
> I already replied to this in the previous thread. Some subsys may want
> to check for the shortcut earlier to avoid call ladders of their own
> functions. See patch 6 for example where I use this one.

Ugh, no. If the page pool code wants to be clever poking around and 
sidestepping parts of the documented API, it can flippin' well open-code 
a single addition to call __dma_sync_single_for_device() directly 
itself. I'm not at all keen on having to maintain "common" APIs for such 
niche trickery.

Thanks,
Robin.
diff mbox series

Patch

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index d62f5957f36b..1c9ff05b1ecb 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -107,6 +107,10 @@  config DMA_BOUNCE_UNALIGNED_KMALLOC
 	bool
 	depends on SWIOTLB
 
+config DMA_NEED_SYNC
+	def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
+		 ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_OPS || SWIOTLB
+
 config DMA_RESTRICTED_POOL
 	bool "DMA Restricted Pool"
 	depends on OF && OF_RESERVED_MEM && SWIOTLB
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4a658de44ee9..6c7640441214 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -117,13 +117,13 @@  dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs);
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir);
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir);
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		    int nelems, enum dma_data_direction dir);
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		       int nelems, enum dma_data_direction dir);
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs);
@@ -147,7 +147,7 @@  u64 dma_get_required_mask(struct device *dev);
 bool dma_addressing_limited(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 size_t dma_opt_mapping_size(struct device *dev);
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
 		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
@@ -195,19 +195,19 @@  static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir)
+static inline void __dma_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
 }
-static inline void dma_sync_single_for_device(struct device *dev,
+static inline void __dma_sync_single_for_device(struct device *dev,
 		dma_addr_t addr, size_t size, enum dma_data_direction dir)
 {
 }
-static inline void dma_sync_sg_for_cpu(struct device *dev,
+static inline void __dma_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
 {
 }
-static inline void dma_sync_sg_for_device(struct device *dev,
+static inline void __dma_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
 {
 }
@@ -277,7 +277,7 @@  static inline size_t dma_opt_mapping_size(struct device *dev)
 {
 	return 0;
 }
-static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+static inline bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	return false;
 }
@@ -348,18 +348,72 @@  static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 	return dma_unmap_page_attrs(dev, addr, size, dir, attrs);
 }
 
+static inline void __dma_sync_single_range_for_cpu(struct device *dev,
+		dma_addr_t addr, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
+{
+	__dma_sync_single_for_cpu(dev, addr + offset, size, dir);
+}
+
+static inline void __dma_sync_single_range_for_device(struct device *dev,
+		dma_addr_t addr, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
+{
+	__dma_sync_single_for_device(dev, addr + offset, size, dir);
+}
+
+static inline bool dma_skip_sync(const struct device *dev)
+{
+	return !IS_ENABLED(CONFIG_DMA_NEED_SYNC);
+}
+
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return !dma_skip_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
+}
+
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+	if (!dma_skip_sync(dev))
+		__dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void dma_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	if (!dma_skip_sync(dev))
+		__dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+static inline void dma_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+	if (!dma_skip_sync(dev))
+		__dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+
+static inline void dma_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+	if (!dma_skip_sync(dev))
+		__dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+
 static inline void dma_sync_single_range_for_cpu(struct device *dev,
 		dma_addr_t addr, unsigned long offset, size_t size,
 		enum dma_data_direction dir)
 {
-	return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
+	if (!dma_skip_sync(dev))
+		__dma_sync_single_for_cpu(dev, addr + offset, size, dir);
 }
 
 static inline void dma_sync_single_range_for_device(struct device *dev,
 		dma_addr_t addr, unsigned long offset, size_t size,
 		enum dma_data_direction dir)
 {
-	return dma_sync_single_for_device(dev, addr + offset, size, dir);
+	if (!dma_skip_sync(dev))
+		__dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
 /**
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..85feaa0e008c 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -329,7 +329,7 @@  void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_resource);
 
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -341,9 +341,9 @@  void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		ops->sync_single_for_cpu(dev, addr, size, dir);
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
+EXPORT_SYMBOL(__dma_sync_single_for_cpu);
 
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -355,9 +355,9 @@  void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		ops->sync_single_for_device(dev, addr, size, dir);
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
-EXPORT_SYMBOL(dma_sync_single_for_device);
+EXPORT_SYMBOL(__dma_sync_single_for_device);
 
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		    int nelems, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -369,9 +369,9 @@  void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
 
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		       int nelems, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -383,7 +383,7 @@  void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 }
-EXPORT_SYMBOL(dma_sync_sg_for_device);
+EXPORT_SYMBOL(__dma_sync_sg_for_device);
 
 /*
  * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -841,7 +841,7 @@  size_t dma_opt_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
 
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
@@ -849,7 +849,7 @@  bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 		return dma_direct_need_sync(dev, dma_addr);
 	return ops->sync_single_for_cpu || ops->sync_single_for_device;
 }
-EXPORT_SYMBOL_GPL(dma_need_sync);
+EXPORT_SYMBOL_GPL(__dma_need_sync);
 
 unsigned long dma_get_merge_boundary(struct device *dev)
 {