diff mbox series

[2/2] arm64: dma-mapping: add relaxed DMA sync

Message ID 1597736591-20457-2-git-send-email-pullip.cho@samsung.com (mailing list archive)
State New, archived
Headers show
Series [1/2] dma-mapping: introduce relaxed version of dma sync | expand

Commit Message

Cho KyongHo Aug. 18, 2020, 7:43 a.m. UTC
__dma_[un]map_area() is the implementation of cache maintenance
operations for DMA in arm64. 'dsb sy' in the subroutine guarantees the
view of given memory area is consistent to all memory observers. So,
it is required.
However, dma_sync_sg_for_{device|cpu}() and dma_[un]map_sg() calls
__dma_[un]map_area() nents number of times and 'dsb sy' instruction is
executed the same number of times. We have observed that 'dsb sy'
consumes more time than cleaning or invalidating 4KiB area.
arch_sync_dma_for_{device|cpu}_relaxed() and
arch_sync_barrier_for_{device|cpu}() are introduced since commit
6a9356234 ("dma-mapping: introduce relaxed version of dma sync") to
reduce redundant memory barriers in sg versions of DMA sync API.
Implementing relaxed version of DMA sync API will dramatically increase
the performance of dma_sync_sg_for_{device|cpu}().

Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
---
 arch/arm64/Kconfig                 |  4 ++--
 arch/arm64/include/asm/assembler.h | 33 ++++++++++++++++++++++++++++++++-
 arch/arm64/include/asm/barrier.h   | 13 +++++++++++++
 arch/arm64/mm/cache.S              | 34 +++++++++++-----------------------
 arch/arm64/mm/dma-mapping.c        |  4 ++--
 include/linux/dma-noncoherent.h    |  1 +
 6 files changed, 61 insertions(+), 28 deletions(-)
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d23283..4fc7ef4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -31,8 +31,8 @@  config ARM64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
-	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+	select ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 54d1811..1f87d98 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -345,6 +345,33 @@  alternative_endif
 	.endm
 
 /*
+ * Macro to perform a data cache invalidation for the interval
+ * [kaddr, kaddr + size)
+ *
+ *	kaddr:		starting virtual address of the region
+ *	size:		size of the region
+ *	Corrupts:	kaddr, size, tmp1, tmp2
+ */
+	.macro __dcache_inv_by_line kaddr, size, tmp1, tmp2
+	add	\size, \size, \kaddr
+	dcache_line_size \tmp1, \tmp2
+	sub	\tmp2, \tmp1, #1
+	tst	\size, \tmp2		// end cache line aligned?
+	bic	\size, \size, \tmp2
+	b.eq	9997f
+	dc	civac, \size		// clean & invalidate D / U line
+9997:	tst	\kaddr, \tmp2		// start cache line aligned?
+	bic	\kaddr, \kaddr, \tmp2
+	b.eq	9998f
+	dc	civac, \kaddr		// clean & invalidate D / U line
+	b	9999f
+9998:	dc	ivac, \kaddr		// invalidate D / U line
+9999:	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	.endm
+
+/*
  * Macro to perform a data cache maintenance for the interval
  * [kaddr, kaddr + size)
  *
@@ -362,7 +389,7 @@  alternative_else
 alternative_endif
 	.endm
 
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	.macro __dcache_by_line_op op, kaddr, size, tmp1, tmp2
 	dcache_line_size \tmp1, \tmp2
 	add	\size, \kaddr, \size
 	sub	\tmp2, \tmp1, #1
@@ -388,6 +415,10 @@  alternative_endif
 	add	\kaddr, \kaddr, \tmp1
 	cmp	\kaddr, \size
 	b.lo	9998b
+	.endm
+
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	__dcache_by_line_op \op, \kaddr, \size, \tmp1, \tmp2
 	dsb	\domain
 	.endm
 
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index fb4c275..96bbbf6 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -167,6 +167,19 @@  do {									\
 
 #include <asm-generic/barrier.h>
 
+#include <linux/dma-direction.h>
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+	dsb(sy);
+}
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+	if (dir == DMA_FROM_DEVICE)
+		dsb(sy);
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __ASM_BARRIER_H */
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 2d881f3..7180256 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -138,34 +138,20 @@  SYM_FUNC_END(__clean_dcache_area_pou)
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
-SYM_FUNC_START_LOCAL(__dma_inv_area)
 SYM_FUNC_START_PI(__inval_dcache_area)
-	/* FALLTHROUGH */
+	__dcache_inv_by_line x0, x1, x2, x3
+	dsb	sy
+	ret
+SYM_FUNC_END_PI(__inval_dcache_area)
 
 /*
  *	__dma_inv_area(start, size)
  *	- start   - virtual start address of region
  *	- size    - size in question
  */
-	add	x1, x1, x0
-	dcache_line_size x2, x3
-	sub	x3, x2, #1
-	tst	x1, x3				// end cache line aligned?
-	bic	x1, x1, x3
-	b.eq	1f
-	dc	civac, x1			// clean & invalidate D / U line
-1:	tst	x0, x3				// start cache line aligned?
-	bic	x0, x0, x3
-	b.eq	2f
-	dc	civac, x0			// clean & invalidate D / U line
-	b	3f
-2:	dc	ivac, x0			// invalidate D / U line
-3:	add	x0, x0, x2
-	cmp	x0, x1
-	b.lo	2b
-	dsb	sy
+SYM_FUNC_START_LOCAL(__dma_inv_area)
+	__dcache_inv_by_line x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__inval_dcache_area)
 SYM_FUNC_END(__dma_inv_area)
 
 /*
@@ -177,16 +163,18 @@  SYM_FUNC_END(__dma_inv_area)
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
-SYM_FUNC_START_LOCAL(__dma_clean_area)
 SYM_FUNC_START_PI(__clean_dcache_area_poc)
-	/* FALLTHROUGH */
+	dcache_by_line_op cvac, sy, x0, x1, x2, x3
+	ret
+SYM_FUNC_END_PI(__clean_dcache_area_poc)
 
 /*
  *	__dma_clean_area(start, size)
  *	- start   - virtual start address of region
  *	- size    - size in question
  */
-	dcache_by_line_op cvac, sy, x0, x1, x2, x3
+SYM_FUNC_START_LOCAL(__dma_clean_area)
+	__dcache_by_line_op cvac, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_poc)
 SYM_FUNC_END(__dma_clean_area)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6c45350..12943b3 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -13,13 +13,13 @@ 
 
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+void arch_sync_dma_for_device_relaxed(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	__dma_map_area(phys_to_virt(paddr), size, dir);
 }
 
-void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	__dma_unmap_area(phys_to_virt(paddr), size, dir);
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 0a31e6c..f88fc0f 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -5,6 +5,7 @@ 
 #include <linux/dma-mapping.h>
 #include <linux/pgtable.h>
 
+#include <asm/barrier.h>
 #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
 #include <asm/dma-coherence.h>
 #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \