diff mbox series

MIPS: Provide unroll() macro, use it for cache ops

Message ID 20191008182149.2082503-1-paul.burton@mips.com (mailing list archive)
State Mainlined
Commit 6baaeadae911ba9cedfead881f3bf305a18fd011
Delegated to: Paul Burton
Headers show
Series MIPS: Provide unroll() macro, use it for cache ops | expand

Commit Message

Paul Burton Oct. 8, 2019, 6:22 p.m. UTC
Currently we have a lot of duplication in asm/r4kcache.h to handle
manually unrolling loops of cache ops for various line sizes, and we
have to explicitly handle the difference in cache op immediate width
between MIPSr6 & earlier ISA revisions with further duplication.

Introduce an unroll() macro in asm/unroll.h which expands to a switch
statement which is used to call a function or expand a preprocessor
macro a compile-time constant number of times in a row - effectively
explicitly unrolling a loop. We make use of this here to remove the
cache op duplication & will use it further in later patches.

A nice side effect of this is that calculating the cache op offset
immediate is now the compiler's responsibility, so we're no longer
sensitive to the width change of that immediate in MIPSr6 & will be
similarly agnostic to immediate width in any future supported ISA.

Signed-off-by: Paul Burton <paul.burton@mips.com>
---

 arch/mips/include/asm/r4kcache.h | 358 ++-----------------------------
 arch/mips/include/asm/unroll.h   |  76 +++++++
 arch/mips/mm/c-r4k.c             |  12 +-
 3 files changed, 103 insertions(+), 343 deletions(-)
 create mode 100644 arch/mips/include/asm/unroll.h

Comments

Paul Burton Oct. 9, 2019, 9:58 p.m. UTC | #1
Hello,

Paul Burton wrote:
> Currently we have a lot of duplication in asm/r4kcache.h to handle
> manually unrolling loops of cache ops for various line sizes, and we
> have to explicitly handle the difference in cache op immediate width
> between MIPSr6 & earlier ISA revisions with further duplication.
> 
> Introduce an unroll() macro in asm/unroll.h which expands to a switch
> statement which is used to call a function or expand a preprocessor
> macro a compile-time constant number of times in a row - effectively
> explicitly unrolling a loop. We make use of this here to remove the
> cache op duplication & will use it further in later patches.
> 
> A nice side effect of this is that calculating the cache op offset
> immediate is now the compiler's responsibility, so we're no longer
> sensitive to the width change of that immediate in MIPSr6 & will be
> similarly agnostic to immediate width in any future supported ISA.

Applied to mips-next.

> commit 6baaeadae911
> https://git.kernel.org/mips/c/6baaeadae911
> 
> Signed-off-by: Paul Burton <paul.burton@mips.com>

Thanks,
    Paul

[ This message was auto-generated; if you believe anything is incorrect
  then please email paul.burton@mips.com to report it. ]
diff mbox series

Patch

diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index 7f4a32d3345a..e73fc9e899d2 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -15,12 +15,14 @@ 
 #include <linux/stringify.h>
 
 #include <asm/asm.h>
+#include <asm/asm-eva.h>
 #include <asm/cacheops.h>
 #include <asm/compiler.h>
 #include <asm/cpu-features.h>
 #include <asm/cpu-type.h>
 #include <asm/mipsmtregs.h>
 #include <asm/mmzone.h>
+#include <asm/unroll.h>
 #include <linux/uaccess.h> /* for uaccess_kernel() */
 
 extern void (*r4k_blast_dcache)(void);
@@ -39,16 +41,19 @@  extern void (*r4k_blast_icache)(void);
  */
 #define INDEX_BASE	CKSEG0
 
-#define cache_op(op,addr)						\
+#define _cache_op(insn, op, addr)					\
 	__asm__ __volatile__(						\
 	"	.set	push					\n"	\
 	"	.set	noreorder				\n"	\
 	"	.set "MIPS_ISA_ARCH_LEVEL"			\n"	\
-	"	cache	%0, %1					\n"	\
+	"	" insn("%0", "%1") "				\n"	\
 	"	.set	pop					\n"	\
 	:								\
 	: "i" (op), "R" (*(unsigned char *)(addr)))
 
+#define cache_op(op, addr)						\
+	_cache_op(kernel_cache, op, addr)
+
 static inline void flush_icache_line_indexed(unsigned long addr)
 {
 	cache_op(Index_Invalidate_I, addr);
@@ -193,338 +198,10 @@  static inline void invalidate_tcache_page(unsigned long addr)
 	cache_op(Page_Invalidate_T, addr);
 }
 
-#ifndef CONFIG_CPU_MIPSR6
-#define cache16_unroll32(base,op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips3					\n"	\
-	"	cache %1, 0x000(%0); cache %1, 0x010(%0)	\n"	\
-	"	cache %1, 0x020(%0); cache %1, 0x030(%0)	\n"	\
-	"	cache %1, 0x040(%0); cache %1, 0x050(%0)	\n"	\
-	"	cache %1, 0x060(%0); cache %1, 0x070(%0)	\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x090(%0)	\n"	\
-	"	cache %1, 0x0a0(%0); cache %1, 0x0b0(%0)	\n"	\
-	"	cache %1, 0x0c0(%0); cache %1, 0x0d0(%0)	\n"	\
-	"	cache %1, 0x0e0(%0); cache %1, 0x0f0(%0)	\n"	\
-	"	cache %1, 0x100(%0); cache %1, 0x110(%0)	\n"	\
-	"	cache %1, 0x120(%0); cache %1, 0x130(%0)	\n"	\
-	"	cache %1, 0x140(%0); cache %1, 0x150(%0)	\n"	\
-	"	cache %1, 0x160(%0); cache %1, 0x170(%0)	\n"	\
-	"	cache %1, 0x180(%0); cache %1, 0x190(%0)	\n"	\
-	"	cache %1, 0x1a0(%0); cache %1, 0x1b0(%0)	\n"	\
-	"	cache %1, 0x1c0(%0); cache %1, 0x1d0(%0)	\n"	\
-	"	cache %1, 0x1e0(%0); cache %1, 0x1f0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#define cache32_unroll32(base,op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips3					\n"	\
-	"	cache %1, 0x000(%0); cache %1, 0x020(%0)	\n"	\
-	"	cache %1, 0x040(%0); cache %1, 0x060(%0)	\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x0a0(%0)	\n"	\
-	"	cache %1, 0x0c0(%0); cache %1, 0x0e0(%0)	\n"	\
-	"	cache %1, 0x100(%0); cache %1, 0x120(%0)	\n"	\
-	"	cache %1, 0x140(%0); cache %1, 0x160(%0)	\n"	\
-	"	cache %1, 0x180(%0); cache %1, 0x1a0(%0)	\n"	\
-	"	cache %1, 0x1c0(%0); cache %1, 0x1e0(%0)	\n"	\
-	"	cache %1, 0x200(%0); cache %1, 0x220(%0)	\n"	\
-	"	cache %1, 0x240(%0); cache %1, 0x260(%0)	\n"	\
-	"	cache %1, 0x280(%0); cache %1, 0x2a0(%0)	\n"	\
-	"	cache %1, 0x2c0(%0); cache %1, 0x2e0(%0)	\n"	\
-	"	cache %1, 0x300(%0); cache %1, 0x320(%0)	\n"	\
-	"	cache %1, 0x340(%0); cache %1, 0x360(%0)	\n"	\
-	"	cache %1, 0x380(%0); cache %1, 0x3a0(%0)	\n"	\
-	"	cache %1, 0x3c0(%0); cache %1, 0x3e0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#define cache64_unroll32(base,op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips3					\n"	\
-	"	cache %1, 0x000(%0); cache %1, 0x040(%0)	\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x0c0(%0)	\n"	\
-	"	cache %1, 0x100(%0); cache %1, 0x140(%0)	\n"	\
-	"	cache %1, 0x180(%0); cache %1, 0x1c0(%0)	\n"	\
-	"	cache %1, 0x200(%0); cache %1, 0x240(%0)	\n"	\
-	"	cache %1, 0x280(%0); cache %1, 0x2c0(%0)	\n"	\
-	"	cache %1, 0x300(%0); cache %1, 0x340(%0)	\n"	\
-	"	cache %1, 0x380(%0); cache %1, 0x3c0(%0)	\n"	\
-	"	cache %1, 0x400(%0); cache %1, 0x440(%0)	\n"	\
-	"	cache %1, 0x480(%0); cache %1, 0x4c0(%0)	\n"	\
-	"	cache %1, 0x500(%0); cache %1, 0x540(%0)	\n"	\
-	"	cache %1, 0x580(%0); cache %1, 0x5c0(%0)	\n"	\
-	"	cache %1, 0x600(%0); cache %1, 0x640(%0)	\n"	\
-	"	cache %1, 0x680(%0); cache %1, 0x6c0(%0)	\n"	\
-	"	cache %1, 0x700(%0); cache %1, 0x740(%0)	\n"	\
-	"	cache %1, 0x780(%0); cache %1, 0x7c0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#define cache128_unroll32(base,op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips3					\n"	\
-	"	cache %1, 0x000(%0); cache %1, 0x080(%0)	\n"	\
-	"	cache %1, 0x100(%0); cache %1, 0x180(%0)	\n"	\
-	"	cache %1, 0x200(%0); cache %1, 0x280(%0)	\n"	\
-	"	cache %1, 0x300(%0); cache %1, 0x380(%0)	\n"	\
-	"	cache %1, 0x400(%0); cache %1, 0x480(%0)	\n"	\
-	"	cache %1, 0x500(%0); cache %1, 0x580(%0)	\n"	\
-	"	cache %1, 0x600(%0); cache %1, 0x680(%0)	\n"	\
-	"	cache %1, 0x700(%0); cache %1, 0x780(%0)	\n"	\
-	"	cache %1, 0x800(%0); cache %1, 0x880(%0)	\n"	\
-	"	cache %1, 0x900(%0); cache %1, 0x980(%0)	\n"	\
-	"	cache %1, 0xa00(%0); cache %1, 0xa80(%0)	\n"	\
-	"	cache %1, 0xb00(%0); cache %1, 0xb80(%0)	\n"	\
-	"	cache %1, 0xc00(%0); cache %1, 0xc80(%0)	\n"	\
-	"	cache %1, 0xd00(%0); cache %1, 0xd80(%0)	\n"	\
-	"	cache %1, 0xe00(%0); cache %1, 0xe80(%0)	\n"	\
-	"	cache %1, 0xf00(%0); cache %1, 0xf80(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#else
-/*
- * MIPS R6 changed the cache opcode and moved to a 8-bit offset field.
- * This means we now need to increment the base register before we flush
- * more cache lines
- */
-#define cache16_unroll32(base,op)				\
-	__asm__ __volatile__(					\
-	"	.set push\n"					\
-	"	.set noreorder\n"				\
-	"	.set mips64r6\n"				\
-	"	.set noat\n"					\
-	"	cache %1, 0x000(%0); cache %1, 0x010(%0)\n"	\
-	"	cache %1, 0x020(%0); cache %1, 0x030(%0)\n"	\
-	"	cache %1, 0x040(%0); cache %1, 0x050(%0)\n"	\
-	"	cache %1, 0x060(%0); cache %1, 0x070(%0)\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x090(%0)\n"	\
-	"	cache %1, 0x0a0(%0); cache %1, 0x0b0(%0)\n"	\
-	"	cache %1, 0x0c0(%0); cache %1, 0x0d0(%0)\n"	\
-	"	cache %1, 0x0e0(%0); cache %1, 0x0f0(%0)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, %0, 0x100	\n"	\
-	"	cache %1, 0x000($1); cache %1, 0x010($1)\n"	\
-	"	cache %1, 0x020($1); cache %1, 0x030($1)\n"	\
-	"	cache %1, 0x040($1); cache %1, 0x050($1)\n"	\
-	"	cache %1, 0x060($1); cache %1, 0x070($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x090($1)\n"	\
-	"	cache %1, 0x0a0($1); cache %1, 0x0b0($1)\n"	\
-	"	cache %1, 0x0c0($1); cache %1, 0x0d0($1)\n"	\
-	"	cache %1, 0x0e0($1); cache %1, 0x0f0($1)\n"	\
-	"	.set pop\n"					\
-		:						\
-		: "r" (base),					\
-		  "i" (op));
-
-#define cache32_unroll32(base,op)				\
-	__asm__ __volatile__(					\
-	"	.set push\n"					\
-	"	.set noreorder\n"				\
-	"	.set mips64r6\n"				\
-	"	.set noat\n"					\
-	"	cache %1, 0x000(%0); cache %1, 0x020(%0)\n"	\
-	"	cache %1, 0x040(%0); cache %1, 0x060(%0)\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x0a0(%0)\n"	\
-	"	cache %1, 0x0c0(%0); cache %1, 0x0e0(%0)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, %0, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x020($1)\n"	\
-	"	cache %1, 0x040($1); cache %1, 0x060($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0a0($1)\n"	\
-	"	cache %1, 0x0c0($1); cache %1, 0x0e0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x020($1)\n"	\
-	"	cache %1, 0x040($1); cache %1, 0x060($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0a0($1)\n"	\
-	"	cache %1, 0x0c0($1); cache %1, 0x0e0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100\n"	\
-	"	cache %1, 0x000($1); cache %1, 0x020($1)\n"	\
-	"	cache %1, 0x040($1); cache %1, 0x060($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0a0($1)\n"	\
-	"	cache %1, 0x0c0($1); cache %1, 0x0e0($1)\n"	\
-	"	.set pop\n"					\
-		:						\
-		: "r" (base),					\
-		  "i" (op));
-
-#define cache64_unroll32(base,op)				\
-	__asm__ __volatile__(					\
-	"	.set push\n"					\
-	"	.set noreorder\n"				\
-	"	.set mips64r6\n"				\
-	"	.set noat\n"					\
-	"	cache %1, 0x000(%0); cache %1, 0x040(%0)\n"	\
-	"	cache %1, 0x080(%0); cache %1, 0x0c0(%0)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, %0, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x040($1)\n"	\
-	"	cache %1, 0x080($1); cache %1, 0x0c0($1)\n"	\
-	"	.set pop\n"					\
-		:						\
-		: "r" (base),					\
-		  "i" (op));
-
-#define cache128_unroll32(base,op)				\
-	__asm__ __volatile__(					\
-	"	.set push\n"					\
-	"	.set noreorder\n"				\
-	"	.set mips64r6\n"				\
-	"	.set noat\n"					\
-	"	cache %1, 0x000(%0); cache %1, 0x080(%0)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, %0, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	"__stringify(LONG_ADDIU)" $1, $1, 0x100 \n"	\
-	"	cache %1, 0x000($1); cache %1, 0x080($1)\n"	\
-	"	.set pop\n"					\
-		:						\
-		: "r" (base),					\
-		  "i" (op));
-#endif /* CONFIG_CPU_MIPSR6 */
-
-/*
- * Perform the cache operation specified by op using a user mode virtual
- * address while in kernel mode.
- */
-#define cache16_unroll32_user(base,op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips0					\n"	\
-	"	.set eva					\n"	\
-	"	cachee %1, 0x000(%0); cachee %1, 0x010(%0)	\n"	\
-	"	cachee %1, 0x020(%0); cachee %1, 0x030(%0)	\n"	\
-	"	cachee %1, 0x040(%0); cachee %1, 0x050(%0)	\n"	\
-	"	cachee %1, 0x060(%0); cachee %1, 0x070(%0)	\n"	\
-	"	cachee %1, 0x080(%0); cachee %1, 0x090(%0)	\n"	\
-	"	cachee %1, 0x0a0(%0); cachee %1, 0x0b0(%0)	\n"	\
-	"	cachee %1, 0x0c0(%0); cachee %1, 0x0d0(%0)	\n"	\
-	"	cachee %1, 0x0e0(%0); cachee %1, 0x0f0(%0)	\n"	\
-	"	cachee %1, 0x100(%0); cachee %1, 0x110(%0)	\n"	\
-	"	cachee %1, 0x120(%0); cachee %1, 0x130(%0)	\n"	\
-	"	cachee %1, 0x140(%0); cachee %1, 0x150(%0)	\n"	\
-	"	cachee %1, 0x160(%0); cachee %1, 0x170(%0)	\n"	\
-	"	cachee %1, 0x180(%0); cachee %1, 0x190(%0)	\n"	\
-	"	cachee %1, 0x1a0(%0); cachee %1, 0x1b0(%0)	\n"	\
-	"	cachee %1, 0x1c0(%0); cachee %1, 0x1d0(%0)	\n"	\
-	"	cachee %1, 0x1e0(%0); cachee %1, 0x1f0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#define cache32_unroll32_user(base, op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips0					\n"	\
-	"	.set eva					\n"	\
-	"	cachee %1, 0x000(%0); cachee %1, 0x020(%0)	\n"	\
-	"	cachee %1, 0x040(%0); cachee %1, 0x060(%0)	\n"	\
-	"	cachee %1, 0x080(%0); cachee %1, 0x0a0(%0)	\n"	\
-	"	cachee %1, 0x0c0(%0); cachee %1, 0x0e0(%0)	\n"	\
-	"	cachee %1, 0x100(%0); cachee %1, 0x120(%0)	\n"	\
-	"	cachee %1, 0x140(%0); cachee %1, 0x160(%0)	\n"	\
-	"	cachee %1, 0x180(%0); cachee %1, 0x1a0(%0)	\n"	\
-	"	cachee %1, 0x1c0(%0); cachee %1, 0x1e0(%0)	\n"	\
-	"	cachee %1, 0x200(%0); cachee %1, 0x220(%0)	\n"	\
-	"	cachee %1, 0x240(%0); cachee %1, 0x260(%0)	\n"	\
-	"	cachee %1, 0x280(%0); cachee %1, 0x2a0(%0)	\n"	\
-	"	cachee %1, 0x2c0(%0); cachee %1, 0x2e0(%0)	\n"	\
-	"	cachee %1, 0x300(%0); cachee %1, 0x320(%0)	\n"	\
-	"	cachee %1, 0x340(%0); cachee %1, 0x360(%0)	\n"	\
-	"	cachee %1, 0x380(%0); cachee %1, 0x3a0(%0)	\n"	\
-	"	cachee %1, 0x3c0(%0); cachee %1, 0x3e0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
-
-#define cache64_unroll32_user(base, op)					\
-	__asm__ __volatile__(						\
-	"	.set push					\n"	\
-	"	.set noreorder					\n"	\
-	"	.set mips0					\n"	\
-	"	.set eva					\n"	\
-	"	cachee %1, 0x000(%0); cachee %1, 0x040(%0)	\n"	\
-	"	cachee %1, 0x080(%0); cachee %1, 0x0c0(%0)	\n"	\
-	"	cachee %1, 0x100(%0); cachee %1, 0x140(%0)	\n"	\
-	"	cachee %1, 0x180(%0); cachee %1, 0x1c0(%0)	\n"	\
-	"	cachee %1, 0x200(%0); cachee %1, 0x240(%0)	\n"	\
-	"	cachee %1, 0x280(%0); cachee %1, 0x2c0(%0)	\n"	\
-	"	cachee %1, 0x300(%0); cachee %1, 0x340(%0)	\n"	\
-	"	cachee %1, 0x380(%0); cachee %1, 0x3c0(%0)	\n"	\
-	"	cachee %1, 0x400(%0); cachee %1, 0x440(%0)	\n"	\
-	"	cachee %1, 0x480(%0); cachee %1, 0x4c0(%0)	\n"	\
-	"	cachee %1, 0x500(%0); cachee %1, 0x540(%0)	\n"	\
-	"	cachee %1, 0x580(%0); cachee %1, 0x5c0(%0)	\n"	\
-	"	cachee %1, 0x600(%0); cachee %1, 0x640(%0)	\n"	\
-	"	cachee %1, 0x680(%0); cachee %1, 0x6c0(%0)	\n"	\
-	"	cachee %1, 0x700(%0); cachee %1, 0x740(%0)	\n"	\
-	"	cachee %1, 0x780(%0); cachee %1, 0x7c0(%0)	\n"	\
-	"	.set pop					\n"	\
-		:							\
-		: "r" (base),						\
-		  "i" (op));
+#define cache_unroll(times, insn, op, addr, lsize) do {			\
+	int i = 0;							\
+	unroll(times, _cache_op, insn, op, (addr) + (i++ * (lsize)));	\
+} while (0)
 
 /* build blast_xxx, blast_xxx_page, blast_xxx_page_indexed */
 #define __BUILD_BLAST_CACHE(pfx, desc, indexop, hitop, lsize, extra)	\
@@ -539,7 +216,8 @@  static inline void extra##blast_##pfx##cache##lsize(void)		\
 									\
 	for (ws = 0; ws < ws_end; ws += ws_inc)				\
 		for (addr = start; addr < end; addr += lsize * 32)	\
-			cache##lsize##_unroll32(addr|ws, indexop);	\
+			cache_unroll(32, kernel_cache, indexop,		\
+				     addr | ws, lsize);			\
 }									\
 									\
 static inline void extra##blast_##pfx##cache##lsize##_page(unsigned long page) \
@@ -548,7 +226,7 @@  static inline void extra##blast_##pfx##cache##lsize##_page(unsigned long page) \
 	unsigned long end = page + PAGE_SIZE;				\
 									\
 	do {								\
-		cache##lsize##_unroll32(start, hitop);			\
+		cache_unroll(32, kernel_cache, hitop, start, lsize);	\
 		start += lsize * 32;					\
 	} while (start < end);						\
 }									\
@@ -565,7 +243,8 @@  static inline void extra##blast_##pfx##cache##lsize##_page_indexed(unsigned long
 									\
 	for (ws = 0; ws < ws_end; ws += ws_inc)				\
 		for (addr = start; addr < end; addr += lsize * 32)	\
-			cache##lsize##_unroll32(addr|ws, indexop);	\
+			cache_unroll(32, kernel_cache, indexop,		\
+				     addr | ws, lsize);			\
 }
 
 __BUILD_BLAST_CACHE(d, dcache, Index_Writeback_Inv_D, Hit_Writeback_Inv_D, 16, )
@@ -596,7 +275,7 @@  static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
 	unsigned long end = page + PAGE_SIZE;				\
 									\
 	do {								\
-		cache##lsize##_unroll32_user(start, hitop);             \
+		cache_unroll(32, user_cache, hitop, start, lsize);	\
 		start += lsize * 32;					\
 	} while (start < end);						\
 }
@@ -688,7 +367,8 @@  static inline void blast_##pfx##cache##lsize##_node(long node)		\
 									\
 	for (ws = 0; ws < ws_end; ws += ws_inc)				\
 		for (addr = start; addr < end; addr += lsize * 32)	\
-			cache##lsize##_unroll32(addr|ws, indexop);	\
+			cache_unroll(32, kernel_cache, indexop,		\
+				     addr | ws, lsize);			\
 }
 
 __BUILD_BLAST_CACHE_NODE(s, scache, Index_Writeback_Inv_SD, Hit_Writeback_Inv_SD, 16)
diff --git a/arch/mips/include/asm/unroll.h b/arch/mips/include/asm/unroll.h
new file mode 100644
index 000000000000..df1cdcfc5a47
--- /dev/null
+++ b/arch/mips/include/asm/unroll.h
@@ -0,0 +1,76 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_UNROLL_H__
+#define __ASM_UNROLL_H__
+
+/*
+ * Explicitly unroll a loop, for use in cases where doing so is performance
+ * critical.
+ *
+ * Ideally we'd rely upon the compiler to provide this but there's no commonly
+ * available means to do so. For example GCC's "#pragma GCC unroll"
+ * functionality would be ideal but is only available from GCC 8 onwards. Using
+ * -funroll-loops is an option but GCC tends to make poor choices when
+ * compiling our string functions. -funroll-all-loops leads to massive code
+ * bloat, even if only applied to the string functions.
+ */
+#define unroll(times, fn, ...) do {				\
+	extern void bad_unroll(void)				\
+		__compiletime_error("Unsupported unroll");	\
+								\
+	/*							\
+	 * We can't unroll if the number of iterations isn't	\
+	 * compile-time constant. Unfortunately GCC versions	\
+	 * up until 4.6 tend to miss obvious constants & cause	\
+	 * this check to fail, even though they go on to	\
+	 * generate reasonable code for the switch statement,	\
+	 * so we skip the sanity check for those compilers.	\
+	 */							\
+	BUILD_BUG_ON(GCC_VERSION >= 40700 &&			\
+		     !__builtin_constant_p(times));		\
+								\
+	switch (times) {					\
+	case 32: fn(__VA_ARGS__); /* fall through */		\
+	case 31: fn(__VA_ARGS__); /* fall through */		\
+	case 30: fn(__VA_ARGS__); /* fall through */		\
+	case 29: fn(__VA_ARGS__); /* fall through */		\
+	case 28: fn(__VA_ARGS__); /* fall through */		\
+	case 27: fn(__VA_ARGS__); /* fall through */		\
+	case 26: fn(__VA_ARGS__); /* fall through */		\
+	case 25: fn(__VA_ARGS__); /* fall through */		\
+	case 24: fn(__VA_ARGS__); /* fall through */		\
+	case 23: fn(__VA_ARGS__); /* fall through */		\
+	case 22: fn(__VA_ARGS__); /* fall through */		\
+	case 21: fn(__VA_ARGS__); /* fall through */		\
+	case 20: fn(__VA_ARGS__); /* fall through */		\
+	case 19: fn(__VA_ARGS__); /* fall through */		\
+	case 18: fn(__VA_ARGS__); /* fall through */		\
+	case 17: fn(__VA_ARGS__); /* fall through */		\
+	case 16: fn(__VA_ARGS__); /* fall through */		\
+	case 15: fn(__VA_ARGS__); /* fall through */		\
+	case 14: fn(__VA_ARGS__); /* fall through */		\
+	case 13: fn(__VA_ARGS__); /* fall through */		\
+	case 12: fn(__VA_ARGS__); /* fall through */		\
+	case 11: fn(__VA_ARGS__); /* fall through */		\
+	case 10: fn(__VA_ARGS__); /* fall through */		\
+	case 9: fn(__VA_ARGS__); /* fall through */		\
+	case 8: fn(__VA_ARGS__); /* fall through */		\
+	case 7: fn(__VA_ARGS__); /* fall through */		\
+	case 6: fn(__VA_ARGS__); /* fall through */		\
+	case 5: fn(__VA_ARGS__); /* fall through */		\
+	case 4: fn(__VA_ARGS__); /* fall through */		\
+	case 3: fn(__VA_ARGS__); /* fall through */		\
+	case 2: fn(__VA_ARGS__); /* fall through */		\
+	case 1: fn(__VA_ARGS__); /* fall through */		\
+	case 0: break;						\
+								\
+	default:						\
+		/*						\
+		 * Either the iteration count is unreasonable	\
+		 * or we need to add more cases above.		\
+		 */						\
+		bad_unroll();					\
+		break;						\
+	}							\
+} while (0)
+
+#endif /* __ASM_UNROLL_H__ */
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 4bf990633135..378cbb02dcdd 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -271,12 +271,14 @@  static inline void tx49_blast_icache32(void)
 	/* I'm in even chunk.  blast odd chunks */
 	for (ws = 0; ws < ws_end; ws += ws_inc)
 		for (addr = start + 0x400; addr < end; addr += 0x400 * 2)
-			cache32_unroll32(addr|ws, Index_Invalidate_I);
+			cache_unroll(32, kernel_cache, Index_Invalidate_I,
+				     addr | ws, 32);
 	CACHE32_UNROLL32_ALIGN;
 	/* I'm in odd chunk.  blast even chunks */
 	for (ws = 0; ws < ws_end; ws += ws_inc)
 		for (addr = start; addr < end; addr += 0x400 * 2)
-			cache32_unroll32(addr|ws, Index_Invalidate_I);
+			cache_unroll(32, kernel_cache, Index_Invalidate_I,
+				     addr | ws, 32);
 }
 
 static inline void blast_icache32_r4600_v1_page_indexed(unsigned long page)
@@ -302,12 +304,14 @@  static inline void tx49_blast_icache32_page_indexed(unsigned long page)
 	/* I'm in even chunk.  blast odd chunks */
 	for (ws = 0; ws < ws_end; ws += ws_inc)
 		for (addr = start + 0x400; addr < end; addr += 0x400 * 2)
-			cache32_unroll32(addr|ws, Index_Invalidate_I);
+			cache_unroll(32, kernel_cache, Index_Invalidate_I,
+				     addr | ws, 32);
 	CACHE32_UNROLL32_ALIGN;
 	/* I'm in odd chunk.  blast even chunks */
 	for (ws = 0; ws < ws_end; ws += ws_inc)
 		for (addr = start; addr < end; addr += 0x400 * 2)
-			cache32_unroll32(addr|ws, Index_Invalidate_I);
+			cache_unroll(32, kernel_cache, Index_Invalidate_I,
+				     addr | ws, 32);
 }
 
 static void (* r4k_blast_icache_page)(unsigned long addr);