diff mbox series

[v4,02/12] riscv: Add vector extension XOR implementation

Message ID 20230711153743.1970625-3-heiko@sntech.de (mailing list archive)
State Superseded
Headers show
Series RISC-V: support some cryptography accelerations | expand

Checks

Context Check Description
conchuod/tree_selection fail Failed to apply to next/pending-fixes, riscv/for-next or riscv/master

Commit Message

Heiko Stübner July 11, 2023, 3:37 p.m. UTC
From: Greentime Hu <greentime.hu@sifive.com>

This patch adds support for vector optimized XOR and it is tested in
qemu.

Co-developed-by: Han-Kuan Chen <hankuan.chen@sifive.com>
Signed-off-by: Han-Kuan Chen <hankuan.chen@sifive.com>
Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
Signed-off-by: Heiko Stuebner <heiko.stuebner@vrull.eu>
---
 arch/riscv/include/asm/xor.h | 82 ++++++++++++++++++++++++++++++++++++
 arch/riscv/lib/Makefile      |  1 +
 arch/riscv/lib/xor.S         | 81 +++++++++++++++++++++++++++++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 arch/riscv/include/asm/xor.h
 create mode 100644 arch/riscv/lib/xor.S

Comments

Rémi Denis-Courmont July 11, 2023, 5:33 p.m. UTC | #1
Le tiistaina 11. heinäkuuta 2023, 18.37.33 EEST Heiko Stuebner a écrit :
> diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S
> new file mode 100644
> index 000000000000..3bc059e18171
> --- /dev/null
> +++ b/arch/riscv/lib/xor.S
> @@ -0,0 +1,81 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Copyright (C) 2021 SiFive
> + */
> +#include <linux/linkage.h>
> +#include <asm-generic/export.h>
> +#include <asm/asm.h>
> +
> +ENTRY(xor_regs_2_)
> +	vsetvli a3, a0, e8, m8, ta, ma

AFAICT, so far, Linux only uses `vsetvli` to save/restore/flush vectors, and 
that's, of course, with LMUL=8, so that's not really telling much anything. 
This function could be the first actual vector optimisation in kernel if/when 
it gets merged.

Should the same group multiplier be used for "actual" vector loops throughout 
the kernel? I've seen conflicting advises or opinions here. Should kernel code 
always use the maximum possible LMUL, depending on register pressure of the 
loop? Or will that just increase latency with no bandwidth gains compared to, 
say, LMUL=1 or LMUL=2?

> +	vle8.v v0, (a1)
> +	vle8.v v8, (a2)
> +	sub a0, a0, a3
> +	vxor.vv v16, v0, v8
> +	add a2, a2, a3
> +	vse8.v v16, (a1)
> +	add a1, a1, a3
> +	bnez a0, xor_regs_2_
> +	ret
> +END(xor_regs_2_)
> +EXPORT_SYMBOL(xor_regs_2_)
> +
> +ENTRY(xor_regs_3_)
> +	vsetvli a4, a0, e8, m8, ta, ma
> +	vle8.v v0, (a1)
> +	vle8.v v8, (a2)
> +	sub a0, a0, a4
> +	vxor.vv v0, v0, v8
> +	vle8.v v16, (a3)
> +	add a2, a2, a4
> +	vxor.vv v16, v0, v16
> +	add a3, a3, a4
> +	vse8.v v16, (a1)
> +	add a1, a1, a4
> +	bnez a0, xor_regs_3_
> +	ret
> +END(xor_regs_3_)
> +EXPORT_SYMBOL(xor_regs_3_)
> +
> +ENTRY(xor_regs_4_)
> +	vsetvli a5, a0, e8, m8, ta, ma
> +	vle8.v v0, (a1)
> +	vle8.v v8, (a2)
> +	sub a0, a0, a5
> +	vxor.vv v0, v0, v8
> +	vle8.v v16, (a3)
> +	add a2, a2, a5
> +	vxor.vv v0, v0, v16
> +	vle8.v v24, (a4)
> +	add a3, a3, a5
> +	vxor.vv v16, v0, v24
> +	add a4, a4, a5
> +	vse8.v v16, (a1)
> +	add a1, a1, a5
> +	bnez a0, xor_regs_4_
> +	ret
> +END(xor_regs_4_)
> +EXPORT_SYMBOL(xor_regs_4_)
> +
> +ENTRY(xor_regs_5_)
> +	vsetvli a6, a0, e8, m8, ta, ma
> +	vle8.v v0, (a1)
> +	vle8.v v8, (a2)
> +	sub a0, a0, a6
> +	vxor.vv v0, v0, v8
> +	vle8.v v16, (a3)
> +	add a2, a2, a6
> +	vxor.vv v0, v0, v16
> +	vle8.v v24, (a4)
> +	add a3, a3, a6
> +	vxor.vv v0, v0, v24
> +	vle8.v v8, (a5)
> +	add a4, a4, a6
> +	vxor.vv v16, v0, v8
> +	add a5, a5, a6
> +	vse8.v v16, (a1)
> +	add a1, a1, a6
> +	bnez a0, xor_regs_5_
> +	ret
> +END(xor_regs_5_)
> +EXPORT_SYMBOL(xor_regs_5_)
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h
new file mode 100644
index 000000000000..74867c7fd955
--- /dev/null
+++ b/arch/riscv/include/asm/xor.h
@@ -0,0 +1,82 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+
+#include <linux/hardirq.h>
+#include <asm-generic/xor.h>
+#ifdef CONFIG_VECTOR
+#include <asm/vector.h>
+#include <asm/switch_to.h>
+
+void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1,
+		 const unsigned long *__restrict p2);
+void xor_regs_3_(unsigned long bytes, unsigned long *__restrict p1,
+		 const unsigned long *__restrict p2,
+		 const unsigned long *__restrict p3);
+void xor_regs_4_(unsigned long bytes, unsigned long *__restrict p1,
+		 const unsigned long *__restrict p2,
+		 const unsigned long *__restrict p3,
+		 const unsigned long *__restrict p4);
+void xor_regs_5_(unsigned long bytes, unsigned long *__restrict p1,
+		 const unsigned long *__restrict p2,
+		 const unsigned long *__restrict p3,
+		 const unsigned long *__restrict p4,
+		 const unsigned long *__restrict p5);
+
+static void xor_rvv_2(unsigned long bytes, unsigned long *__restrict p1,
+		      const unsigned long *__restrict p2)
+{
+	kernel_rvv_begin();
+	xor_regs_2_(bytes, p1, p2);
+	kernel_rvv_end();
+}
+
+static void xor_rvv_3(unsigned long bytes, unsigned long *__restrict p1,
+		      const unsigned long *__restrict p2,
+		      const unsigned long *__restrict p3)
+{
+	kernel_rvv_begin();
+	xor_regs_3_(bytes, p1, p2, p3);
+	kernel_rvv_end();
+}
+
+static void xor_rvv_4(unsigned long bytes, unsigned long *__restrict p1,
+		      const unsigned long *__restrict p2,
+		      const unsigned long *__restrict p3,
+		      const unsigned long *__restrict p4)
+{
+	kernel_rvv_begin();
+	xor_regs_4_(bytes, p1, p2, p3, p4);
+	kernel_rvv_end();
+}
+
+static void xor_rvv_5(unsigned long bytes, unsigned long *__restrict p1,
+		      const unsigned long *__restrict p2,
+		      const unsigned long *__restrict p3,
+		      const unsigned long *__restrict p4,
+		      const unsigned long *__restrict p5)
+{
+	kernel_rvv_begin();
+	xor_regs_5_(bytes, p1, p2, p3, p4, p5);
+	kernel_rvv_end();
+}
+
+static struct xor_block_template xor_block_rvv = {
+	.name = "rvv",
+	.do_2 = xor_rvv_2,
+	.do_3 = xor_rvv_3,
+	.do_4 = xor_rvv_4,
+	.do_5 = xor_rvv_5
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES           \
+	do {        \
+		xor_speed(&xor_block_8regs);    \
+		xor_speed(&xor_block_32regs);    \
+		if (has_vector()) { \
+			xor_speed(&xor_block_rvv);\
+		} \
+	} while (0)
+#endif
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 26cb2502ecf8..3164112680f1 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -11,3 +11,4 @@  lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+lib-$(CONFIG_VECTOR)	+= xor.o
diff --git a/arch/riscv/lib/xor.S b/arch/riscv/lib/xor.S
new file mode 100644
index 000000000000..3bc059e18171
--- /dev/null
+++ b/arch/riscv/lib/xor.S
@@ -0,0 +1,81 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021 SiFive
+ */
+#include <linux/linkage.h>
+#include <asm-generic/export.h>
+#include <asm/asm.h>
+
+ENTRY(xor_regs_2_)
+	vsetvli a3, a0, e8, m8, ta, ma
+	vle8.v v0, (a1)
+	vle8.v v8, (a2)
+	sub a0, a0, a3
+	vxor.vv v16, v0, v8
+	add a2, a2, a3
+	vse8.v v16, (a1)
+	add a1, a1, a3
+	bnez a0, xor_regs_2_
+	ret
+END(xor_regs_2_)
+EXPORT_SYMBOL(xor_regs_2_)
+
+ENTRY(xor_regs_3_)
+	vsetvli a4, a0, e8, m8, ta, ma
+	vle8.v v0, (a1)
+	vle8.v v8, (a2)
+	sub a0, a0, a4
+	vxor.vv v0, v0, v8
+	vle8.v v16, (a3)
+	add a2, a2, a4
+	vxor.vv v16, v0, v16
+	add a3, a3, a4
+	vse8.v v16, (a1)
+	add a1, a1, a4
+	bnez a0, xor_regs_3_
+	ret
+END(xor_regs_3_)
+EXPORT_SYMBOL(xor_regs_3_)
+
+ENTRY(xor_regs_4_)
+	vsetvli a5, a0, e8, m8, ta, ma
+	vle8.v v0, (a1)
+	vle8.v v8, (a2)
+	sub a0, a0, a5
+	vxor.vv v0, v0, v8
+	vle8.v v16, (a3)
+	add a2, a2, a5
+	vxor.vv v0, v0, v16
+	vle8.v v24, (a4)
+	add a3, a3, a5
+	vxor.vv v16, v0, v24
+	add a4, a4, a5
+	vse8.v v16, (a1)
+	add a1, a1, a5
+	bnez a0, xor_regs_4_
+	ret
+END(xor_regs_4_)
+EXPORT_SYMBOL(xor_regs_4_)
+
+ENTRY(xor_regs_5_)
+	vsetvli a6, a0, e8, m8, ta, ma
+	vle8.v v0, (a1)
+	vle8.v v8, (a2)
+	sub a0, a0, a6
+	vxor.vv v0, v0, v8
+	vle8.v v16, (a3)
+	add a2, a2, a6
+	vxor.vv v0, v0, v16
+	vle8.v v24, (a4)
+	add a3, a3, a6
+	vxor.vv v0, v0, v24
+	vle8.v v8, (a5)
+	add a4, a4, a6
+	vxor.vv v16, v0, v8
+	add a5, a5, a6
+	vse8.v v16, (a1)
+	add a1, a1, a6
+	bnez a0, xor_regs_5_
+	ret
+END(xor_regs_5_)
+EXPORT_SYMBOL(xor_regs_5_)