diff mbox series

[V2] raid6: Add RISC-V SIMD syndrome and recovery calculations

Message ID 20250127061529.2437012-1-zhangchunyan@iscas.ac.cn (mailing list archive)
State New
Headers show
Series [V2] raid6: Add RISC-V SIMD syndrome and recovery calculations | expand

Commit Message

Chunyan Zhang Jan. 27, 2025, 6:15 a.m. UTC
The assembly is originally based on the ARM NEON and int.uc, but uses
RISC-V vector instructions to implement the RAID6 syndrome and
recovery calculations.

Results on QEMU running with the option "-icount shift=0":

  raid6: rvvx1    gen()  1008 MB/s
  raid6: rvvx2    gen()  1395 MB/s
  raid6: rvvx4    gen()  1584 MB/s
  raid6: rvvx8    gen()  1694 MB/s
  raid6: int64x8  gen()   113 MB/s
  raid6: int64x4  gen()   116 MB/s
  raid6: int64x2  gen()   272 MB/s
  raid6: int64x1  gen()   229 MB/s
  raid6: using algorithm rvvx8 gen() 1694 MB/s
  raid6: .... xor() 1000 MB/s, rmw enabled
  raid6: using rvv recovery algorithm

[Charlie: - Fixup vector options]
Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
---
V2:
- Add raid6_rvvx8;
- Address the vector options issue;
- Add .valid callback to raid6_rvv and raid6_recov_rvv;
- Removed unneeded check of crypto_simd_usable();

RFC: https://lore.kernel.org/lkml/20241220114023.667347-1-zhangchunyan@iscas.ac.cn/
---
 include/linux/raid/pq.h |    5 +
 lib/raid6/Makefile      |    1 +
 lib/raid6/algos.c       |    9 +
 lib/raid6/recov_rvv.c   |  234 ++++++++
 lib/raid6/rvv.c         | 1269 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 1518 insertions(+)
 create mode 100644 lib/raid6/recov_rvv.c
 create mode 100644 lib/raid6/rvv.c

Comments

Paul Menzel Jan. 27, 2025, 8:39 a.m. UTC | #1
Dear Chunyan,


Thank you for the patch.


Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> The assembly is originally based on the ARM NEON and int.uc, but uses
> RISC-V vector instructions to implement the RAID6 syndrome and
> recovery calculations.
> 
> Results on QEMU running with the option "-icount shift=0":
> 
>    raid6: rvvx1    gen()  1008 MB/s
>    raid6: rvvx2    gen()  1395 MB/s
>    raid6: rvvx4    gen()  1584 MB/s
>    raid6: rvvx8    gen()  1694 MB/s
>    raid6: int64x8  gen()   113 MB/s
>    raid6: int64x4  gen()   116 MB/s
>    raid6: int64x2  gen()   272 MB/s
>    raid6: int64x1  gen()   229 MB/s
>    raid6: using algorithm rvvx8 gen() 1694 MB/s
>    raid6: .... xor() 1000 MB/s, rmw enabled
>    raid6: using rvv recovery algorithm

How did you start QEMU and on what host did you run it? Does it change 
between runs? (For me these benchmark values were very unreliable in the 
past on x86 hardware.)

> [Charlie: - Fixup vector options]
> Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> ---
> V2:
> - Add raid6_rvvx8;
> - Address the vector options issue;
> - Add .valid callback to raid6_rvv and raid6_recov_rvv;
> - Removed unneeded check of crypto_simd_usable();
> 
> RFC: https://lore.kernel.org/lkml/20241220114023.667347-1-zhangchunyan@iscas.ac.cn/
> ---
>   include/linux/raid/pq.h |    5 +
>   lib/raid6/Makefile      |    1 +
>   lib/raid6/algos.c       |    9 +
>   lib/raid6/recov_rvv.c   |  234 ++++++++
>   lib/raid6/rvv.c         | 1269 +++++++++++++++++++++++++++++++++++++++
>   5 files changed, 1518 insertions(+)
>   create mode 100644 lib/raid6/recov_rvv.c
>   create mode 100644 lib/raid6/rvv.c
> 
> diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> index 98030accf641..72ff44cca864 100644
> --- a/include/linux/raid/pq.h
> +++ b/include/linux/raid/pq.h
> @@ -108,6 +108,10 @@ extern const struct raid6_calls raid6_vpermxor4;
>   extern const struct raid6_calls raid6_vpermxor8;
>   extern const struct raid6_calls raid6_lsx;
>   extern const struct raid6_calls raid6_lasx;
> +extern const struct raid6_calls raid6_rvvx1;
> +extern const struct raid6_calls raid6_rvvx2;
> +extern const struct raid6_calls raid6_rvvx4;
> +extern const struct raid6_calls raid6_rvvx8;
>   
>   struct raid6_recov_calls {
>   	void (*data2)(int, size_t, int, int, void **);
> @@ -125,6 +129,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
>   extern const struct raid6_recov_calls raid6_recov_neon;
>   extern const struct raid6_recov_calls raid6_recov_lsx;
>   extern const struct raid6_recov_calls raid6_recov_lasx;
> +extern const struct raid6_recov_calls raid6_recov_rvv;
>   
>   extern const struct raid6_calls raid6_neonx1;
>   extern const struct raid6_calls raid6_neonx2;
> diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> index 29127dd05d63..5be0a4e60ab1 100644
> --- a/lib/raid6/Makefile
> +++ b/lib/raid6/Makefile
> @@ -10,6 +10,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
>   raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
>   raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
>   raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
>   
>   hostprogs	+= mktables
>   
> diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> index cd2e88ee1f14..99980ff5b985 100644
> --- a/lib/raid6/algos.c
> +++ b/lib/raid6/algos.c
> @@ -80,6 +80,12 @@ const struct raid6_calls * const raid6_algos[] = {
>   #ifdef CONFIG_CPU_HAS_LSX
>   	&raid6_lsx,
>   #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_rvvx1,
> +	&raid6_rvvx2,
> +	&raid6_rvvx4,
> +	&raid6_rvvx8,
>   #endif
>   	&raid6_intx8,
>   	&raid6_intx4,
> @@ -115,6 +121,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
>   #ifdef CONFIG_CPU_HAS_LSX
>   	&raid6_recov_lsx,
>   #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_recov_rvv,
>   #endif
>   	&raid6_recov_intx1,
>   	NULL
> diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> new file mode 100644
> index 000000000000..db271d2987c6
> --- /dev/null
> +++ b/lib/raid6/recov_rvv.c
> @@ -0,0 +1,234 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright 2024 Institute of Software, CAS.

Remove the dot/period at the end?

> + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> + */
> +
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/raid/pq.h>
> +
> +static int rvv_has_vector(void)
> +{
> +	return has_vector();
> +}
> +
> +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> +			      u8 *dq, const u8 *pbmul,
> +			      const u8 *qmul)
> +{
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
> +		".option	pop\n"
> +		: :
> +		[avl]"r"(16)
> +	);
> +
> +	/*
> +	 * while ( bytes-- ) {
> +	 *	uint8_t px, qx, db;
> +	 *
> +	 *	px	  = *p ^ *dp;
> +	 *	qx	  = qmul[*q ^ *dq];
> +	 *	*dq++ = db = pbmul[px] ^ qx;
> +	 *	*dp++ = db ^ px;
> +	 *	p++; q++;
> +	 * }
> +	 */
> +	while (bytes) {
> +		/*
> +		 * v0:px, v1:dp,
> +		 * v2:qx, v3:dq,
> +		 * v4:vx, v5:vy,
> +		 * v6:qm0, v7:qm1,
> +		 * v8:pm0, v9:pm1,
> +		 * v14:p/qm[vx], v15:p/qm[vy]
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[px])\n"
> +			"vle8.v		v1, (%[dp])\n"
> +			"vxor.vv	v0, v0, v1\n"
> +			"vle8.v		v2, (%[qx])\n"
> +			"vle8.v		v3, (%[dq])\n"
> +			"vxor.vv	v4, v2, v3\n"
> +			"vsrl.vi	v5, v4, 4\n"
> +			"vand.vi	v4, v4, 0xf\n"
> +			"vle8.v		v6, (%[qm0])\n"
> +			"vle8.v		v7, (%[qm1])\n"
> +			"vrgather.vv	v14, v6, v4\n" /* v14 = qm[vx] */
> +			"vrgather.vv	v15, v7, v5\n" /* v15 = qm[vy] */
> +			"vxor.vv	v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
> +
> +			"vsrl.vi	v5, v0, 4\n"
> +			"vand.vi	v4, v0, 0xf\n"
> +			"vle8.v		v8, (%[pm0])\n"
> +			"vle8.v		v9, (%[pm1])\n"
> +			"vrgather.vv	v14, v8, v4\n" /* v14 = pm[vx] */
> +			"vrgather.vv	v15, v9, v5\n" /* v15 = pm[vy] */
> +			"vxor.vv	v4, v14, v15\n" /* v4 = pbmul[px] */
> +			"vxor.vv	v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
> +			"vxor.vv	v1, v3, v0\n" /* v1 = db ^ px; */
> +			"vse8.v		v3, (%[dq])\n"
> +			"vse8.v		v1, (%[dp])\n"
> +			".option	pop\n"
> +			: :
> +			[px]"r"(p),
> +			[dp]"r"(dp),
> +			[qx]"r"(q),
> +			[dq]"r"(dq),
> +			[qm0]"r"(qmul),
> +			[qm1]"r"(qmul + 16),
> +			[pm0]"r"(pbmul),
> +			[pm1]"r"(pbmul + 16)
> +			:);
> +
> +		bytes -= 16;
> +		p += 16;
> +		q += 16;
> +		dp += 16;
> +		dq += 16;
> +	}
> +}
> +
> +static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
> +			      const uint8_t *qmul)
> +{
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
> +		".option	pop\n"
> +		: :
> +		[avl]"r"(16)
> +	);
> +
> +	/*
> +	 * while (bytes--) {
> +	 *  *p++ ^= *dq = qmul[*q ^ *dq];
> +	 *  q++; dq++;
> +	 * }
> +	 */
> +	while (bytes) {
> +		/*
> +		 * v0:vx, v1:vy,
> +		 * v2:dq, v3:p,
> +		 * v4:qm0, v5:qm1,
> +		 * v10:m[vx], v11:m[vy]
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[vx])\n"
> +			"vle8.v		v2, (%[dq])\n"
> +			"vxor.vv	v0, v0, v2\n"
> +			"vsrl.vi	v1, v0, 4\n"
> +			"vand.vi	v0, v0, 0xf\n"
> +			"vle8.v		v4, (%[qm0])\n"
> +			"vle8.v		v5, (%[qm1])\n"
> +			"vrgather.vv	v10, v4, v0\n"
> +			"vrgather.vv	v11, v5, v1\n"
> +			"vxor.vv	v0, v10, v11\n"
> +			"vle8.v		v1, (%[vy])\n"
> +			"vxor.vv	v1, v0, v1\n"
> +			"vse8.v		v0, (%[dq])\n"
> +			"vse8.v		v1, (%[vy])\n"
> +			".option	pop\n"
> +			: :
> +			[vx]"r"(q),
> +			[vy]"r"(p),
> +			[dq]"r"(dq),
> +			[qm0]"r"(qmul),
> +			[qm1]"r"(qmul + 16)
> +			:);
> +
> +		bytes -= 16;
> +		p += 16;
> +		q += 16;
> +		dq += 16;
> +	}
> +}
> +
> +
> +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
> +		int failb, void **ptrs)
> +{
> +	u8 *p, *q, *dp, *dq;
> +	const u8 *pbmul;	/* P multiplier table for B data */
> +	const u8 *qmul;		/* Q multiplier table (for both) */
> +
> +	p = (u8 *)ptrs[disks - 2];
> +	q = (u8 *)ptrs[disks - 1];
> +
> +	/*
> +	 * Compute syndrome with zero for the missing data pages
> +	 * Use the dead data pages as temporary storage for
> +	 * delta p and delta q
> +	 */
> +	dp = (u8 *)ptrs[faila];
> +	ptrs[faila] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 2] = dp;
> +	dq = (u8 *)ptrs[failb];
> +	ptrs[failb] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 1] = dq;
> +
> +	raid6_call.gen_syndrome(disks, bytes, ptrs);
> +
> +	/* Restore pointer table */
> +	ptrs[faila]     = dp;
> +	ptrs[failb]     = dq;
> +	ptrs[disks - 2] = p;
> +	ptrs[disks - 1] = q;
> +
> +	/* Now, pick the proper data tables */
> +	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
> +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
> +					 raid6_gfexp[failb]]];
> +
> +	kernel_vector_begin();
> +	__raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
> +	kernel_vector_end();
> +}
> +
> +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
> +		void **ptrs)
> +{
> +	u8 *p, *q, *dq;
> +	const u8 *qmul;		/* Q multiplier table */
> +
> +	p = (u8 *)ptrs[disks - 2];
> +	q = (u8 *)ptrs[disks - 1];
> +
> +	/*
> +	 * Compute syndrome with zero for the missing data page
> +	 * Use the dead data page as temporary storage for delta q
> +	 */
> +	dq = (u8 *)ptrs[faila];
> +	ptrs[faila] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 1] = dq;
> +
> +	raid6_call.gen_syndrome(disks, bytes, ptrs);
> +
> +	/* Restore pointer table */
> +	ptrs[faila]     = dq;
> +	ptrs[disks - 1] = q;
> +
> +	/* Now, pick the proper data tables */
> +	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
> +
> +	kernel_vector_begin();
> +	__raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
> +	kernel_vector_end();
> +}
> +
> +const struct raid6_recov_calls raid6_recov_rvv = {
> +	.data2		= raid6_2data_recov_rvv,
> +	.datap		= raid6_datap_recov_rvv,
> +	.valid		= rvv_has_vector,
> +	.name		= "rvv",
> +	.priority	= 1,
> +};
> diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
> new file mode 100644
> index 000000000000..fd0ec33edb1e
> --- /dev/null
> +++ b/lib/raid6/rvv.c
> @@ -0,0 +1,1269 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * RAID-6 syndrome calculation using RISCV vector instructions
> + *
> + * Copyright 2024 Institute of Software, CAS.

Remove the dot/period at the end?

> + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> + *
> + * Based on neon.uc:
> + *	Copyright 2002-2004 H. Peter Anvin
> + */
> +
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/raid/pq.h>
> +#include <linux/types.h>
> +
> +#define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
> +
> +static int rvv_has_vector(void)
> +{
> +	return has_vector();
> +}
> +
> +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;		/* Highest data disk */
> +	p = dptr[z0+1];		/* XOR parity */
> +	q = dptr[z0+2];		/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*1) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> +				    unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*1) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				".option	pop\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;		/* Highest data disk */
> +	p = dptr[z0+1];		/* XOR parity */
> +	q = dptr[z0+2];		/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*2) {

*bytes* is unsigned long, but d is int. Should they match? (Also above 
and below.)


Kind regards,

Paul


> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			"vse8.v		v4, (%[wp1])\n"
> +			"vse8.v		v5, (%[wq1])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> +					 unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*2) {
> +		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				".option	pop\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 * v4:wp1, v5:wq1, v6:p1, v7:q1
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +
> +			"vle8.v		v6, (%[wp1])\n"
> +			"vle8.v		v7, (%[wq1])\n"
> +			"vxor.vv	v6, v6, v4\n"
> +			"vxor.vv	v7, v7, v5\n"
> +			"vse8.v		v6, (%[wp1])\n"
> +			"vse8.v		v7, (%[wq1])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;	/* Highest data disk */
> +	p = dptr[z0+1];	/* XOR parity */
> +	q = dptr[z0+2];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*4) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			"vle8.v		v8, (%[wp2])\n"
> +			"vle8.v		v9, (%[wp2])\n"
> +			"vle8.v		v12, (%[wp3])\n"
> +			"vle8.v		v13, (%[wp3])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			"vse8.v		v4, (%[wp1])\n"
> +			"vse8.v		v5, (%[wq1])\n"
> +			"vse8.v		v8, (%[wp2])\n"
> +			"vse8.v		v9, (%[wq2])\n"
> +			"vse8.v		v12, (%[wp3])\n"
> +			"vse8.v		v13, (%[wq3])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> +					unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*4) {
> +		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			"vle8.v		v8, (%[wp2])\n"
> +			"vle8.v		v9, (%[wp2])\n"
> +			"vle8.v		v12, (%[wp3])\n"
> +			"vle8.v		v13, (%[wp3])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v5, v7, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v9, v11, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				".option	pop\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 * v4:wp1, v5:wq1, v6:p1, v7:q1
> +		 * v8:wp2, v9:wq2, v10:p2, v11:q2
> +		 * v12:wp3, v13:wq3, v14:p3, v15:q3
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +
> +			"vle8.v		v6, (%[wp1])\n"
> +			"vle8.v		v7, (%[wq1])\n"
> +			"vxor.vv	v6, v6, v4\n"
> +			"vxor.vv	v7, v7, v5\n"
> +			"vse8.v		v6, (%[wp1])\n"
> +			"vse8.v		v7, (%[wq1])\n"
> +
> +			"vle8.v		v10, (%[wp2])\n"
> +			"vle8.v		v11, (%[wq2])\n"
> +			"vxor.vv	v10, v10, v8\n"
> +			"vxor.vv	v11, v11, v9\n"
> +			"vse8.v		v10, (%[wp2])\n"
> +			"vse8.v		v11, (%[wq2])\n"
> +
> +			"vle8.v		v14, (%[wp3])\n"
> +			"vle8.v		v15, (%[wq3])\n"
> +			"vxor.vv	v14, v14, v12\n"
> +			"vxor.vv	v15, v15, v13\n"
> +			"vse8.v		v14, (%[wp3])\n"
> +			"vse8.v		v15, (%[wq3])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;	/* Highest data disk */
> +	p = dptr[z0+1];	/* XOR parity */
> +	q = dptr[z0+2];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
> +	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
> +	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
> +	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
> +	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
> +	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
> +	 */
> +	for (d = 0; d < bytes; d += NSIZE*8) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			"vle8.v		v8, (%[wp2])\n"
> +			"vle8.v		v9, (%[wp2])\n"
> +			"vle8.v		v12, (%[wp3])\n"
> +			"vle8.v		v13, (%[wp3])\n"
> +			"vle8.v		v16, (%[wp4])\n"
> +			"vle8.v		v17, (%[wp4])\n"
> +			"vle8.v		v20, (%[wp5])\n"
> +			"vle8.v		v21, (%[wp5])\n"
> +			"vle8.v		v24, (%[wp6])\n"
> +			"vle8.v		v25, (%[wp6])\n"
> +			"vle8.v		v28, (%[wp7])\n"
> +			"vle8.v		v29, (%[wp7])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE]),
> +			[wp4]"r"(&dptr[z0][d+4*NSIZE]),
> +			[wp5]"r"(&dptr[z0][d+5*NSIZE]),
> +			[wp6]"r"(&dptr[z0][d+6*NSIZE]),
> +			[wp7]"r"(&dptr[z0][d+7*NSIZE])
> +		);
> +
> +		for (z = z0-1; z >= 0; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +
> +				"vsra.vi	v18, v17, 7\n"
> +				"vsll.vi	v19, v17, 1\n"
> +				"vand.vx	v18, v18, %[x1d]\n"
> +				"vxor.vv	v19, v19, v18\n"
> +				"vle8.v		v18, (%[wd4])\n"
> +				"vxor.vv	v17, v19, v18\n"
> +				"vxor.vv	v16, v16, v18\n"
> +
> +				"vsra.vi	v22, v21, 7\n"
> +				"vsll.vi	v23, v21, 1\n"
> +				"vand.vx	v22, v22, %[x1d]\n"
> +				"vxor.vv	v23, v23, v22\n"
> +				"vle8.v		v22, (%[wd5])\n"
> +				"vxor.vv	v21, v23, v22\n"
> +				"vxor.vv	v20, v20, v22\n"
> +
> +				"vsra.vi	v26, v25, 7\n"
> +				"vsll.vi	v27, v25, 1\n"
> +				"vand.vx	v26, v26, %[x1d]\n"
> +				"vxor.vv	v27, v27, v26\n"
> +				"vle8.v		v26, (%[wd6])\n"
> +				"vxor.vv	v25, v27, v26\n"
> +				"vxor.vv	v24, v24, v26\n"
> +
> +				"vsra.vi	v30, v29, 7\n"
> +				"vsll.vi	v31, v29, 1\n"
> +				"vand.vx	v30, v30, %[x1d]\n"
> +				"vxor.vv	v31, v31, v30\n"
> +				"vle8.v		v30, (%[wd7])\n"
> +				"vxor.vv	v29, v31, v30\n"
> +				"vxor.vv	v28, v28, v30\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[wd4]"r"(&dptr[z][d+4*NSIZE]),
> +				[wd5]"r"(&dptr[z][d+5*NSIZE]),
> +				[wd6]"r"(&dptr[z][d+6*NSIZE]),
> +				[wd7]"r"(&dptr[z][d+7*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			"vse8.v		v4, (%[wp1])\n"
> +			"vse8.v		v5, (%[wq1])\n"
> +			"vse8.v		v8, (%[wp2])\n"
> +			"vse8.v		v9, (%[wq2])\n"
> +			"vse8.v		v12, (%[wp3])\n"
> +			"vse8.v		v13, (%[wq3])\n"
> +			"vse8.v		v16, (%[wp4])\n"
> +			"vse8.v		v17, (%[wq4])\n"
> +			"vse8.v		v20, (%[wp5])\n"
> +			"vse8.v		v21, (%[wq5])\n"
> +			"vse8.v		v24, (%[wp6])\n"
> +			"vse8.v		v25, (%[wq6])\n"
> +			"vse8.v		v28, (%[wp7])\n"
> +			"vse8.v		v29, (%[wq7])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3]),
> +			[wp4]"r"(&p[d+NSIZE*4]),
> +			[wq4]"r"(&q[d+NSIZE*4]),
> +			[wp5]"r"(&p[d+NSIZE*5]),
> +			[wq5]"r"(&q[d+NSIZE*5]),
> +			[wp6]"r"(&p[d+NSIZE*6]),
> +			[wq6]"r"(&q[d+NSIZE*6]),
> +			[wp7]"r"(&p[d+NSIZE*7]),
> +			[wq7]"r"(&q[d+NSIZE*7])
> +		);
> +	}
> +}
> +
> +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
> +					unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +		".option	pop\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
> +	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
> +	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
> +	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
> +	 */
> +	for (d = 0; d < bytes; d += NSIZE*8) {
> +		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v0, (%[wp0])\n"
> +			"vle8.v		v1, (%[wp0])\n"
> +			"vle8.v		v4, (%[wp1])\n"
> +			"vle8.v		v5, (%[wp1])\n"
> +			"vle8.v		v8, (%[wp2])\n"
> +			"vle8.v		v9, (%[wp2])\n"
> +			"vle8.v		v12, (%[wp3])\n"
> +			"vle8.v		v13, (%[wp3])\n"
> +			"vle8.v		v16, (%[wp4])\n"
> +			"vle8.v		v17, (%[wp4])\n"
> +			"vle8.v		v20, (%[wp5])\n"
> +			"vle8.v		v21, (%[wp5])\n"
> +			"vle8.v		v24, (%[wp6])\n"
> +			"vle8.v		v25, (%[wp6])\n"
> +			"vle8.v		v28, (%[wp7])\n"
> +			"vle8.v		v29, (%[wp7])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE]),
> +			[wp4]"r"(&dptr[z0][d+4*NSIZE]),
> +			[wp5]"r"(&dptr[z0][d+5*NSIZE]),
> +			[wp6]"r"(&dptr[z0][d+6*NSIZE]),
> +			[wp7]"r"(&dptr[z0][d+7*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1; z >= start; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +
> +				"vsra.vi	v18, v17, 7\n"
> +				"vsll.vi	v19, v17, 1\n"
> +				"vand.vx	v18, v18, %[x1d]\n"
> +				"vxor.vv	v19, v19, v18\n"
> +				"vle8.v		v18, (%[wd4])\n"
> +				"vxor.vv	v17, v19, v18\n"
> +				"vxor.vv	v16, v16, v18\n"
> +
> +				"vsra.vi	v22, v21, 7\n"
> +				"vsll.vi	v23, v21, 1\n"
> +				"vand.vx	v22, v22, %[x1d]\n"
> +				"vxor.vv	v23, v23, v22\n"
> +				"vle8.v		v22, (%[wd5])\n"
> +				"vxor.vv	v21, v23, v22\n"
> +				"vxor.vv	v20, v20, v22\n"
> +
> +				"vsra.vi	v26, v25, 7\n"
> +				"vsll.vi	v27, v25, 1\n"
> +				"vand.vx	v26, v26, %[x1d]\n"
> +				"vxor.vv	v27, v27, v26\n"
> +				"vle8.v		v26, (%[wd6])\n"
> +				"vxor.vv	v25, v27, v26\n"
> +				"vxor.vv	v24, v24, v26\n"
> +
> +				"vsra.vi	v30, v29, 7\n"
> +				"vsll.vi	v31, v29, 1\n"
> +				"vand.vx	v30, v30, %[x1d]\n"
> +				"vxor.vv	v31, v31, v30\n"
> +				"vle8.v		v30, (%[wd7])\n"
> +				"vxor.vv	v29, v31, v30\n"
> +				"vxor.vv	v28, v28, v30\n"
> +				".option	pop\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[wd4]"r"(&dptr[z][d+4*NSIZE]),
> +				[wd5]"r"(&dptr[z][d+5*NSIZE]),
> +				[wd6]"r"(&dptr[z][d+6*NSIZE]),
> +				[wd7]"r"(&dptr[z][d+7*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1; z >= 0; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				".option	push\n"
> +				".option	arch,+v\n"
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v5, v7, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v9, v11, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v13, v15, v14\n"
> +
> +				"vsra.vi	v18, v17, 7\n"
> +				"vsll.vi	v19, v17, 1\n"
> +				"vand.vx	v18, v18, %[x1d]\n"
> +				"vxor.vv	v17, v19, v18\n"
> +
> +				"vsra.vi	v22, v21, 7\n"
> +				"vsll.vi	v23, v21, 1\n"
> +				"vand.vx	v22, v22, %[x1d]\n"
> +				"vxor.vv	v21, v23, v22\n"
> +
> +				"vsra.vi	v26, v25, 7\n"
> +				"vsll.vi	v27, v25, 1\n"
> +				"vand.vx	v26, v26, %[x1d]\n"
> +				"vxor.vv	v25, v27, v26\n"
> +
> +				"vsra.vi	v30, v29, 7\n"
> +				"vsll.vi	v31, v29, 1\n"
> +				"vand.vx	v30, v30, %[x1d]\n"
> +				"vxor.vv	v29, v31, v30\n"
> +				".option	pop\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 * v4:wp1, v5:wq1, v6:p1, v7:q1
> +		 * v8:wp2, v9:wq2, v10:p2, v11:q2
> +		 * v12:wp3, v13:wq3, v14:p3, v15:q3
> +		 * v16:wp4, v17:wq4, v18:p4, v19:q4
> +		 * v20:wp5, v21:wq5, v22:p5, v23:q5
> +		 * v24:wp6, v25:wq6, v26:p6, v27:q6
> +		 * v28:wp7, v29:wq7, v30:p7, v31:q7
> +		 */
> +		asm volatile (
> +			".option	push\n"
> +			".option	arch,+v\n"
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +
> +			"vle8.v		v6, (%[wp1])\n"
> +			"vle8.v		v7, (%[wq1])\n"
> +			"vxor.vv	v6, v6, v4\n"
> +			"vxor.vv	v7, v7, v5\n"
> +			"vse8.v		v6, (%[wp1])\n"
> +			"vse8.v		v7, (%[wq1])\n"
> +
> +			"vle8.v		v10, (%[wp2])\n"
> +			"vle8.v		v11, (%[wq2])\n"
> +			"vxor.vv	v10, v10, v8\n"
> +			"vxor.vv	v11, v11, v9\n"
> +			"vse8.v		v10, (%[wp2])\n"
> +			"vse8.v		v11, (%[wq2])\n"
> +
> +			"vle8.v		v14, (%[wp3])\n"
> +			"vle8.v		v15, (%[wq3])\n"
> +			"vxor.vv	v14, v14, v12\n"
> +			"vxor.vv	v15, v15, v13\n"
> +			"vse8.v		v14, (%[wp3])\n"
> +			"vse8.v		v15, (%[wq3])\n"
> +
> +			"vle8.v		v18, (%[wp4])\n"
> +			"vle8.v		v19, (%[wq4])\n"
> +			"vxor.vv	v18, v18, v16\n"
> +			"vxor.vv	v19, v19, v17\n"
> +			"vse8.v		v18, (%[wp4])\n"
> +			"vse8.v		v19, (%[wq4])\n"
> +
> +			"vle8.v		v22, (%[wp5])\n"
> +			"vle8.v		v23, (%[wq5])\n"
> +			"vxor.vv	v22, v22, v20\n"
> +			"vxor.vv	v23, v23, v21\n"
> +			"vse8.v		v22, (%[wp5])\n"
> +			"vse8.v		v23, (%[wq5])\n"
> +
> +			"vle8.v		v26, (%[wp6])\n"
> +			"vle8.v		v27, (%[wq6])\n"
> +			"vxor.vv	v26, v26, v24\n"
> +			"vxor.vv	v27, v27, v25\n"
> +			"vse8.v		v26, (%[wp6])\n"
> +			"vse8.v		v27, (%[wq6])\n"
> +
> +			"vle8.v		v30, (%[wp7])\n"
> +			"vle8.v		v31, (%[wq7])\n"
> +			"vxor.vv	v30, v30, v28\n"
> +			"vxor.vv	v31, v31, v29\n"
> +			"vse8.v		v30, (%[wp7])\n"
> +			"vse8.v		v31, (%[wq7])\n"
> +			".option	pop\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3]),
> +			[wp4]"r"(&p[d+NSIZE*4]),
> +			[wq4]"r"(&q[d+NSIZE*4]),
> +			[wp5]"r"(&p[d+NSIZE*5]),
> +			[wq5]"r"(&q[d+NSIZE*5]),
> +			[wp6]"r"(&p[d+NSIZE*6]),
> +			[wq6]"r"(&q[d+NSIZE*6]),
> +			[wp7]"r"(&p[d+NSIZE*7]),
> +			[wq7]"r"(&q[d+NSIZE*7])
> +		);
> +	}
> +}
> +
> +#define RAID6_RVV_WRAPPER(_n)						\
> +	static void raid6_rvv ## _n ## _gen_syndrome(int disks,		\
> +					size_t bytes, void **ptrs)	\
> +	{								\
> +		void raid6_rvv ## _n  ## _gen_syndrome_real(int,	\
> +						unsigned long, void**);	\
> +		kernel_vector_begin();					\
> +		raid6_rvv ## _n ## _gen_syndrome_real(disks,		\
> +				(unsigned long)bytes, ptrs);		\
> +		kernel_vector_end();					\
> +	}								\
> +	static void raid6_rvv ## _n ## _xor_syndrome(int disks,		\
> +					int start, int stop,		\
> +					size_t bytes, void **ptrs)	\
> +	{								\
> +		void raid6_rvv ## _n  ## _xor_syndrome_real(int,	\
> +				int, int, unsigned long, void**);	\
> +		kernel_vector_begin();					\
> +		raid6_rvv ## _n ## _xor_syndrome_real(disks,		\
> +			start, stop, (unsigned long)bytes, ptrs);	\
> +		kernel_vector_end();					\
> +	}								\
> +	struct raid6_calls const raid6_rvvx ## _n = {			\
> +		raid6_rvv ## _n ## _gen_syndrome,			\
> +		raid6_rvv ## _n ## _xor_syndrome,			\
> +		rvv_has_vector,						\
> +		"rvvx" #_n,						\
> +		0							\
> +	}
> +
> +RAID6_RVV_WRAPPER(1);
> +RAID6_RVV_WRAPPER(2);
> +RAID6_RVV_WRAPPER(4);
> +RAID6_RVV_WRAPPER(8);
Charlie Jenkins Jan. 27, 2025, 10:34 p.m. UTC | #2
On Mon, Jan 27, 2025 at 09:39:11AM +0100, Paul Menzel wrote:
> Dear Chunyan,
> 
> 
> Thank you for the patch.
> 
> 
> Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> > The assembly is originally based on the ARM NEON and int.uc, but uses
> > RISC-V vector instructions to implement the RAID6 syndrome and
> > recovery calculations.
> > 
> > Results on QEMU running with the option "-icount shift=0":
> > 
> >    raid6: rvvx1    gen()  1008 MB/s
> >    raid6: rvvx2    gen()  1395 MB/s
> >    raid6: rvvx4    gen()  1584 MB/s
> >    raid6: rvvx8    gen()  1694 MB/s
> >    raid6: int64x8  gen()   113 MB/s
> >    raid6: int64x4  gen()   116 MB/s
> >    raid6: int64x2  gen()   272 MB/s
> >    raid6: int64x1  gen()   229 MB/s
> >    raid6: using algorithm rvvx8 gen() 1694 MB/s
> >    raid6: .... xor() 1000 MB/s, rmw enabled
> >    raid6: using rvv recovery algorithm
> 
> How did you start QEMU and on what host did you run it? Does it change
> between runs? (For me these benchmark values were very unreliable in the
> past on x86 hardware.)

I reported dramatic gains on vector as well in this response [1]. Note
that these gains are only present when using the QEMU option "-icount
shift=0" vector becomes dramatically more performant. Without this
option we do not see a performance gain on QEMU. However riscv vector is
known to not be less optimized on QEMU so having vector be less
performant on some QEMU configurations is not necessarily representative
of hardware implementations.


My full qemu command is (running on x86 host):

qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1\
    -kernel arch/riscv/boot/Image \
    -append "root=/dev/vda rw earlycon console=ttyS0" \
    -drive file=rootfs.ext2,format=raw,id=hd0,if=none \
    -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
    -device virtio-blk-device,drive=hd0

This is with version 9.2.0.


I am also facing this issue when executing this:

raid6: rvvx1    gen()   717 MB/s
raid6: rvvx2    gen()   734 MB/s
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020

Only rvvx4 is failing. I applied this patch to 6.13.

- Charlie
Chunyan Zhang Feb. 11, 2025, 8:39 a.m. UTC | #3
On Mon, 27 Jan 2025 at 16:39, Paul Menzel <pmenzel@molgen.mpg.de> wrote:
>
> Dear Chunyan,
>
>
> Thank you for the patch.
>
>
> Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> > The assembly is originally based on the ARM NEON and int.uc, but uses
> > RISC-V vector instructions to implement the RAID6 syndrome and
> > recovery calculations.
> >
> > Results on QEMU running with the option "-icount shift=0":
> >
> >    raid6: rvvx1    gen()  1008 MB/s
> >    raid6: rvvx2    gen()  1395 MB/s
> >    raid6: rvvx4    gen()  1584 MB/s
> >    raid6: rvvx8    gen()  1694 MB/s
> >    raid6: int64x8  gen()   113 MB/s
> >    raid6: int64x4  gen()   116 MB/s
> >    raid6: int64x2  gen()   272 MB/s
> >    raid6: int64x1  gen()   229 MB/s
> >    raid6: using algorithm rvvx8 gen() 1694 MB/s
> >    raid6: .... xor() 1000 MB/s, rmw enabled
> >    raid6: using rvv recovery algorithm
>
> How did you start QEMU and on what host did you run it? Does it change

Started QEMU with the option "-icount shift=0" on X86 host.

Start QEMU without this option, I can get the result like this:

raid6: rvvx1    gen()    41 MB/s
raid6: rvvx2    gen()    46 MB/s
raid6: rvvx4    gen()    46 MB/s
raid6: rvvx8    gen()    46 MB/s
raid6: int64x8  gen()   279 MB/s
raid6: int64x4  gen()   285 MB/s
raid6: int64x2  gen()   631 MB/s
raid6: int64x1  gen()  1981 MB/s
raid6: using algorithm int64x1 gen() 1981 MB/s
raid6: .... xor() 1168 MB/s, rmw enabled
raid6: using rvv recovery algorithm

> between runs? (For me these benchmark values were very unreliable in the
> past on x86 hardware.)
>
> > [Charlie: - Fixup vector options]
> > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > ---
> > V2:
> > - Add raid6_rvvx8;
> > - Address the vector options issue;
> > - Add .valid callback to raid6_rvv and raid6_recov_rvv;
> > - Removed unneeded check of crypto_simd_usable();
> >
> > RFC: https://lore.kernel.org/lkml/20241220114023.667347-1-zhangchunyan@iscas.ac.cn/
> > ---
> >   include/linux/raid/pq.h |    5 +
> >   lib/raid6/Makefile      |    1 +
> >   lib/raid6/algos.c       |    9 +
> >   lib/raid6/recov_rvv.c   |  234 ++++++++
> >   lib/raid6/rvv.c         | 1269 +++++++++++++++++++++++++++++++++++++++
> >   5 files changed, 1518 insertions(+)
> >   create mode 100644 lib/raid6/recov_rvv.c
> >   create mode 100644 lib/raid6/rvv.c
> >
> > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> > index 98030accf641..72ff44cca864 100644
> > --- a/include/linux/raid/pq.h
> > +++ b/include/linux/raid/pq.h
> > @@ -108,6 +108,10 @@ extern const struct raid6_calls raid6_vpermxor4;
> >   extern const struct raid6_calls raid6_vpermxor8;
> >   extern const struct raid6_calls raid6_lsx;
> >   extern const struct raid6_calls raid6_lasx;
> > +extern const struct raid6_calls raid6_rvvx1;
> > +extern const struct raid6_calls raid6_rvvx2;
> > +extern const struct raid6_calls raid6_rvvx4;
> > +extern const struct raid6_calls raid6_rvvx8;
> >
> >   struct raid6_recov_calls {
> >       void (*data2)(int, size_t, int, int, void **);
> > @@ -125,6 +129,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
> >   extern const struct raid6_recov_calls raid6_recov_neon;
> >   extern const struct raid6_recov_calls raid6_recov_lsx;
> >   extern const struct raid6_recov_calls raid6_recov_lasx;
> > +extern const struct raid6_recov_calls raid6_recov_rvv;
> >
> >   extern const struct raid6_calls raid6_neonx1;
> >   extern const struct raid6_calls raid6_neonx2;
> > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > index 29127dd05d63..5be0a4e60ab1 100644
> > --- a/lib/raid6/Makefile
> > +++ b/lib/raid6/Makefile
> > @@ -10,6 +10,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
> >   raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
> >   raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> >   raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> > +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> >
> >   hostprogs   += mktables
> >
> > diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> > index cd2e88ee1f14..99980ff5b985 100644
> > --- a/lib/raid6/algos.c
> > +++ b/lib/raid6/algos.c
> > @@ -80,6 +80,12 @@ const struct raid6_calls * const raid6_algos[] = {
> >   #ifdef CONFIG_CPU_HAS_LSX
> >       &raid6_lsx,
> >   #endif
> > +#endif
> > +#ifdef CONFIG_RISCV_ISA_V
> > +     &raid6_rvvx1,
> > +     &raid6_rvvx2,
> > +     &raid6_rvvx4,
> > +     &raid6_rvvx8,
> >   #endif
> >       &raid6_intx8,
> >       &raid6_intx4,
> > @@ -115,6 +121,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
> >   #ifdef CONFIG_CPU_HAS_LSX
> >       &raid6_recov_lsx,
> >   #endif
> > +#endif
> > +#ifdef CONFIG_RISCV_ISA_V
> > +     &raid6_recov_rvv,
> >   #endif
> >       &raid6_recov_intx1,
> >       NULL
> > diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> > new file mode 100644
> > index 000000000000..db271d2987c6
> > --- /dev/null
> > +++ b/lib/raid6/recov_rvv.c
> > @@ -0,0 +1,234 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright 2024 Institute of Software, CAS.
>
> Remove the dot/period at the end?

Ok.

>
> > + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > + */
> > +
> > +#include <asm/simd.h>
> > +#include <asm/vector.h>
> > +#include <crypto/internal/simd.h>
> > +#include <linux/raid/pq.h>
> > +
> > +static int rvv_has_vector(void)
> > +{
> > +     return has_vector();
> > +}
> > +
> > +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> > +                           u8 *dq, const u8 *pbmul,
> > +                           const u8 *qmul)
> > +{
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +             : :
> > +             [avl]"r"(16)
> > +     );
> > +
> > +     /*
> > +      * while ( bytes-- ) {
> > +      *      uint8_t px, qx, db;
> > +      *
> > +      *      px        = *p ^ *dp;
> > +      *      qx        = qmul[*q ^ *dq];
> > +      *      *dq++ = db = pbmul[px] ^ qx;
> > +      *      *dp++ = db ^ px;
> > +      *      p++; q++;
> > +      * }
> > +      */
> > +     while (bytes) {
> > +             /*
> > +              * v0:px, v1:dp,
> > +              * v2:qx, v3:dq,
> > +              * v4:vx, v5:vy,
> > +              * v6:qm0, v7:qm1,
> > +              * v8:pm0, v9:pm1,
> > +              * v14:p/qm[vx], v15:p/qm[vy]
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[px])\n"
> > +                     "vle8.v         v1, (%[dp])\n"
> > +                     "vxor.vv        v0, v0, v1\n"
> > +                     "vle8.v         v2, (%[qx])\n"
> > +                     "vle8.v         v3, (%[dq])\n"
> > +                     "vxor.vv        v4, v2, v3\n"
> > +                     "vsrl.vi        v5, v4, 4\n"
> > +                     "vand.vi        v4, v4, 0xf\n"
> > +                     "vle8.v         v6, (%[qm0])\n"
> > +                     "vle8.v         v7, (%[qm1])\n"
> > +                     "vrgather.vv    v14, v6, v4\n" /* v14 = qm[vx] */
> > +                     "vrgather.vv    v15, v7, v5\n" /* v15 = qm[vy] */
> > +                     "vxor.vv        v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
> > +
> > +                     "vsrl.vi        v5, v0, 4\n"
> > +                     "vand.vi        v4, v0, 0xf\n"
> > +                     "vle8.v         v8, (%[pm0])\n"
> > +                     "vle8.v         v9, (%[pm1])\n"
> > +                     "vrgather.vv    v14, v8, v4\n" /* v14 = pm[vx] */
> > +                     "vrgather.vv    v15, v9, v5\n" /* v15 = pm[vy] */
> > +                     "vxor.vv        v4, v14, v15\n" /* v4 = pbmul[px] */
> > +                     "vxor.vv        v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
> > +                     "vxor.vv        v1, v3, v0\n" /* v1 = db ^ px; */
> > +                     "vse8.v         v3, (%[dq])\n"
> > +                     "vse8.v         v1, (%[dp])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [px]"r"(p),
> > +                     [dp]"r"(dp),
> > +                     [qx]"r"(q),
> > +                     [dq]"r"(dq),
> > +                     [qm0]"r"(qmul),
> > +                     [qm1]"r"(qmul + 16),
> > +                     [pm0]"r"(pbmul),
> > +                     [pm1]"r"(pbmul + 16)
> > +                     :);
> > +
> > +             bytes -= 16;
> > +             p += 16;
> > +             q += 16;
> > +             dp += 16;
> > +             dq += 16;
> > +     }
> > +}
> > +
> > +static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
> > +                           const uint8_t *qmul)
> > +{
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +             : :
> > +             [avl]"r"(16)
> > +     );
> > +
> > +     /*
> > +      * while (bytes--) {
> > +      *  *p++ ^= *dq = qmul[*q ^ *dq];
> > +      *  q++; dq++;
> > +      * }
> > +      */
> > +     while (bytes) {
> > +             /*
> > +              * v0:vx, v1:vy,
> > +              * v2:dq, v3:p,
> > +              * v4:qm0, v5:qm1,
> > +              * v10:m[vx], v11:m[vy]
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[vx])\n"
> > +                     "vle8.v         v2, (%[dq])\n"
> > +                     "vxor.vv        v0, v0, v2\n"
> > +                     "vsrl.vi        v1, v0, 4\n"
> > +                     "vand.vi        v0, v0, 0xf\n"
> > +                     "vle8.v         v4, (%[qm0])\n"
> > +                     "vle8.v         v5, (%[qm1])\n"
> > +                     "vrgather.vv    v10, v4, v0\n"
> > +                     "vrgather.vv    v11, v5, v1\n"
> > +                     "vxor.vv        v0, v10, v11\n"
> > +                     "vle8.v         v1, (%[vy])\n"
> > +                     "vxor.vv        v1, v0, v1\n"
> > +                     "vse8.v         v0, (%[dq])\n"
> > +                     "vse8.v         v1, (%[vy])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [vx]"r"(q),
> > +                     [vy]"r"(p),
> > +                     [dq]"r"(dq),
> > +                     [qm0]"r"(qmul),
> > +                     [qm1]"r"(qmul + 16)
> > +                     :);
> > +
> > +             bytes -= 16;
> > +             p += 16;
> > +             q += 16;
> > +             dq += 16;
> > +     }
> > +}
> > +
> > +
> > +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
> > +             int failb, void **ptrs)
> > +{
> > +     u8 *p, *q, *dp, *dq;
> > +     const u8 *pbmul;        /* P multiplier table for B data */
> > +     const u8 *qmul;         /* Q multiplier table (for both) */
> > +
> > +     p = (u8 *)ptrs[disks - 2];
> > +     q = (u8 *)ptrs[disks - 1];
> > +
> > +     /*
> > +      * Compute syndrome with zero for the missing data pages
> > +      * Use the dead data pages as temporary storage for
> > +      * delta p and delta q
> > +      */
> > +     dp = (u8 *)ptrs[faila];
> > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 2] = dp;
> > +     dq = (u8 *)ptrs[failb];
> > +     ptrs[failb] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 1] = dq;
> > +
> > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > +
> > +     /* Restore pointer table */
> > +     ptrs[faila]     = dp;
> > +     ptrs[failb]     = dq;
> > +     ptrs[disks - 2] = p;
> > +     ptrs[disks - 1] = q;
> > +
> > +     /* Now, pick the proper data tables */
> > +     pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
> > +     qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
> > +                                      raid6_gfexp[failb]]];
> > +
> > +     kernel_vector_begin();
> > +     __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
> > +     kernel_vector_end();
> > +}
> > +
> > +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
> > +             void **ptrs)
> > +{
> > +     u8 *p, *q, *dq;
> > +     const u8 *qmul;         /* Q multiplier table */
> > +
> > +     p = (u8 *)ptrs[disks - 2];
> > +     q = (u8 *)ptrs[disks - 1];
> > +
> > +     /*
> > +      * Compute syndrome with zero for the missing data page
> > +      * Use the dead data page as temporary storage for delta q
> > +      */
> > +     dq = (u8 *)ptrs[faila];
> > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 1] = dq;
> > +
> > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > +
> > +     /* Restore pointer table */
> > +     ptrs[faila]     = dq;
> > +     ptrs[disks - 1] = q;
> > +
> > +     /* Now, pick the proper data tables */
> > +     qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
> > +
> > +     kernel_vector_begin();
> > +     __raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
> > +     kernel_vector_end();
> > +}
> > +
> > +const struct raid6_recov_calls raid6_recov_rvv = {
> > +     .data2          = raid6_2data_recov_rvv,
> > +     .datap          = raid6_datap_recov_rvv,
> > +     .valid          = rvv_has_vector,
> > +     .name           = "rvv",
> > +     .priority       = 1,
> > +};
> > diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
> > new file mode 100644
> > index 000000000000..fd0ec33edb1e
> > --- /dev/null
> > +++ b/lib/raid6/rvv.c
> > @@ -0,0 +1,1269 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * RAID-6 syndrome calculation using RISCV vector instructions
> > + *
> > + * Copyright 2024 Institute of Software, CAS.
>
> Remove the dot/period at the end?

Ok.

>
> > + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > + *
> > + * Based on neon.uc:
> > + *   Copyright 2002-2004 H. Peter Anvin
> > + */
> > +
> > +#include <asm/simd.h>
> > +#include <asm/vector.h>
> > +#include <crypto/internal/simd.h>
> > +#include <linux/raid/pq.h>
> > +#include <linux/types.h>
> > +
> > +#define NSIZE        (riscv_v_vsize / 32) /* NSIZE = vlenb */
> > +
> > +static int rvv_has_vector(void)
> > +{
> > +     return has_vector();
> > +}
> > +
> > +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     int d, z, z0;
> > +     u8 *p, *q;
> > +
> > +     z0 = disks - 3;         /* Highest data disk */
> > +     p = dptr[z0+1];         /* XOR parity */
> > +     q = dptr[z0+2];         /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +      /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*1) {
> > +             /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE])
> > +             );
> > +
> > +             for (z = z0-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vse8.v         v0, (%[wp0])\n"
> > +                     "vse8.v         v1, (%[wq0])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> > +                                 unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     u8 *p, *q;
> > +     int d, z, z0;
> > +
> > +     z0 = stop;              /* P/Q right side optimization */
> > +     p = dptr[disks-2];      /* XOR parity */
> > +     q = dptr[disks-1];      /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*1) {
> > +             /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE])
> > +             );
> > +
> > +             /* P/Q data pages */
> > +             for (z = z0-1 ; z >= start ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /* P/Q left side optimization */
> > +             for (z = start-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * wq$$ = w1$$ ^ w2$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> > +              * v0:wp0, v1:wq0, v2:p0, v3:q0
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v2, (%[wp0])\n"
> > +                     "vle8.v         v3, (%[wq0])\n"
> > +                     "vxor.vv        v2, v2, v0\n"
> > +                     "vxor.vv        v3, v3, v1\n"
> > +                     "vse8.v         v2, (%[wp0])\n"
> > +                     "vse8.v         v3, (%[wq0])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     int d, z, z0;
> > +     u8 *p, *q;
> > +
> > +     z0 = disks - 3;         /* Highest data disk */
> > +     p = dptr[z0+1];         /* XOR parity */
> > +     q = dptr[z0+2];         /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> > +      * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> > +      */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*2) {
>
> *bytes* is unsigned long, but d is int. Should they match? (Also above
> and below.)

Ok, will fix them.

Thanks,
Chunyan

>
>
> Kind regards,
>
> Paul
>
>
> > +             /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE])
> > +             );
> > +
> > +             for (z = z0-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vse8.v         v0, (%[wp0])\n"
> > +                     "vse8.v         v1, (%[wq0])\n"
> > +                     "vse8.v         v4, (%[wp1])\n"
> > +                     "vse8.v         v5, (%[wq1])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> > +                                      unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     u8 *p, *q;
> > +     int d, z, z0;
> > +
> > +     z0 = stop;              /* P/Q right side optimization */
> > +     p = dptr[disks-2];      /* XOR parity */
> > +     q = dptr[disks-1];      /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> > +      * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> > +      */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*2) {
> > +              /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE])
> > +             );
> > +
> > +             /* P/Q data pages */
> > +             for (z = z0-1 ; z >= start ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /* P/Q left side optimization */
> > +             for (z = start-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * wq$$ = w1$$ ^ w2$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> > +              * v0:wp0, v1:wq0, v2:p0, v3:q0
> > +              * v4:wp1, v5:wq1, v6:p1, v7:q1
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v2, (%[wp0])\n"
> > +                     "vle8.v         v3, (%[wq0])\n"
> > +                     "vxor.vv        v2, v2, v0\n"
> > +                     "vxor.vv        v3, v3, v1\n"
> > +                     "vse8.v         v2, (%[wp0])\n"
> > +                     "vse8.v         v3, (%[wq0])\n"
> > +
> > +                     "vle8.v         v6, (%[wp1])\n"
> > +                     "vle8.v         v7, (%[wq1])\n"
> > +                     "vxor.vv        v6, v6, v4\n"
> > +                     "vxor.vv        v7, v7, v5\n"
> > +                     "vse8.v         v6, (%[wp1])\n"
> > +                     "vse8.v         v7, (%[wq1])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     int d, z, z0;
> > +     u8 *p, *q;
> > +
> > +     z0 = disks - 3; /* Highest data disk */
> > +     p = dptr[z0+1]; /* XOR parity */
> > +     q = dptr[z0+2]; /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> > +      * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> > +      * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> > +      * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> > +      */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*4) {
> > +             /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     "vle8.v         v8, (%[wp2])\n"
> > +                     "vle8.v         v9, (%[wp2])\n"
> > +                     "vle8.v         v12, (%[wp3])\n"
> > +                     "vle8.v         v13, (%[wp3])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > +                     [wp2]"r"(&dptr[z0][d+2*NSIZE]),
> > +                     [wp3]"r"(&dptr[z0][d+3*NSIZE])
> > +             );
> > +
> > +             for (z = z0-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v11, v11, v10\n"
> > +                             "vle8.v         v10, (%[wd2])\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +                             "vxor.vv        v8, v8, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v15, v15, v14\n"
> > +                             "vle8.v         v14, (%[wd3])\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +                             "vxor.vv        v12, v12, v14\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [wd2]"r"(&dptr[z][d+2*NSIZE]),
> > +                             [wd3]"r"(&dptr[z][d+3*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vse8.v         v0, (%[wp0])\n"
> > +                     "vse8.v         v1, (%[wq0])\n"
> > +                     "vse8.v         v4, (%[wp1])\n"
> > +                     "vse8.v         v5, (%[wq1])\n"
> > +                     "vse8.v         v8, (%[wp2])\n"
> > +                     "vse8.v         v9, (%[wq2])\n"
> > +                     "vse8.v         v12, (%[wp3])\n"
> > +                     "vse8.v         v13, (%[wq3])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1]),
> > +                     [wp2]"r"(&p[d+NSIZE*2]),
> > +                     [wq2]"r"(&q[d+NSIZE*2]),
> > +                     [wp3]"r"(&p[d+NSIZE*3]),
> > +                     [wq3]"r"(&q[d+NSIZE*3])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> > +                                     unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     u8 *p, *q;
> > +     int d, z, z0;
> > +
> > +     z0 = stop;              /* P/Q right side optimization */
> > +     p = dptr[disks-2];      /* XOR parity */
> > +     q = dptr[disks-1];      /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> > +      * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> > +      * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> > +      * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> > +      */
> > +     for (d = 0 ; d < bytes ; d += NSIZE*4) {
> > +              /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     "vle8.v         v8, (%[wp2])\n"
> > +                     "vle8.v         v9, (%[wp2])\n"
> > +                     "vle8.v         v12, (%[wp3])\n"
> > +                     "vle8.v         v13, (%[wp3])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > +                     [wp2]"r"(&dptr[z0][d+2*NSIZE]),
> > +                     [wp3]"r"(&dptr[z0][d+3*NSIZE])
> > +             );
> > +
> > +             /* P/Q data pages */
> > +             for (z = z0-1 ; z >= start ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v11, v11, v10\n"
> > +                             "vle8.v         v10, (%[wd2])\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +                             "vxor.vv        v8, v8, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v15, v15, v14\n"
> > +                             "vle8.v         v14, (%[wd3])\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +                             "vxor.vv        v12, v12, v14\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [wd2]"r"(&dptr[z][d+2*NSIZE]),
> > +                             [wd3]"r"(&dptr[z][d+3*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /* P/Q left side optimization */
> > +             for (z = start-1 ; z >= 0 ; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * wq$$ = w1$$ ^ w2$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> > +              * v0:wp0, v1:wq0, v2:p0, v3:q0
> > +              * v4:wp1, v5:wq1, v6:p1, v7:q1
> > +              * v8:wp2, v9:wq2, v10:p2, v11:q2
> > +              * v12:wp3, v13:wq3, v14:p3, v15:q3
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v2, (%[wp0])\n"
> > +                     "vle8.v         v3, (%[wq0])\n"
> > +                     "vxor.vv        v2, v2, v0\n"
> > +                     "vxor.vv        v3, v3, v1\n"
> > +                     "vse8.v         v2, (%[wp0])\n"
> > +                     "vse8.v         v3, (%[wq0])\n"
> > +
> > +                     "vle8.v         v6, (%[wp1])\n"
> > +                     "vle8.v         v7, (%[wq1])\n"
> > +                     "vxor.vv        v6, v6, v4\n"
> > +                     "vxor.vv        v7, v7, v5\n"
> > +                     "vse8.v         v6, (%[wp1])\n"
> > +                     "vse8.v         v7, (%[wq1])\n"
> > +
> > +                     "vle8.v         v10, (%[wp2])\n"
> > +                     "vle8.v         v11, (%[wq2])\n"
> > +                     "vxor.vv        v10, v10, v8\n"
> > +                     "vxor.vv        v11, v11, v9\n"
> > +                     "vse8.v         v10, (%[wp2])\n"
> > +                     "vse8.v         v11, (%[wq2])\n"
> > +
> > +                     "vle8.v         v14, (%[wp3])\n"
> > +                     "vle8.v         v15, (%[wq3])\n"
> > +                     "vxor.vv        v14, v14, v12\n"
> > +                     "vxor.vv        v15, v15, v13\n"
> > +                     "vse8.v         v14, (%[wp3])\n"
> > +                     "vse8.v         v15, (%[wq3])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1]),
> > +                     [wp2]"r"(&p[d+NSIZE*2]),
> > +                     [wq2]"r"(&q[d+NSIZE*2]),
> > +                     [wp3]"r"(&p[d+NSIZE*3]),
> > +                     [wq3]"r"(&q[d+NSIZE*3])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     int d, z, z0;
> > +     u8 *p, *q;
> > +
> > +     z0 = disks - 3; /* Highest data disk */
> > +     p = dptr[z0+1]; /* XOR parity */
> > +     q = dptr[z0+2]; /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
> > +      * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
> > +      * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
> > +      * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> > +      * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
> > +      * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
> > +      * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
> > +      * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
> > +      */
> > +     for (d = 0; d < bytes; d += NSIZE*8) {
> > +             /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     "vle8.v         v8, (%[wp2])\n"
> > +                     "vle8.v         v9, (%[wp2])\n"
> > +                     "vle8.v         v12, (%[wp3])\n"
> > +                     "vle8.v         v13, (%[wp3])\n"
> > +                     "vle8.v         v16, (%[wp4])\n"
> > +                     "vle8.v         v17, (%[wp4])\n"
> > +                     "vle8.v         v20, (%[wp5])\n"
> > +                     "vle8.v         v21, (%[wp5])\n"
> > +                     "vle8.v         v24, (%[wp6])\n"
> > +                     "vle8.v         v25, (%[wp6])\n"
> > +                     "vle8.v         v28, (%[wp7])\n"
> > +                     "vle8.v         v29, (%[wp7])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > +                     [wp2]"r"(&dptr[z0][d+2*NSIZE]),
> > +                     [wp3]"r"(&dptr[z0][d+3*NSIZE]),
> > +                     [wp4]"r"(&dptr[z0][d+4*NSIZE]),
> > +                     [wp5]"r"(&dptr[z0][d+5*NSIZE]),
> > +                     [wp6]"r"(&dptr[z0][d+6*NSIZE]),
> > +                     [wp7]"r"(&dptr[z0][d+7*NSIZE])
> > +             );
> > +
> > +             for (z = z0-1; z >= 0; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v11, v11, v10\n"
> > +                             "vle8.v         v10, (%[wd2])\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +                             "vxor.vv        v8, v8, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v15, v15, v14\n"
> > +                             "vle8.v         v14, (%[wd3])\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +                             "vxor.vv        v12, v12, v14\n"
> > +
> > +                             "vsra.vi        v18, v17, 7\n"
> > +                             "vsll.vi        v19, v17, 1\n"
> > +                             "vand.vx        v18, v18, %[x1d]\n"
> > +                             "vxor.vv        v19, v19, v18\n"
> > +                             "vle8.v         v18, (%[wd4])\n"
> > +                             "vxor.vv        v17, v19, v18\n"
> > +                             "vxor.vv        v16, v16, v18\n"
> > +
> > +                             "vsra.vi        v22, v21, 7\n"
> > +                             "vsll.vi        v23, v21, 1\n"
> > +                             "vand.vx        v22, v22, %[x1d]\n"
> > +                             "vxor.vv        v23, v23, v22\n"
> > +                             "vle8.v         v22, (%[wd5])\n"
> > +                             "vxor.vv        v21, v23, v22\n"
> > +                             "vxor.vv        v20, v20, v22\n"
> > +
> > +                             "vsra.vi        v26, v25, 7\n"
> > +                             "vsll.vi        v27, v25, 1\n"
> > +                             "vand.vx        v26, v26, %[x1d]\n"
> > +                             "vxor.vv        v27, v27, v26\n"
> > +                             "vle8.v         v26, (%[wd6])\n"
> > +                             "vxor.vv        v25, v27, v26\n"
> > +                             "vxor.vv        v24, v24, v26\n"
> > +
> > +                             "vsra.vi        v30, v29, 7\n"
> > +                             "vsll.vi        v31, v29, 1\n"
> > +                             "vand.vx        v30, v30, %[x1d]\n"
> > +                             "vxor.vv        v31, v31, v30\n"
> > +                             "vle8.v         v30, (%[wd7])\n"
> > +                             "vxor.vv        v29, v31, v30\n"
> > +                             "vxor.vv        v28, v28, v30\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [wd2]"r"(&dptr[z][d+2*NSIZE]),
> > +                             [wd3]"r"(&dptr[z][d+3*NSIZE]),
> > +                             [wd4]"r"(&dptr[z][d+4*NSIZE]),
> > +                             [wd5]"r"(&dptr[z][d+5*NSIZE]),
> > +                             [wd6]"r"(&dptr[z][d+6*NSIZE]),
> > +                             [wd7]"r"(&dptr[z][d+7*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vse8.v         v0, (%[wp0])\n"
> > +                     "vse8.v         v1, (%[wq0])\n"
> > +                     "vse8.v         v4, (%[wp1])\n"
> > +                     "vse8.v         v5, (%[wq1])\n"
> > +                     "vse8.v         v8, (%[wp2])\n"
> > +                     "vse8.v         v9, (%[wq2])\n"
> > +                     "vse8.v         v12, (%[wp3])\n"
> > +                     "vse8.v         v13, (%[wq3])\n"
> > +                     "vse8.v         v16, (%[wp4])\n"
> > +                     "vse8.v         v17, (%[wq4])\n"
> > +                     "vse8.v         v20, (%[wp5])\n"
> > +                     "vse8.v         v21, (%[wq5])\n"
> > +                     "vse8.v         v24, (%[wp6])\n"
> > +                     "vse8.v         v25, (%[wq6])\n"
> > +                     "vse8.v         v28, (%[wp7])\n"
> > +                     "vse8.v         v29, (%[wq7])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1]),
> > +                     [wp2]"r"(&p[d+NSIZE*2]),
> > +                     [wq2]"r"(&q[d+NSIZE*2]),
> > +                     [wp3]"r"(&p[d+NSIZE*3]),
> > +                     [wq3]"r"(&q[d+NSIZE*3]),
> > +                     [wp4]"r"(&p[d+NSIZE*4]),
> > +                     [wq4]"r"(&q[d+NSIZE*4]),
> > +                     [wp5]"r"(&p[d+NSIZE*5]),
> > +                     [wq5]"r"(&q[d+NSIZE*5]),
> > +                     [wp6]"r"(&p[d+NSIZE*6]),
> > +                     [wq6]"r"(&q[d+NSIZE*6]),
> > +                     [wp7]"r"(&p[d+NSIZE*7]),
> > +                     [wq7]"r"(&q[d+NSIZE*7])
> > +             );
> > +     }
> > +}
> > +
> > +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
> > +                                     unsigned long bytes, void **ptrs)
> > +{
> > +     u8 **dptr = (u8 **)ptrs;
> > +     u8 *p, *q;
> > +     int d, z, z0;
> > +
> > +     z0 = stop;              /* P/Q right side optimization */
> > +     p = dptr[disks-2];      /* XOR parity */
> > +     q = dptr[disks-1];      /* RS syndrome */
> > +
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"
> > +             "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +             ".option        pop\n"
> > +     );
> > +
> > +     /*
> > +      * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> > +      * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> > +      * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> > +      * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> > +      * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
> > +      * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
> > +      * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
> > +      * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
> > +      */
> > +     for (d = 0; d < bytes; d += NSIZE*8) {
> > +              /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v0, (%[wp0])\n"
> > +                     "vle8.v         v1, (%[wp0])\n"
> > +                     "vle8.v         v4, (%[wp1])\n"
> > +                     "vle8.v         v5, (%[wp1])\n"
> > +                     "vle8.v         v8, (%[wp2])\n"
> > +                     "vle8.v         v9, (%[wp2])\n"
> > +                     "vle8.v         v12, (%[wp3])\n"
> > +                     "vle8.v         v13, (%[wp3])\n"
> > +                     "vle8.v         v16, (%[wp4])\n"
> > +                     "vle8.v         v17, (%[wp4])\n"
> > +                     "vle8.v         v20, (%[wp5])\n"
> > +                     "vle8.v         v21, (%[wp5])\n"
> > +                     "vle8.v         v24, (%[wp6])\n"
> > +                     "vle8.v         v25, (%[wp6])\n"
> > +                     "vle8.v         v28, (%[wp7])\n"
> > +                     "vle8.v         v29, (%[wp7])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> > +                     [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > +                     [wp2]"r"(&dptr[z0][d+2*NSIZE]),
> > +                     [wp3]"r"(&dptr[z0][d+3*NSIZE]),
> > +                     [wp4]"r"(&dptr[z0][d+4*NSIZE]),
> > +                     [wp5]"r"(&dptr[z0][d+5*NSIZE]),
> > +                     [wp6]"r"(&dptr[z0][d+6*NSIZE]),
> > +                     [wp7]"r"(&dptr[z0][d+7*NSIZE])
> > +             );
> > +
> > +             /* P/Q data pages */
> > +             for (z = z0-1; z >= start; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * w1$$ ^= w2$$;
> > +                      * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> > +                      * wq$$ = w1$$ ^ wd$$;
> > +                      * wp$$ ^= wd$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v3, v3, v2\n"
> > +                             "vle8.v         v2, (%[wd0])\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +                             "vxor.vv        v0, v0, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v7, v7, v6\n"
> > +                             "vle8.v         v6, (%[wd1])\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +                             "vxor.vv        v4, v4, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v11, v11, v10\n"
> > +                             "vle8.v         v10, (%[wd2])\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +                             "vxor.vv        v8, v8, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v15, v15, v14\n"
> > +                             "vle8.v         v14, (%[wd3])\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +                             "vxor.vv        v12, v12, v14\n"
> > +
> > +                             "vsra.vi        v18, v17, 7\n"
> > +                             "vsll.vi        v19, v17, 1\n"
> > +                             "vand.vx        v18, v18, %[x1d]\n"
> > +                             "vxor.vv        v19, v19, v18\n"
> > +                             "vle8.v         v18, (%[wd4])\n"
> > +                             "vxor.vv        v17, v19, v18\n"
> > +                             "vxor.vv        v16, v16, v18\n"
> > +
> > +                             "vsra.vi        v22, v21, 7\n"
> > +                             "vsll.vi        v23, v21, 1\n"
> > +                             "vand.vx        v22, v22, %[x1d]\n"
> > +                             "vxor.vv        v23, v23, v22\n"
> > +                             "vle8.v         v22, (%[wd5])\n"
> > +                             "vxor.vv        v21, v23, v22\n"
> > +                             "vxor.vv        v20, v20, v22\n"
> > +
> > +                             "vsra.vi        v26, v25, 7\n"
> > +                             "vsll.vi        v27, v25, 1\n"
> > +                             "vand.vx        v26, v26, %[x1d]\n"
> > +                             "vxor.vv        v27, v27, v26\n"
> > +                             "vle8.v         v26, (%[wd6])\n"
> > +                             "vxor.vv        v25, v27, v26\n"
> > +                             "vxor.vv        v24, v24, v26\n"
> > +
> > +                             "vsra.vi        v30, v29, 7\n"
> > +                             "vsll.vi        v31, v29, 1\n"
> > +                             "vand.vx        v30, v30, %[x1d]\n"
> > +                             "vxor.vv        v31, v31, v30\n"
> > +                             "vle8.v         v30, (%[wd7])\n"
> > +                             "vxor.vv        v29, v31, v30\n"
> > +                             "vxor.vv        v28, v28, v30\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [wd0]"r"(&dptr[z][d+0*NSIZE]),
> > +                             [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > +                             [wd2]"r"(&dptr[z][d+2*NSIZE]),
> > +                             [wd3]"r"(&dptr[z][d+3*NSIZE]),
> > +                             [wd4]"r"(&dptr[z][d+4*NSIZE]),
> > +                             [wd5]"r"(&dptr[z][d+5*NSIZE]),
> > +                             [wd6]"r"(&dptr[z][d+6*NSIZE]),
> > +                             [wd7]"r"(&dptr[z][d+7*NSIZE]),
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /* P/Q left side optimization */
> > +             for (z = start-1; z >= 0; z--) {
> > +                     /*
> > +                      * w2$$ = MASK(wq$$);
> > +                      * w1$$ = SHLBYTE(wq$$);
> > +                      * w2$$ &= NBYTES(0x1d);
> > +                      * wq$$ = w1$$ ^ w2$$;
> > +                      */
> > +                     asm volatile (
> > +                             ".option        push\n"
> > +                             ".option        arch,+v\n"
> > +                             "vsra.vi        v2, v1, 7\n"
> > +                             "vsll.vi        v3, v1, 1\n"
> > +                             "vand.vx        v2, v2, %[x1d]\n"
> > +                             "vxor.vv        v1, v3, v2\n"
> > +
> > +                             "vsra.vi        v6, v5, 7\n"
> > +                             "vsll.vi        v7, v5, 1\n"
> > +                             "vand.vx        v6, v6, %[x1d]\n"
> > +                             "vxor.vv        v5, v7, v6\n"
> > +
> > +                             "vsra.vi        v10, v9, 7\n"
> > +                             "vsll.vi        v11, v9, 1\n"
> > +                             "vand.vx        v10, v10, %[x1d]\n"
> > +                             "vxor.vv        v9, v11, v10\n"
> > +
> > +                             "vsra.vi        v14, v13, 7\n"
> > +                             "vsll.vi        v15, v13, 1\n"
> > +                             "vand.vx        v14, v14, %[x1d]\n"
> > +                             "vxor.vv        v13, v15, v14\n"
> > +
> > +                             "vsra.vi        v18, v17, 7\n"
> > +                             "vsll.vi        v19, v17, 1\n"
> > +                             "vand.vx        v18, v18, %[x1d]\n"
> > +                             "vxor.vv        v17, v19, v18\n"
> > +
> > +                             "vsra.vi        v22, v21, 7\n"
> > +                             "vsll.vi        v23, v21, 1\n"
> > +                             "vand.vx        v22, v22, %[x1d]\n"
> > +                             "vxor.vv        v21, v23, v22\n"
> > +
> > +                             "vsra.vi        v26, v25, 7\n"
> > +                             "vsll.vi        v27, v25, 1\n"
> > +                             "vand.vx        v26, v26, %[x1d]\n"
> > +                             "vxor.vv        v25, v27, v26\n"
> > +
> > +                             "vsra.vi        v30, v29, 7\n"
> > +                             "vsll.vi        v31, v29, 1\n"
> > +                             "vand.vx        v30, v30, %[x1d]\n"
> > +                             "vxor.vv        v29, v31, v30\n"
> > +                             ".option        pop\n"
> > +                             : :
> > +                             [x1d]"r"(0x1d)
> > +                     );
> > +             }
> > +
> > +             /*
> > +              * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> > +              * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> > +              * v0:wp0, v1:wq0, v2:p0, v3:q0
> > +              * v4:wp1, v5:wq1, v6:p1, v7:q1
> > +              * v8:wp2, v9:wq2, v10:p2, v11:q2
> > +              * v12:wp3, v13:wq3, v14:p3, v15:q3
> > +              * v16:wp4, v17:wq4, v18:p4, v19:q4
> > +              * v20:wp5, v21:wq5, v22:p5, v23:q5
> > +              * v24:wp6, v25:wq6, v26:p6, v27:q6
> > +              * v28:wp7, v29:wq7, v30:p7, v31:q7
> > +              */
> > +             asm volatile (
> > +                     ".option        push\n"
> > +                     ".option        arch,+v\n"
> > +                     "vle8.v         v2, (%[wp0])\n"
> > +                     "vle8.v         v3, (%[wq0])\n"
> > +                     "vxor.vv        v2, v2, v0\n"
> > +                     "vxor.vv        v3, v3, v1\n"
> > +                     "vse8.v         v2, (%[wp0])\n"
> > +                     "vse8.v         v3, (%[wq0])\n"
> > +
> > +                     "vle8.v         v6, (%[wp1])\n"
> > +                     "vle8.v         v7, (%[wq1])\n"
> > +                     "vxor.vv        v6, v6, v4\n"
> > +                     "vxor.vv        v7, v7, v5\n"
> > +                     "vse8.v         v6, (%[wp1])\n"
> > +                     "vse8.v         v7, (%[wq1])\n"
> > +
> > +                     "vle8.v         v10, (%[wp2])\n"
> > +                     "vle8.v         v11, (%[wq2])\n"
> > +                     "vxor.vv        v10, v10, v8\n"
> > +                     "vxor.vv        v11, v11, v9\n"
> > +                     "vse8.v         v10, (%[wp2])\n"
> > +                     "vse8.v         v11, (%[wq2])\n"
> > +
> > +                     "vle8.v         v14, (%[wp3])\n"
> > +                     "vle8.v         v15, (%[wq3])\n"
> > +                     "vxor.vv        v14, v14, v12\n"
> > +                     "vxor.vv        v15, v15, v13\n"
> > +                     "vse8.v         v14, (%[wp3])\n"
> > +                     "vse8.v         v15, (%[wq3])\n"
> > +
> > +                     "vle8.v         v18, (%[wp4])\n"
> > +                     "vle8.v         v19, (%[wq4])\n"
> > +                     "vxor.vv        v18, v18, v16\n"
> > +                     "vxor.vv        v19, v19, v17\n"
> > +                     "vse8.v         v18, (%[wp4])\n"
> > +                     "vse8.v         v19, (%[wq4])\n"
> > +
> > +                     "vle8.v         v22, (%[wp5])\n"
> > +                     "vle8.v         v23, (%[wq5])\n"
> > +                     "vxor.vv        v22, v22, v20\n"
> > +                     "vxor.vv        v23, v23, v21\n"
> > +                     "vse8.v         v22, (%[wp5])\n"
> > +                     "vse8.v         v23, (%[wq5])\n"
> > +
> > +                     "vle8.v         v26, (%[wp6])\n"
> > +                     "vle8.v         v27, (%[wq6])\n"
> > +                     "vxor.vv        v26, v26, v24\n"
> > +                     "vxor.vv        v27, v27, v25\n"
> > +                     "vse8.v         v26, (%[wp6])\n"
> > +                     "vse8.v         v27, (%[wq6])\n"
> > +
> > +                     "vle8.v         v30, (%[wp7])\n"
> > +                     "vle8.v         v31, (%[wq7])\n"
> > +                     "vxor.vv        v30, v30, v28\n"
> > +                     "vxor.vv        v31, v31, v29\n"
> > +                     "vse8.v         v30, (%[wp7])\n"
> > +                     "vse8.v         v31, (%[wq7])\n"
> > +                     ".option        pop\n"
> > +                     : :
> > +                     [wp0]"r"(&p[d+NSIZE*0]),
> > +                     [wq0]"r"(&q[d+NSIZE*0]),
> > +                     [wp1]"r"(&p[d+NSIZE*1]),
> > +                     [wq1]"r"(&q[d+NSIZE*1]),
> > +                     [wp2]"r"(&p[d+NSIZE*2]),
> > +                     [wq2]"r"(&q[d+NSIZE*2]),
> > +                     [wp3]"r"(&p[d+NSIZE*3]),
> > +                     [wq3]"r"(&q[d+NSIZE*3]),
> > +                     [wp4]"r"(&p[d+NSIZE*4]),
> > +                     [wq4]"r"(&q[d+NSIZE*4]),
> > +                     [wp5]"r"(&p[d+NSIZE*5]),
> > +                     [wq5]"r"(&q[d+NSIZE*5]),
> > +                     [wp6]"r"(&p[d+NSIZE*6]),
> > +                     [wq6]"r"(&q[d+NSIZE*6]),
> > +                     [wp7]"r"(&p[d+NSIZE*7]),
> > +                     [wq7]"r"(&q[d+NSIZE*7])
> > +             );
> > +     }
> > +}
> > +
> > +#define RAID6_RVV_WRAPPER(_n)                                                \
> > +     static void raid6_rvv ## _n ## _gen_syndrome(int disks,         \
> > +                                     size_t bytes, void **ptrs)      \
> > +     {                                                               \
> > +             void raid6_rvv ## _n  ## _gen_syndrome_real(int,        \
> > +                                             unsigned long, void**); \
> > +             kernel_vector_begin();                                  \
> > +             raid6_rvv ## _n ## _gen_syndrome_real(disks,            \
> > +                             (unsigned long)bytes, ptrs);            \
> > +             kernel_vector_end();                                    \
> > +     }                                                               \
> > +     static void raid6_rvv ## _n ## _xor_syndrome(int disks,         \
> > +                                     int start, int stop,            \
> > +                                     size_t bytes, void **ptrs)      \
> > +     {                                                               \
> > +             void raid6_rvv ## _n  ## _xor_syndrome_real(int,        \
> > +                             int, int, unsigned long, void**);       \
> > +             kernel_vector_begin();                                  \
> > +             raid6_rvv ## _n ## _xor_syndrome_real(disks,            \
> > +                     start, stop, (unsigned long)bytes, ptrs);       \
> > +             kernel_vector_end();                                    \
> > +     }                                                               \
> > +     struct raid6_calls const raid6_rvvx ## _n = {                   \
> > +             raid6_rvv ## _n ## _gen_syndrome,                       \
> > +             raid6_rvv ## _n ## _xor_syndrome,                       \
> > +             rvv_has_vector,                                         \
> > +             "rvvx" #_n,                                             \
> > +             0                                                       \
> > +     }
> > +
> > +RAID6_RVV_WRAPPER(1);
> > +RAID6_RVV_WRAPPER(2);
> > +RAID6_RVV_WRAPPER(4);
> > +RAID6_RVV_WRAPPER(8);
>
Chunyan Zhang Feb. 11, 2025, 9:59 a.m. UTC | #4
On Tue, 28 Jan 2025 at 06:34, Charlie Jenkins <charlie@rivosinc.com> wrote:
>
> On Mon, Jan 27, 2025 at 09:39:11AM +0100, Paul Menzel wrote:
> > Dear Chunyan,
> >
> >
> > Thank you for the patch.
> >
> >
> > Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > recovery calculations.
> > >
> > > Results on QEMU running with the option "-icount shift=0":
> > >
> > >    raid6: rvvx1    gen()  1008 MB/s
> > >    raid6: rvvx2    gen()  1395 MB/s
> > >    raid6: rvvx4    gen()  1584 MB/s
> > >    raid6: rvvx8    gen()  1694 MB/s
> > >    raid6: int64x8  gen()   113 MB/s
> > >    raid6: int64x4  gen()   116 MB/s
> > >    raid6: int64x2  gen()   272 MB/s
> > >    raid6: int64x1  gen()   229 MB/s
> > >    raid6: using algorithm rvvx8 gen() 1694 MB/s
> > >    raid6: .... xor() 1000 MB/s, rmw enabled
> > >    raid6: using rvv recovery algorithm
> >
> > How did you start QEMU and on what host did you run it? Does it change
> > between runs? (For me these benchmark values were very unreliable in the
> > past on x86 hardware.)
>
> I reported dramatic gains on vector as well in this response [1]. Note
> that these gains are only present when using the QEMU option "-icount
> shift=0" vector becomes dramatically more performant. Without this
> option we do not see a performance gain on QEMU. However riscv vector is
> known to not be less optimized on QEMU so having vector be less
> performant on some QEMU configurations is not necessarily representative
> of hardware implementations.
>
>
> My full qemu command is (running on x86 host):
>
> qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1\
>     -kernel arch/riscv/boot/Image \
>     -append "root=/dev/vda rw earlycon console=ttyS0" \
>     -drive file=rootfs.ext2,format=raw,id=hd0,if=none \
>     -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
>     -device virtio-blk-device,drive=hd0
>
> This is with version 9.2.0.
>
>
> I am also facing this issue when executing this:
>
> raid6: rvvx1    gen()   717 MB/s
> raid6: rvvx2    gen()   734 MB/s
> Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
>
> Only rvvx4 is failing. I applied this patch to 6.13.

I used your command to run but no issue on my side (x86 host, qemu
version is 9.2.0, kernel 6.13 too):

qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1 -icount shift=0 \
        -kernel arch/riscv/boot/Image   \
        -append "rootwait root=/dev/vda ro"     \
        -drive file=rootfs.ext4,format=raw,id=hd0 \
        -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
        -device virtio-blk-device,drive=hd0

Thanks,
Chunyan

>
> - Charlie
>
Charlie Jenkins Feb. 13, 2025, 9:36 p.m. UTC | #5
On Tue, Feb 11, 2025 at 05:59:26PM +0800, Chunyan Zhang wrote:
> On Tue, 28 Jan 2025 at 06:34, Charlie Jenkins <charlie@rivosinc.com> wrote:
> >
> > On Mon, Jan 27, 2025 at 09:39:11AM +0100, Paul Menzel wrote:
> > > Dear Chunyan,
> > >
> > >
> > > Thank you for the patch.
> > >
> > >
> > > Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> > > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > > recovery calculations.
> > > >
> > > > Results on QEMU running with the option "-icount shift=0":
> > > >
> > > >    raid6: rvvx1    gen()  1008 MB/s
> > > >    raid6: rvvx2    gen()  1395 MB/s
> > > >    raid6: rvvx4    gen()  1584 MB/s
> > > >    raid6: rvvx8    gen()  1694 MB/s
> > > >    raid6: int64x8  gen()   113 MB/s
> > > >    raid6: int64x4  gen()   116 MB/s
> > > >    raid6: int64x2  gen()   272 MB/s
> > > >    raid6: int64x1  gen()   229 MB/s
> > > >    raid6: using algorithm rvvx8 gen() 1694 MB/s
> > > >    raid6: .... xor() 1000 MB/s, rmw enabled
> > > >    raid6: using rvv recovery algorithm
> > >
> > > How did you start QEMU and on what host did you run it? Does it change
> > > between runs? (For me these benchmark values were very unreliable in the
> > > past on x86 hardware.)
> >
> > I reported dramatic gains on vector as well in this response [1]. Note
> > that these gains are only present when using the QEMU option "-icount
> > shift=0" vector becomes dramatically more performant. Without this
> > option we do not see a performance gain on QEMU. However riscv vector is
> > known to not be less optimized on QEMU so having vector be less
> > performant on some QEMU configurations is not necessarily representative
> > of hardware implementations.
> >
> >
> > My full qemu command is (running on x86 host):
> >
> > qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1\
> >     -kernel arch/riscv/boot/Image \
> >     -append "root=/dev/vda rw earlycon console=ttyS0" \
> >     -drive file=rootfs.ext2,format=raw,id=hd0,if=none \
> >     -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
> >     -device virtio-blk-device,drive=hd0
> >
> > This is with version 9.2.0.
> >
> >
> > I am also facing this issue when executing this:
> >
> > raid6: rvvx1    gen()   717 MB/s
> > raid6: rvvx2    gen()   734 MB/s
> > Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
> >
> > Only rvvx4 is failing. I applied this patch to 6.13.
> 
> I used your command to run but no issue on my side (x86 host, qemu
> version is 9.2.0, kernel 6.13 too):
> 
> qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1 -icount shift=0 \
>         -kernel arch/riscv/boot/Image   \
>         -append "rootwait root=/dev/vda ro"     \
>         -drive file=rootfs.ext4,format=raw,id=hd0 \
>         -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
>         -device virtio-blk-device,drive=hd0

I am able to reproduce it with this defconfig:

CONFIG_SYSVIPC=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_BPF_SYSCALL=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_CHECKPOINT_RESTORE=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_EXPERT=y
# CONFIG_SYSFS_SYSCALL is not set
CONFIG_PROFILING=y
CONFIG_SMP=y
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_STAT=y
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
CONFIG_CPUFREQ_DT=y
CONFIG_JUMP_LABEL=y
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_MTD=y
CONFIG_MTD_BLOCK=y
CONFIG_MTD_CFI=y
CONFIG_MTD_CFI_ADV_OPTIONS=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_VIRTIO_BLK=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=y
CONFIG_MD_RAID456=y
CONFIG_INPUT_MOUSEDEV=y
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_SERIAL_8250_DW=y
CONFIG_SERIAL_OF_PLATFORM=y
CONFIG_SERIAL_SIFIVE=y
CONFIG_SERIAL_SIFIVE_CONSOLE=y
CONFIG_VIRTIO_CONSOLE=y
CONFIG_HW_RANDOM_VIRTIO=y
CONFIG_PINCTRL=y
CONFIG_GPIOLIB=y
CONFIG_GPIO_DWAPB=y
CONFIG_GPIO_SIFIVE=y
CONFIG_SOUND=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_GOLDFISH=y
CONFIG_DMADEVICES=y
CONFIG_DW_AXI_DMAC=y
CONFIG_VIRTIO_BALLOON=y
CONFIG_VIRTIO_INPUT=y
CONFIG_VIRTIO_MMIO=y
CONFIG_GOLDFISH=y
CONFIG_MAILBOX=y
CONFIG_RPMSG_CTRL=y
CONFIG_RPMSG_VIRTIO=y
CONFIG_PM_DEVFREQ=y
CONFIG_IIO=y
CONFIG_LIBNVDIMM=y
CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_POSIX_ACL=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_AUTOFS_FS=y
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
CONFIG_PRINTK_TIME=y
CONFIG_SCHED_STACK_END_CHECK=y
# CONFIG_RCU_TRACE is not set
# CONFIG_FTRACE is not set
# CONFIG_RUNTIME_TESTING_MENU is not set

I took the riscv/defconfig and added MD_RAID456 and it's dependencies.
So that the message wasn't too long I started removing some unnecessary
configs. Try this out and let me know if you encounter the issue.

- Charlie

> 
> Thanks,
> Chunyan
> 
> >
> > - Charlie
> >
Chunyan Zhang Feb. 14, 2025, 8:57 a.m. UTC | #6
On Fri, 14 Feb 2025 at 05:36, Charlie Jenkins <charlie@rivosinc.com> wrote:
>
> On Tue, Feb 11, 2025 at 05:59:26PM +0800, Chunyan Zhang wrote:
> > On Tue, 28 Jan 2025 at 06:34, Charlie Jenkins <charlie@rivosinc.com> wrote:
> > >
> > > On Mon, Jan 27, 2025 at 09:39:11AM +0100, Paul Menzel wrote:
> > > > Dear Chunyan,
> > > >
> > > >
> > > > Thank you for the patch.
> > > >
> > > >
> > > > Am 27.01.25 um 07:15 schrieb Chunyan Zhang:
> > > > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > > > recovery calculations.
> > > > >
> > > > > Results on QEMU running with the option "-icount shift=0":
> > > > >
> > > > >    raid6: rvvx1    gen()  1008 MB/s
> > > > >    raid6: rvvx2    gen()  1395 MB/s
> > > > >    raid6: rvvx4    gen()  1584 MB/s
> > > > >    raid6: rvvx8    gen()  1694 MB/s
> > > > >    raid6: int64x8  gen()   113 MB/s
> > > > >    raid6: int64x4  gen()   116 MB/s
> > > > >    raid6: int64x2  gen()   272 MB/s
> > > > >    raid6: int64x1  gen()   229 MB/s
> > > > >    raid6: using algorithm rvvx8 gen() 1694 MB/s
> > > > >    raid6: .... xor() 1000 MB/s, rmw enabled
> > > > >    raid6: using rvv recovery algorithm
> > > >
> > > > How did you start QEMU and on what host did you run it? Does it change
> > > > between runs? (For me these benchmark values were very unreliable in the
> > > > past on x86 hardware.)
> > >
> > > I reported dramatic gains on vector as well in this response [1]. Note
> > > that these gains are only present when using the QEMU option "-icount
> > > shift=0" vector becomes dramatically more performant. Without this
> > > option we do not see a performance gain on QEMU. However riscv vector is
> > > known to not be less optimized on QEMU so having vector be less
> > > performant on some QEMU configurations is not necessarily representative
> > > of hardware implementations.
> > >
> > >
> > > My full qemu command is (running on x86 host):
> > >
> > > qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1\
> > >     -kernel arch/riscv/boot/Image \
> > >     -append "root=/dev/vda rw earlycon console=ttyS0" \
> > >     -drive file=rootfs.ext2,format=raw,id=hd0,if=none \
> > >     -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
> > >     -device virtio-blk-device,drive=hd0
> > >
> > > This is with version 9.2.0.
> > >
> > >
> > > I am also facing this issue when executing this:
> > >
> > > raid6: rvvx1    gen()   717 MB/s
> > > raid6: rvvx2    gen()   734 MB/s
> > > Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
> > >
> > > Only rvvx4 is failing. I applied this patch to 6.13.
> >
> > I used your command to run but no issue on my side (x86 host, qemu
> > version is 9.2.0, kernel 6.13 too):
> >
> > qemu-system-riscv64 -nographic -m 1G -machine virt -smp 1 -icount shift=0 \
> >         -kernel arch/riscv/boot/Image   \
> >         -append "rootwait root=/dev/vda ro"     \
> >         -drive file=rootfs.ext4,format=raw,id=hd0 \
> >         -bios default -cpu rv64,v=true,vlen=256,vext_spec=v1.0 \
> >         -device virtio-blk-device,drive=hd0
>
> I am able to reproduce it with this defconfig:
>
> CONFIG_SYSVIPC=y
> CONFIG_NO_HZ_IDLE=y
> CONFIG_HIGH_RES_TIMERS=y
> CONFIG_BPF_SYSCALL=y
> CONFIG_IKCONFIG=y
> CONFIG_IKCONFIG_PROC=y
> CONFIG_NAMESPACES=y
> CONFIG_USER_NS=y
> CONFIG_CHECKPOINT_RESTORE=y
> CONFIG_BLK_DEV_INITRD=y
> CONFIG_EXPERT=y
> # CONFIG_SYSFS_SYSCALL is not set
> CONFIG_PROFILING=y
> CONFIG_SMP=y
> CONFIG_CPU_FREQ=y
> CONFIG_CPU_FREQ_STAT=y
> CONFIG_CPU_FREQ_GOV_USERSPACE=y
> CONFIG_CPU_FREQ_GOV_ONDEMAND=y
> CONFIG_CPUFREQ_DT=y
> CONFIG_JUMP_LABEL=y
> CONFIG_DEVTMPFS=y
> CONFIG_DEVTMPFS_MOUNT=y
> CONFIG_MTD=y
> CONFIG_MTD_BLOCK=y
> CONFIG_MTD_CFI=y
> CONFIG_MTD_CFI_ADV_OPTIONS=y
> CONFIG_BLK_DEV_LOOP=y
> CONFIG_VIRTIO_BLK=y
> CONFIG_MD=y
> CONFIG_BLK_DEV_MD=y
> CONFIG_MD_RAID456=y
> CONFIG_INPUT_MOUSEDEV=y
> CONFIG_SERIAL_8250=y
> CONFIG_SERIAL_8250_CONSOLE=y
> CONFIG_SERIAL_8250_DW=y
> CONFIG_SERIAL_OF_PLATFORM=y
> CONFIG_SERIAL_SIFIVE=y
> CONFIG_SERIAL_SIFIVE_CONSOLE=y
> CONFIG_VIRTIO_CONSOLE=y
> CONFIG_HW_RANDOM_VIRTIO=y
> CONFIG_PINCTRL=y
> CONFIG_GPIOLIB=y
> CONFIG_GPIO_DWAPB=y
> CONFIG_GPIO_SIFIVE=y
> CONFIG_SOUND=y
> CONFIG_RTC_CLASS=y
> CONFIG_RTC_DRV_GOLDFISH=y
> CONFIG_DMADEVICES=y
> CONFIG_DW_AXI_DMAC=y
> CONFIG_VIRTIO_BALLOON=y
> CONFIG_VIRTIO_INPUT=y
> CONFIG_VIRTIO_MMIO=y
> CONFIG_GOLDFISH=y
> CONFIG_MAILBOX=y
> CONFIG_RPMSG_CTRL=y
> CONFIG_RPMSG_VIRTIO=y
> CONFIG_PM_DEVFREQ=y
> CONFIG_IIO=y
> CONFIG_LIBNVDIMM=y
> CONFIG_EXT4_FS=y
> CONFIG_EXT4_FS_POSIX_ACL=y
> CONFIG_EXT4_FS_SECURITY=y
> CONFIG_AUTOFS_FS=y
> CONFIG_ISO9660_FS=y
> CONFIG_JOLIET=y
> CONFIG_ZISOFS=y
> CONFIG_MSDOS_FS=y
> CONFIG_VFAT_FS=y
> CONFIG_PRINTK_TIME=y
> CONFIG_SCHED_STACK_END_CHECK=y
> # CONFIG_RCU_TRACE is not set
> # CONFIG_FTRACE is not set
> # CONFIG_RUNTIME_TESTING_MENU is not set
>
> I took the riscv/defconfig and added MD_RAID456 and it's dependencies.
> So that the message wasn't too long I started removing some unnecessary
> configs. Try this out and let me know if you encounter the issue.

I took the riscv/defconfig and set MD_RAID456=y, but didn't see this issue.
Since RAID6_PQ is selected by MD_RAID456, so RAID6_PQ=y, I got the
raid6 test result during kernel init.

[    0.317147] raid6: rvvx1    gen()    45 MB/s
[    0.390800] raid6: rvvx2    gen()    45 MB/s
[    0.459435] raid6: rvvx4    gen()    45 MB/s
[    0.527651] raid6: rvvx8    gen()    44 MB/s
[    0.596123] raid6: int64x8  gen()  1232 MB/s
[    0.664686] raid6: int64x4  gen()  2728 MB/s
[    0.733291] raid6: int64x2  gen()  3405 MB/s
[    0.801836] raid6: int64x1  gen()  2730 MB/s
[    0.801895] raid6: using algorithm int64x2 gen() 3405 MB/s
[    0.870379] raid6: .... xor() 493 MB/s, rmw enabled

Thanks,
Chunyan
diff mbox series

Patch

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 98030accf641..72ff44cca864 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -108,6 +108,10 @@  extern const struct raid6_calls raid6_vpermxor4;
 extern const struct raid6_calls raid6_vpermxor8;
 extern const struct raid6_calls raid6_lsx;
 extern const struct raid6_calls raid6_lasx;
+extern const struct raid6_calls raid6_rvvx1;
+extern const struct raid6_calls raid6_rvvx2;
+extern const struct raid6_calls raid6_rvvx4;
+extern const struct raid6_calls raid6_rvvx8;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
@@ -125,6 +129,7 @@  extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
 extern const struct raid6_recov_calls raid6_recov_lsx;
 extern const struct raid6_recov_calls raid6_recov_lasx;
+extern const struct raid6_recov_calls raid6_recov_rvv;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 29127dd05d63..5be0a4e60ab1 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -10,6 +10,7 @@  raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
+raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index cd2e88ee1f14..99980ff5b985 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -80,6 +80,12 @@  const struct raid6_calls * const raid6_algos[] = {
 #ifdef CONFIG_CPU_HAS_LSX
 	&raid6_lsx,
 #endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_rvvx1,
+	&raid6_rvvx2,
+	&raid6_rvvx4,
+	&raid6_rvvx8,
 #endif
 	&raid6_intx8,
 	&raid6_intx4,
@@ -115,6 +121,9 @@  const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #ifdef CONFIG_CPU_HAS_LSX
 	&raid6_recov_lsx,
 #endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_recov_rvv,
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
new file mode 100644
index 000000000000..db271d2987c6
--- /dev/null
+++ b/lib/raid6/recov_rvv.c
@@ -0,0 +1,234 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/raid/pq.h>
+
+static int rvv_has_vector(void)
+{
+	return has_vector();
+}
+
+static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
+			      u8 *dq, const u8 *pbmul,
+			      const u8 *qmul)
+{
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		".option	pop\n"
+		: :
+		[avl]"r"(16)
+	);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px	  = *p ^ *dp;
+	 *	qx	  = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:px, v1:dp,
+		 * v2:qx, v3:dq,
+		 * v4:vx, v5:vy,
+		 * v6:qm0, v7:qm1,
+		 * v8:pm0, v9:pm1,
+		 * v14:p/qm[vx], v15:p/qm[vy]
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[px])\n"
+			"vle8.v		v1, (%[dp])\n"
+			"vxor.vv	v0, v0, v1\n"
+			"vle8.v		v2, (%[qx])\n"
+			"vle8.v		v3, (%[dq])\n"
+			"vxor.vv	v4, v2, v3\n"
+			"vsrl.vi	v5, v4, 4\n"
+			"vand.vi	v4, v4, 0xf\n"
+			"vle8.v		v6, (%[qm0])\n"
+			"vle8.v		v7, (%[qm1])\n"
+			"vrgather.vv	v14, v6, v4\n" /* v14 = qm[vx] */
+			"vrgather.vv	v15, v7, v5\n" /* v15 = qm[vy] */
+			"vxor.vv	v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
+
+			"vsrl.vi	v5, v0, 4\n"
+			"vand.vi	v4, v0, 0xf\n"
+			"vle8.v		v8, (%[pm0])\n"
+			"vle8.v		v9, (%[pm1])\n"
+			"vrgather.vv	v14, v8, v4\n" /* v14 = pm[vx] */
+			"vrgather.vv	v15, v9, v5\n" /* v15 = pm[vy] */
+			"vxor.vv	v4, v14, v15\n" /* v4 = pbmul[px] */
+			"vxor.vv	v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
+			"vxor.vv	v1, v3, v0\n" /* v1 = db ^ px; */
+			"vse8.v		v3, (%[dq])\n"
+			"vse8.v		v1, (%[dp])\n"
+			".option	pop\n"
+			: :
+			[px]"r"(p),
+			[dp]"r"(dp),
+			[qx]"r"(q),
+			[dq]"r"(dq),
+			[qm0]"r"(qmul),
+			[qm1]"r"(qmul + 16),
+			[pm0]"r"(pbmul),
+			[pm1]"r"(pbmul + 16)
+			:);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+}
+
+static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul)
+{
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		".option	pop\n"
+		: :
+		[avl]"r"(16)
+	);
+
+	/*
+	 * while (bytes--) {
+	 *  *p++ ^= *dq = qmul[*q ^ *dq];
+	 *  q++; dq++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:vx, v1:vy,
+		 * v2:dq, v3:p,
+		 * v4:qm0, v5:qm1,
+		 * v10:m[vx], v11:m[vy]
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[vx])\n"
+			"vle8.v		v2, (%[dq])\n"
+			"vxor.vv	v0, v0, v2\n"
+			"vsrl.vi	v1, v0, 4\n"
+			"vand.vi	v0, v0, 0xf\n"
+			"vle8.v		v4, (%[qm0])\n"
+			"vle8.v		v5, (%[qm1])\n"
+			"vrgather.vv	v10, v4, v0\n"
+			"vrgather.vv	v11, v5, v1\n"
+			"vxor.vv	v0, v10, v11\n"
+			"vle8.v		v1, (%[vy])\n"
+			"vxor.vv	v1, v0, v1\n"
+			"vse8.v		v0, (%[dq])\n"
+			"vse8.v		v1, (%[vy])\n"
+			".option	pop\n"
+			: :
+			[vx]"r"(q),
+			[vy]"r"(p),
+			[dq]"r"(dq),
+			[qm0]"r"(qmul),
+			[qm1]"r"(qmul + 16)
+			:);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+}
+
+
+static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	kernel_vector_begin();
+	__raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
+	kernel_vector_end();
+}
+
+static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_vector_begin();
+	__raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
+	kernel_vector_end();
+}
+
+const struct raid6_recov_calls raid6_recov_rvv = {
+	.data2		= raid6_2data_recov_rvv,
+	.datap		= raid6_datap_recov_rvv,
+	.valid		= rvv_has_vector,
+	.name		= "rvv",
+	.priority	= 1,
+};
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
new file mode 100644
index 000000000000..fd0ec33edb1e
--- /dev/null
+++ b/lib/raid6/rvv.c
@@ -0,0 +1,1269 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID-6 syndrome calculation using RISCV vector instructions
+ *
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ *
+ * Based on neon.uc:
+ *	Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/raid/pq.h>
+#include <linux/types.h>
+
+#define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
+
+static int rvv_has_vector(void)
+{
+	return has_vector();
+}
+
+static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0 ; d < bytes ; d += NSIZE*1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0])
+		);
+	}
+}
+
+static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0 ; d < bytes ; d += NSIZE*1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+				".option	pop\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0])
+		);
+	}
+}
+
+static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			"vse8.v		v4, (%[wp1])\n"
+			"vse8.v		v5, (%[wq1])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1])
+		);
+	}
+}
+
+static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*2) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v5, v7, v6\n"
+				".option	pop\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+
+			"vle8.v		v6, (%[wp1])\n"
+			"vle8.v		v7, (%[wq1])\n"
+			"vxor.vv	v6, v6, v4\n"
+			"vxor.vv	v7, v7, v5\n"
+			"vse8.v		v6, (%[wp1])\n"
+			"vse8.v		v7, (%[wq1])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1])
+		);
+	}
+}
+
+static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;	/* Highest data disk */
+	p = dptr[z0+1];	/* XOR parity */
+	q = dptr[z0+2];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			"vle8.v		v8, (%[wp2])\n"
+			"vle8.v		v9, (%[wp2])\n"
+			"vle8.v		v12, (%[wp3])\n"
+			"vle8.v		v13, (%[wp3])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			"vse8.v		v4, (%[wp1])\n"
+			"vse8.v		v5, (%[wq1])\n"
+			"vse8.v		v8, (%[wp2])\n"
+			"vse8.v		v9, (%[wq2])\n"
+			"vse8.v		v12, (%[wp3])\n"
+			"vse8.v		v13, (%[wq3])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3])
+		);
+	}
+}
+
+static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
+					unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*4) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			"vle8.v		v8, (%[wp2])\n"
+			"vle8.v		v9, (%[wp2])\n"
+			"vle8.v		v12, (%[wp3])\n"
+			"vle8.v		v13, (%[wp3])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v5, v7, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v9, v11, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v13, v15, v14\n"
+				".option	pop\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 * v8:wp2, v9:wq2, v10:p2, v11:q2
+		 * v12:wp3, v13:wq3, v14:p3, v15:q3
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+
+			"vle8.v		v6, (%[wp1])\n"
+			"vle8.v		v7, (%[wq1])\n"
+			"vxor.vv	v6, v6, v4\n"
+			"vxor.vv	v7, v7, v5\n"
+			"vse8.v		v6, (%[wp1])\n"
+			"vse8.v		v7, (%[wq1])\n"
+
+			"vle8.v		v10, (%[wp2])\n"
+			"vle8.v		v11, (%[wq2])\n"
+			"vxor.vv	v10, v10, v8\n"
+			"vxor.vv	v11, v11, v9\n"
+			"vse8.v		v10, (%[wp2])\n"
+			"vse8.v		v11, (%[wq2])\n"
+
+			"vle8.v		v14, (%[wp3])\n"
+			"vle8.v		v15, (%[wq3])\n"
+			"vxor.vv	v14, v14, v12\n"
+			"vxor.vv	v15, v15, v13\n"
+			"vse8.v		v14, (%[wp3])\n"
+			"vse8.v		v15, (%[wq3])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3])
+		);
+	}
+}
+
+static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;	/* Highest data disk */
+	p = dptr[z0+1];	/* XOR parity */
+	q = dptr[z0+2];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0,   v1:wq0,  v2:wd0/w20,  v3:w10
+	 * v4:wp1,   v5:wq1,  v6:wd1/w21,  v7:w11
+	 * v8:wp2,   v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+	 */
+	for (d = 0; d < bytes; d += NSIZE*8) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			"vle8.v		v8, (%[wp2])\n"
+			"vle8.v		v9, (%[wp2])\n"
+			"vle8.v		v12, (%[wp3])\n"
+			"vle8.v		v13, (%[wp3])\n"
+			"vle8.v		v16, (%[wp4])\n"
+			"vle8.v		v17, (%[wp4])\n"
+			"vle8.v		v20, (%[wp5])\n"
+			"vle8.v		v21, (%[wp5])\n"
+			"vle8.v		v24, (%[wp6])\n"
+			"vle8.v		v25, (%[wp6])\n"
+			"vle8.v		v28, (%[wp7])\n"
+			"vle8.v		v29, (%[wp7])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE]),
+			[wp4]"r"(&dptr[z0][d+4*NSIZE]),
+			[wp5]"r"(&dptr[z0][d+5*NSIZE]),
+			[wp6]"r"(&dptr[z0][d+6*NSIZE]),
+			[wp7]"r"(&dptr[z0][d+7*NSIZE])
+		);
+
+		for (z = z0-1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+
+				"vsra.vi	v18, v17, 7\n"
+				"vsll.vi	v19, v17, 1\n"
+				"vand.vx	v18, v18, %[x1d]\n"
+				"vxor.vv	v19, v19, v18\n"
+				"vle8.v		v18, (%[wd4])\n"
+				"vxor.vv	v17, v19, v18\n"
+				"vxor.vv	v16, v16, v18\n"
+
+				"vsra.vi	v22, v21, 7\n"
+				"vsll.vi	v23, v21, 1\n"
+				"vand.vx	v22, v22, %[x1d]\n"
+				"vxor.vv	v23, v23, v22\n"
+				"vle8.v		v22, (%[wd5])\n"
+				"vxor.vv	v21, v23, v22\n"
+				"vxor.vv	v20, v20, v22\n"
+
+				"vsra.vi	v26, v25, 7\n"
+				"vsll.vi	v27, v25, 1\n"
+				"vand.vx	v26, v26, %[x1d]\n"
+				"vxor.vv	v27, v27, v26\n"
+				"vle8.v		v26, (%[wd6])\n"
+				"vxor.vv	v25, v27, v26\n"
+				"vxor.vv	v24, v24, v26\n"
+
+				"vsra.vi	v30, v29, 7\n"
+				"vsll.vi	v31, v29, 1\n"
+				"vand.vx	v30, v30, %[x1d]\n"
+				"vxor.vv	v31, v31, v30\n"
+				"vle8.v		v30, (%[wd7])\n"
+				"vxor.vv	v29, v31, v30\n"
+				"vxor.vv	v28, v28, v30\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[wd4]"r"(&dptr[z][d+4*NSIZE]),
+				[wd5]"r"(&dptr[z][d+5*NSIZE]),
+				[wd6]"r"(&dptr[z][d+6*NSIZE]),
+				[wd7]"r"(&dptr[z][d+7*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			"vse8.v		v4, (%[wp1])\n"
+			"vse8.v		v5, (%[wq1])\n"
+			"vse8.v		v8, (%[wp2])\n"
+			"vse8.v		v9, (%[wq2])\n"
+			"vse8.v		v12, (%[wp3])\n"
+			"vse8.v		v13, (%[wq3])\n"
+			"vse8.v		v16, (%[wp4])\n"
+			"vse8.v		v17, (%[wq4])\n"
+			"vse8.v		v20, (%[wp5])\n"
+			"vse8.v		v21, (%[wq5])\n"
+			"vse8.v		v24, (%[wp6])\n"
+			"vse8.v		v25, (%[wq6])\n"
+			"vse8.v		v28, (%[wp7])\n"
+			"vse8.v		v29, (%[wq7])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3]),
+			[wp4]"r"(&p[d+NSIZE*4]),
+			[wq4]"r"(&q[d+NSIZE*4]),
+			[wp5]"r"(&p[d+NSIZE*5]),
+			[wq5]"r"(&q[d+NSIZE*5]),
+			[wp6]"r"(&p[d+NSIZE*6]),
+			[wq6]"r"(&q[d+NSIZE*6]),
+			[wp7]"r"(&p[d+NSIZE*7]),
+			[wq7]"r"(&q[d+NSIZE*7])
+		);
+	}
+}
+
+static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
+					unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+	 * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+	 * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+	 * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+	 */
+	for (d = 0; d < bytes; d += NSIZE*8) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v0, (%[wp0])\n"
+			"vle8.v		v1, (%[wp0])\n"
+			"vle8.v		v4, (%[wp1])\n"
+			"vle8.v		v5, (%[wp1])\n"
+			"vle8.v		v8, (%[wp2])\n"
+			"vle8.v		v9, (%[wp2])\n"
+			"vle8.v		v12, (%[wp3])\n"
+			"vle8.v		v13, (%[wp3])\n"
+			"vle8.v		v16, (%[wp4])\n"
+			"vle8.v		v17, (%[wp4])\n"
+			"vle8.v		v20, (%[wp5])\n"
+			"vle8.v		v21, (%[wp5])\n"
+			"vle8.v		v24, (%[wp6])\n"
+			"vle8.v		v25, (%[wp6])\n"
+			"vle8.v		v28, (%[wp7])\n"
+			"vle8.v		v29, (%[wp7])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE]),
+			[wp4]"r"(&dptr[z0][d+4*NSIZE]),
+			[wp5]"r"(&dptr[z0][d+5*NSIZE]),
+			[wp6]"r"(&dptr[z0][d+6*NSIZE]),
+			[wp7]"r"(&dptr[z0][d+7*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1; z >= start; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+
+				"vsra.vi	v18, v17, 7\n"
+				"vsll.vi	v19, v17, 1\n"
+				"vand.vx	v18, v18, %[x1d]\n"
+				"vxor.vv	v19, v19, v18\n"
+				"vle8.v		v18, (%[wd4])\n"
+				"vxor.vv	v17, v19, v18\n"
+				"vxor.vv	v16, v16, v18\n"
+
+				"vsra.vi	v22, v21, 7\n"
+				"vsll.vi	v23, v21, 1\n"
+				"vand.vx	v22, v22, %[x1d]\n"
+				"vxor.vv	v23, v23, v22\n"
+				"vle8.v		v22, (%[wd5])\n"
+				"vxor.vv	v21, v23, v22\n"
+				"vxor.vv	v20, v20, v22\n"
+
+				"vsra.vi	v26, v25, 7\n"
+				"vsll.vi	v27, v25, 1\n"
+				"vand.vx	v26, v26, %[x1d]\n"
+				"vxor.vv	v27, v27, v26\n"
+				"vle8.v		v26, (%[wd6])\n"
+				"vxor.vv	v25, v27, v26\n"
+				"vxor.vv	v24, v24, v26\n"
+
+				"vsra.vi	v30, v29, 7\n"
+				"vsll.vi	v31, v29, 1\n"
+				"vand.vx	v30, v30, %[x1d]\n"
+				"vxor.vv	v31, v31, v30\n"
+				"vle8.v		v30, (%[wd7])\n"
+				"vxor.vv	v29, v31, v30\n"
+				"vxor.vv	v28, v28, v30\n"
+				".option	pop\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[wd4]"r"(&dptr[z][d+4*NSIZE]),
+				[wd5]"r"(&dptr[z][d+5*NSIZE]),
+				[wd6]"r"(&dptr[z][d+6*NSIZE]),
+				[wd7]"r"(&dptr[z][d+7*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v5, v7, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v9, v11, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v13, v15, v14\n"
+
+				"vsra.vi	v18, v17, 7\n"
+				"vsll.vi	v19, v17, 1\n"
+				"vand.vx	v18, v18, %[x1d]\n"
+				"vxor.vv	v17, v19, v18\n"
+
+				"vsra.vi	v22, v21, 7\n"
+				"vsll.vi	v23, v21, 1\n"
+				"vand.vx	v22, v22, %[x1d]\n"
+				"vxor.vv	v21, v23, v22\n"
+
+				"vsra.vi	v26, v25, 7\n"
+				"vsll.vi	v27, v25, 1\n"
+				"vand.vx	v26, v26, %[x1d]\n"
+				"vxor.vv	v25, v27, v26\n"
+
+				"vsra.vi	v30, v29, 7\n"
+				"vsll.vi	v31, v29, 1\n"
+				"vand.vx	v30, v30, %[x1d]\n"
+				"vxor.vv	v29, v31, v30\n"
+				".option	pop\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 * v8:wp2, v9:wq2, v10:p2, v11:q2
+		 * v12:wp3, v13:wq3, v14:p3, v15:q3
+		 * v16:wp4, v17:wq4, v18:p4, v19:q4
+		 * v20:wp5, v21:wq5, v22:p5, v23:q5
+		 * v24:wp6, v25:wq6, v26:p6, v27:q6
+		 * v28:wp7, v29:wq7, v30:p7, v31:q7
+		 */
+		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+
+			"vle8.v		v6, (%[wp1])\n"
+			"vle8.v		v7, (%[wq1])\n"
+			"vxor.vv	v6, v6, v4\n"
+			"vxor.vv	v7, v7, v5\n"
+			"vse8.v		v6, (%[wp1])\n"
+			"vse8.v		v7, (%[wq1])\n"
+
+			"vle8.v		v10, (%[wp2])\n"
+			"vle8.v		v11, (%[wq2])\n"
+			"vxor.vv	v10, v10, v8\n"
+			"vxor.vv	v11, v11, v9\n"
+			"vse8.v		v10, (%[wp2])\n"
+			"vse8.v		v11, (%[wq2])\n"
+
+			"vle8.v		v14, (%[wp3])\n"
+			"vle8.v		v15, (%[wq3])\n"
+			"vxor.vv	v14, v14, v12\n"
+			"vxor.vv	v15, v15, v13\n"
+			"vse8.v		v14, (%[wp3])\n"
+			"vse8.v		v15, (%[wq3])\n"
+
+			"vle8.v		v18, (%[wp4])\n"
+			"vle8.v		v19, (%[wq4])\n"
+			"vxor.vv	v18, v18, v16\n"
+			"vxor.vv	v19, v19, v17\n"
+			"vse8.v		v18, (%[wp4])\n"
+			"vse8.v		v19, (%[wq4])\n"
+
+			"vle8.v		v22, (%[wp5])\n"
+			"vle8.v		v23, (%[wq5])\n"
+			"vxor.vv	v22, v22, v20\n"
+			"vxor.vv	v23, v23, v21\n"
+			"vse8.v		v22, (%[wp5])\n"
+			"vse8.v		v23, (%[wq5])\n"
+
+			"vle8.v		v26, (%[wp6])\n"
+			"vle8.v		v27, (%[wq6])\n"
+			"vxor.vv	v26, v26, v24\n"
+			"vxor.vv	v27, v27, v25\n"
+			"vse8.v		v26, (%[wp6])\n"
+			"vse8.v		v27, (%[wq6])\n"
+
+			"vle8.v		v30, (%[wp7])\n"
+			"vle8.v		v31, (%[wq7])\n"
+			"vxor.vv	v30, v30, v28\n"
+			"vxor.vv	v31, v31, v29\n"
+			"vse8.v		v30, (%[wp7])\n"
+			"vse8.v		v31, (%[wq7])\n"
+			".option	pop\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3]),
+			[wp4]"r"(&p[d+NSIZE*4]),
+			[wq4]"r"(&q[d+NSIZE*4]),
+			[wp5]"r"(&p[d+NSIZE*5]),
+			[wq5]"r"(&q[d+NSIZE*5]),
+			[wp6]"r"(&p[d+NSIZE*6]),
+			[wq6]"r"(&q[d+NSIZE*6]),
+			[wp7]"r"(&p[d+NSIZE*7]),
+			[wq7]"r"(&q[d+NSIZE*7])
+		);
+	}
+}
+
+#define RAID6_RVV_WRAPPER(_n)						\
+	static void raid6_rvv ## _n ## _gen_syndrome(int disks,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _gen_syndrome_real(int,	\
+						unsigned long, void**);	\
+		kernel_vector_begin();					\
+		raid6_rvv ## _n ## _gen_syndrome_real(disks,		\
+				(unsigned long)bytes, ptrs);		\
+		kernel_vector_end();					\
+	}								\
+	static void raid6_rvv ## _n ## _xor_syndrome(int disks,		\
+					int start, int stop,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _xor_syndrome_real(int,	\
+				int, int, unsigned long, void**);	\
+		kernel_vector_begin();					\
+		raid6_rvv ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+		kernel_vector_end();					\
+	}								\
+	struct raid6_calls const raid6_rvvx ## _n = {			\
+		raid6_rvv ## _n ## _gen_syndrome,			\
+		raid6_rvv ## _n ## _xor_syndrome,			\
+		rvv_has_vector,						\
+		"rvvx" #_n,						\
+		0							\
+	}
+
+RAID6_RVV_WRAPPER(1);
+RAID6_RVV_WRAPPER(2);
+RAID6_RVV_WRAPPER(4);
+RAID6_RVV_WRAPPER(8);