diff mbox series

[RFC,v0] RISCV: Report vector unaligned accesses hwprobe

Message ID 20240604162457.3757417-1-jesse@rivosinc.com (mailing list archive)
State Changes Requested
Headers show
Series [RFC,v0] RISCV: Report vector unaligned accesses hwprobe | expand

Checks

Context Check Description
conchuod/vmtest-for-next-PR fail PR summary
conchuod/patch-1-test-1 success .github/scripts/patches/tests/build_rv32_defconfig.sh
conchuod/patch-1-test-2 fail .github/scripts/patches/tests/build_rv64_clang_allmodconfig.sh
conchuod/patch-1-test-3 fail .github/scripts/patches/tests/build_rv64_gcc_allmodconfig.sh
conchuod/patch-1-test-4 success .github/scripts/patches/tests/build_rv64_nommu_k210_defconfig.sh
conchuod/patch-1-test-5 success .github/scripts/patches/tests/build_rv64_nommu_virt_defconfig.sh
conchuod/patch-1-test-6 warning .github/scripts/patches/tests/checkpatch.sh
conchuod/patch-1-test-7 success .github/scripts/patches/tests/dtb_warn_rv64.sh
conchuod/patch-1-test-8 success .github/scripts/patches/tests/header_inline.sh
conchuod/patch-1-test-9 success .github/scripts/patches/tests/kdoc.sh
conchuod/patch-1-test-10 success .github/scripts/patches/tests/module_param.sh
conchuod/patch-1-test-11 success .github/scripts/patches/tests/verify_fixes.sh
conchuod/patch-1-test-12 success .github/scripts/patches/tests/verify_signedoff.sh

Commit Message

Jesse Taube June 4, 2024, 4:24 p.m. UTC
Detected if a system traps into the kernel on an vector unaligned access.
Add the result to a new key in hwprobe.

Signed-off-by: Jesse Taube <jesse@rivosinc.com>
---
 arch/riscv/include/asm/cpufeature.h        |  3 ++
 arch/riscv/include/asm/hwprobe.h           |  2 +-
 arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
 arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
 arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
 arch/riscv/kernel/unaligned_access_speed.c |  4 ++
 6 files changed, 108 insertions(+), 1 deletion(-)

Comments

Jesse Taube June 4, 2024, 4:42 p.m. UTC | #1
On 6/4/24 12:24, Jesse Taube wrote:
> Detected if a system traps into the kernel on an vector unaligned access.
> Add the result to a new key in hwprobe.
> 
> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> ---
>   arch/riscv/include/asm/cpufeature.h        |  3 ++
>   arch/riscv/include/asm/hwprobe.h           |  2 +-
>   arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>   arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>   arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>   arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>   6 files changed, 108 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..5ad69cf25b25 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>   
>   #if defined(CONFIG_RISCV_MISALIGNED)
>   bool check_unaligned_access_emulated_all_cpus(void);
> +bool check_vector_unaligned_access_all_cpus(void);
> +
>   void unaligned_emulation_finish(void);
>   bool unaligned_ctl_available(void);
>   DECLARE_PER_CPU(long, misaligned_access_speed);
> +DECLARE_PER_CPU(long, vector_misaligned_access);
>   #else
>   static inline bool unaligned_ctl_available(void)
>   {
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 630507dff5ea..150a9877b0af 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>   
>   #include <uapi/asm/hwprobe.h>
>   
> -#define RISCV_HWPROBE_MAX_KEY 6
> +#define RISCV_HWPROBE_MAX_KEY 7
>   
>   static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>   {
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 060212331a03..4474e98d17bd 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>   #define		RISCV_HWPROBE_MISALIGNED_UNSUPPORTED	(4 << 0)
>   #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF	7

There were talks about combining vecotor and scalar speed for the user 
facing API into RISCV_HWPROBE_KEY_CPUPERF_0, but adding another key 
seems easier.

> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_EMULATED		1
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
> +#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>   
>   /* Flags */
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index b286b73e763e..ce641cc6e47a 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>   }
>   #endif
>   
> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	int cpu;
> +	u64 perf = -1ULL;
> +
> +	for_each_cpu(cpu, cpus) {
> +		int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> +		if (perf == -1ULL)
> +			perf = this_perf;
> +
> +		if (perf != this_perf) {
> +			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +			break;
> +		}
> +	}
> +
> +	if (perf == -1ULL)
> +		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +	return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>   			     const struct cpumask *cpus)
>   {
> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>   		pair->value = hwprobe_misaligned(cpus);
>   		break;
>   
> +	case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
> +		pair->value = hwprobe_vec_misaligned(cpus);
> +		break;
> +
>   	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>   		pair->value = 0;
>   		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 2adb7c3e4dd5..0c07e990e9c5 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
>   #include <asm/entry-common.h>
>   #include <asm/hwprobe.h>
>   #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>   
>   #define INSN_MATCH_LB			0x3
>   #define INSN_MASK_LB			0x707f
> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>   	if (get_insn(regs, epc, &insn))
>   		return -1;
>   

Is this an appropriate way to check if there is vector missaligned 
access? What if a unaligned vector load as called by the kernel before 
this check?

> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> +	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> +		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> +		regs->epc = epc + INSN_LEN(insn);
> +		return 0;
> +	}
> +#endif
> +
>   	regs->epc = 0;
>   
>   	if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>   	return misaligned_emu_detected;
>   }
>   
> +#ifdef CONFIG_RISCV_ISA_V
> +static bool check_vector_unaligned_access(int cpu)
> +{
> +	long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
> +	struct riscv_isainfo *isainfo = &hart_isa[cpu];
> +	unsigned long tmp_var;
> +	bool misaligned_vec_suported;
> +
> +	if (!riscv_isa_extension_available(isainfo->isa, v))
> +		return false;
> +
> +	/* This case will only happen if a unaligned vector load
> +	 * was called by the kernel before this check
> +	 */
> +	if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
> +		return false;
> +
> +	kernel_vector_begin();
> +	__asm__ __volatile__ (
> +		".option push\n\t"
> +		".option arch, +v\n\t"
> +		"	li t1, 0x1\n"				//size
> +		"       vsetvli t0, t1, e16, m2, ta, ma\n\t"	// Vectors of 16b
> +		"       addi t0, %[ptr], 1\n\t"			// Misalign address
> +		"	vle16.v v0, (t0)\n\t"			// Load bytes
> +		".option pop\n\t"
> +		: : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
> +	kernel_vector_end();
> +
> +	misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
> +
> +	return misaligned_vec_suported;
> +}
> +#else
> +static bool check_vector_unaligned_access(int cpu)
> +{
> +	return false;
> +}
> +#endif
> +
> +bool check_vector_unaligned_access_all_cpus(void)
> +{
> +	int cpu;
> +
> +	for_each_online_cpu(cpu)
> +		if (!check_vector_unaligned_access(cpu))
> +			return false;
> +
> +	return true;
> +}
> +
>   bool check_unaligned_access_emulated_all_cpus(void)
>   {
>   	int cpu;
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index a9a6bcb02acf..92a84239beaa 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -20,6 +20,7 @@
>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>   
>   DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>   
>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>   static cpumask_t fast_misaligned_access;
> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>   {
>   	bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>  

There was talks about Zicclsm, but spike doesnt have support for Zicclsm 
afaik, but I was wondering if i should add Zicclsm to cpufeature and aswell.

If anyone wants to run this i tested with
spike --misaligned --isa=RV64IMAFDCV_zicntr_zihpm 
--kernel=arch/riscv/boot/Image 
opensbi/build/platform/generic/firmware/fw_jump.elf


Thanks,
Jesse Taube


> +	check_vector_unaligned_access_all_cpus();
> +
>   	if (!all_cpus_emulated)
>   		return check_unaligned_access_speed_all_cpus();
>   
> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>   static int check_unaligned_access_all_cpus(void)
>   {
>   	check_unaligned_access_emulated_all_cpus();
> +	check_vector_unaligned_access_all_cpus();
>   
>   	return 0;
>   }
Clément Léger June 5, 2024, 7:55 a.m. UTC | #2
On 04/06/2024 18:42, Jesse Taube wrote:
> 
> 
> On 6/4/24 12:24, Jesse Taube wrote:
>> Detected if a system traps into the kernel on an vector unaligned access.
>> Add the result to a new key in hwprobe.
>>
>> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
>> ---
>>   arch/riscv/include/asm/cpufeature.h        |  3 ++
>>   arch/riscv/include/asm/hwprobe.h           |  2 +-
>>   arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>>   arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>>   arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>>   arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>>   6 files changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/riscv/include/asm/cpufeature.h
>> b/arch/riscv/include/asm/cpufeature.h
>> index 347805446151..5ad69cf25b25 100644
>> --- a/arch/riscv/include/asm/cpufeature.h
>> +++ b/arch/riscv/include/asm/cpufeature.h
>> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>>     #if defined(CONFIG_RISCV_MISALIGNED)
>>   bool check_unaligned_access_emulated_all_cpus(void);
>> +bool check_vector_unaligned_access_all_cpus(void);
>> +
>>   void unaligned_emulation_finish(void);
>>   bool unaligned_ctl_available(void);
>>   DECLARE_PER_CPU(long, misaligned_access_speed);
>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>   #else
>>   static inline bool unaligned_ctl_available(void)
>>   {
>> diff --git a/arch/riscv/include/asm/hwprobe.h
>> b/arch/riscv/include/asm/hwprobe.h
>> index 630507dff5ea..150a9877b0af 100644
>> --- a/arch/riscv/include/asm/hwprobe.h
>> +++ b/arch/riscv/include/asm/hwprobe.h
>> @@ -8,7 +8,7 @@
>>     #include <uapi/asm/hwprobe.h>
>>   -#define RISCV_HWPROBE_MAX_KEY 6
>> +#define RISCV_HWPROBE_MAX_KEY 7
>>     static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>   {
>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h
>> b/arch/riscv/include/uapi/asm/hwprobe.h
>> index 060212331a03..4474e98d17bd 100644
>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>>   #define        RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
>>   #define        RISCV_HWPROBE_MISALIGNED_MASK        (7 << 0)
>>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
>> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF    7
> 
> There were talks about combining vecotor and scalar speed for the user
> facing API into RISCV_HWPROBE_KEY_CPUPERF_0, but adding another key
> seems easier.
> 
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN        0
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_EMULATED        1
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_SLOW        2
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_FAST        3
>> +#define        RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED    4
>>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>     /* Flags */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c
>> b/arch/riscv/kernel/sys_hwprobe.c
>> index b286b73e763e..ce641cc6e47a 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct
>> cpumask *cpus)
>>   }
>>   #endif
>>   +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +    int cpu;
>> +    u64 perf = -1ULL;
>> +
>> +    for_each_cpu(cpu, cpus) {
>> +        int this_perf = per_cpu(vector_misaligned_access, cpu);
>> +
>> +        if (perf == -1ULL)
>> +            perf = this_perf;
>> +
>> +        if (perf != this_perf) {
>> +            perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +            break;
>> +        }
>> +    }
>> +
>> +    if (perf == -1ULL)
>> +        return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +
>> +    return perf;
>> +}
>> +#else
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +    return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +}
>> +#endif
>> +
>>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                    const struct cpumask *cpus)
>>   {
>> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe
>> *pair,
>>           pair->value = hwprobe_misaligned(cpus);
>>           break;
>>   +    case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
>> +        pair->value = hwprobe_vec_misaligned(cpus);
>> +        break;
>> +
>>       case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>           pair->value = 0;
>>           if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>> diff --git a/arch/riscv/kernel/traps_misaligned.c
>> b/arch/riscv/kernel/traps_misaligned.c
>> index 2adb7c3e4dd5..0c07e990e9c5 100644
>> --- a/arch/riscv/kernel/traps_misaligned.c
>> +++ b/arch/riscv/kernel/traps_misaligned.c
>> @@ -16,6 +16,7 @@
>>   #include <asm/entry-common.h>
>>   #include <asm/hwprobe.h>
>>   #include <asm/cpufeature.h>
>> +#include <asm/vector.h>
>>     #define INSN_MATCH_LB            0x3
>>   #define INSN_MASK_LB            0x707f
>> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>>       if (get_insn(regs, epc, &insn))
>>           return -1;
>>   
> 
> Is this an appropriate way to check if there is vector missaligned
> access? What if a unaligned vector load as called by the kernel before
> this check?
> 
>> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>> +    if (*this_cpu_ptr(&vector_misaligned_access) ==
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>> +        *this_cpu_ptr(&vector_misaligned_access) =
>> RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
>> +        regs->epc = epc + INSN_LEN(insn);
>> +        return 0;
>> +    }
>> +#endif

Since this code will be called for standard load/store misaligned
accesses your, I guess this needs to check if the faulty instruction
itself is a vector related one before setting it to supported. I don't
know what the specs says about that but I guess it shoukld be checked if
vector load/store and standard load/store could have different behaviors
with respect to misaligned accesses.

Regardless of that (except if I missed something), the emulation code
does not actually support vector load/store instructions.
So, support for misaligned vector load/store should be added in a first
place before reporting that it is actually "supported".

Thanks,

Clément

>> +
>>       regs->epc = 0;
>>         if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
>> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>>       return misaligned_emu_detected;
>>   }
>>   +#ifdef CONFIG_RISCV_ISA_V
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +    long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
>> +    struct riscv_isainfo *isainfo = &hart_isa[cpu];
>> +    unsigned long tmp_var;
>> +    bool misaligned_vec_suported;
>> +
>> +    if (!riscv_isa_extension_available(isainfo->isa, v))
>> +        return false;
>> +
>> +    /* This case will only happen if a unaligned vector load
>> +     * was called by the kernel before this check
>> +     */
>> +    if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>> +        return false;
>> +
>> +    kernel_vector_begin();
>> +    __asm__ __volatile__ (
>> +        ".option push\n\t"
>> +        ".option arch, +v\n\t"
>> +        "    li t1, 0x1\n"                //size
>> +        "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of
>> 16b
>> +        "       addi t0, %[ptr], 1\n\t"            // Misalign address
>> +        "    vle16.v v0, (t0)\n\t"            // Load bytes
>> +        ".option pop\n\t"
>> +        : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
>> +    kernel_vector_end();
>> +
>> +    misaligned_vec_suported = (*mas_ptr ==
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
>> +
>> +    return misaligned_vec_suported;
>> +}
>> +#else
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +    return false;
>> +}
>> +#endif
>> +
>> +bool check_vector_unaligned_access_all_cpus(void)
>> +{
>> +    int cpu;
>> +
>> +    for_each_online_cpu(cpu)
>> +        if (!check_vector_unaligned_access(cpu))
>> +            return false;
>> +
>> +    return true;
>> +}
>> +
>>   bool check_unaligned_access_emulated_all_cpus(void)
>>   {
>>       int cpu;
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c
>> b/arch/riscv/kernel/unaligned_access_speed.c
>> index a9a6bcb02acf..92a84239beaa 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -20,6 +20,7 @@
>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>     DEFINE_PER_CPU(long, misaligned_access_speed);
>> +DEFINE_PER_CPU(long, vector_misaligned_access) =
>> RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>     #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>   static cpumask_t fast_misaligned_access;
>> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>>   {
>>       bool all_cpus_emulated =
>> check_unaligned_access_emulated_all_cpus();
>>  
> 
> There was talks about Zicclsm, but spike doesnt have support for Zicclsm
> afaik, but I was wondering if i should add Zicclsm to cpufeature and
> aswell.
> 
> If anyone wants to run this i tested with
> spike --misaligned --isa=RV64IMAFDCV_zicntr_zihpm
> --kernel=arch/riscv/boot/Image
> opensbi/build/platform/generic/firmware/fw_jump.elf

AFAIK, --misaligned tells spike to actually handle misaligned load/store
in "hardware" and not generate a misaligned trap so I guess you would
need to remove that flag and use a specific openSBI version that always
delegate the misaligned load/store traps if you want to be sure that the
kernel actually handles the vector misaligned load/store traps.

I previously made a small test utility to verify that the kernel
correctly gets the misaligned traps [1]. If it fails then, your kernel
probably do not get any trap :)

Link: https://github.com/clementleger/unaligned_test [1]
> 
> 
> Thanks,
> Jesse Taube
> 
> 
>> +    check_vector_unaligned_access_all_cpus();
>> +
>>       if (!all_cpus_emulated)
>>           return check_unaligned_access_speed_all_cpus();
>>   @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>>   static int check_unaligned_access_all_cpus(void)
>>   {
>>       check_unaligned_access_emulated_all_cpus();
>> +    check_vector_unaligned_access_all_cpus();
>>         return 0;
>>   }
Conor Dooley June 5, 2024, 3:45 p.m. UTC | #3
On Tue, Jun 04, 2024 at 12:42:10PM -0400, Jesse Taube wrote:
> On 6/4/24 12:24, Jesse Taube wrote:
> > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> > index a9a6bcb02acf..92a84239beaa 100644
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > @@ -20,6 +20,7 @@
> >   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> >   DEFINE_PER_CPU(long, misaligned_access_speed);
> > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> >   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >   static cpumask_t fast_misaligned_access;
> > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
> >   {
> >   	bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> 
> There was talks about Zicclsm, but spike doesnt have support for Zicclsm
> afaik,

Support for Zicclsm just means that it can perform misaligned loads and
stores to cache coherent memory. I guess support in Spike would involve
setting that in its devicetree iff/when that's the case.

> but I was wondering if i should add Zicclsm to cpufeature and aswell.

Ye, please do add detection for Zicclsm. I think that should be fairly
straightforward to do, nothing too special to document.

Cheers,
Conor.
Evan Green June 5, 2024, 3:54 p.m. UTC | #4
On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote:
>
> Detected if a system traps into the kernel on an vector unaligned access.
> Add the result to a new key in hwprobe.
>
> Signed-off-by: Jesse Taube <jesse@rivosinc.com>

For the scalar world, we wanted to know whether misaligned accesses
were faster or slower than equivalent byte accesses, so usermode could
know for something like memcpy which option had better bandwidth. Is
the motivation here the same, where we're going to use vector
registers for memcpy and we want to know which size load to use? Or
will usermode be consuming this info for a different purpose as well?
I know this is a basic question, but having the motivation helps me
get the right lens for reviewing it. Perhaps that should be added to
the commit message as well.

> ---
>  arch/riscv/include/asm/cpufeature.h        |  3 ++
>  arch/riscv/include/asm/hwprobe.h           |  2 +-
>  arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>  arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>  arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>  arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>  6 files changed, 108 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> index 347805446151..5ad69cf25b25 100644
> --- a/arch/riscv/include/asm/cpufeature.h
> +++ b/arch/riscv/include/asm/cpufeature.h
> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>
>  #if defined(CONFIG_RISCV_MISALIGNED)
>  bool check_unaligned_access_emulated_all_cpus(void);
> +bool check_vector_unaligned_access_all_cpus(void);
> +
>  void unaligned_emulation_finish(void);
>  bool unaligned_ctl_available(void);
>  DECLARE_PER_CPU(long, misaligned_access_speed);
> +DECLARE_PER_CPU(long, vector_misaligned_access);
>  #else
>  static inline bool unaligned_ctl_available(void)
>  {
> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> index 630507dff5ea..150a9877b0af 100644
> --- a/arch/riscv/include/asm/hwprobe.h
> +++ b/arch/riscv/include/asm/hwprobe.h
> @@ -8,7 +8,7 @@
>
>  #include <uapi/asm/hwprobe.h>
>
> -#define RISCV_HWPROBE_MAX_KEY 6
> +#define RISCV_HWPROBE_MAX_KEY 7
>
>  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>  {
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> index 060212331a03..4474e98d17bd 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>  #define                RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
>  #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
>  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF  7
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_EMULATED           1
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4

This needs to be added to the documentation as well.

What value should be returned when V is not enabled in the kernel, or
V is not supported in the hardware? Currently in the code it would be
UNKNOWN, right? Is that what we want, or is it worth differentiating
"no support for V" from "I don't know the speed of misaligned loads"?
Maybe UNKNOWN is the right value, as there are other values to tell
you V is not enabled.


>  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>
>  /* Flags */
> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> index b286b73e763e..ce641cc6e47a 100644
> --- a/arch/riscv/kernel/sys_hwprobe.c
> +++ b/arch/riscv/kernel/sys_hwprobe.c
> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>  }
>  #endif
>
> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +       int cpu;
> +       u64 perf = -1ULL;
> +
> +       for_each_cpu(cpu, cpus) {
> +               int this_perf = per_cpu(vector_misaligned_access, cpu);
> +
> +               if (perf == -1ULL)
> +                       perf = this_perf;
> +
> +               if (perf != this_perf) {
> +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +                       break;
> +               }
> +       }
> +
> +       if (perf == -1ULL)
> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +
> +       return perf;
> +}
> +#else
> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> +{
> +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> +}
> +#endif
> +
>  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>                              const struct cpumask *cpus)
>  {
> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>                 pair->value = hwprobe_misaligned(cpus);
>                 break;
>
> +       case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
> +               pair->value = hwprobe_vec_misaligned(cpus);
> +               break;
> +
>         case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>                 pair->value = 0;
>                 if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> index 2adb7c3e4dd5..0c07e990e9c5 100644
> --- a/arch/riscv/kernel/traps_misaligned.c
> +++ b/arch/riscv/kernel/traps_misaligned.c
> @@ -16,6 +16,7 @@
>  #include <asm/entry-common.h>
>  #include <asm/hwprobe.h>
>  #include <asm/cpufeature.h>
> +#include <asm/vector.h>
>
>  #define INSN_MATCH_LB                  0x3
>  #define INSN_MASK_LB                   0x707f
> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>         if (get_insn(regs, epc, &insn))
>                 return -1;
>
> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;

Shouldn't this be EMULATED, given we were just delegated this trap? I
guess it depends on whether you're going to add support for actually
handling the misaligned vector trap, as Clément mentioned.

Scalar misaligned loads had a history to lean on since the specs were
always explicit that misaligned loads/store had to be supported one
way or another. So UNSUPPORTED was a future theoretical value. I
haven't dug through the specs yet, do you know what the story is for V
and misaligned loads? My sub-question is what the rules are for
detecting the difference between EMULATED and UNSUPPORTED.

> +               regs->epc = epc + INSN_LEN(insn);
> +               return 0;
> +       }
> +#endif
> +
>         regs->epc = 0;
>
>         if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>         return misaligned_emu_detected;
>  }
>
> +#ifdef CONFIG_RISCV_ISA_V
> +static bool check_vector_unaligned_access(int cpu)
> +{
> +       long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
> +       struct riscv_isainfo *isainfo = &hart_isa[cpu];
> +       unsigned long tmp_var;
> +       bool misaligned_vec_suported;
> +
> +       if (!riscv_isa_extension_available(isainfo->isa, v))
> +               return false;
> +
> +       /* This case will only happen if a unaligned vector load
> +        * was called by the kernel before this check
> +        */
> +       if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
> +               return false;
> +
> +       kernel_vector_begin();
> +       __asm__ __volatile__ (
> +               ".option push\n\t"
> +               ".option arch, +v\n\t"
> +               "       li t1, 0x1\n"                           //size
> +               "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of 16b
> +               "       addi t0, %[ptr], 1\n\t"                 // Misalign address
> +               "       vle16.v v0, (t0)\n\t"                   // Load bytes
> +               ".option pop\n\t"
> +               : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
> +       kernel_vector_end();
> +
> +       misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
> +
> +       return misaligned_vec_suported;
> +}
> +#else
> +static bool check_vector_unaligned_access(int cpu)
> +{
> +       return false;
> +}
> +#endif
> +
> +bool check_vector_unaligned_access_all_cpus(void)
> +{
> +       int cpu;
> +
> +       for_each_online_cpu(cpu)
> +               if (!check_vector_unaligned_access(cpu))
> +                       return false;
> +
> +       return true;
> +}

These functions return a bool, but the bool is never checked. I'm
guessing that's because you're going to check it in a future patch
where you decide whether or not to probe for fast/slow?


> +
>  bool check_unaligned_access_emulated_all_cpus(void)
>  {
>         int cpu;
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index a9a6bcb02acf..92a84239beaa 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -20,6 +20,7 @@
>  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>
>  DEFINE_PER_CPU(long, misaligned_access_speed);
> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>
>  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>  static cpumask_t fast_misaligned_access;
> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>  {
>         bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>
> +       check_vector_unaligned_access_all_cpus();
> +
>         if (!all_cpus_emulated)
>                 return check_unaligned_access_speed_all_cpus();
>
> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>  static int check_unaligned_access_all_cpus(void)
>  {
>         check_unaligned_access_emulated_all_cpus();
> +       check_vector_unaligned_access_all_cpus();
>
>         return 0;
>  }
> --
> 2.43.0
>
Jesse Taube June 5, 2024, 4:19 p.m. UTC | #5
On 6/5/24 11:54, Evan Green wrote:
> On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote:
>>
>> Detected if a system traps into the kernel on an vector unaligned access.
>> Add the result to a new key in hwprobe.
>>
>> Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> 
> For the scalar world, we wanted to know whether misaligned accesses
> were faster or slower than equivalent byte accesses, so usermode could
> know for something like memcpy which option had better bandwidth. 

Yes that will be in another patch and will set it to either fast or slow.
This patch will just detect if we can run it.

> Is
> the motivation here the same, where we're going to use vector
> registers for memcpy and we want to know which size load to use? Or
> will usermode be consuming this info for a different purpose as well?
> I know this is a basic question, but having the motivation helps me
> get the right lens for reviewing it. Perhaps that should be added to
> the commit message as well.
> 
>> ---
>>   arch/riscv/include/asm/cpufeature.h        |  3 ++
>>   arch/riscv/include/asm/hwprobe.h           |  2 +-
>>   arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>>   arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>>   arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>>   arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>>   6 files changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
>> index 347805446151..5ad69cf25b25 100644
>> --- a/arch/riscv/include/asm/cpufeature.h
>> +++ b/arch/riscv/include/asm/cpufeature.h
>> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>>
>>   #if defined(CONFIG_RISCV_MISALIGNED)
>>   bool check_unaligned_access_emulated_all_cpus(void);
>> +bool check_vector_unaligned_access_all_cpus(void);
>> +
>>   void unaligned_emulation_finish(void);
>>   bool unaligned_ctl_available(void);
>>   DECLARE_PER_CPU(long, misaligned_access_speed);
>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>   #else
>>   static inline bool unaligned_ctl_available(void)
>>   {
>> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
>> index 630507dff5ea..150a9877b0af 100644
>> --- a/arch/riscv/include/asm/hwprobe.h
>> +++ b/arch/riscv/include/asm/hwprobe.h
>> @@ -8,7 +8,7 @@
>>
>>   #include <uapi/asm/hwprobe.h>
>>
>> -#define RISCV_HWPROBE_MAX_KEY 6
>> +#define RISCV_HWPROBE_MAX_KEY 7
>>
>>   static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>   {
>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
>> index 060212331a03..4474e98d17bd 100644
>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>>   #define                RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
>>   #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
>>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
>> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF  7
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_EMULATED           1
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
> 
> This needs to be added to the documentation as well.
> 
> What value should be returned when V is not enabled in the kernel, or
> V is not supported in the hardware? Currently in the code it would be
> UNKNOWN, right?

Yes. It would be trivial to set it to unsupported if V is not enabled.

> Is that what we want, or is it worth differentiating
> "no support for V" from "I don't know the speed of misaligned loads"?
> Maybe UNKNOWN is the right value, as there are other values to tell
> you V is not enabled.
> 
> 
>>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>
>>   /* Flags */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>> index b286b73e763e..ce641cc6e47a 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>>   }
>>   #endif
>>
>> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +       int cpu;
>> +       u64 perf = -1ULL;
>> +
>> +       for_each_cpu(cpu, cpus) {
>> +               int this_perf = per_cpu(vector_misaligned_access, cpu);
>> +
>> +               if (perf == -1ULL)
>> +                       perf = this_perf;
>> +
>> +               if (perf != this_perf) {
>> +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +                       break;
>> +               }
>> +       }
>> +
>> +       if (perf == -1ULL)
>> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +
>> +       return perf;
>> +}
>> +#else
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +}
>> +#endif
>> +
>>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                               const struct cpumask *cpus)
>>   {
>> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                  pair->value = hwprobe_misaligned(cpus);
>>                  break;
>>
>> +       case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
>> +               pair->value = hwprobe_vec_misaligned(cpus);
>> +               break;
>> +
>>          case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>                  pair->value = 0;
>>                  if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
>> index 2adb7c3e4dd5..0c07e990e9c5 100644
>> --- a/arch/riscv/kernel/traps_misaligned.c
>> +++ b/arch/riscv/kernel/traps_misaligned.c
>> @@ -16,6 +16,7 @@
>>   #include <asm/entry-common.h>
>>   #include <asm/hwprobe.h>
>>   #include <asm/cpufeature.h>
>> +#include <asm/vector.h>
>>
>>   #define INSN_MATCH_LB                  0x3
>>   #define INSN_MASK_LB                   0x707f
>> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>>          if (get_insn(regs, epc, &insn))
>>                  return -1;
>>
>> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>> +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>> +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> 
> Shouldn't this be EMULATED, given we were just delegated this trap? I
> guess it depends on whether you're going to add support for actually
> handling the misaligned vector trap, as Clément mentioned.

Yeah I accedentaly forgot to check if the trap was caused by a vector 
instruction. I was not planing on emulating them, this is just a 
one-time check.

> 
> Scalar misaligned loads had a history to lean on since the specs were
> always explicit that misaligned loads/store had to be supported one
> way or another. So UNSUPPORTED was a future theoretical value. I
> haven't dug through the specs yet, do you know what the story is for V
> and misaligned loads?

"
8. Vector Memory Alignment Constraints
  If an element accessed by a vector memory instruction is not naturally 
aligned to the size of the element, either the element is transferred 
successfully or an address misaligned exception is raised on that element.
Support for misaligned vector memory accesses is independent of an 
implementation’s support for misaligned scalar memory accesses.
"

It is if the load/store is naturally aligned to the size of the element.

  My sub-question is what the rules are for
> detecting the difference between EMULATED and UNSUPPORTED.

For vector EMULATED isnt used thought it can be added if I add emulation.
Idealy that would be done in the SBI.

> 
>> +               regs->epc = epc + INSN_LEN(insn);
>> +               return 0;
>> +       }
>> +#endif
>> +
>>          regs->epc = 0;
>>
>>          if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
>> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>>          return misaligned_emu_detected;
>>   }
>>
>> +#ifdef CONFIG_RISCV_ISA_V
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +       long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
>> +       struct riscv_isainfo *isainfo = &hart_isa[cpu];
>> +       unsigned long tmp_var;
>> +       bool misaligned_vec_suported;
>> +
>> +       if (!riscv_isa_extension_available(isainfo->isa, v))
>> +               return false;
>> +
>> +       /* This case will only happen if a unaligned vector load
>> +        * was called by the kernel before this check
>> +        */
>> +       if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>> +               return false;
>> +
>> +       kernel_vector_begin();
>> +       __asm__ __volatile__ (
>> +               ".option push\n\t"
>> +               ".option arch, +v\n\t"
>> +               "       li t1, 0x1\n"                           //size
>> +               "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of 16b
>> +               "       addi t0, %[ptr], 1\n\t"                 // Misalign address
>> +               "       vle16.v v0, (t0)\n\t"                   // Load bytes
>> +               ".option pop\n\t"
>> +               : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
>> +       kernel_vector_end();
>> +
>> +       misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
>> +
>> +       return misaligned_vec_suported;
>> +}
>> +#else
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +       return false;
>> +}
>> +#endif
>> +
>> +bool check_vector_unaligned_access_all_cpus(void)
>> +{
>> +       int cpu;
>> +
>> +       for_each_online_cpu(cpu)
>> +               if (!check_vector_unaligned_access(cpu))
>> +                       return false;
>> +
>> +       return true;
>> +}
> 
> These functions return a bool, but the bool is never checked. I'm
> guessing that's because you're going to check it in a future patch
> where you decide whether or not to probe for fast/slow?

Yes. Relating to your first question. The values should have the same 
meaning as scalar. So UNKNOWN whould mean it's supported, we dont know 
the speed.

> 
> 
>> +
>>   bool check_unaligned_access_emulated_all_cpus(void)
>>   {
>>          int cpu;
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>> index a9a6bcb02acf..92a84239beaa 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -20,6 +20,7 @@
>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>
>>   DEFINE_PER_CPU(long, misaligned_access_speed);
>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>
>>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>   static cpumask_t fast_misaligned_access;
>> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>>   {
>>          bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>>
>> +       check_vector_unaligned_access_all_cpus();
>> +
>>          if (!all_cpus_emulated)
>>                  return check_unaligned_access_speed_all_cpus();
>>
>> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>>   static int check_unaligned_access_all_cpus(void)
>>   {
>>          check_unaligned_access_emulated_all_cpus();
>> +       check_vector_unaligned_access_all_cpus();
>>
>>          return 0;
>>   }
>> --
>> 2.43.0
>>
Conor Dooley June 5, 2024, 4:23 p.m. UTC | #6
Yo,

Beat me to some things I wanted to saw, so I'll leave my comments on top
of yours.

On Wed, Jun 05, 2024 at 08:54:21AM -0700, Evan Green wrote:
> On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote:
> >
> > Detected if a system traps into the kernel on an vector unaligned access.
> > Add the result to a new key in hwprobe.
> >
> > Signed-off-by: Jesse Taube <jesse@rivosinc.com>
> 
> For the scalar world, we wanted to know whether misaligned accesses
> were faster or slower than equivalent byte accesses, so usermode could
> know for something like memcpy which option had better bandwidth. Is
> the motivation here the same, where we're going to use vector
> registers for memcpy and we want to know which size load to use? Or
> will usermode be consuming this info for a different purpose as well?
> I know this is a basic question, but having the motivation helps me
> get the right lens for reviewing it. Perhaps that should be added to
> the commit message as well.

Motivation should always be in the commit message ;)

> >  arch/riscv/include/asm/cpufeature.h        |  3 ++
> >  arch/riscv/include/asm/hwprobe.h           |  2 +-
> >  arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
> >  arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
> >  arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
> >  arch/riscv/kernel/unaligned_access_speed.c |  4 ++
> >  6 files changed, 108 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index 347805446151..5ad69cf25b25 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
> >
> >  #if defined(CONFIG_RISCV_MISALIGNED)
> >  bool check_unaligned_access_emulated_all_cpus(void);
> > +bool check_vector_unaligned_access_all_cpus(void);
> > +
> >  void unaligned_emulation_finish(void);
> >  bool unaligned_ctl_available(void);
> >  DECLARE_PER_CPU(long, misaligned_access_speed);
> > +DECLARE_PER_CPU(long, vector_misaligned_access);
> >  #else
> >  static inline bool unaligned_ctl_available(void)
> >  {
> > diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
> > index 630507dff5ea..150a9877b0af 100644
> > --- a/arch/riscv/include/asm/hwprobe.h
> > +++ b/arch/riscv/include/asm/hwprobe.h
> > @@ -8,7 +8,7 @@
> >
> >  #include <uapi/asm/hwprobe.h>
> >
> > -#define RISCV_HWPROBE_MAX_KEY 6
> > +#define RISCV_HWPROBE_MAX_KEY 7
> >
> >  static inline bool riscv_hwprobe_key_is_valid(__s64 key)
> >  {
> > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
> > index 060212331a03..4474e98d17bd 100644
> > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > @@ -68,6 +68,12 @@ struct riscv_hwprobe {
> >  #define                RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
> >  #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
> >  #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
> > +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF  7
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_EMULATED           1
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
> > +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
> 
> This needs to be added to the documentation as well.

+1

> What value should be returned when V is not enabled in the kernel, or
> V is not supported in the hardware? Currently in the code it would be
> UNKNOWN, right? Is that what we want, or is it worth differentiating
> "no support for V" from "I don't know the speed of misaligned loads"?
> Maybe UNKNOWN is the right value, as there are other values to tell
> you V is not enabled.

I think UNKNOWN is fine for !V, assuming identical definitions as scalar.
Realistically, we /should/ always know the value once we get as far as
userspace (ignoring the fact that this patch as implemented doesn't know),
either by probing for it or config options like those that the non-vector
stuff has.

What I am not a fan of in this patch is how it makes UNKNOWN a proxy
for "SUPPORTED", I think we should set SLOW as a minimum. Really we
should either add probing and Kconfig options, and maybe emulation if
there's a strong case for it.

> >  /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
> >
> >  /* Flags */
> > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > index b286b73e763e..ce641cc6e47a 100644
> > --- a/arch/riscv/kernel/sys_hwprobe.c
> > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> >  }
> >  #endif
> >
> > +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +       int cpu;
> > +       u64 perf = -1ULL;
> > +
> > +       for_each_cpu(cpu, cpus) {
> > +               int this_perf = per_cpu(vector_misaligned_access, cpu);
> > +
> > +               if (perf == -1ULL)
> > +                       perf = this_perf;
> > +
> > +               if (perf != this_perf) {
> > +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +                       break;
> > +               }
> > +       }
> > +
> > +       if (perf == -1ULL)
> > +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +
> > +       return perf;
> > +}
> > +#else
> > +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
> > +{
> > +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> > +}
> > +#endif
> > +
> >  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >                              const struct cpumask *cpus)
> >  {
> > @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >                 pair->value = hwprobe_misaligned(cpus);
> >                 break;
> >
> > +       case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
> > +               pair->value = hwprobe_vec_misaligned(cpus);
> > +               break;
> > +
> >         case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
> >                 pair->value = 0;
> >                 if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
> > diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
> > index 2adb7c3e4dd5..0c07e990e9c5 100644
> > --- a/arch/riscv/kernel/traps_misaligned.c
> > +++ b/arch/riscv/kernel/traps_misaligned.c
> > @@ -16,6 +16,7 @@
> >  #include <asm/entry-common.h>
> >  #include <asm/hwprobe.h>
> >  #include <asm/cpufeature.h>
> > +#include <asm/vector.h>
> >
> >  #define INSN_MATCH_LB                  0x3
> >  #define INSN_MASK_LB                   0x707f
> > @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
> >         if (get_insn(regs, epc, &insn))
> >                 return -1;
> >
> > +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> > +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
> > +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> 
> Shouldn't this be EMULATED, given we were just delegated this trap? I
> guess it depends on whether you're going to add support for actually
> handling the misaligned vector trap, as Clément mentioned.

Also, we should not actually set EMULATED if we don't actually emulate
it - this patch just advances epc, right? Also, is it even correct to
arbitrarily set this value on a misaligned trap - how do we know if this
was a vector access at this point? Finally, with how handle_misaligned_load()
works, aren't we gonna set the scalar misaligned access speed to EMULATED
during this trap even if the trapping instruction is a vector one? That seems
like it should be fixed in a patch of its own.

> Scalar misaligned loads had a history to lean on since the specs were
> always explicit that misaligned loads/store had to be supported one
> way or another. So UNSUPPORTED was a future theoretical value. I
> haven't dug through the specs yet, do you know what the story is for V
> and misaligned loads? My sub-question is what the rules are for
> detecting the difference between EMULATED and UNSUPPORTED.

I mean, if we emulate it, then it's EMULATED. If we don't EMULATE it and
nothing below us emulates it, then it is UNSUPPORTED. The one that has
questionmarks is EMULATED v SLOW, where firmware emulation is
communicated to userspace as SLOW, because we can't really tell that it
is in fact EMULATED.

Cheers,
Conor.

> > +               regs->epc = epc + INSN_LEN(insn);
> > +               return 0;
> > +       }
> > +#endif
> > +
> >         regs->epc = 0;
> >
> >         if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
> > @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
> >         return misaligned_emu_detected;
> >  }
> >
> > +#ifdef CONFIG_RISCV_ISA_V
> > +static bool check_vector_unaligned_access(int cpu)
> > +{
> > +       long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
> > +       struct riscv_isainfo *isainfo = &hart_isa[cpu];
> > +       unsigned long tmp_var;
> > +       bool misaligned_vec_suported;
> > +
> > +       if (!riscv_isa_extension_available(isainfo->isa, v))
> > +               return false;
> > +
> > +       /* This case will only happen if a unaligned vector load
> > +        * was called by the kernel before this check
> > +        */
> > +       if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
> > +               return false;
> > +
> > +       kernel_vector_begin();
> > +       __asm__ __volatile__ (
> > +               ".option push\n\t"
> > +               ".option arch, +v\n\t"
> > +               "       li t1, 0x1\n"                           //size
> > +               "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of 16b
> > +               "       addi t0, %[ptr], 1\n\t"                 // Misalign address
> > +               "       vle16.v v0, (t0)\n\t"                   // Load bytes
> > +               ".option pop\n\t"
> > +               : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
> > +       kernel_vector_end();
> > +
> > +       misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
> > +
> > +       return misaligned_vec_suported;
> > +}
> > +#else
> > +static bool check_vector_unaligned_access(int cpu)
> > +{
> > +       return false;
> > +}
> > +#endif
> > +
> > +bool check_vector_unaligned_access_all_cpus(void)
> > +{
> > +       int cpu;
> > +
> > +       for_each_online_cpu(cpu)
> > +               if (!check_vector_unaligned_access(cpu))
> > +                       return false;
> > +
> > +       return true;
> > +}
> 
> These functions return a bool, but the bool is never checked. I'm
> guessing that's because you're going to check it in a future patch
> where you decide whether or not to probe for fast/slow?
> 
> 
> > +
> >  bool check_unaligned_access_emulated_all_cpus(void)
> >  {
> >         int cpu;
> > diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> > index a9a6bcb02acf..92a84239beaa 100644
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> > @@ -20,6 +20,7 @@
> >  #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> >
> >  DEFINE_PER_CPU(long, misaligned_access_speed);
> > +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
> >
> >  #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> >  static cpumask_t fast_misaligned_access;
> > @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
> >  {
> >         bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
> >
> > +       check_vector_unaligned_access_all_cpus();
> > +
> >         if (!all_cpus_emulated)
> >                 return check_unaligned_access_speed_all_cpus();
> >
> > @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
> >  static int check_unaligned_access_all_cpus(void)
> >  {
> >         check_unaligned_access_emulated_all_cpus();
> > +       check_vector_unaligned_access_all_cpus();
> >
> >         return 0;
> >  }
> > --
> > 2.43.0
> >
>
Conor Dooley June 5, 2024, 4:34 p.m. UTC | #7
> On Wed, Jun 05, 2024 at 08:54:21AM -0700, Evan Green wrote:
> > On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse@rivosinc.com> wrote:
> > What value should be returned when V is not enabled in the kernel, or
> > V is not supported in the hardware? Currently in the code it would be
> > UNKNOWN, right? Is that what we want, or is it worth differentiating
> > "no support for V" from "I don't know the speed of misaligned loads"?
> > Maybe UNKNOWN is the right value, as there are other values to tell
> > you V is not enabled.
> 
> I think UNKNOWN is fine for !V, assuming identical definitions as scalar.

I dunno, maybe we should set it to UNSUPPORTED in that case, but there's
probably some funny behaviour around the v prctl we might need to be
aware of. If the system has vector, we should probably figure out the
actual speed is, but not report it while vector is disabled from the
prctl?
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 347805446151..5ad69cf25b25 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -35,9 +35,12 @@  void riscv_user_isa_enable(void);
 
 #if defined(CONFIG_RISCV_MISALIGNED)
 bool check_unaligned_access_emulated_all_cpus(void);
+bool check_vector_unaligned_access_all_cpus(void);
+
 void unaligned_emulation_finish(void);
 bool unaligned_ctl_available(void);
 DECLARE_PER_CPU(long, misaligned_access_speed);
+DECLARE_PER_CPU(long, vector_misaligned_access);
 #else
 static inline bool unaligned_ctl_available(void)
 {
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 630507dff5ea..150a9877b0af 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@ 
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 6
+#define RISCV_HWPROBE_MAX_KEY 7
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index 060212331a03..4474e98d17bd 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -68,6 +68,12 @@  struct riscv_hwprobe {
 #define		RISCV_HWPROBE_MISALIGNED_UNSUPPORTED	(4 << 0)
 #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
 #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
+#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF	7
+#define		RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN		0
+#define		RISCV_HWPROBE_VEC_MISALIGNED_EMULATED		1
+#define		RISCV_HWPROBE_VEC_MISALIGNED_SLOW		2
+#define		RISCV_HWPROBE_VEC_MISALIGNED_FAST		3
+#define		RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED	4
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index b286b73e763e..ce641cc6e47a 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -184,6 +184,36 @@  static u64 hwprobe_misaligned(const struct cpumask *cpus)
 }
 #endif
 
+#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
+static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
+{
+	int cpu;
+	u64 perf = -1ULL;
+
+	for_each_cpu(cpu, cpus) {
+		int this_perf = per_cpu(vector_misaligned_access, cpu);
+
+		if (perf == -1ULL)
+			perf = this_perf;
+
+		if (perf != this_perf) {
+			perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+			break;
+		}
+	}
+
+	if (perf == -1ULL)
+		return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+
+	return perf;
+}
+#else
+static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
+{
+	return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
+}
+#endif
+
 static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 			     const struct cpumask *cpus)
 {
@@ -211,6 +241,10 @@  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 		pair->value = hwprobe_misaligned(cpus);
 		break;
 
+	case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
+		pair->value = hwprobe_vec_misaligned(cpus);
+		break;
+
 	case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
 		pair->value = 0;
 		if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 2adb7c3e4dd5..0c07e990e9c5 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -16,6 +16,7 @@ 
 #include <asm/entry-common.h>
 #include <asm/hwprobe.h>
 #include <asm/cpufeature.h>
+#include <asm/vector.h>
 
 #define INSN_MATCH_LB			0x3
 #define INSN_MASK_LB			0x707f
@@ -426,6 +427,14 @@  int handle_misaligned_load(struct pt_regs *regs)
 	if (get_insn(regs, epc, &insn))
 		return -1;
 
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
+	if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
+		*this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
+		regs->epc = epc + INSN_LEN(insn);
+		return 0;
+	}
+#endif
+
 	regs->epc = 0;
 
 	if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
@@ -625,6 +634,57 @@  static bool check_unaligned_access_emulated(int cpu)
 	return misaligned_emu_detected;
 }
 
+#ifdef CONFIG_RISCV_ISA_V
+static bool check_vector_unaligned_access(int cpu)
+{
+	long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
+	struct riscv_isainfo *isainfo = &hart_isa[cpu];
+	unsigned long tmp_var;
+	bool misaligned_vec_suported;
+
+	if (!riscv_isa_extension_available(isainfo->isa, v))
+		return false;
+
+	/* This case will only happen if a unaligned vector load
+	 * was called by the kernel before this check
+	 */
+	if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
+		return false;
+
+	kernel_vector_begin();
+	__asm__ __volatile__ (
+		".option push\n\t"
+		".option arch, +v\n\t"
+		"	li t1, 0x1\n"				//size
+		"       vsetvli t0, t1, e16, m2, ta, ma\n\t"	// Vectors of 16b
+		"       addi t0, %[ptr], 1\n\t"			// Misalign address
+		"	vle16.v v0, (t0)\n\t"			// Load bytes
+		".option pop\n\t"
+		: : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
+	kernel_vector_end();
+
+	misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
+
+	return misaligned_vec_suported;
+}
+#else
+static bool check_vector_unaligned_access(int cpu)
+{
+	return false;
+}
+#endif
+
+bool check_vector_unaligned_access_all_cpus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		if (!check_vector_unaligned_access(cpu))
+			return false;
+
+	return true;
+}
+
 bool check_unaligned_access_emulated_all_cpus(void)
 {
 	int cpu;
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index a9a6bcb02acf..92a84239beaa 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -20,6 +20,7 @@ 
 #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
 
 DEFINE_PER_CPU(long, misaligned_access_speed);
+DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
 
 #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 static cpumask_t fast_misaligned_access;
@@ -264,6 +265,8 @@  static int check_unaligned_access_all_cpus(void)
 {
 	bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
 
+	check_vector_unaligned_access_all_cpus();
+
 	if (!all_cpus_emulated)
 		return check_unaligned_access_speed_all_cpus();
 
@@ -273,6 +276,7 @@  static int check_unaligned_access_all_cpus(void)
 static int check_unaligned_access_all_cpus(void)
 {
 	check_unaligned_access_emulated_all_cpus();
+	check_vector_unaligned_access_all_cpus();
 
 	return 0;
 }