@@ -690,6 +690,7 @@ config THREAD_SIZE_ORDER
config RISCV_MISALIGNED
bool "Support misaligned load/store traps for kernel and userspace"
select SYSCTL_ARCH_UNALIGN_ALLOW
+ depends on !HAVE_EFFICIENT_UNALIGNED_ACCESS
default y
help
Say Y here if you want the kernel to embed support for misaligned
@@ -51,6 +51,7 @@ static inline bool check_unaligned_access_emulated(int cpu)
static inline void unaligned_emulation_finish(void) {}
#endif
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
DECLARE_PER_CPU(long, misaligned_access_speed);
DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
@@ -59,6 +60,12 @@ static __always_inline bool has_fast_misaligned_accesses(void)
{
return static_branch_likely(&fast_misaligned_access_speed_key);
}
+#else
+static __always_inline bool has_fast_misaligned_accesses(void)
+{
+ return true;
+}
+#endif
unsigned long riscv_get_elf_hwcap(void);
new file mode 100644
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2024 Rivos, Inc
+ */
+
+#ifndef _ASM_MISALIGNED_ACCESS_SPEED_H
+#define _ASM_MISALIGNED_ACCESS_SPEED_H
+
+#include <linux/compiler_attributes.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+DECLARE_PER_CPU(long, misaligned_access_speed);
+
+DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
+
+static __always_inline bool has_fast_misaligned_accesses(void)
+{
+ return static_branch_likely(&fast_misaligned_access_speed_key);
+}
+#else
+static __always_inline bool has_fast_misaligned_accesses(void)
+{
+ return static_branch_likely(&fast_misaligned_access_speed_key);
+}
+#endif
+#endif
@@ -62,6 +62,9 @@ obj-y += tests/
obj-$(CONFIG_MMU) += vdso.o vdso/
obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
+ifneq ($(RISCV_EFFICIENT_UNALIGNED_ACCESS), y)
+obj-y += misaligned_access_speed.o
+endif
obj-$(CONFIG_FPU) += fpu.o
obj-$(CONFIG_RISCV_ISA_V) += vector.o
obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o
@@ -11,7 +11,6 @@
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
#include <linux/ctype.h>
-#include <linux/jump_label.h>
#include <linux/log2.h>
#include <linux/memory.h>
#include <linux/module.h>
@@ -21,20 +20,12 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/hwcap.h>
-#include <asm/hwprobe.h>
#include <asm/patch.h>
#include <asm/processor.h>
#include <asm/vector.h>
-#include "copy-unaligned.h"
-
#define NUM_ALPHA_EXTS ('z' - 'a' + 1)
-#define MISALIGNED_ACCESS_JIFFIES_LG2 1
-#define MISALIGNED_BUFFER_SIZE 0x4000
-#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
-#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
-
unsigned long elf_hwcap __read_mostly;
/* Host ISA bitmap */
@@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
/* Per-cpu ISA extensions. */
struct riscv_isainfo hart_isa[NR_CPUS];
-/* Performance information */
-DEFINE_PER_CPU(long, misaligned_access_speed);
-
-static cpumask_t fast_misaligned_access;
-
/**
* riscv_isa_extension_base() - Get base extension word
*
@@ -706,247 +692,6 @@ unsigned long riscv_get_elf_hwcap(void)
return hwcap;
}
-static int check_unaligned_access(void *param)
-{
- int cpu = smp_processor_id();
- u64 start_cycles, end_cycles;
- u64 word_cycles;
- u64 byte_cycles;
- int ratio;
- unsigned long start_jiffies, now;
- struct page *page = param;
- void *dst;
- void *src;
- long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
-
- if (check_unaligned_access_emulated(cpu))
- return 0;
-
- /* Make an unaligned destination buffer. */
- dst = (void *)((unsigned long)page_address(page) | 0x1);
- /* Unalign src as well, but differently (off by 1 + 2 = 3). */
- src = dst + (MISALIGNED_BUFFER_SIZE / 2);
- src += 2;
- word_cycles = -1ULL;
- /* Do a warmup. */
- __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- preempt_disable();
- start_jiffies = jiffies;
- while ((now = jiffies) == start_jiffies)
- cpu_relax();
-
- /*
- * For a fixed amount of time, repeatedly try the function, and take
- * the best time in cycles as the measurement.
- */
- while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
- start_cycles = get_cycles64();
- /* Ensure the CSR read can't reorder WRT to the copy. */
- mb();
- __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- /* Ensure the copy ends before the end time is snapped. */
- mb();
- end_cycles = get_cycles64();
- if ((end_cycles - start_cycles) < word_cycles)
- word_cycles = end_cycles - start_cycles;
- }
-
- byte_cycles = -1ULL;
- __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- start_jiffies = jiffies;
- while ((now = jiffies) == start_jiffies)
- cpu_relax();
-
- while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
- start_cycles = get_cycles64();
- mb();
- __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
- mb();
- end_cycles = get_cycles64();
- if ((end_cycles - start_cycles) < byte_cycles)
- byte_cycles = end_cycles - start_cycles;
- }
-
- preempt_enable();
-
- /* Don't divide by zero. */
- if (!word_cycles || !byte_cycles) {
- pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
- cpu);
-
- return 0;
- }
-
- if (word_cycles < byte_cycles)
- speed = RISCV_HWPROBE_MISALIGNED_FAST;
-
- ratio = div_u64((byte_cycles * 100), word_cycles);
- pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
- cpu,
- ratio / 100,
- ratio % 100,
- (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
-
- per_cpu(misaligned_access_speed, cpu) = speed;
-
- /*
- * Set the value of fast_misaligned_access of a CPU. These operations
- * are atomic to avoid race conditions.
- */
- if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
- cpumask_set_cpu(cpu, &fast_misaligned_access);
- else
- cpumask_clear_cpu(cpu, &fast_misaligned_access);
-
- return 0;
-}
-
-static void check_unaligned_access_nonboot_cpu(void *param)
-{
- unsigned int cpu = smp_processor_id();
- struct page **pages = param;
-
- if (smp_processor_id() != 0)
- check_unaligned_access(pages[cpu]);
-}
-
-DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
-
-static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
-{
- if (cpumask_weight(mask) == weight)
- static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
- else
- static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
-}
-
-static void set_unaligned_access_static_branches_except_cpu(int cpu)
-{
- /*
- * Same as set_unaligned_access_static_branches, except excludes the
- * given CPU from the result. When a CPU is hotplugged into an offline
- * state, this function is called before the CPU is set to offline in
- * the cpumask, and thus the CPU needs to be explicitly excluded.
- */
-
- cpumask_t fast_except_me;
-
- cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
- cpumask_clear_cpu(cpu, &fast_except_me);
-
- modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
-}
-
-static void set_unaligned_access_static_branches(void)
-{
- /*
- * This will be called after check_unaligned_access_all_cpus so the
- * result of unaligned access speed for all CPUs will be available.
- *
- * To avoid the number of online cpus changing between reading
- * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
- * held before calling this function.
- */
-
- cpumask_t fast_and_online;
-
- cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
-
- modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
-}
-
-static int lock_and_set_unaligned_access_static_branch(void)
-{
- cpus_read_lock();
- set_unaligned_access_static_branches();
- cpus_read_unlock();
-
- return 0;
-}
-
-arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
-
-static int riscv_online_cpu(unsigned int cpu)
-{
- static struct page *buf;
-
- /* We are already set since the last check */
- if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
- goto exit;
-
- buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
- if (!buf) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- return -ENOMEM;
- }
-
- check_unaligned_access(buf);
- __free_pages(buf, MISALIGNED_BUFFER_ORDER);
-
-exit:
- set_unaligned_access_static_branches();
-
- return 0;
-}
-
-static int riscv_offline_cpu(unsigned int cpu)
-{
- set_unaligned_access_static_branches_except_cpu(cpu);
-
- return 0;
-}
-
-/* Measure unaligned access on all CPUs present at boot in parallel. */
-static int check_unaligned_access_all_cpus(void)
-{
- unsigned int cpu;
- unsigned int cpu_count = num_possible_cpus();
- struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
- GFP_KERNEL);
-
- if (!bufs) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- return 0;
- }
-
- /*
- * Allocate separate buffers for each CPU so there's no fighting over
- * cache lines.
- */
- for_each_cpu(cpu, cpu_online_mask) {
- bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
- if (!bufs[cpu]) {
- pr_warn("Allocation failure, not measuring misaligned performance\n");
- goto out;
- }
- }
-
- /* Check everybody except 0, who stays behind to tend jiffies. */
- on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
-
- /* Check core 0. */
- smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
-
- /*
- * Setup hotplug callbacks for any new CPUs that come online or go
- * offline.
- */
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
- riscv_online_cpu, riscv_offline_cpu);
-
-out:
- unaligned_emulation_finish();
- for_each_cpu(cpu, cpu_online_mask) {
- if (bufs[cpu])
- __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
- }
-
- kfree(bufs);
- return 0;
-}
-
-arch_initcall(check_unaligned_access_all_cpus);
-
void riscv_user_isa_enable(void)
{
if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
new file mode 100644
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Rivos Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/jump_label.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+#include <asm/hwprobe.h>
+
+#include "copy-unaligned.h"
+
+#define MISALIGNED_ACCESS_JIFFIES_LG2 1
+#define MISALIGNED_BUFFER_SIZE 0x4000
+#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
+#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
+
+DEFINE_PER_CPU(long, misaligned_access_speed);
+
+static cpumask_t fast_misaligned_access;
+
+static int check_unaligned_access(void *param)
+{
+ int cpu = smp_processor_id();
+ u64 start_cycles, end_cycles;
+ u64 word_cycles;
+ u64 byte_cycles;
+ int ratio;
+ unsigned long start_jiffies, now;
+ struct page *page = param;
+ void *dst;
+ void *src;
+ long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
+
+ if (check_unaligned_access_emulated(cpu))
+ return 0;
+
+ /* Make an unaligned destination buffer. */
+ dst = (void *)((unsigned long)page_address(page) | 0x1);
+ /* Unalign src as well, but differently (off by 1 + 2 = 3). */
+ src = dst + (MISALIGNED_BUFFER_SIZE / 2);
+ src += 2;
+ word_cycles = -1ULL;
+ /* Do a warmup. */
+ __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ preempt_disable();
+ start_jiffies = jiffies;
+ while ((now = jiffies) == start_jiffies)
+ cpu_relax();
+
+ /*
+ * For a fixed amount of time, repeatedly try the function, and take
+ * the best time in cycles as the measurement.
+ */
+ while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+ start_cycles = get_cycles64();
+ /* Ensure the CSR read can't reorder WRT to the copy. */
+ mb();
+ __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ /* Ensure the copy ends before the end time is snapped. */
+ mb();
+ end_cycles = get_cycles64();
+ if ((end_cycles - start_cycles) < word_cycles)
+ word_cycles = end_cycles - start_cycles;
+ }
+
+ byte_cycles = -1ULL;
+ __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ start_jiffies = jiffies;
+ while ((now = jiffies) == start_jiffies)
+ cpu_relax();
+
+ while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+ start_cycles = get_cycles64();
+ mb();
+ __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+ mb();
+ end_cycles = get_cycles64();
+ if ((end_cycles - start_cycles) < byte_cycles)
+ byte_cycles = end_cycles - start_cycles;
+ }
+
+ preempt_enable();
+
+ /* Don't divide by zero. */
+ if (!word_cycles || !byte_cycles) {
+ pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
+ cpu);
+
+ return 0;
+ }
+
+ if (word_cycles < byte_cycles)
+ speed = RISCV_HWPROBE_MISALIGNED_FAST;
+
+ ratio = div_u64((byte_cycles * 100), word_cycles);
+ pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
+ cpu,
+ ratio / 100,
+ ratio % 100,
+ (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
+
+ per_cpu(misaligned_access_speed, cpu) = speed;
+
+ /*
+ * Set the value of fast_misaligned_access of a CPU. These operations
+ * are atomic to avoid race conditions.
+ */
+ if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
+ cpumask_set_cpu(cpu, &fast_misaligned_access);
+ else
+ cpumask_clear_cpu(cpu, &fast_misaligned_access);
+
+ return 0;
+}
+
+static void check_unaligned_access_nonboot_cpu(void *param)
+{
+ unsigned int cpu = smp_processor_id();
+ struct page **pages = param;
+
+ if (smp_processor_id() != 0)
+ check_unaligned_access(pages[cpu]);
+}
+
+DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
+
+static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
+{
+ if (cpumask_weight(mask) == weight)
+ static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
+ else
+ static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
+}
+
+static void set_unaligned_access_static_branches_except_cpu(int cpu)
+{
+ /*
+ * Same as set_unaligned_access_static_branches, except excludes the
+ * given CPU from the result. When a CPU is hotplugged into an offline
+ * state, this function is called before the CPU is set to offline in
+ * the cpumask, and thus the CPU needs to be explicitly excluded.
+ */
+
+ cpumask_t fast_except_me;
+
+ cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
+ cpumask_clear_cpu(cpu, &fast_except_me);
+
+ modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
+}
+
+static void set_unaligned_access_static_branches(void)
+{
+ /*
+ * This will be called after check_unaligned_access_all_cpus so the
+ * result of unaligned access speed for all CPUs will be available.
+ *
+ * To avoid the number of online cpus changing between reading
+ * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
+ * held before calling this function.
+ */
+
+ cpumask_t fast_and_online;
+
+ cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
+
+ modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
+}
+
+static int lock_and_set_unaligned_access_static_branch(void)
+{
+ cpus_read_lock();
+ set_unaligned_access_static_branches();
+ cpus_read_unlock();
+
+ return 0;
+}
+
+arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
+
+static int riscv_online_cpu(unsigned int cpu)
+{
+ static struct page *buf;
+
+ /* We are already set since the last check */
+ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
+ goto exit;
+
+ buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+ if (!buf) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ return -ENOMEM;
+ }
+
+ check_unaligned_access(buf);
+ __free_pages(buf, MISALIGNED_BUFFER_ORDER);
+
+exit:
+ set_unaligned_access_static_branches();
+
+ return 0;
+}
+
+static int riscv_offline_cpu(unsigned int cpu)
+{
+ set_unaligned_access_static_branches_except_cpu(cpu);
+
+ return 0;
+}
+
+/* Measure unaligned access on all CPUs present at boot in parallel. */
+static int check_unaligned_access_all_cpus(void)
+{
+ unsigned int cpu;
+ unsigned int cpu_count = num_possible_cpus();
+ struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
+ GFP_KERNEL);
+
+ if (!bufs) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ return 0;
+ }
+
+ /*
+ * Allocate separate buffers for each CPU so there's no fighting over
+ * cache lines.
+ */
+ for_each_cpu(cpu, cpu_online_mask) {
+ bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+ if (!bufs[cpu]) {
+ pr_warn("Allocation failure, not measuring misaligned performance\n");
+ goto out;
+ }
+ }
+
+ /* Check everybody except 0, who stays behind to tend jiffies. */
+ on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
+
+ /* Check core 0. */
+ smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
+
+ /*
+ * Setup hotplug callbacks for any new CPUs that come online or go
+ * offline.
+ */
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
+ riscv_online_cpu, riscv_offline_cpu);
+
+out:
+ unaligned_emulation_finish();
+ for_each_cpu(cpu, cpu_online_mask) {
+ if (bufs[cpu])
+ __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
+ }
+
+ kfree(bufs);
+ return 0;
+}
+
+arch_initcall(check_unaligned_access_all_cpus);
@@ -149,6 +149,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
static u64 hwprobe_misaligned(const struct cpumask *cpus)
{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
int cpu;
u64 perf = -1ULL;
@@ -168,6 +169,9 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
return RISCV_HWPROBE_MISALIGNED_UNKNOWN;
return perf;
+#else
+ return RISCV_HWPROBE_MISALIGNED_FAST;
+#endif
}
static void hwprobe_one_pair(struct riscv_hwprobe *pair,
When CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is selected, the cpus can be set to have fast misaligned access without needing to probe. Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/cpufeature.h | 7 + arch/riscv/include/asm/misaligned_access_speed.h | 29 +++ arch/riscv/kernel/Makefile | 3 + arch/riscv/kernel/cpufeature.c | 255 ---------------------- arch/riscv/kernel/misaligned_access_speed.c | 265 +++++++++++++++++++++++ arch/riscv/kernel/sys_hwprobe.c | 4 + 7 files changed, 309 insertions(+), 255 deletions(-)