@@ -74,6 +74,8 @@ extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state,
unsigned long vq_minus_1);
extern unsigned int sve_get_vl(void);
+extern void sve_set_vq(unsigned long vq_minus_1);
+
struct arm64_cpu_capabilities;
extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused);
@@ -48,6 +48,11 @@ SYM_FUNC_START(sve_get_vl)
ret
SYM_FUNC_END(sve_get_vl)
+SYM_FUNC_START(sve_set_vq)
+ sve_load_vq x0, x1, x2
+ ret
+SYM_FUNC_END(sve_set_vq)
+
/*
* Load SVE state from FPSIMD state.
*
@@ -994,10 +994,10 @@ void fpsimd_release_task(struct task_struct *dead_task)
/*
* Trapped SVE access
*
- * Storage is allocated for the full SVE state, the current FPSIMD
- * register contents are migrated across, and TIF_SVE_EXEC is set so that
- * the SVE access trap will be disabled the next time this task
- * reaches ret_to_user.
+ * Storage is allocated for the full SVE state so that the code
+ * running subsequently has somewhere to save the SVE registers to. We
+ * then rely on ret_to_user to actually convert the FPSIMD registers
+ * to SVE state by flushing as required.
*
* TIF_SVE_EXEC should be clear on entry: otherwise,
* fpsimd_restore_current_state() would have disabled the SVE access
@@ -1016,15 +1016,26 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
get_cpu_fpsimd_context();
- fpsimd_save();
-
- /* Force ret_to_user to reload the registers: */
- fpsimd_flush_task_state(current);
-
- fpsimd_to_sve(current);
+ /*
+ * We shouldn't trap if we can execute SVE instructions and
+ * there should be no SVE state if that is the case.
+ */
if (test_and_set_thread_flag(TIF_SVE_EXEC))
- WARN_ON(1); /* SVE access shouldn't have trapped */
- set_thread_flag(TIF_SVE_FULL_REGS);
+ WARN_ON(1);
+ if (test_and_clear_thread_flag(TIF_SVE_FULL_REGS))
+ WARN_ON(1);
+
+ /*
+ * When the FPSIMD state is loaded:
+ * - The return path (see fpsimd_restore_current_state) requires
+ * the vector length to be loaded beforehand.
+ * - We need to rebind the task to the CPU so the newly allocated
+ * SVE state is used when the task is saved.
+ */
+ if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
+ sve_set_vq(sve_vq_from_vl(current->thread.sve_vl) - 1);
+ fpsimd_bind_task_to_cpu();
+ }
put_cpu_fpsimd_context();
}
When we take a SVE access trap only the subset of the SVE Z0-Z31 registers shared with the FPSIMD V0-V31 registers is valid, the rest of the bits in the SVE registers must be cleared before returning to userspace. Currently we do this by saving the current FPSIMD register state to the task struct and then using that to initalize the copy of the SVE registers in the task struct so they can be loaded from there into the registers. This requires a lot more memory access than we need. The newly added TIF_SVE_FULL_REGS can be used to reduce this overhead - instead of doing the conversion immediately we can set only TIF_SVE_EXEC and not TIF_SVE_FULL_REGS. This means that until we return to userspace we only need to store the FPSIMD registers and if (as should be the common case) the hardware still has the task state and does not need that to be reloaded from the task struct we can do the initialization of the SVE state entirely in registers. In the event that we do need to reload the registers from the task struct only the FPSIMD subset needs to be loaded from memory. If the FPSIMD state is loaded then we need to set the vector length. This is because the vector length is only set when loading from memory, the expectation is that the vector length is set when TIF_SVE_EXEC is set. We also need to rebind the task to the CPU so the newly allocated SVE state is used when the task is saved. This is based on earlier work by Julien Gral implementing a similar idea. Signed-off-by: Mark Brown <broonie@kernel.org> --- arch/arm64/include/asm/fpsimd.h | 2 ++ arch/arm64/kernel/entry-fpsimd.S | 5 +++++ arch/arm64/kernel/fpsimd.c | 35 +++++++++++++++++++++----------- 3 files changed, 30 insertions(+), 12 deletions(-)