diff mbox series

[1/3] arm64: start using 'asm goto' for get_user() when available

Message ID 20240709161022.1035500-2-torvalds@linux-foundation.org (mailing list archive)
State New, archived
Headers show
Series [1/3] arm64: start using 'asm goto' for get_user() when available | expand

Commit Message

Linus Torvalds July 9, 2024, 4:01 p.m. UTC
This generates noticeably better code with compilers that support it,
since we don't need to test the error register etc, the exception just
jumps to the error handling directly.

Note that this also marks SW_TTBR0_PAN incompatible with KCSAN support,
since KCSAN wants to save and restore the user access state.

KCSAN and SW_TTBR0_PAN were probably always incompatible, but it became
obvious only when implementing the unsafe user access functions.  At
that point the default empty user_access_save/restore() functions
weren't provided by the default fallback functions.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig               |   1 +
 arch/arm64/include/asm/uaccess.h | 121 +++++++++++++++++++++++++------
 arch/arm64/kernel/mte.c          |  12 ++-
 3 files changed, 104 insertions(+), 30 deletions(-)

Comments

Mark Rutland July 17, 2024, 4:22 p.m. UTC | #1
Hi Linus,

As a heads-up, this is tickling a GCC bug with asm goto outputs in GCC
versions prior to 14.1.0, and the existing asm_goto_output() workaround
doesn't seem to be sufficent. More details on that below, and I've added
LKML and the usual folk to CC, including compiler folk who looked into
this area before. The gist is that in the paths where we jump to a goto
label GCC may erroneously omit assignments to variables which get
assigned an output of the asm in the non-branching case.

The big question is can we actually workaround this, or do we need to
bite the bullet and say that we need GCC 14.1.0 or later?

I spotted that while preparing a fixup for a thinko with
_ASM_EXTABLE_##type##ACCESS_ERR(), where we put the error code (-EFAULT)
into the value register. That *should* be benign, since in the faulting
case we subsequently assign 0, but due to the compiler bug GCC discards
that later assignment...

On Tue, Jul 09, 2024 at 09:01:59AM -0700, Linus Torvalds wrote:
> -#define __get_mem_asm(load, reg, x, addr, err, type)			\
> +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
> +#define __get_mem_asm(load, reg, x, addr, label, type)			\
> +	asm_goto_output(						\
> +	"1:	" load "	" reg "0, [%1]\n"			\
> +	_ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0)			\
> +	: "=r" (x)							\
> +	: "r" (addr) : : label)

Ignoring the compiler bug, the extable entry should be:

	_ASM_EXTABLE_##type##ACCESS(1b, %l2)

... since we don't have an error variable, and we don't intend to move
-EFAULT into the value destination register (which the extable fixup
handler will do for the 'error' value register).

I'll send a fixup for that, but the more pressing issue is
miscompilation, which seems to be the issue fixed in GCC in:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921

... which we tried to work around in commits:

  4356e9f841f7fbb9 ("work around gcc bugs with 'asm goto' with outputs")
  68fb3ca0e408e00d ("update workarounds for gcc "asm goto" issue")

... which we currently try to workaround that issue with the
asm_goto_output() macro:

| #define asm_goto_output(x...) \
|         do { asm volatile goto(x); asm (""); } while (0)

... but testing indicates that's insufficient on arm64 and x86_64, and
even with that I see GCC erroneously omitting assignments to variables
in the 'goto' paths, where those are assigned an output in the non-goto
path.

As a reduced test case I have:

| #define asm_goto_output(x...) \
|         do { asm volatile goto(x); asm (""); } while (0)
| 
| #define __good_or_bad(__val, __key)                                     \
| do {                                                                    \
|         __label__ __failed;                                             \
|         unsigned long __tmp;                                            \
|         asm_goto_output(                                                \
|         "       cbnz    %[key], %l[__failed]\n"                         \
|         "       mov     %[val], #0x900d\n"                              \
|         : [val] "=r" (__tmp)                                            \
|         : [key] "r" (__key)                                             \
|         :                                                               \
|         : __failed);                                                    \
|         (__val) = __tmp;                                                \
|         break;                                                          \
| __failed:                                                               \
|         (__val) = 0xbad;                                                \
| } while (0)
| 
| unsigned long get_val(unsigned long key);
| unsigned long get_val(unsigned long key)
| {
|         unsigned long val = 0xbad;
| 
|         __good_or_bad(val, key);
| 
|         return val;
| }

... which GCC 13.2.0 (at -O2) compiles to:

| 	cbnz    x0, .Lfailed
| 	mov     x0, #0x900d
| .Lfailed:
| 	ret

... where the assignment to '__val' in the '__failed' case has been
omitted.

If I add an 'asm("");' immediately after the '__failed' label, before the
assignment, GCC 13.2.0 generates:

| 	cbnz    x0, .Lfailed
| 	mov     x0, #0x900d
| 	ret
| .Lfailed:
| 	mov     x0, #0xbad
| 	ret

... where the assignment is retained as we'd hope.

GCC 14.1.0 generates the later sequence regardless of the presence of an asm
after the __failed label.

Can anyone from the GCC side comment as to whether placing the dummy asm("")
block after the goto labels is a sufficient workaround, or whether that's just
avoiding the issue by chance?

Further examples below.

With current mainline, building the following:

| #include <linux/uaccess.h>
| #include <linux/types.h>
| 
| noinline unsigned long test_get_user(unsigned long __user *ptr);
| noinline unsigned long test_get_user(unsigned long __user *ptr)
| {
|         unsigned long val;
| 
|         if (!access_ok(ptr, sizeof(*ptr)))
|                 return 0xdead;
|     
|         if (get_user(val, ptr))
|                 val = 0x900d;
|     
|         return val;
| }

GCC 13.2.0, arm64 defconfig + ARM64_TAGGED_ADDR_ABI=n generates:

|         mov     x1, #0xffffffffffff8
|         cmp     x0, x1
|         b.hi    .Laccess_not_ok
|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
| .Lextable_fixup:
|         ret
| .Laccess_not_ok:
|         mov     x0, #0xdead
|         ret

... entirely omitting the assignment to 'val' in the error path.

GCC 14.1.0, arm64 defconfig + ARM64_TAGGED_ADDR_ABI=n generates:

|         mov     x1, #0xffffffffffff8
|         cmp     x0, x1
|         b.hi    .Laccess_not_ok
|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
|         ret
| .Laccess_not_ok:
|         mov     x0, #0xdead
|         ret
| .Lextable_fixup:
|         mov     x0, #0x900d
|         ret

... with the expected assignment to 'val' in the error path.

On x86 we don't see this for regular get_user() because that calls
out-of-line asm rather than using asm goto with outputs. However
unsafe_get_user() usage does get miscompiled on both arm64 and x86_64.
In the unsafe_* case we generally don't manipulate the value register in
the error path, so we largely don't notice, but this is fragile.

With current mainline, building the following:

| #include <linux/uaccess.h>
| #include <linux/types.h>
| 
| noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr);
| noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr)
| {
|         unsigned long val;
| 
|         unsafe_get_user(val, ptr, Efault);
|         return val;
| 
| Efault:
|         val = 0x900d;
|         return val;
| }

GCC 13.2.0, arm64 defconfig generates:

|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
| .Lextable_fixup:
|         ret

GCC 13.2.0, x86_64 defconfig + MITIGATION_RETPOLINE=n generates:

|         endbr64
|         mov    (%rdi),%rax
| .Lextable_fixup:
|         ret

... omitting the assignment to 'val' in the error path.

GCC 14.1.0, arm64 defconfig generates:

|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
|         ret
| .Lextable_fixup:
|         mov     x0, #0x900d                     // #36877
|         ret

GCC 14.1.0, x86_64 defconfig + MITIGATION_RETPOLINE=n generates:

|         endbr64
|         mov    (%rdi),%rax
|         ret
| .Lextable_fixup:
|         mov    $0x900d,%eax
|         ret

... with the expected assignment to 'val' in the error path.

Mark.
Mark Rutland July 17, 2024, 4:28 p.m. UTC | #2
[resending as I messed up the LKML address; sorry!]

Hi Linus,

As a heads-up, this is tickling a GCC bug with asm goto outputs in GCC
versions prior to 14.1.0, and the existing asm_goto_output() workaround
doesn't seem to be sufficent. More details on that below, and I've added
LKML and the usual folk to CC, including compiler folk who looked into
this area before. The gist is that in the paths where we jump to a goto
label GCC may erroneously omit assignments to variables which get
assigned an output of the asm in the non-branching case.

The big question is can we actually workaround this, or do we need to
bite the bullet and say that we need GCC 14.1.0 or later?

I spotted that while preparing a fixup for a thinko with
_ASM_EXTABLE_##type##ACCESS_ERR(), where we put the error code (-EFAULT)
into the value register. That *should* be benign, since in the faulting
case we subsequently assign 0, but due to the compiler bug GCC discards
that later assignment...

On Tue, Jul 09, 2024 at 09:01:59AM -0700, Linus Torvalds wrote:
> -#define __get_mem_asm(load, reg, x, addr, err, type)			\
> +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
> +#define __get_mem_asm(load, reg, x, addr, label, type)			\
> +	asm_goto_output(						\
> +	"1:	" load "	" reg "0, [%1]\n"			\
> +	_ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0)			\
> +	: "=r" (x)							\
> +	: "r" (addr) : : label)

Ignoring the compiler bug, the extable entry should be:

	_ASM_EXTABLE_##type##ACCESS(1b, %l2)

... since we don't have an error variable, and we don't intend to move
-EFAULT into the value destination register (which the extable fixup
handler will do for the 'error' value register).

I'll send a fixup for that, but the more pressing issue is
miscompilation, which seems to be the issue fixed in GCC in:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921

... which we tried to work around in commits:

  4356e9f841f7fbb9 ("work around gcc bugs with 'asm goto' with outputs")
  68fb3ca0e408e00d ("update workarounds for gcc "asm goto" issue")

... which we currently try to workaround that issue with the
asm_goto_output() macro:

| #define asm_goto_output(x...) \
|         do { asm volatile goto(x); asm (""); } while (0)

... but testing indicates that's insufficient on arm64 and x86_64, and
even with that I see GCC erroneously omitting assignments to variables
in the 'goto' paths, where those are assigned an output in the non-goto
path.

As a reduced test case I have:

| #define asm_goto_output(x...) \
|         do { asm volatile goto(x); asm (""); } while (0)
| 
| #define __good_or_bad(__val, __key)                                     \
| do {                                                                    \
|         __label__ __failed;                                             \
|         unsigned long __tmp;                                            \
|         asm_goto_output(                                                \
|         "       cbnz    %[key], %l[__failed]\n"                         \
|         "       mov     %[val], #0x900d\n"                              \
|         : [val] "=r" (__tmp)                                            \
|         : [key] "r" (__key)                                             \
|         :                                                               \
|         : __failed);                                                    \
|         (__val) = __tmp;                                                \
|         break;                                                          \
| __failed:                                                               \
|         (__val) = 0xbad;                                                \
| } while (0)
| 
| unsigned long get_val(unsigned long key);
| unsigned long get_val(unsigned long key)
| {
|         unsigned long val = 0xbad;
| 
|         __good_or_bad(val, key);
| 
|         return val;
| }

... which GCC 13.2.0 (at -O2) compiles to:

| 	cbnz    x0, .Lfailed
| 	mov     x0, #0x900d
| .Lfailed:
| 	ret

... where the assignment to '__val' in the '__failed' case has been
omitted.

If I add an 'asm("");' immediately after the '__failed' label, before the
assignment, GCC 13.2.0 generates:

| 	cbnz    x0, .Lfailed
| 	mov     x0, #0x900d
| 	ret
| .Lfailed:
| 	mov     x0, #0xbad
| 	ret

... where the assignment is retained as we'd hope.

GCC 14.1.0 generates the later sequence regardless of the presence of an asm
after the __failed label.

Can anyone from the GCC side comment as to whether placing the dummy asm("")
block after the goto labels is a sufficient workaround, or whether that's just
avoiding the issue by chance?

Further examples below.

With current mainline, building the following:

| #include <linux/uaccess.h>
| #include <linux/types.h>
| 
| noinline unsigned long test_get_user(unsigned long __user *ptr);
| noinline unsigned long test_get_user(unsigned long __user *ptr)
| {
|         unsigned long val;
| 
|         if (!access_ok(ptr, sizeof(*ptr)))
|                 return 0xdead;
|     
|         if (get_user(val, ptr))
|                 val = 0x900d;
|     
|         return val;
| }

GCC 13.2.0, arm64 defconfig + ARM64_TAGGED_ADDR_ABI=n generates:

|         mov     x1, #0xffffffffffff8
|         cmp     x0, x1
|         b.hi    .Laccess_not_ok
|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
| .Lextable_fixup:
|         ret
| .Laccess_not_ok:
|         mov     x0, #0xdead
|         ret

... entirely omitting the assignment to 'val' in the error path.

GCC 14.1.0, arm64 defconfig + ARM64_TAGGED_ADDR_ABI=n generates:

|         mov     x1, #0xffffffffffff8
|         cmp     x0, x1
|         b.hi    .Laccess_not_ok
|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
|         ret
| .Laccess_not_ok:
|         mov     x0, #0xdead
|         ret
| .Lextable_fixup:
|         mov     x0, #0x900d
|         ret

... with the expected assignment to 'val' in the error path.

On x86 we don't see this for regular get_user() because that calls
out-of-line asm rather than using asm goto with outputs. However
unsafe_get_user() usage does get miscompiled on both arm64 and x86_64.
In the unsafe_* case we generally don't manipulate the value register in
the error path, so we largely don't notice, but this is fragile.

With current mainline, building the following:

| #include <linux/uaccess.h>
| #include <linux/types.h>
| 
| noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr);
| noinline unsigned long test_unsafe_get_user(unsigned long __user *ptr)
| {
|         unsigned long val;
| 
|         unsafe_get_user(val, ptr, Efault);
|         return val;
| 
| Efault:
|         val = 0x900d;
|         return val;
| }

GCC 13.2.0, arm64 defconfig generates:

|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
| .Lextable_fixup:
|         ret

GCC 13.2.0, x86_64 defconfig + MITIGATION_RETPOLINE=n generates:

|         endbr64
|         mov    (%rdi),%rax
| .Lextable_fixup:
|         ret

... omitting the assignment to 'val' in the error path.

GCC 14.1.0, arm64 defconfig generates:

|         and     x0, x0, #0xff7fffffffffffff
|         ldtr    x0, [x0]
|         ret
| .Lextable_fixup:
|         mov     x0, #0x900d                     // #36877
|         ret

GCC 14.1.0, x86_64 defconfig + MITIGATION_RETPOLINE=n generates:

|         endbr64
|         mov    (%rdi),%rax
|         ret
| .Lextable_fixup:
|         mov    $0x900d,%eax
|         ret

... with the expected assignment to 'val' in the error path.

Mark.
Linus Torvalds July 17, 2024, 5:54 p.m. UTC | #3
On Wed, 17 Jul 2024 at 09:23, Mark Rutland <mark.rutland@arm.com> wrote:
>
> As a heads-up, this is tickling a GCC bug with asm goto outputs in GCC
> versions prior to 14.1.0, and the existing asm_goto_output() workaround
> doesn't seem to be sufficent.

Uhhuh.

I've done all my testing with gcc version 13.3.1, but that is one of
the fixed versions.

> I spotted that while preparing a fixup for a thinko with
> _ASM_EXTABLE_##type##ACCESS_ERR(),

Ack. Your fix looks obviously correct to me, but yeah, the deeper
issue is the compiler bug interaction.

> | #define asm_goto_output(x...) \
> |         do { asm volatile goto(x); asm (""); } while (0)
>
> ... but testing indicates that's insufficient on arm64 and x86_64, and
> even with that I see GCC erroneously omitting assignments to variables
> in the 'goto' paths, where those are assigned an output in the non-goto
> path.
>
> As a reduced test case I have: [ snip snip ]
>
> ... which GCC 13.2.0 (at -O2) compiles to:
>
> |       cbnz    x0, .Lfailed
> |       mov     x0, #0x900d
> | .Lfailed:
> |       ret

Yeah, that test-case works fine for me, and I get the expected result:

  get_val:
  .LFB0:
        .cfi_startproc
        cbnz    x0, .L3
        mov     x0, #0x900d
        ret
        .p2align 2,,3
  .L3:
  .L2:
        mov     x0, 2989
        ret
        .cfi_endproc

but as mentioned, I have one of the gcc versions that doesn't need
that GCC_ASM_GOTO_OUTPUT_WORKAROUND:

  # Fixed in GCC 14, 13.3, 12.4 and 11.5

so my gcc-13.3.1 doesn't have the issue.

> Can anyone from the GCC side comment as to whether placing the dummy asm("")
> block after the goto labels is a sufficient workaround, or whether that's just
> avoiding the issue by chance?

I have to say that the workaround of putting the empty asm on the
target label rather than next to the "asm goto" itself would seem to
be more sensible than our current workaround, but the problem is that
the target is entirely separate.

The current workaround was very much a "this works in practice in the
(very few) cases we saw" and works syntactically.

And yes, due to how arm64 inlines get_user(), arm64 ends up seeing a
lot more effects of this.

Hmm. I was thinking that we could change the "asm_goto_output()" macro
to take the exception label as the first argument, and create some
kind of trampoline for the exception case with the workaround doing
something like

    asm goto(/* for real */ ... : temp_label);
    if (0) { temp_label: asm volatile(""); goto real_label; }

but it turns out that we actually have cases of multiple output labels
on x86: see __vmcs_readl() in arch/x86/kvm/vmx/vmx_ops.h.

So doing that kind of wrapping looks non-trivial.

Maybe we just need to give up on the workaround and say that "asm goto
with outputs only works when ASM_GOTO_OUTPUT_WORKAROUND is not set".

The set of broken gcc versions will get progressively smaller as time goes on.

                Linus
Linus Torvalds July 17, 2024, 6:29 p.m. UTC | #4
On Wed, 17 Jul 2024 at 09:28, Mark Rutland <mark.rutland@arm.com> wrote:
>
> [resending as I messed up the LKML address; sorry!]

Bah. And my reply was to your original.

I'll just leave the lore link to the reply (that did go to the arm
list successfully) here for anybody that only gets lkml..

    https://lore.kernel.org/all/CAHk-=wghzGt7J9XaQgcmLniYrQMtoXGcv+FvdGcyspkb+FxUsw@mail.gmail.com/

but the gist of it probably boils down to the final few sentences:

 "Maybe we just need to give up on the workaround and say that "asm goto
  with outputs only works when ASM_GOTO_OUTPUT_WORKAROUND is not set".

  The set of broken gcc versions will get progressively smaller as time goes on"

which is sad, but not the end of the world.

Unless the gcc people can maybe come up with some variation of our
current workaround that actually works (because the "workaround at
target label" model looks very inconvenient).

           Linus
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5d91259ee7b5..b6e8920364de 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1649,6 +1649,7 @@  config RODATA_FULL_DEFAULT_ENABLED
 
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
+	depends on !KCSAN
 	help
 	  Enabling this option prevents the kernel from accessing
 	  user-space memory directly by pointing TTBR0_EL1 to a reserved
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 14be5000c5a0..3e721f73bcaf 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -184,29 +184,40 @@  static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
  * The "__xxx_error" versions set the third argument to -EFAULT if an error
  * occurs, and leave it unchanged on success.
  */
-#define __get_mem_asm(load, reg, x, addr, err, type)			\
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define __get_mem_asm(load, reg, x, addr, label, type)			\
+	asm_goto_output(						\
+	"1:	" load "	" reg "0, [%1]\n"			\
+	_ASM_EXTABLE_##type##ACCESS_ERR(1b, %l2, %w0)			\
+	: "=r" (x)							\
+	: "r" (addr) : : label)
+#else
+#define __get_mem_asm(load, reg, x, addr, label, type) do {		\
+	int __gma_err = 0;						\
 	asm volatile(							\
 	"1:	" load "	" reg "1, [%2]\n"			\
 	"2:\n"								\
 	_ASM_EXTABLE_##type##ACCESS_ERR_ZERO(1b, 2b, %w0, %w1)		\
-	: "+r" (err), "=r" (x)						\
-	: "r" (addr))
+	: "+r" (__gma_err), "=r" (x)					\
+	: "r" (addr));							\
+	if (__gma_err) goto label; } while (0)
+#endif
 
-#define __raw_get_mem(ldr, x, ptr, err, type)					\
+#define __raw_get_mem(ldr, x, ptr, label, type)					\
 do {										\
 	unsigned long __gu_val;							\
 	switch (sizeof(*(ptr))) {						\
 	case 1:									\
-		__get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err), type);	\
+		__get_mem_asm(ldr "b", "%w", __gu_val, (ptr), label, type);	\
 		break;								\
 	case 2:									\
-		__get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err), type);	\
+		__get_mem_asm(ldr "h", "%w", __gu_val, (ptr), label, type);	\
 		break;								\
 	case 4:									\
-		__get_mem_asm(ldr, "%w", __gu_val, (ptr), (err), type);		\
+		__get_mem_asm(ldr, "%w", __gu_val, (ptr), label, type);		\
 		break;								\
 	case 8:									\
-		__get_mem_asm(ldr, "%x",  __gu_val, (ptr), (err), type);	\
+		__get_mem_asm(ldr, "%x",  __gu_val, (ptr), label, type);	\
 		break;								\
 	default:								\
 		BUILD_BUG();							\
@@ -219,27 +230,34 @@  do {										\
  * uaccess_ttbr0_disable(). As `x` and `ptr` could contain blocking functions,
  * we must evaluate these outside of the critical section.
  */
-#define __raw_get_user(x, ptr, err)					\
+#define __raw_get_user(x, ptr, label)					\
 do {									\
 	__typeof__(*(ptr)) __user *__rgu_ptr = (ptr);			\
 	__typeof__(x) __rgu_val;					\
 	__chk_user_ptr(ptr);						\
-									\
-	uaccess_ttbr0_enable();						\
-	__raw_get_mem("ldtr", __rgu_val, __rgu_ptr, err, U);		\
-	uaccess_ttbr0_disable();					\
-									\
-	(x) = __rgu_val;						\
+	do {								\
+		__label__ __rgu_failed;					\
+		uaccess_ttbr0_enable();					\
+		__raw_get_mem("ldtr", __rgu_val, __rgu_ptr, __rgu_failed, U);	\
+		uaccess_ttbr0_disable();				\
+		(x) = __rgu_val;					\
+		break;							\
+	__rgu_failed:							\
+		uaccess_ttbr0_disable();				\
+		goto label;						\
+	} while (0);							\
 } while (0)
 
 #define __get_user_error(x, ptr, err)					\
 do {									\
+	__label__ __gu_failed;						\
 	__typeof__(*(ptr)) __user *__p = (ptr);				\
 	might_fault();							\
 	if (access_ok(__p, sizeof(*__p))) {				\
 		__p = uaccess_mask_ptr(__p);				\
-		__raw_get_user((x), __p, (err));			\
+		__raw_get_user((x), __p, __gu_failed);			\
 	} else {							\
+	__gu_failed:							\
 		(x) = (__force __typeof__(x))0; (err) = -EFAULT;	\
 	}								\
 } while (0)
@@ -262,15 +280,18 @@  do {									\
 do {									\
 	__typeof__(dst) __gkn_dst = (dst);				\
 	__typeof__(src) __gkn_src = (src);				\
-	int __gkn_err = 0;						\
+	do { 								\
+		__label__ __gkn_label;					\
 									\
-	__mte_enable_tco_async();					\
-	__raw_get_mem("ldr", *((type *)(__gkn_dst)),			\
-		      (__force type *)(__gkn_src), __gkn_err, K);	\
-	__mte_disable_tco_async();					\
-									\
-	if (unlikely(__gkn_err))					\
+		__mte_enable_tco_async();				\
+		__raw_get_mem("ldr", *((type *)(__gkn_dst)),		\
+		      (__force type *)(__gkn_src), __gkn_label, K);	\
+		__mte_disable_tco_async();				\
+		break;							\
+	__gkn_label:							\
+		__mte_disable_tco_async();				\
 		goto err_label;						\
+	} while (0);							\
 } while (0)
 
 #define __put_mem_asm(store, reg, x, addr, err, type)			\
@@ -381,6 +402,60 @@  extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi
 	__actu_ret;							\
 })
 
+static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
+{
+	if (unlikely(!access_ok(ptr,len)))
+		return 0;
+	uaccess_ttbr0_enable();
+	return 1;
+}
+#define user_access_begin(a,b)	user_access_begin(a,b)
+#define user_access_end()	uaccess_ttbr0_disable()
+
+/*
+ * The arm64 inline asms should learn abut asm goto, and we should
+ * teach user_access_begin() about address masking.
+ */
+#define unsafe_put_user(x, ptr, label)	do {				\
+	int __upu_err = 0;						\
+	__raw_put_mem("sttr", x, uaccess_mask_ptr(ptr), __upu_err, U);	\
+	if (__upu_err) goto label;				\
+} while (0)
+
+#define unsafe_get_user(x, ptr, label) \
+	__raw_get_mem("ldtr", x, uaccess_mask_ptr(ptr), label, U)
+
+/*
+ * KCSAN uses these to save and restore ttbr state.
+ * We do not support KCSAN with ARM64_SW_TTBR0_PAN, so
+ * they are no-ops.
+ */
+static inline unsigned long user_access_save(void) { return 0; }
+static inline void user_access_restore(unsigned long enabled) { }
+
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_copy_loop(dst, src, len, type, label)				\
+	while (len >= sizeof(type)) {						\
+		unsafe_put_user(*(type *)(src),(type __user *)(dst),label);	\
+		dst += sizeof(type);						\
+		src += sizeof(type);						\
+		len -= sizeof(type);						\
+	}
+
+#define unsafe_copy_to_user(_dst,_src,_len,label)			\
+do {									\
+	char __user *__ucu_dst = (_dst);				\
+	const char *__ucu_src = (_src);					\
+	size_t __ucu_len = (_len);					\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);	\
+} while (0)
+
 #define INLINE_COPY_TO_USER
 #define INLINE_COPY_FROM_USER
 
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index dcdcccd40891..6174671be7c1 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -582,12 +582,9 @@  subsys_initcall(register_mte_tcf_preferred_sysctl);
 size_t mte_probe_user_range(const char __user *uaddr, size_t size)
 {
 	const char __user *end = uaddr + size;
-	int err = 0;
 	char val;
 
-	__raw_get_user(val, uaddr, err);
-	if (err)
-		return size;
+	__raw_get_user(val, uaddr, efault);
 
 	uaddr = PTR_ALIGN(uaddr, MTE_GRANULE_SIZE);
 	while (uaddr < end) {
@@ -595,12 +592,13 @@  size_t mte_probe_user_range(const char __user *uaddr, size_t size)
 		 * A read is sufficient for mte, the caller should have probed
 		 * for the pte write permission if required.
 		 */
-		__raw_get_user(val, uaddr, err);
-		if (err)
-			return end - uaddr;
+		__raw_get_user(val, uaddr, efault);
 		uaddr += MTE_GRANULE_SIZE;
 	}
 	(void)val;
 
 	return 0;
+
+efault:
+	return end - uaddr;
 }