diff mbox series

[8/8] arm64: Rewrite __arch_clear_user()

Message ID 76a1700b0316b50fb5881da603f2daf3c81468f4.1620738177.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: String function updates | expand

Commit Message

Robin Murphy May 11, 2021, 4:12 p.m. UTC
Now that we're always using STTR variants rather than abstracting two
different addressing modes, the user_ldst macro here is frankly more
obfuscating than helpful. Rewrite __arch_clear_user() with regular
USER() annotations so that it's clearer what's going on, and take the
opportunity to minimise the branchiness in the most common paths, which
also allows the exception fixup to return a more accurate result.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

Comments

Mark Rutland May 12, 2021, 10:48 a.m. UTC | #1
On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote:
> Now that we're always using STTR variants rather than abstracting two
> different addressing modes, the user_ldst macro here is frankly more
> obfuscating than helpful.

FWIW, I completely agree; the user_ldst macros are a historical artifact
and I'm happy to see them go!

> Rewrite __arch_clear_user() with regular
> USER() annotations so that it's clearer what's going on, and take the
> opportunity to minimise the branchiness in the most common paths, which
> also allows the exception fixup to return a more accurate result.

IIUC this isn't always accurate for the {4,2,1}-byte cases; example
below. I'm not sure whether that's intentional since the commit message
says "more accurate" rather than "accurate".

> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------
>  1 file changed, 22 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
> index af9afcbec92c..1005345b4066 100644
> --- a/arch/arm64/lib/clear_user.S
> +++ b/arch/arm64/lib/clear_user.S
> @@ -1,12 +1,9 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
>  /*
> - * Based on arch/arm/lib/clear_user.S
> - *
> - * Copyright (C) 2012 ARM Ltd.
> + * Copyright (C) 2021 Arm Ltd.
>   */
> -#include <linux/linkage.h>
>  
> -#include <asm/asm-uaccess.h>
> +#include <linux/linkage.h>
>  #include <asm/assembler.h>
>  
>  	.text
> @@ -19,25 +16,30 @@
>   *
>   * Alignment fixed up by hardware.
>   */
> +	.p2align 4
>  SYM_FUNC_START(__arch_clear_user)

Say we're called with size in x1 == 0x7

> -	mov	x2, x1			// save the size for fixup return
> +	add	x2, x0, x1
>  	subs	x1, x1, #8
>  	b.mi	2f

... here we'll skip to the 4-byte case at 2f ...

>  1:
> -user_ldst 9f, sttr, xzr, x0, 8
> +USER(9f, sttr	xzr, [x0])
> +	add	x0, x0, #8
>  	subs	x1, x1, #8
> -	b.pl	1b
> -2:	adds	x1, x1, #4
> -	b.mi	3f
> -user_ldst 9f, sttr, wzr, x0, 4
> -	sub	x1, x1, #4
> -3:	adds	x1, x1, #2
> -	b.mi	4f
> -user_ldst 9f, sttrh, wzr, x0, 2
> -	sub	x1, x1, #2
> -4:	adds	x1, x1, #1
> -	b.mi	5f
> -user_ldst 9f, sttrb, wzr, x0, 0
> +	b.hi	1b
> +USER(9f, sttr	xzr, [x2, #-8])
> +	mov	x0, #0
> +	ret
> +
> +2:	tbz	x1, #2, 3f

... bit 2 is non-zero, so we continue ...

> +USER(9f, sttr	wzr, [x0])

... and if this faults, the fixup will report the correct address ...

> +USER(9f, sttr	wzr, [x2, #-4])

... but if this faults, teh fixup handler will report that we didn't
copy all 7 bytes, rather than just the last 3, since we didn't update x0
after the first 4-byte STTR.

We could update x0 inline, or add separate fixup handlers to account for
that out-of-line.

If we think that under-estimating is fine, I reckon it'd be worth a
comment to make that clear.

Thanks,
Mark.

> +	mov	x0, #0
> +	ret
> +
> +3:	tbz	x1, #1, 4f
> +USER(9f, sttrh	wzr, [x0])
> +4:	tbz	x1, #0, 5f
> +USER(9f, sttrb	wzr, [x2, #-1])
>  5:	mov	x0, #0
>  	ret
>  SYM_FUNC_END(__arch_clear_user)
> @@ -45,6 +47,6 @@ EXPORT_SYMBOL(__arch_clear_user)
>  
>  	.section .fixup,"ax"
>  	.align	2
> -9:	mov	x0, x2			// return the original size
> +9:	sub	x0, x2, x0
>  	ret
>  	.previous
> -- 
> 2.21.0.dirty
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Robin Murphy May 12, 2021, 11:31 a.m. UTC | #2
On 2021-05-12 11:48, Mark Rutland wrote:
> On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote:
>> Now that we're always using STTR variants rather than abstracting two
>> different addressing modes, the user_ldst macro here is frankly more
>> obfuscating than helpful.
> 
> FWIW, I completely agree; the user_ldst macros are a historical artifact
> and I'm happy to see them go!
> 
>> Rewrite __arch_clear_user() with regular
>> USER() annotations so that it's clearer what's going on, and take the
>> opportunity to minimise the branchiness in the most common paths, which
>> also allows the exception fixup to return a more accurate result.
> 
> IIUC this isn't always accurate for the {4,2,1}-byte cases; example
> below. I'm not sure whether that's intentional since the commit message
> says "more accurate" rather than "accurate".

Indeed, the "more" was definitely significant :)

>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>   arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------
>>   1 file changed, 22 insertions(+), 20 deletions(-)
>>
>> diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
>> index af9afcbec92c..1005345b4066 100644
>> --- a/arch/arm64/lib/clear_user.S
>> +++ b/arch/arm64/lib/clear_user.S
>> @@ -1,12 +1,9 @@
>>   /* SPDX-License-Identifier: GPL-2.0-only */
>>   /*
>> - * Based on arch/arm/lib/clear_user.S
>> - *
>> - * Copyright (C) 2012 ARM Ltd.
>> + * Copyright (C) 2021 Arm Ltd.
>>    */
>> -#include <linux/linkage.h>
>>   
>> -#include <asm/asm-uaccess.h>
>> +#include <linux/linkage.h>
>>   #include <asm/assembler.h>
>>   
>>   	.text
>> @@ -19,25 +16,30 @@
>>    *
>>    * Alignment fixed up by hardware.
>>    */
>> +	.p2align 4
>>   SYM_FUNC_START(__arch_clear_user)
> 
> Say we're called with size in x1 == 0x7
> 
>> -	mov	x2, x1			// save the size for fixup return
>> +	add	x2, x0, x1
>>   	subs	x1, x1, #8
>>   	b.mi	2f
> 
> ... here we'll skip to the 4-byte case at 2f ...
> 
>>   1:
>> -user_ldst 9f, sttr, xzr, x0, 8
>> +USER(9f, sttr	xzr, [x0])
>> +	add	x0, x0, #8
>>   	subs	x1, x1, #8
>> -	b.pl	1b
>> -2:	adds	x1, x1, #4
>> -	b.mi	3f
>> -user_ldst 9f, sttr, wzr, x0, 4
>> -	sub	x1, x1, #4
>> -3:	adds	x1, x1, #2
>> -	b.mi	4f
>> -user_ldst 9f, sttrh, wzr, x0, 2
>> -	sub	x1, x1, #2
>> -4:	adds	x1, x1, #1
>> -	b.mi	5f
>> -user_ldst 9f, sttrb, wzr, x0, 0
>> +	b.hi	1b
>> +USER(9f, sttr	xzr, [x2, #-8])
>> +	mov	x0, #0
>> +	ret
>> +
>> +2:	tbz	x1, #2, 3f
> 
> ... bit 2 is non-zero, so we continue ...
> 
>> +USER(9f, sttr	wzr, [x0])
> 
> ... and if this faults, the fixup will report the correct address ...
> 
>> +USER(9f, sttr	wzr, [x2, #-4])
> 
> ... but if this faults, teh fixup handler will report that we didn't
> copy all 7 bytes, rather than just the last 3, since we didn't update x0
> after the first 4-byte STTR.
> 
> We could update x0 inline, or add separate fixup handlers to account for
> that out-of-line.
> 
> If we think that under-estimating is fine, I reckon it'd be worth a
> comment to make that clear.

Indeed for smaller amounts there's no change in fixup behaviour at all, 
but I have to assume that underestimating by up to 100% is probably OK 
since we've been underestimating by fully 100% for nearly 10 years now. 
I don't believe it's worth having any more complexity than necessary for 
the fault case - grepping for clear_user() usage suggests that nobody 
really cares about the return value beyond whether it's zero or not, so 
the minor "improvement" here is more of a nice-to-have TBH.

The existing comment doesn't actually explain anything either, which is 
why I didn't replace it, but I'm happy to add something if you like.

Cheers,
Robin.

> 
> Thanks,
> Mark.
> 
>> +	mov	x0, #0
>> +	ret
>> +
>> +3:	tbz	x1, #1, 4f
>> +USER(9f, sttrh	wzr, [x0])
>> +4:	tbz	x1, #0, 5f
>> +USER(9f, sttrb	wzr, [x2, #-1])
>>   5:	mov	x0, #0
>>   	ret
>>   SYM_FUNC_END(__arch_clear_user)
>> @@ -45,6 +47,6 @@ EXPORT_SYMBOL(__arch_clear_user)
>>   
>>   	.section .fixup,"ax"
>>   	.align	2
>> -9:	mov	x0, x2			// return the original size
>> +9:	sub	x0, x2, x0
>>   	ret
>>   	.previous
>> -- 
>> 2.21.0.dirty
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Mark Rutland May 12, 2021, 1:06 p.m. UTC | #3
On Wed, May 12, 2021 at 12:31:39PM +0100, Robin Murphy wrote:
> On 2021-05-12 11:48, Mark Rutland wrote:
> > On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote:
> > > Rewrite __arch_clear_user() with regular
> > > USER() annotations so that it's clearer what's going on, and take the
> > > opportunity to minimise the branchiness in the most common paths, which
> > > also allows the exception fixup to return a more accurate result.
> > 
> > IIUC this isn't always accurate for the {4,2,1}-byte cases; example
> > below. I'm not sure whether that's intentional since the commit message
> > says "more accurate" rather than "accurate".
> 
> Indeed, the "more" was definitely significant :)

:)

> > If we think that under-estimating is fine, I reckon it'd be worth a
> > comment to make that clear.
> 
> Indeed for smaller amounts there's no change in fixup behaviour at all, but
> I have to assume that underestimating by up to 100% is probably OK since
> we've been underestimating by fully 100% for nearly 10 years now. I don't
> believe it's worth having any more complexity than necessary for the fault
> case - grepping for clear_user() usage suggests that nobody really cares
> about the return value beyond whether it's zero or not, so the minor
> "improvement" here is more of a nice-to-have TBH.
> 
> The existing comment doesn't actually explain anything either, which is why
> I didn't replace it, but I'm happy to add something if you like.

I don't have strong feelings either way, but I think that we should at
least document this, since that'll at least save us rehashing the same
point in future. :)

That said, IIUC to make this always accurate we only need two ADDs (diff
below). Since those will only be executed at most once each, I suspect
they won't have a measureable impact in practice.

So maybe it's worth adding them to avoid any risk that someone needs
this to be accurate in future?

Mark.

---->8----
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index 1005345b4066..7ef553ec2677 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -32,12 +32,14 @@ USER(9f, sttr       xzr, [x2, #-8])
 
 2:     tbz     x1, #2, 3f
 USER(9f, sttr  wzr, [x0])
+       add     x0, x0, #4
 USER(9f, sttr  wzr, [x2, #-4])
        mov     x0, #0
        ret
 
 3:     tbz     x1, #1, 4f
 USER(9f, sttrh wzr, [x0])
+       add     x0, x0, #2
 4:     tbz     x1, #0, 5f
 USER(9f, sttrb wzr, [x2, #-1])
 5:     mov     x0, #0
Robin Murphy May 12, 2021, 1:51 p.m. UTC | #4
On 2021-05-12 14:06, Mark Rutland wrote:
> On Wed, May 12, 2021 at 12:31:39PM +0100, Robin Murphy wrote:
>> On 2021-05-12 11:48, Mark Rutland wrote:
>>> On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote:
>>>> Rewrite __arch_clear_user() with regular
>>>> USER() annotations so that it's clearer what's going on, and take the
>>>> opportunity to minimise the branchiness in the most common paths, which
>>>> also allows the exception fixup to return a more accurate result.
>>>
>>> IIUC this isn't always accurate for the {4,2,1}-byte cases; example
>>> below. I'm not sure whether that's intentional since the commit message
>>> says "more accurate" rather than "accurate".
>>
>> Indeed, the "more" was definitely significant :)
> 
> :)
> 
>>> If we think that under-estimating is fine, I reckon it'd be worth a
>>> comment to make that clear.
>>
>> Indeed for smaller amounts there's no change in fixup behaviour at all, but
>> I have to assume that underestimating by up to 100% is probably OK since
>> we've been underestimating by fully 100% for nearly 10 years now. I don't
>> believe it's worth having any more complexity than necessary for the fault
>> case - grepping for clear_user() usage suggests that nobody really cares
>> about the return value beyond whether it's zero or not, so the minor
>> "improvement" here is more of a nice-to-have TBH.
>>
>> The existing comment doesn't actually explain anything either, which is why
>> I didn't replace it, but I'm happy to add something if you like.
> 
> I don't have strong feelings either way, but I think that we should at
> least document this, since that'll at least save us rehashing the same
> point in future. :)
> 
> That said, IIUC to make this always accurate we only need two ADDs (diff
> below). Since those will only be executed at most once each, I suspect
> they won't have a measureable impact in practice.
> 
> So maybe it's worth adding them to avoid any risk that someone needs
> this to be accurate in future?

Hmm, now that you've caused me to ponder it some more, it can in fact be 
achieved with just two extra ADDs _out of line_, and still neatly enough 
that I'm now definitely going to do that. Thanks for the push!

Robin.

> 
> Mark.
> 
> ---->8----
> diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
> index 1005345b4066..7ef553ec2677 100644
> --- a/arch/arm64/lib/clear_user.S
> +++ b/arch/arm64/lib/clear_user.S
> @@ -32,12 +32,14 @@ USER(9f, sttr       xzr, [x2, #-8])
>   
>   2:     tbz     x1, #2, 3f
>   USER(9f, sttr  wzr, [x0])
> +       add     x0, x0, #4
>   USER(9f, sttr  wzr, [x2, #-4])
>          mov     x0, #0
>          ret
>   
>   3:     tbz     x1, #1, 4f
>   USER(9f, sttrh wzr, [x0])
> +       add     x0, x0, #2
>   4:     tbz     x1, #0, 5f
>   USER(9f, sttrb wzr, [x2, #-1])
>   5:     mov     x0, #0
>
diff mbox series

Patch

diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index af9afcbec92c..1005345b4066 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -1,12 +1,9 @@ 
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Based on arch/arm/lib/clear_user.S
- *
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2021 Arm Ltd.
  */
-#include <linux/linkage.h>
 
-#include <asm/asm-uaccess.h>
+#include <linux/linkage.h>
 #include <asm/assembler.h>
 
 	.text
@@ -19,25 +16,30 @@ 
  *
  * Alignment fixed up by hardware.
  */
+	.p2align 4
 SYM_FUNC_START(__arch_clear_user)
-	mov	x2, x1			// save the size for fixup return
+	add	x2, x0, x1
 	subs	x1, x1, #8
 	b.mi	2f
 1:
-user_ldst 9f, sttr, xzr, x0, 8
+USER(9f, sttr	xzr, [x0])
+	add	x0, x0, #8
 	subs	x1, x1, #8
-	b.pl	1b
-2:	adds	x1, x1, #4
-	b.mi	3f
-user_ldst 9f, sttr, wzr, x0, 4
-	sub	x1, x1, #4
-3:	adds	x1, x1, #2
-	b.mi	4f
-user_ldst 9f, sttrh, wzr, x0, 2
-	sub	x1, x1, #2
-4:	adds	x1, x1, #1
-	b.mi	5f
-user_ldst 9f, sttrb, wzr, x0, 0
+	b.hi	1b
+USER(9f, sttr	xzr, [x2, #-8])
+	mov	x0, #0
+	ret
+
+2:	tbz	x1, #2, 3f
+USER(9f, sttr	wzr, [x0])
+USER(9f, sttr	wzr, [x2, #-4])
+	mov	x0, #0
+	ret
+
+3:	tbz	x1, #1, 4f
+USER(9f, sttrh	wzr, [x0])
+4:	tbz	x1, #0, 5f
+USER(9f, sttrb	wzr, [x2, #-1])
 5:	mov	x0, #0
 	ret
 SYM_FUNC_END(__arch_clear_user)
@@ -45,6 +47,6 @@  EXPORT_SYMBOL(__arch_clear_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	mov	x0, x2			// return the original size
+9:	sub	x0, x2, x0
 	ret
 	.previous