Message ID | 76a1700b0316b50fb5881da603f2daf3c81468f4.1620738177.git.robin.murphy@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: String function updates | expand |
On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote: > Now that we're always using STTR variants rather than abstracting two > different addressing modes, the user_ldst macro here is frankly more > obfuscating than helpful. FWIW, I completely agree; the user_ldst macros are a historical artifact and I'm happy to see them go! > Rewrite __arch_clear_user() with regular > USER() annotations so that it's clearer what's going on, and take the > opportunity to minimise the branchiness in the most common paths, which > also allows the exception fixup to return a more accurate result. IIUC this isn't always accurate for the {4,2,1}-byte cases; example below. I'm not sure whether that's intentional since the commit message says "more accurate" rather than "accurate". > > Signed-off-by: Robin Murphy <robin.murphy@arm.com> > --- > arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------ > 1 file changed, 22 insertions(+), 20 deletions(-) > > diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S > index af9afcbec92c..1005345b4066 100644 > --- a/arch/arm64/lib/clear_user.S > +++ b/arch/arm64/lib/clear_user.S > @@ -1,12 +1,9 @@ > /* SPDX-License-Identifier: GPL-2.0-only */ > /* > - * Based on arch/arm/lib/clear_user.S > - * > - * Copyright (C) 2012 ARM Ltd. > + * Copyright (C) 2021 Arm Ltd. > */ > -#include <linux/linkage.h> > > -#include <asm/asm-uaccess.h> > +#include <linux/linkage.h> > #include <asm/assembler.h> > > .text > @@ -19,25 +16,30 @@ > * > * Alignment fixed up by hardware. > */ > + .p2align 4 > SYM_FUNC_START(__arch_clear_user) Say we're called with size in x1 == 0x7 > - mov x2, x1 // save the size for fixup return > + add x2, x0, x1 > subs x1, x1, #8 > b.mi 2f ... here we'll skip to the 4-byte case at 2f ... > 1: > -user_ldst 9f, sttr, xzr, x0, 8 > +USER(9f, sttr xzr, [x0]) > + add x0, x0, #8 > subs x1, x1, #8 > - b.pl 1b > -2: adds x1, x1, #4 > - b.mi 3f > -user_ldst 9f, sttr, wzr, x0, 4 > - sub x1, x1, #4 > -3: adds x1, x1, #2 > - b.mi 4f > -user_ldst 9f, sttrh, wzr, x0, 2 > - sub x1, x1, #2 > -4: adds x1, x1, #1 > - b.mi 5f > -user_ldst 9f, sttrb, wzr, x0, 0 > + b.hi 1b > +USER(9f, sttr xzr, [x2, #-8]) > + mov x0, #0 > + ret > + > +2: tbz x1, #2, 3f ... bit 2 is non-zero, so we continue ... > +USER(9f, sttr wzr, [x0]) ... and if this faults, the fixup will report the correct address ... > +USER(9f, sttr wzr, [x2, #-4]) ... but if this faults, teh fixup handler will report that we didn't copy all 7 bytes, rather than just the last 3, since we didn't update x0 after the first 4-byte STTR. We could update x0 inline, or add separate fixup handlers to account for that out-of-line. If we think that under-estimating is fine, I reckon it'd be worth a comment to make that clear. Thanks, Mark. > + mov x0, #0 > + ret > + > +3: tbz x1, #1, 4f > +USER(9f, sttrh wzr, [x0]) > +4: tbz x1, #0, 5f > +USER(9f, sttrb wzr, [x2, #-1]) > 5: mov x0, #0 > ret > SYM_FUNC_END(__arch_clear_user) > @@ -45,6 +47,6 @@ EXPORT_SYMBOL(__arch_clear_user) > > .section .fixup,"ax" > .align 2 > -9: mov x0, x2 // return the original size > +9: sub x0, x2, x0 > ret > .previous > -- > 2.21.0.dirty > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
On 2021-05-12 11:48, Mark Rutland wrote: > On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote: >> Now that we're always using STTR variants rather than abstracting two >> different addressing modes, the user_ldst macro here is frankly more >> obfuscating than helpful. > > FWIW, I completely agree; the user_ldst macros are a historical artifact > and I'm happy to see them go! > >> Rewrite __arch_clear_user() with regular >> USER() annotations so that it's clearer what's going on, and take the >> opportunity to minimise the branchiness in the most common paths, which >> also allows the exception fixup to return a more accurate result. > > IIUC this isn't always accurate for the {4,2,1}-byte cases; example > below. I'm not sure whether that's intentional since the commit message > says "more accurate" rather than "accurate". Indeed, the "more" was definitely significant :) >> Signed-off-by: Robin Murphy <robin.murphy@arm.com> >> --- >> arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------ >> 1 file changed, 22 insertions(+), 20 deletions(-) >> >> diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S >> index af9afcbec92c..1005345b4066 100644 >> --- a/arch/arm64/lib/clear_user.S >> +++ b/arch/arm64/lib/clear_user.S >> @@ -1,12 +1,9 @@ >> /* SPDX-License-Identifier: GPL-2.0-only */ >> /* >> - * Based on arch/arm/lib/clear_user.S >> - * >> - * Copyright (C) 2012 ARM Ltd. >> + * Copyright (C) 2021 Arm Ltd. >> */ >> -#include <linux/linkage.h> >> >> -#include <asm/asm-uaccess.h> >> +#include <linux/linkage.h> >> #include <asm/assembler.h> >> >> .text >> @@ -19,25 +16,30 @@ >> * >> * Alignment fixed up by hardware. >> */ >> + .p2align 4 >> SYM_FUNC_START(__arch_clear_user) > > Say we're called with size in x1 == 0x7 > >> - mov x2, x1 // save the size for fixup return >> + add x2, x0, x1 >> subs x1, x1, #8 >> b.mi 2f > > ... here we'll skip to the 4-byte case at 2f ... > >> 1: >> -user_ldst 9f, sttr, xzr, x0, 8 >> +USER(9f, sttr xzr, [x0]) >> + add x0, x0, #8 >> subs x1, x1, #8 >> - b.pl 1b >> -2: adds x1, x1, #4 >> - b.mi 3f >> -user_ldst 9f, sttr, wzr, x0, 4 >> - sub x1, x1, #4 >> -3: adds x1, x1, #2 >> - b.mi 4f >> -user_ldst 9f, sttrh, wzr, x0, 2 >> - sub x1, x1, #2 >> -4: adds x1, x1, #1 >> - b.mi 5f >> -user_ldst 9f, sttrb, wzr, x0, 0 >> + b.hi 1b >> +USER(9f, sttr xzr, [x2, #-8]) >> + mov x0, #0 >> + ret >> + >> +2: tbz x1, #2, 3f > > ... bit 2 is non-zero, so we continue ... > >> +USER(9f, sttr wzr, [x0]) > > ... and if this faults, the fixup will report the correct address ... > >> +USER(9f, sttr wzr, [x2, #-4]) > > ... but if this faults, teh fixup handler will report that we didn't > copy all 7 bytes, rather than just the last 3, since we didn't update x0 > after the first 4-byte STTR. > > We could update x0 inline, or add separate fixup handlers to account for > that out-of-line. > > If we think that under-estimating is fine, I reckon it'd be worth a > comment to make that clear. Indeed for smaller amounts there's no change in fixup behaviour at all, but I have to assume that underestimating by up to 100% is probably OK since we've been underestimating by fully 100% for nearly 10 years now. I don't believe it's worth having any more complexity than necessary for the fault case - grepping for clear_user() usage suggests that nobody really cares about the return value beyond whether it's zero or not, so the minor "improvement" here is more of a nice-to-have TBH. The existing comment doesn't actually explain anything either, which is why I didn't replace it, but I'm happy to add something if you like. Cheers, Robin. > > Thanks, > Mark. > >> + mov x0, #0 >> + ret >> + >> +3: tbz x1, #1, 4f >> +USER(9f, sttrh wzr, [x0]) >> +4: tbz x1, #0, 5f >> +USER(9f, sttrb wzr, [x2, #-1]) >> 5: mov x0, #0 >> ret >> SYM_FUNC_END(__arch_clear_user) >> @@ -45,6 +47,6 @@ EXPORT_SYMBOL(__arch_clear_user) >> >> .section .fixup,"ax" >> .align 2 >> -9: mov x0, x2 // return the original size >> +9: sub x0, x2, x0 >> ret >> .previous >> -- >> 2.21.0.dirty >> >> >> _______________________________________________ >> linux-arm-kernel mailing list >> linux-arm-kernel@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
On Wed, May 12, 2021 at 12:31:39PM +0100, Robin Murphy wrote: > On 2021-05-12 11:48, Mark Rutland wrote: > > On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote: > > > Rewrite __arch_clear_user() with regular > > > USER() annotations so that it's clearer what's going on, and take the > > > opportunity to minimise the branchiness in the most common paths, which > > > also allows the exception fixup to return a more accurate result. > > > > IIUC this isn't always accurate for the {4,2,1}-byte cases; example > > below. I'm not sure whether that's intentional since the commit message > > says "more accurate" rather than "accurate". > > Indeed, the "more" was definitely significant :) :) > > If we think that under-estimating is fine, I reckon it'd be worth a > > comment to make that clear. > > Indeed for smaller amounts there's no change in fixup behaviour at all, but > I have to assume that underestimating by up to 100% is probably OK since > we've been underestimating by fully 100% for nearly 10 years now. I don't > believe it's worth having any more complexity than necessary for the fault > case - grepping for clear_user() usage suggests that nobody really cares > about the return value beyond whether it's zero or not, so the minor > "improvement" here is more of a nice-to-have TBH. > > The existing comment doesn't actually explain anything either, which is why > I didn't replace it, but I'm happy to add something if you like. I don't have strong feelings either way, but I think that we should at least document this, since that'll at least save us rehashing the same point in future. :) That said, IIUC to make this always accurate we only need two ADDs (diff below). Since those will only be executed at most once each, I suspect they won't have a measureable impact in practice. So maybe it's worth adding them to avoid any risk that someone needs this to be accurate in future? Mark. ---->8---- diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 1005345b4066..7ef553ec2677 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -32,12 +32,14 @@ USER(9f, sttr xzr, [x2, #-8]) 2: tbz x1, #2, 3f USER(9f, sttr wzr, [x0]) + add x0, x0, #4 USER(9f, sttr wzr, [x2, #-4]) mov x0, #0 ret 3: tbz x1, #1, 4f USER(9f, sttrh wzr, [x0]) + add x0, x0, #2 4: tbz x1, #0, 5f USER(9f, sttrb wzr, [x2, #-1]) 5: mov x0, #0
On 2021-05-12 14:06, Mark Rutland wrote: > On Wed, May 12, 2021 at 12:31:39PM +0100, Robin Murphy wrote: >> On 2021-05-12 11:48, Mark Rutland wrote: >>> On Tue, May 11, 2021 at 05:12:38PM +0100, Robin Murphy wrote: >>>> Rewrite __arch_clear_user() with regular >>>> USER() annotations so that it's clearer what's going on, and take the >>>> opportunity to minimise the branchiness in the most common paths, which >>>> also allows the exception fixup to return a more accurate result. >>> >>> IIUC this isn't always accurate for the {4,2,1}-byte cases; example >>> below. I'm not sure whether that's intentional since the commit message >>> says "more accurate" rather than "accurate". >> >> Indeed, the "more" was definitely significant :) > > :) > >>> If we think that under-estimating is fine, I reckon it'd be worth a >>> comment to make that clear. >> >> Indeed for smaller amounts there's no change in fixup behaviour at all, but >> I have to assume that underestimating by up to 100% is probably OK since >> we've been underestimating by fully 100% for nearly 10 years now. I don't >> believe it's worth having any more complexity than necessary for the fault >> case - grepping for clear_user() usage suggests that nobody really cares >> about the return value beyond whether it's zero or not, so the minor >> "improvement" here is more of a nice-to-have TBH. >> >> The existing comment doesn't actually explain anything either, which is why >> I didn't replace it, but I'm happy to add something if you like. > > I don't have strong feelings either way, but I think that we should at > least document this, since that'll at least save us rehashing the same > point in future. :) > > That said, IIUC to make this always accurate we only need two ADDs (diff > below). Since those will only be executed at most once each, I suspect > they won't have a measureable impact in practice. > > So maybe it's worth adding them to avoid any risk that someone needs > this to be accurate in future? Hmm, now that you've caused me to ponder it some more, it can in fact be achieved with just two extra ADDs _out of line_, and still neatly enough that I'm now definitely going to do that. Thanks for the push! Robin. > > Mark. > > ---->8---- > diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S > index 1005345b4066..7ef553ec2677 100644 > --- a/arch/arm64/lib/clear_user.S > +++ b/arch/arm64/lib/clear_user.S > @@ -32,12 +32,14 @@ USER(9f, sttr xzr, [x2, #-8]) > > 2: tbz x1, #2, 3f > USER(9f, sttr wzr, [x0]) > + add x0, x0, #4 > USER(9f, sttr wzr, [x2, #-4]) > mov x0, #0 > ret > > 3: tbz x1, #1, 4f > USER(9f, sttrh wzr, [x0]) > + add x0, x0, #2 > 4: tbz x1, #0, 5f > USER(9f, sttrb wzr, [x2, #-1]) > 5: mov x0, #0 >
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index af9afcbec92c..1005345b4066 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -1,12 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Based on arch/arm/lib/clear_user.S - * - * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2021 Arm Ltd. */ -#include <linux/linkage.h> -#include <asm/asm-uaccess.h> +#include <linux/linkage.h> #include <asm/assembler.h> .text @@ -19,25 +16,30 @@ * * Alignment fixed up by hardware. */ + .p2align 4 SYM_FUNC_START(__arch_clear_user) - mov x2, x1 // save the size for fixup return + add x2, x0, x1 subs x1, x1, #8 b.mi 2f 1: -user_ldst 9f, sttr, xzr, x0, 8 +USER(9f, sttr xzr, [x0]) + add x0, x0, #8 subs x1, x1, #8 - b.pl 1b -2: adds x1, x1, #4 - b.mi 3f -user_ldst 9f, sttr, wzr, x0, 4 - sub x1, x1, #4 -3: adds x1, x1, #2 - b.mi 4f -user_ldst 9f, sttrh, wzr, x0, 2 - sub x1, x1, #2 -4: adds x1, x1, #1 - b.mi 5f -user_ldst 9f, sttrb, wzr, x0, 0 + b.hi 1b +USER(9f, sttr xzr, [x2, #-8]) + mov x0, #0 + ret + +2: tbz x1, #2, 3f +USER(9f, sttr wzr, [x0]) +USER(9f, sttr wzr, [x2, #-4]) + mov x0, #0 + ret + +3: tbz x1, #1, 4f +USER(9f, sttrh wzr, [x0]) +4: tbz x1, #0, 5f +USER(9f, sttrb wzr, [x2, #-1]) 5: mov x0, #0 ret SYM_FUNC_END(__arch_clear_user) @@ -45,6 +47,6 @@ EXPORT_SYMBOL(__arch_clear_user) .section .fixup,"ax" .align 2 -9: mov x0, x2 // return the original size +9: sub x0, x2, x0 ret .previous
Now that we're always using STTR variants rather than abstracting two different addressing modes, the user_ldst macro here is frankly more obfuscating than helpful. Rewrite __arch_clear_user() with regular USER() annotations so that it's clearer what's going on, and take the opportunity to minimise the branchiness in the most common paths, which also allows the exception fixup to return a more accurate result. Signed-off-by: Robin Murphy <robin.murphy@arm.com> --- arch/arm64/lib/clear_user.S | 42 +++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 20 deletions(-)