diff mbox series

[2/2] ARM: copypage: do not use naked functions

Message ID 20181015222621.14673-1-stefan@agner.ch (mailing list archive)
State New, archived
Headers show
Series [1/2] ARM: copypage-fa: add kto and kfrom to input operands list | expand

Commit Message

Stefan Agner Oct. 15, 2018, 10:26 p.m. UTC
GCC documentation says naked functions should only use basic ASM
syntax. The extended ASM or mixture of basic ASM and "C" code is
not guaranteed. Currently it seems to work though.

Furthermore with Clang using parameters in extended asm in a
naked function is not supported:
  arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
  allowed in naked functions
        : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
               ^

Use a regular function to be more portable. Also use volatile asm
to avoid unsolicited optimizations.

Tested with qemu versatileab machine and versatile_defconfig and
qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
and Clang 7.0.

Link: https://github.com/ClangBuiltLinux/linux/issues/90
Reported-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Stefan Agner <stefan@agner.ch>
---
 arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
 arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
 arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
 arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
 arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
 arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
 arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
 7 files changed, 72 insertions(+), 40 deletions(-)

Comments

Nicolas Pitre Oct. 15, 2018, 10:35 p.m. UTC | #1
On Tue, 16 Oct 2018, Stefan Agner wrote:

> GCC documentation says naked functions should only use basic ASM
> syntax. The extended ASM or mixture of basic ASM and "C" code is
> not guaranteed. Currently it seems to work though.
> 
> Furthermore with Clang using parameters in extended asm in a
> naked function is not supported:
>   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
>   allowed in naked functions
>         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
>                ^
> 
> Use a regular function to be more portable. Also use volatile asm
> to avoid unsolicited optimizations.
> 
> Tested with qemu versatileab machine and versatile_defconfig and
> qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
> and Clang 7.0.
> 
> Link: https://github.com/ClangBuiltLinux/linux/issues/90
> Reported-by: Joel Stanley <joel@jms.id.au>
> Signed-off-by: Stefan Agner <stefan@agner.ch>
> ---
>  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
>  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
>  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
>  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
>  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
>  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
>  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
>  7 files changed, 72 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
> index ec6501308c60..33ccd396bf99 100644
> --- a/arch/arm/mm/copypage-fa.c
> +++ b/arch/arm/mm/copypage-fa.c
> @@ -17,11 +17,16 @@
>  /*
>   * Faraday optimised copy_user_page
>   */
> -static void __naked
> -fa_copy_user_page(void *kto, const void *kfrom)
> +static void fa_copy_user_page(void *kto, const void *kfrom)
>  {
> -	asm("\
> -	stmfd	sp!, {r4, lr}			@ 2\n\
> +	register void *r0 asm("r0") = kto;
> +	register const void *r1 asm("r1") = kfrom;
> +
> +	asm(
> +	__asmeq("%0", "r0")
> +	__asmeq("%1", "r1")
> +	"\
> +	stmfd	sp!, {r4}			@ 2\n\
>  	mov	r2, %2				@ 1\n\
>  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
>  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
> @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
>  	subs	r2, r2, #1			@ 1\n\
>  	bne	1b				@ 1\n\
>  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
> -	ldmfd	sp!, {r4, pc}			@ 3"
> +	ldmfd	sp!, {r4}			@ 3"
>  	:
> -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
> +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));

This is still wrong as you list r0 and r1 in the input operand list 
where they must remain constant but the code does modify them. You 
should list them in the output operand list with the "&" attribute. Also 
r2 should be listed in the clobbered list.


Nicolas
Russell King (Oracle) Oct. 15, 2018, 10:41 p.m. UTC | #2
On Mon, Oct 15, 2018 at 06:35:33PM -0400, Nicolas Pitre wrote:
> On Tue, 16 Oct 2018, Stefan Agner wrote:
> 
> > GCC documentation says naked functions should only use basic ASM
> > syntax. The extended ASM or mixture of basic ASM and "C" code is
> > not guaranteed. Currently it seems to work though.
> > 
> > Furthermore with Clang using parameters in extended asm in a
> > naked function is not supported:
> >   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
> >   allowed in naked functions
> >         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
> >                ^
> > 
> > Use a regular function to be more portable. Also use volatile asm
> > to avoid unsolicited optimizations.
> > 
> > Tested with qemu versatileab machine and versatile_defconfig and
> > qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
> > and Clang 7.0.
> > 
> > Link: https://github.com/ClangBuiltLinux/linux/issues/90
> > Reported-by: Joel Stanley <joel@jms.id.au>
> > Signed-off-by: Stefan Agner <stefan@agner.ch>
> > ---
> >  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
> >  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
> >  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
> >  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
> >  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
> >  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
> >  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
> >  7 files changed, 72 insertions(+), 40 deletions(-)
> > 
> > diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
> > index ec6501308c60..33ccd396bf99 100644
> > --- a/arch/arm/mm/copypage-fa.c
> > +++ b/arch/arm/mm/copypage-fa.c
> > @@ -17,11 +17,16 @@
> >  /*
> >   * Faraday optimised copy_user_page
> >   */
> > -static void __naked
> > -fa_copy_user_page(void *kto, const void *kfrom)
> > +static void fa_copy_user_page(void *kto, const void *kfrom)
> >  {
> > -	asm("\
> > -	stmfd	sp!, {r4, lr}			@ 2\n\
> > +	register void *r0 asm("r0") = kto;
> > +	register const void *r1 asm("r1") = kfrom;
> > +
> > +	asm(
> > +	__asmeq("%0", "r0")
> > +	__asmeq("%1", "r1")
> > +	"\
> > +	stmfd	sp!, {r4}			@ 2\n\
> >  	mov	r2, %2				@ 1\n\
> >  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
> >  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
> > @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
> >  	subs	r2, r2, #1			@ 1\n\
> >  	bne	1b				@ 1\n\
> >  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
> > -	ldmfd	sp!, {r4, pc}			@ 3"
> > +	ldmfd	sp!, {r4}			@ 3"
> >  	:
> > -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
> > +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
> 
> This is still wrong as you list r0 and r1 in the input operand list 
> where they must remain constant but the code does modify them. You 
> should list them in the output operand list with the "&" attribute. Also 
> r2 should be listed in the clobbered list.

Either we keep these as naked functions (and, if Clang wants to
try to inline naked functions which makes no sense, also mark them
as noinline) or we make them proper functions and also add (eg) r4
to the clobber list and get rid of the stacking of that register
along with LR/PC.

Having this half-way house which will generate worse code is not
acceptable.
Stefan Agner Oct. 15, 2018, 10:51 p.m. UTC | #3
On 16.10.2018 00:41, Russell King - ARM Linux wrote:
> On Mon, Oct 15, 2018 at 06:35:33PM -0400, Nicolas Pitre wrote:
>> On Tue, 16 Oct 2018, Stefan Agner wrote:
>>
>> > GCC documentation says naked functions should only use basic ASM
>> > syntax. The extended ASM or mixture of basic ASM and "C" code is
>> > not guaranteed. Currently it seems to work though.
>> >
>> > Furthermore with Clang using parameters in extended asm in a
>> > naked function is not supported:
>> >   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
>> >   allowed in naked functions
>> >         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
>> >                ^
>> >
>> > Use a regular function to be more portable. Also use volatile asm
>> > to avoid unsolicited optimizations.
>> >
>> > Tested with qemu versatileab machine and versatile_defconfig and
>> > qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
>> > and Clang 7.0.
>> >
>> > Link: https://github.com/ClangBuiltLinux/linux/issues/90
>> > Reported-by: Joel Stanley <joel@jms.id.au>
>> > Signed-off-by: Stefan Agner <stefan@agner.ch>
>> > ---
>> >  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
>> >  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
>> >  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
>> >  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
>> >  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
>> >  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
>> >  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
>> >  7 files changed, 72 insertions(+), 40 deletions(-)
>> >
>> > diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
>> > index ec6501308c60..33ccd396bf99 100644
>> > --- a/arch/arm/mm/copypage-fa.c
>> > +++ b/arch/arm/mm/copypage-fa.c
>> > @@ -17,11 +17,16 @@
>> >  /*
>> >   * Faraday optimised copy_user_page
>> >   */
>> > -static void __naked
>> > -fa_copy_user_page(void *kto, const void *kfrom)
>> > +static void fa_copy_user_page(void *kto, const void *kfrom)
>> >  {
>> > -	asm("\
>> > -	stmfd	sp!, {r4, lr}			@ 2\n\
>> > +	register void *r0 asm("r0") = kto;
>> > +	register const void *r1 asm("r1") = kfrom;
>> > +
>> > +	asm(
>> > +	__asmeq("%0", "r0")
>> > +	__asmeq("%1", "r1")
>> > +	"\
>> > +	stmfd	sp!, {r4}			@ 2\n\
>> >  	mov	r2, %2				@ 1\n\
>> >  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
>> >  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
>> > @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
>> >  	subs	r2, r2, #1			@ 1\n\
>> >  	bne	1b				@ 1\n\
>> >  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
>> > -	ldmfd	sp!, {r4, pc}			@ 3"
>> > +	ldmfd	sp!, {r4}			@ 3"
>> >  	:
>> > -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
>> > +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
>>
>> This is still wrong as you list r0 and r1 in the input operand list
>> where they must remain constant but the code does modify them. You
>> should list them in the output operand list with the "&" attribute. Also
>> r2 should be listed in the clobbered list.
> 
> Either we keep these as naked functions (and, if Clang wants to
> try to inline naked functions which makes no sense, also mark them
> as noinline) or we make them proper functions and also add (eg) r4
> to the clobber list and get rid of the stacking of that register
> along with LR/PC.

Clang does not inline naked functions, at least that is what a quick
look at the disassembled code shows when compiling with 9a40ac86152c
reverted.

> 
> Having this half-way house which will generate worse code is not
> acceptable.

For Clang reverting 9a40ac86152c ("ARM: 6164/1: Add kto and kfrom to
input operands list.") is a solution...

I guess the question is why that commit was necessary back then... Do we
break something by reverting it?

--
Stefan
Nicolas Pitre Oct. 15, 2018, 10:54 p.m. UTC | #4
On Mon, 15 Oct 2018, Russell King - ARM Linux wrote:

> On Mon, Oct 15, 2018 at 06:35:33PM -0400, Nicolas Pitre wrote:
> > On Tue, 16 Oct 2018, Stefan Agner wrote:
> > 
> > > GCC documentation says naked functions should only use basic ASM
> > > syntax. The extended ASM or mixture of basic ASM and "C" code is
> > > not guaranteed. Currently it seems to work though.
> > > 
> > > Furthermore with Clang using parameters in extended asm in a
> > > naked function is not supported:
> > >   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
> > >   allowed in naked functions
> > >         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
> > >                ^
> > > 
> > > Use a regular function to be more portable. Also use volatile asm
> > > to avoid unsolicited optimizations.
> > > 
> > > Tested with qemu versatileab machine and versatile_defconfig and
> > > qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
> > > and Clang 7.0.
> > > 
> > > Link: https://github.com/ClangBuiltLinux/linux/issues/90
> > > Reported-by: Joel Stanley <joel@jms.id.au>
> > > Signed-off-by: Stefan Agner <stefan@agner.ch>
> > > ---
> > >  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
> > >  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
> > >  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
> > >  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
> > >  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
> > >  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
> > >  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
> > >  7 files changed, 72 insertions(+), 40 deletions(-)
> > > 
> > > diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
> > > index ec6501308c60..33ccd396bf99 100644
> > > --- a/arch/arm/mm/copypage-fa.c
> > > +++ b/arch/arm/mm/copypage-fa.c
> > > @@ -17,11 +17,16 @@
> > >  /*
> > >   * Faraday optimised copy_user_page
> > >   */
> > > -static void __naked
> > > -fa_copy_user_page(void *kto, const void *kfrom)
> > > +static void fa_copy_user_page(void *kto, const void *kfrom)
> > >  {
> > > -	asm("\
> > > -	stmfd	sp!, {r4, lr}			@ 2\n\
> > > +	register void *r0 asm("r0") = kto;
> > > +	register const void *r1 asm("r1") = kfrom;
> > > +
> > > +	asm(
> > > +	__asmeq("%0", "r0")
> > > +	__asmeq("%1", "r1")
> > > +	"\
> > > +	stmfd	sp!, {r4}			@ 2\n\
> > >  	mov	r2, %2				@ 1\n\
> > >  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
> > >  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
> > > @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
> > >  	subs	r2, r2, #1			@ 1\n\
> > >  	bne	1b				@ 1\n\
> > >  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
> > > -	ldmfd	sp!, {r4, pc}			@ 3"
> > > +	ldmfd	sp!, {r4}			@ 3"
> > >  	:
> > > -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
> > > +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
> > 
> > This is still wrong as you list r0 and r1 in the input operand list 
> > where they must remain constant but the code does modify them. You 
> > should list them in the output operand list with the "&" attribute. Also 
> > r2 should be listed in the clobbered list.
> 
> Either we keep these as naked functions (and, if Clang wants to
> try to inline naked functions which makes no sense, also mark them
> as noinline) or we make them proper functions and also add (eg) r4
> to the clobber list and get rid of the stacking of that register
> along with LR/PC.

Yes, indeed.

I'd say: remove the naked stuff, and let the compiler do the 
prologue/epilogue itself (or inline it for that matter). And don't force 
pointers and counter into particular registers. This way r0-r3 could be 
used as temporaries since they're probably already clobbered by the call 
to kmap_atomic() anyway. That is likely to be better than forcing ip/lr 
as temporaryes.


Nicolas
Russell King (Oracle) Oct. 15, 2018, 11:02 p.m. UTC | #5
On Mon, Oct 15, 2018 at 06:54:49PM -0400, Nicolas Pitre wrote:
> On Mon, 15 Oct 2018, Russell King - ARM Linux wrote:
> 
> > On Mon, Oct 15, 2018 at 06:35:33PM -0400, Nicolas Pitre wrote:
> > > On Tue, 16 Oct 2018, Stefan Agner wrote:
> > > 
> > > > GCC documentation says naked functions should only use basic ASM
> > > > syntax. The extended ASM or mixture of basic ASM and "C" code is
> > > > not guaranteed. Currently it seems to work though.
> > > > 
> > > > Furthermore with Clang using parameters in extended asm in a
> > > > naked function is not supported:
> > > >   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
> > > >   allowed in naked functions
> > > >         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
> > > >                ^
> > > > 
> > > > Use a regular function to be more portable. Also use volatile asm
> > > > to avoid unsolicited optimizations.
> > > > 
> > > > Tested with qemu versatileab machine and versatile_defconfig and
> > > > qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
> > > > and Clang 7.0.
> > > > 
> > > > Link: https://github.com/ClangBuiltLinux/linux/issues/90
> > > > Reported-by: Joel Stanley <joel@jms.id.au>
> > > > Signed-off-by: Stefan Agner <stefan@agner.ch>
> > > > ---
> > > >  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
> > > >  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
> > > >  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
> > > >  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
> > > >  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
> > > >  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
> > > >  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
> > > >  7 files changed, 72 insertions(+), 40 deletions(-)
> > > > 
> > > > diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
> > > > index ec6501308c60..33ccd396bf99 100644
> > > > --- a/arch/arm/mm/copypage-fa.c
> > > > +++ b/arch/arm/mm/copypage-fa.c
> > > > @@ -17,11 +17,16 @@
> > > >  /*
> > > >   * Faraday optimised copy_user_page
> > > >   */
> > > > -static void __naked
> > > > -fa_copy_user_page(void *kto, const void *kfrom)
> > > > +static void fa_copy_user_page(void *kto, const void *kfrom)
> > > >  {
> > > > -	asm("\
> > > > -	stmfd	sp!, {r4, lr}			@ 2\n\
> > > > +	register void *r0 asm("r0") = kto;
> > > > +	register const void *r1 asm("r1") = kfrom;
> > > > +
> > > > +	asm(
> > > > +	__asmeq("%0", "r0")
> > > > +	__asmeq("%1", "r1")
> > > > +	"\
> > > > +	stmfd	sp!, {r4}			@ 2\n\
> > > >  	mov	r2, %2				@ 1\n\
> > > >  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
> > > >  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
> > > > @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
> > > >  	subs	r2, r2, #1			@ 1\n\
> > > >  	bne	1b				@ 1\n\
> > > >  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
> > > > -	ldmfd	sp!, {r4, pc}			@ 3"
> > > > +	ldmfd	sp!, {r4}			@ 3"
> > > >  	:
> > > > -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
> > > > +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
> > > 
> > > This is still wrong as you list r0 and r1 in the input operand list 
> > > where they must remain constant but the code does modify them. You 
> > > should list them in the output operand list with the "&" attribute. Also 
> > > r2 should be listed in the clobbered list.
> > 
> > Either we keep these as naked functions (and, if Clang wants to
> > try to inline naked functions which makes no sense, also mark them
> > as noinline) or we make them proper functions and also add (eg) r4
> > to the clobber list and get rid of the stacking of that register
> > along with LR/PC.
> 
> Yes, indeed.
> 
> I'd say: remove the naked stuff, and let the compiler do the 
> prologue/epilogue itself (or inline it for that matter). And don't force 
> pointers and counter into particular registers. This way r0-r3 could be 
> used as temporaries since they're probably already clobbered by the call 
> to kmap_atomic() anyway. That is likely to be better than forcing ip/lr 
> as temporaryes.

That doesn't work for the general case - which is where the functions
are called via function pointers, and so are never inlined.  For these,
the current code is optimal, and I suspect the compiler will do worse
with it.

For the two instances (v4wb and mc) that don't follow that pattern,
you may be right, but I'd want to see the result of the changes.
Nicolas Pitre Oct. 15, 2018, 11:27 p.m. UTC | #6
On Tue, 16 Oct 2018, Stefan Agner wrote:

> On 16.10.2018 00:41, Russell King - ARM Linux wrote:
> > On Mon, Oct 15, 2018 at 06:35:33PM -0400, Nicolas Pitre wrote:
> >> On Tue, 16 Oct 2018, Stefan Agner wrote:
> >>
> >> > GCC documentation says naked functions should only use basic ASM
> >> > syntax. The extended ASM or mixture of basic ASM and "C" code is
> >> > not guaranteed. Currently it seems to work though.
> >> >
> >> > Furthermore with Clang using parameters in extended asm in a
> >> > naked function is not supported:
> >> >   arch/arm/mm/copypage-v4wb.c:47:9: error: parameter references not
> >> >   allowed in naked functions
> >> >         : "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
> >> >                ^
> >> >
> >> > Use a regular function to be more portable. Also use volatile asm
> >> > to avoid unsolicited optimizations.
> >> >
> >> > Tested with qemu versatileab machine and versatile_defconfig and
> >> > qemu mainstone machine using pxa_defconfig compiled with GCC 7.2.1
> >> > and Clang 7.0.
> >> >
> >> > Link: https://github.com/ClangBuiltLinux/linux/issues/90
> >> > Reported-by: Joel Stanley <joel@jms.id.au>
> >> > Signed-off-by: Stefan Agner <stefan@agner.ch>
> >> > ---
> >> >  arch/arm/mm/copypage-fa.c       | 17 +++++++++++------
> >> >  arch/arm/mm/copypage-feroceon.c | 17 +++++++++++------
> >> >  arch/arm/mm/copypage-v4mc.c     | 14 +++++++++-----
> >> >  arch/arm/mm/copypage-v4wb.c     | 17 +++++++++++------
> >> >  arch/arm/mm/copypage-v4wt.c     | 17 +++++++++++------
> >> >  arch/arm/mm/copypage-xsc3.c     | 17 +++++++++++------
> >> >  arch/arm/mm/copypage-xscale.c   | 13 ++++++++-----
> >> >  7 files changed, 72 insertions(+), 40 deletions(-)
> >> >
> >> > diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
> >> > index ec6501308c60..33ccd396bf99 100644
> >> > --- a/arch/arm/mm/copypage-fa.c
> >> > +++ b/arch/arm/mm/copypage-fa.c
> >> > @@ -17,11 +17,16 @@
> >> >  /*
> >> >   * Faraday optimised copy_user_page
> >> >   */
> >> > -static void __naked
> >> > -fa_copy_user_page(void *kto, const void *kfrom)
> >> > +static void fa_copy_user_page(void *kto, const void *kfrom)
> >> >  {
> >> > -	asm("\
> >> > -	stmfd	sp!, {r4, lr}			@ 2\n\
> >> > +	register void *r0 asm("r0") = kto;
> >> > +	register const void *r1 asm("r1") = kfrom;
> >> > +
> >> > +	asm(
> >> > +	__asmeq("%0", "r0")
> >> > +	__asmeq("%1", "r1")
> >> > +	"\
> >> > +	stmfd	sp!, {r4}			@ 2\n\
> >> >  	mov	r2, %2				@ 1\n\
> >> >  1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
> >> >  	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
> >> > @@ -34,9 +39,9 @@ fa_copy_user_page(void *kto, const void *kfrom)
> >> >  	subs	r2, r2, #1			@ 1\n\
> >> >  	bne	1b				@ 1\n\
> >> >  	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
> >> > -	ldmfd	sp!, {r4, pc}			@ 3"
> >> > +	ldmfd	sp!, {r4}			@ 3"
> >> >  	:
> >> > -	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
> >> > +	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
> >>
> >> This is still wrong as you list r0 and r1 in the input operand list
> >> where they must remain constant but the code does modify them. You
> >> should list them in the output operand list with the "&" attribute. Also
> >> r2 should be listed in the clobbered list.
> > 
> > Either we keep these as naked functions (and, if Clang wants to
> > try to inline naked functions which makes no sense, also mark them
> > as noinline) or we make them proper functions and also add (eg) r4
> > to the clobber list and get rid of the stacking of that register
> > along with LR/PC.
> 
> Clang does not inline naked functions, at least that is what a quick
> look at the disassembled code shows when compiling with 9a40ac86152c
> reverted.

It's hard to see what that commit was actually fixing, but the operands 
usage is wrong as explained already. Maybe the generated code has been 
OK for all those years but that is due to luck rather than correctness.

> > Having this half-way house which will generate worse code is not
> > acceptable.
> 
> For Clang reverting 9a40ac86152c ("ARM: 6164/1: Add kto and kfrom to
> input operands list.") is a solution...
> 
> I guess the question is why that commit was necessary back then... Do we
> break something by reverting it?

No idea. Maybe Russell remembers?
Maybe digging into the mailing list archive might tell.


Nicolas
Russell King (Oracle) Oct. 16, 2018, 8:33 a.m. UTC | #7
On Mon, Oct 15, 2018 at 07:27:43PM -0400, Nicolas Pitre wrote:
> It's hard to see what that commit was actually fixing, but the operands 
> usage is wrong as explained already. Maybe the generated code has been 
> OK for all those years but that is due to luck rather than correctness.
...
> No idea. Maybe Russell remembers?
> Maybe digging into the mailing list archive might tell.

I found this as a reply to the patch by Mikael Pettersson:

I've tested and verified that this bit enables a gcc-4.5 compiled kernel
to boot on TS-119 (Kirkwood) when combined with my fix for __naked.
With neither or only one of the patches applied, the kernel oopses hard
in copy_user_page() as it tries to start /sbin/init.
...
- the asm() bodies of these __naked functions have inadequate input
  parameter constraints, in particular they fail to declare any
  dependencies on the functions' formal parameters; gcc-4.5 sees this
  and skips the parameter setup before calling these functions, causing
  runtime crashes; Khem's patch (this one) fixes that
  (copypage-xscale.c already had correct asm() constraints so it works
  with only the __naked fix, these other copypage-*.c files need both
  patches to work)

So, while wrong to the GCC manual, it's fixing a bug that is present
with gcc-4.5 and who-knows what other GCC versions.  Reverting the
commit has the chance to cause regressions with GCC.

It looks like any change here needs to be validated on a range of
GCC versions, because there are versions of GCC known not to follow
it's manual!
Stefan Agner Oct. 16, 2018, 12:09 p.m. UTC | #8
On 16.10.2018 10:33, Russell King - ARM Linux wrote:
> On Mon, Oct 15, 2018 at 07:27:43PM -0400, Nicolas Pitre wrote:
>> It's hard to see what that commit was actually fixing, but the operands
>> usage is wrong as explained already. Maybe the generated code has been
>> OK for all those years but that is due to luck rather than correctness.
> ...
>> No idea. Maybe Russell remembers?
>> Maybe digging into the mailing list archive might tell.
> 
> I found this as a reply to the patch by Mikael Pettersson:
> 
> I've tested and verified that this bit enables a gcc-4.5 compiled kernel
> to boot on TS-119 (Kirkwood) when combined with my fix for __naked.
> With neither or only one of the patches applied, the kernel oopses hard
> in copy_user_page() as it tries to start /sbin/init.
> ...
> - the asm() bodies of these __naked functions have inadequate input
>   parameter constraints, in particular they fail to declare any
>   dependencies on the functions' formal parameters; gcc-4.5 sees this
>   and skips the parameter setup before calling these functions, causing
>   runtime crashes; Khem's patch (this one) fixes that
>   (copypage-xscale.c already had correct asm() constraints so it works
>   with only the __naked fix, these other copypage-*.c files need both
>   patches to work)
> 
> So, while wrong to the GCC manual, it's fixing a bug that is present
> with gcc-4.5 and who-knows what other GCC versions.  Reverting the
> commit has the chance to cause regressions with GCC.

The build system requires at least GCC 4.6 currently, so we do not have
to deal with 4.5.

> 
> It looks like any change here needs to be validated on a range of
> GCC versions, because there are versions of GCC known not to follow
> it's manual!

The commit message as well as the above message sounds more like it was
a newly introduced behavior in 4.5. I would suggest to at least check
4.6 to make sure it has been corrected.

--
Stefan
diff mbox series

Patch

diff --git a/arch/arm/mm/copypage-fa.c b/arch/arm/mm/copypage-fa.c
index ec6501308c60..33ccd396bf99 100644
--- a/arch/arm/mm/copypage-fa.c
+++ b/arch/arm/mm/copypage-fa.c
@@ -17,11 +17,16 @@ 
 /*
  * Faraday optimised copy_user_page
  */
-static void __naked
-fa_copy_user_page(void *kto, const void *kfrom)
+static void fa_copy_user_page(void *kto, const void *kfrom)
 {
-	asm("\
-	stmfd	sp!, {r4, lr}			@ 2\n\
+	register void *r0 asm("r0") = kto;
+	register const void *r1 asm("r1") = kfrom;
+
+	asm(
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"\
+	stmfd	sp!, {r4}			@ 2\n\
 	mov	r2, %2				@ 1\n\
 1:	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
 	stmia	r0, {r3, r4, ip, lr}		@ 4\n\
@@ -34,9 +39,9 @@  fa_copy_user_page(void *kto, const void *kfrom)
 	subs	r2, r2, #1			@ 1\n\
 	bne	1b				@ 1\n\
 	mcr	p15, 0, r2, c7, c10, 4		@ 1   drain WB\n\
-	ldmfd	sp!, {r4, pc}			@ 3"
+	ldmfd	sp!, {r4}			@ 3"
 	:
-	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 32));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 32));
 }
 
 void fa_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-feroceon.c b/arch/arm/mm/copypage-feroceon.c
index 49ee0c1a7209..71c3b938493a 100644
--- a/arch/arm/mm/copypage-feroceon.c
+++ b/arch/arm/mm/copypage-feroceon.c
@@ -13,11 +13,16 @@ 
 #include <linux/init.h>
 #include <linux/highmem.h>
 
-static void __naked
-feroceon_copy_user_page(void *kto, const void *kfrom)
+static void feroceon_copy_user_page(void *kto, const void *kfrom)
 {
-	asm("\
-	stmfd	sp!, {r4-r9, lr}		\n\
+	register void *r0 asm("r0") = kto;
+	register const void *r1 asm("r1") = kfrom;
+
+	asm volatile(
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"\
+	stmfd	sp!, {r4-r9}			\n\
 	mov	ip, %2				\n\
 1:	mov	lr, r1				\n\
 	ldmia	r1!, {r2 - r9}			\n\
@@ -62,9 +67,9 @@  feroceon_copy_user_page(void *kto, const void *kfrom)
 	add	r0, r0, #32			\n\
 	bne	1b				\n\
 	mcr	p15, 0, ip, c7, c10, 4		@ drain WB\n\
-	ldmfd	sp!, {r4-r9, pc}"
+	ldmfd	sp!, {r4-r9}"
 	:
-	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE));
 }
 
 void feroceon_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index 0224416cba3c..85a81bc67912 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -40,11 +40,15 @@  static DEFINE_RAW_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_highpage that does the right thing.
  */
-static void __naked
-mc_copy_user_page(void *from, void *to)
+static void mc_copy_user_page(void *from, void *to)
 {
+	register void *r0 asm("r0") = from;
+	register void *r1 asm("r1") = to;
+
 	asm volatile(
-	"stmfd	sp!, {r4, lr}			@ 2\n\
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"stmfd	sp!, {r4}			@ 2\n\
 	mov	r4, %2				@ 1\n\
 	ldmia	%0!, {r2, r3, ip, lr}		@ 4\n\
 1:	mcr	p15, 0, %1, c7, c6, 1		@ 1   invalidate D line\n\
@@ -59,9 +63,9 @@  mc_copy_user_page(void *from, void *to)
 	stmia	%1!, {r2, r3, ip, lr}		@ 4\n\
 	ldmneia	%0!, {r2, r3, ip, lr}		@ 4\n\
 	bne	1b				@ 1\n\
-	ldmfd	sp!, {r4, pc}			@ 3"
+	ldmfd	sp!, {r4}			@ 3"
 	:
-	: "r" (from), "r" (to), "I" (PAGE_SIZE / 64));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 64));
 }
 
 void v4_mc_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-v4wb.c b/arch/arm/mm/copypage-v4wb.c
index 067d0fdd630c..dd518bf30a97 100644
--- a/arch/arm/mm/copypage-v4wb.c
+++ b/arch/arm/mm/copypage-v4wb.c
@@ -22,11 +22,16 @@ 
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_highpage that does the right thing.
  */
-static void __naked
-v4wb_copy_user_page(void *kto, const void *kfrom)
+static void v4wb_copy_user_page(void *kto, const void *kfrom)
 {
-	asm("\
-	stmfd	sp!, {r4, lr}			@ 2\n\
+	register void *r0 asm("r0") = kto;
+	register const void *r1 asm("r1") = kfrom;
+
+	asm volatile(
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"\
+	stmfd	sp!, {r4}			@ 2\n\
 	mov	r2, %2				@ 1\n\
 	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
 1:	mcr	p15, 0, r0, c7, c6, 1		@ 1   invalidate D line\n\
@@ -42,9 +47,9 @@  v4wb_copy_user_page(void *kto, const void *kfrom)
 	ldmneia	r1!, {r3, r4, ip, lr}		@ 4\n\
 	bne	1b				@ 1\n\
 	mcr	p15, 0, r1, c7, c10, 4		@ 1   drain WB\n\
-	ldmfd	 sp!, {r4, pc}			@ 3"
+	ldmfd	 sp!, {r4}			@ 3"
 	:
-	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 64));
 }
 
 void v4wb_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-v4wt.c b/arch/arm/mm/copypage-v4wt.c
index b85c5da2e510..d397ac123300 100644
--- a/arch/arm/mm/copypage-v4wt.c
+++ b/arch/arm/mm/copypage-v4wt.c
@@ -20,11 +20,16 @@ 
  * dirty data in the cache.  However, we do have to ensure that
  * subsequent reads are up to date.
  */
-static void __naked
-v4wt_copy_user_page(void *kto, const void *kfrom)
+static void v4wt_copy_user_page(void *kto, const void *kfrom)
 {
-	asm("\
-	stmfd	sp!, {r4, lr}			@ 2\n\
+	register void *r0 asm("r0") = kto;
+	register const void *r1 asm("r1") = kfrom;
+
+	asm volatile(
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"\
+	stmfd	sp!, {r4}			@ 2\n\
 	mov	r2, %2				@ 1\n\
 	ldmia	r1!, {r3, r4, ip, lr}		@ 4\n\
 1:	stmia	r0!, {r3, r4, ip, lr}		@ 4\n\
@@ -38,9 +43,9 @@  v4wt_copy_user_page(void *kto, const void *kfrom)
 	ldmneia	r1!, {r3, r4, ip, lr}		@ 4\n\
 	bne	1b				@ 1\n\
 	mcr	p15, 0, r2, c7, c7, 0		@ flush ID cache\n\
-	ldmfd	sp!, {r4, pc}			@ 3"
+	ldmfd	sp!, {r4}			@ 3"
 	:
-	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 64));
 }
 
 void v4wt_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-xsc3.c b/arch/arm/mm/copypage-xsc3.c
index 03a2042aced5..6a60465b52e1 100644
--- a/arch/arm/mm/copypage-xsc3.c
+++ b/arch/arm/mm/copypage-xsc3.c
@@ -29,11 +29,16 @@ 
  * if we eventually end up using our copied page.
  *
  */
-static void __naked
-xsc3_mc_copy_user_page(void *kto, const void *kfrom)
+static void xsc3_mc_copy_user_page(void *kto, const void *kfrom)
 {
-	asm("\
-	stmfd	sp!, {r4, r5, lr}		\n\
+	register void *r0 asm("r0") = kto;
+	register const void *r1 asm("r1") = kfrom;
+
+	asm volatile(
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"\
+	stmfd	sp!, {r4, r5}			\n\
 	mov	lr, %2				\n\
 						\n\
 	pld	[r1, #0]			\n\
@@ -65,9 +70,9 @@  xsc3_mc_copy_user_page(void *kto, const void *kfrom)
 	bgt	1b				\n\
 	beq	2b				\n\
 						\n\
-	ldmfd	sp!, {r4, r5, pc}"
+	ldmfd	sp!, {r4, r5}"
 	:
-	: "r" (kto), "r" (kfrom), "I" (PAGE_SIZE / 64 - 1));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 64 - 1));
 }
 
 void xsc3_mc_copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c
index 97972379f4d6..e508e99311a0 100644
--- a/arch/arm/mm/copypage-xscale.c
+++ b/arch/arm/mm/copypage-xscale.c
@@ -36,15 +36,18 @@  static DEFINE_RAW_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __naked
-mc_copy_user_page(void *from, void *to)
+static void mc_copy_user_page(void *from, void *to)
 {
+	register void *r0 asm("r0") = from;
+	register void *r1 asm("r1") = to;
 	/*
 	 * Strangely enough, best performance is achieved
 	 * when prefetching destination as well.  (NP)
 	 */
 	asm volatile(
-	"stmfd	sp!, {r4, r5, lr}		\n\
+	__asmeq("%0", "r0")
+	__asmeq("%1", "r1")
+	"stmfd	sp!, {r4, r5}			\n\
 	mov	lr, %2				\n\
 	pld	[r0, #0]			\n\
 	pld	[r0, #32]			\n\
@@ -79,9 +82,9 @@  mc_copy_user_page(void *from, void *to)
 	mcr	p15, 0, ip, c7, c6, 1		@ invalidate D line\n\
 	bgt	1b				\n\
 	beq	2b				\n\
-	ldmfd	sp!, {r4, r5, pc}		"
+	ldmfd	sp!, {r4, r5}		"
 	:
-	: "r" (from), "r" (to), "I" (PAGE_SIZE / 64 - 1));
+	: "r" (r0), "r" (r1), "I" (PAGE_SIZE / 64 - 1));
 }
 
 void xscale_mc_copy_user_highpage(struct page *to, struct page *from,