diff mbox series

tools/nolibc: add support for N64 and N32 ABIs

Message ID 20250212-nolibc-mips-n32-v1-1-6892e58d1321@weissschuh.net (mailing list archive)
State New
Headers show
Series tools/nolibc: add support for N64 and N32 ABIs | expand

Commit Message

Thomas Weißschuh Feb. 12, 2025, 6:49 p.m. UTC
Add support for the MIPS 64bit N64 and ILP32 N32 ABIs.

In addition to different byte orders and ABIs there are also different
releases of the MIPS architecture. To avoid blowing up the test matrix,
only add a subset of all possible test combinations.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/arch-mips.h            | 119 ++++++++++++++++++++++++----
 tools/testing/selftests/nolibc/Makefile     |  28 ++++++-
 tools/testing/selftests/nolibc/run-tests.sh |   2 +-
 3 files changed, 131 insertions(+), 18 deletions(-)


---
base-commit: 16681bea9a80080765c98b545ad74c17de2d513c
change-id: 20231105-nolibc-mips-n32-234901bd910d

Best regards,

Comments

Willy Tarreau Feb. 16, 2025, 9:49 a.m. UTC | #1
On Wed, Feb 12, 2025 at 07:49:53PM +0100, Thomas Weißschuh wrote:
> +#if defined(_ABIO32)
> +
>  #define _NOLIBC_SYSCALL_CLOBBERLIST \
>  	"memory", "cc", "at", "v1", "hi", "lo", \
>  	"t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"
> +#define _NOLIBC_SYSCALL_STACK_RESERVE "addiu $sp, $sp, -32\n"
> +#define _NOLIBC_SYSCALL_STACK_UNRESERVE "addiu $sp, $sp, 32\n"
> +
> +#elif defined(_ABIN32) || defined(_ABI64)
> +
> +/* binutils, GCC and clang disagree about register aliases, use numbers instead. */

Is this often encountered despite this ? I guess it can cause portability
issues :-/

> +#if defined(_ABIO32)
> +
>  #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
>  ({                                                                            \
>  	register long _num __asm__ ("v0") = (num);                            \
(...)
> @@ -178,6 +201,50 @@
>  	_arg4 ? -_num : _num;                                                 \
>  })
>  
> +#else
> +

Here you should indicate which ABI is covered by this #else, because one
has to go up to previous definitions to figure it's _ABIN32 and _ABI64.

> +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
> +({                                                                            \
> +	register long _num __asm__ ("v0") = (num);                            \
> +	register long _arg1 __asm__ ("$4") = (long)(arg1);                    \
> +	register long _arg2 __asm__ ("$5") = (long)(arg2);                    \
> +	register long _arg3 __asm__ ("$6") = (long)(arg3);                    \
> +	register long _arg4 __asm__ ("$7") = (long)(arg4);                    \
> +	register long _arg5 __asm__ ("$8") = (long)(arg5);                    \
(...)
> @@ -190,13 +257,33 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
>  		"1:\n"
>  		".cpload $ra\n"
>  		"move  $a0, $sp\n"       /* save stack pointer to $a0, as arg1 of _start_c */
> +
> +#if defined(_ABIO32)
>  		"addiu $sp, $sp, -4\n"   /* space for .cprestore to store $gp              */
>  		".cprestore 0\n"
>  		"li    $t0, -8\n"
>  		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
>  		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
> -		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
> +#else

So same here. I think you should do it for all #else since you're generally
grouping 2 ABIs vs one between a #if and a #else and it's not trivial to
figure what a #else covers, like below.

> +		"daddiu $sp, $sp, -8\n"  /* space for .cprestore to store $gp              */
> +		".cpsetup $ra, 0, 1b\n"
> +		"li    $t0, -16\n"
> +		"and   $sp, $sp, $t0\n"  /* $sp must be 16-byte aligned                    */
> +#endif
> +
> +		/* ABI requires current function address in $t9 */
> +#if defined(_ABIO32) || defined(_ABIN32)
> +		"lui $t9, %hi(_start_c)\n"
>  		"ori $t9, %lo(_start_c)\n"
> +#else

This one indeed covers only _ABI64

> +		"lui  $t9, %highest(_start_c)\n"
> +		"ori  $t9, %higher(_start_c)\n"
> +		"dsll $t9, 0x10\n"
> +		"ori  $t9, %hi(_start_c)\n"
> +		"dsll $t9, 0x10\n"
> +		"ori  $t9, %lo(_start_c)\n"
> +#endif

With the tiny details above, this looks fine. It's great that syscall
numbers didn't change so that you can cover an extra arch with only a
few ifdefs. I have not tested but I guess you did :-) So that's OK for
me:

Acked-by: Willy Tarreau <w@1wt.eu>

Thanks!
Willy
Maciej W. Rozycki Feb. 16, 2025, 3:41 p.m. UTC | #2
On Wed, 12 Feb 2025, Thomas Weißschuh wrote:

> diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
> index 753a8ed2cf695f0b5eac4b5e4d317fdb383ebf93..638520a3427a985fdbd5f5a49b55853bbadeee75 100644
> --- a/tools/include/nolibc/arch-mips.h
> +++ b/tools/include/nolibc/arch-mips.h
> @@ -190,13 +257,33 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
>  		"1:\n"
>  		".cpload $ra\n"
>  		"move  $a0, $sp\n"       /* save stack pointer to $a0, as arg1 of _start_c */
> +
> +#if defined(_ABIO32)
>  		"addiu $sp, $sp, -4\n"   /* space for .cprestore to store $gp              */
>  		".cprestore 0\n"
>  		"li    $t0, -8\n"
>  		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
>  		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
> -		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
> +#else
> +		"daddiu $sp, $sp, -8\n"  /* space for .cprestore to store $gp              */
> +		".cpsetup $ra, 0, 1b\n"
> +		"li    $t0, -16\n"
> +		"and   $sp, $sp, $t0\n"  /* $sp must be 16-byte aligned                    */
> +#endif

 Why is this code breaking stack alignment just to have to fix it up two 
instructions down the line?  Or is it that the incoming $sp is not aligned 
in the first place (in which case we're having a deeper problem).

> +
> +		/* ABI requires current function address in $t9 */
> +#if defined(_ABIO32) || defined(_ABIN32)
> +		"lui $t9, %hi(_start_c)\n"
>  		"ori $t9, %lo(_start_c)\n"
> +#else
> +		"lui  $t9, %highest(_start_c)\n"
> +		"ori  $t9, %higher(_start_c)\n"
> +		"dsll $t9, 0x10\n"
> +		"ori  $t9, %hi(_start_c)\n"
> +		"dsll $t9, 0x10\n"
> +		"ori  $t9, %lo(_start_c)\n"

 This could be optimised using a temporary (e.g. $at, but I guess any will 
do as I gather we don't have any ABI abnormalities here).

> +#endif
> +
>  		"jalr $t9\n"             /* transfer to c runtime
> */
>  		" nop\n"                 /* delayed slot

 On an unrelated matter JALR above ought to be JAL (or otherwise there's 
no point in using the .cprestore pseudo-op).  And I fail to see why this 
code has to be "noreorder" (except for the .cpload piece, of course), it's 
just asking for troubles.

  Maciej
Thomas Weißschuh Feb. 17, 2025, 9:26 p.m. UTC | #3
On 2025-02-16 10:49:38+0100, Willy Tarreau wrote:
> On Wed, Feb 12, 2025 at 07:49:53PM +0100, Thomas Weißschuh wrote:
> > +#if defined(_ABIO32)
> > +
> >  #define _NOLIBC_SYSCALL_CLOBBERLIST \
> >  	"memory", "cc", "at", "v1", "hi", "lo", \
> >  	"t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"
> > +#define _NOLIBC_SYSCALL_STACK_RESERVE "addiu $sp, $sp, -32\n"
> > +#define _NOLIBC_SYSCALL_STACK_UNRESERVE "addiu $sp, $sp, 32\n"
> > +
> > +#elif defined(_ABIN32) || defined(_ABI64)
> > +
> > +/* binutils, GCC and clang disagree about register aliases, use numbers instead. */
> 
> Is this often encountered despite this ? I guess it can cause portability
> issues :-/

No idea. It's the first time I saw something like this.

> > +#if defined(_ABIO32)
> > +
> >  #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
> >  ({                                                                            \
> >  	register long _num __asm__ ("v0") = (num);                            \
> (...)
> > @@ -178,6 +201,50 @@
> >  	_arg4 ? -_num : _num;                                                 \
> >  })
> >  
> > +#else
> > +
> 
> Here you should indicate which ABI is covered by this #else, because one
> has to go up to previous definitions to figure it's _ABIN32 and _ABI64.

Ack.

> > +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
> > +({                                                                            \
> > +	register long _num __asm__ ("v0") = (num);                            \
> > +	register long _arg1 __asm__ ("$4") = (long)(arg1);                    \
> > +	register long _arg2 __asm__ ("$5") = (long)(arg2);                    \
> > +	register long _arg3 __asm__ ("$6") = (long)(arg3);                    \
> > +	register long _arg4 __asm__ ("$7") = (long)(arg4);                    \
> > +	register long _arg5 __asm__ ("$8") = (long)(arg5);                    \
> (...)
> > @@ -190,13 +257,33 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
> >  		"1:\n"
> >  		".cpload $ra\n"
> >  		"move  $a0, $sp\n"       /* save stack pointer to $a0, as arg1 of _start_c */
> > +
> > +#if defined(_ABIO32)
> >  		"addiu $sp, $sp, -4\n"   /* space for .cprestore to store $gp              */
> >  		".cprestore 0\n"
> >  		"li    $t0, -8\n"
> >  		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
> >  		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
> > -		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
> > +#else
> 
> So same here. I think you should do it for all #else since you're generally
> grouping 2 ABIs vs one between a #if and a #else and it's not trivial to
> figure what a #else covers, like below.

Ack.

> > +		"daddiu $sp, $sp, -8\n"  /* space for .cprestore to store $gp              */
> > +		".cpsetup $ra, 0, 1b\n"
> > +		"li    $t0, -16\n"
> > +		"and   $sp, $sp, $t0\n"  /* $sp must be 16-byte aligned                    */
> > +#endif
> > +
> > +		/* ABI requires current function address in $t9 */
> > +#if defined(_ABIO32) || defined(_ABIN32)
> > +		"lui $t9, %hi(_start_c)\n"
> >  		"ori $t9, %lo(_start_c)\n"
> > +#else
> 
> This one indeed covers only _ABI64

(That is intentional)

> > +		"lui  $t9, %highest(_start_c)\n"
> > +		"ori  $t9, %higher(_start_c)\n"
> > +		"dsll $t9, 0x10\n"
> > +		"ori  $t9, %hi(_start_c)\n"
> > +		"dsll $t9, 0x10\n"
> > +		"ori  $t9, %lo(_start_c)\n"
> > +#endif
> 
> With the tiny details above, this looks fine. It's great that syscall
> numbers didn't change so that you can cover an extra arch with only a
> few ifdefs. I have not tested but I guess you did :-) So that's OK for
> me:

The syscall numbers are different, but the UAPI headers also detect the
ABI in use and select the correct numbers.

> Acked-by: Willy Tarreau <w@1wt.eu>

Thanks!
Thomas Weißschuh Feb. 17, 2025, 9:41 p.m. UTC | #4
Hi Maciej,

thanks for your feedback!

On 2025-02-16 15:41:55+0000, Maciej W. Rozycki wrote:
> On Wed, 12 Feb 2025, Thomas Weißschuh wrote:
> > diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
> > index 753a8ed2cf695f0b5eac4b5e4d317fdb383ebf93..638520a3427a985fdbd5f5a49b55853bbadeee75 100644
> > --- a/tools/include/nolibc/arch-mips.h
> > +++ b/tools/include/nolibc/arch-mips.h
> > @@ -190,13 +257,33 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
> >  		"1:\n"
> >  		".cpload $ra\n"
> >  		"move  $a0, $sp\n"       /* save stack pointer to $a0, as arg1 of _start_c */
> > +
> > +#if defined(_ABIO32)
> >  		"addiu $sp, $sp, -4\n"   /* space for .cprestore to store $gp              */
> >  		".cprestore 0\n"
> >  		"li    $t0, -8\n"
> >  		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
> >  		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
> > -		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
> > +#else
> > +		"daddiu $sp, $sp, -8\n"  /* space for .cprestore to store $gp              */
> > +		".cpsetup $ra, 0, 1b\n"
> > +		"li    $t0, -16\n"
> > +		"and   $sp, $sp, $t0\n"  /* $sp must be 16-byte aligned                    */
> > +#endif
> 
>  Why is this code breaking stack alignment just to have to fix it up two 
> instructions down the line?  Or is it that the incoming $sp is not aligned 
> in the first place (in which case we're having a deeper problem).

nolibc itself does not assume that $sp is aligned.
Maybe Willy can explain the historical background.

The System V ABI MIPS supplement [0] says the following:

The registers listed below have the specified contents at process entry:
	...

	$sp The stack pointer holds the address of the bottom of the stack, which
	must be doubleword (8 byte) aligned.
	...

However "process entry" is main(), while this code is running before main.

The kernel always aligns the stack to a multiple of 16 bytes.
See the usage of STACK_ROUND() in fs/binfmt_elf.c.

So I guess we could remove the manual alignment.
(At least for alignments of 16 bytes and less)

> > +
> > +		/* ABI requires current function address in $t9 */
> > +#if defined(_ABIO32) || defined(_ABIN32)
> > +		"lui $t9, %hi(_start_c)\n"
> >  		"ori $t9, %lo(_start_c)\n"
> > +#else
> > +		"lui  $t9, %highest(_start_c)\n"
> > +		"ori  $t9, %higher(_start_c)\n"
> > +		"dsll $t9, 0x10\n"
> > +		"ori  $t9, %hi(_start_c)\n"
> > +		"dsll $t9, 0x10\n"
> > +		"ori  $t9, %lo(_start_c)\n"
> 
>  This could be optimised using a temporary (e.g. $at, but I guess any will 
> do as I gather we don't have any ABI abnormalities here).

clang rejects manual usage of $at without ".set noat".
So $t0 is simpler.

> > +#endif
> > +
> >  		"jalr $t9\n"             /* transfer to c runtime
> > */
> >  		" nop\n"                 /* delayed slot
> 
>  On an unrelated matter JALR above ought to be JAL (or otherwise there's 
> no point in using the .cprestore pseudo-op).  And I fail to see why this 
> code has to be "noreorder" (except for the .cpload piece, of course), it's 
> just asking for troubles.

Thanks for the hints.

Without "noreorder", is the manually addition of the delayed slot "nop"
still necessary?
These points also apply to the existing O32 implementation, right?
If so I'll make a proper series out of it.

[0] https://refspecs.linuxfoundation.org/elf/mipsabi.pdf


Thomas
Maciej W. Rozycki Feb. 17, 2025, 11:23 p.m. UTC | #5
Hi Thomas,

> thanks for your feedback!

 You're welcome.  Sadly little MIPS-fu seems still available nowadays and 
for me it's often too easy to miss MIPS-related topics in the mailing list 
flood.

> >  Why is this code breaking stack alignment just to have to fix it up two 
> > instructions down the line?  Or is it that the incoming $sp is not aligned 
> > in the first place (in which case we're having a deeper problem).
> 
> nolibc itself does not assume that $sp is aligned.
> Maybe Willy can explain the historical background.

 I'm all ears.

> The System V ABI MIPS supplement [0] says the following:
> 
> The registers listed below have the specified contents at process entry:
> 	...
> 
> 	$sp The stack pointer holds the address of the bottom of the stack, which
> 	must be doubleword (8 byte) aligned.
> 	...
> 
> However "process entry" is main(), while this code is running before main.

 Umm, no, process entry definitely is not main(); if you refer to the 
somewhat obsolete and inaccurate MIPS psABI, then please note that the 
paragraph right above your quote says:

$2	A non-zero value specifies a function pointer the application 
	should register with atexit(BA_OS).  If $2 contains zero, no 
	action is required.

and one immediately below it says:

$31	The return address register is set to zero so that programs that 
	search backward through stack frames (stack backtracing) recognize 
	the last stack frame, that is, a stack frame with a zero in the 
        saved $31 slot.

and there is more on the initial process stack earlier on, including the 
location of the auxiliary vector.  All it making it clear it's not main() 
this specification refers to, but the entry point from the OS kernel (then 
of course you're aware already it's o32 it talks about; n64/n32 requires 
16 bytes, but then again we only have secondary references[1][2], in this 
case for SGI IRIX).

> The kernel always aligns the stack to a multiple of 16 bytes.
> See the usage of STACK_ROUND() in fs/binfmt_elf.c.
> 
> So I guess we could remove the manual alignment.
> (At least for alignments of 16 bytes and less)

 I think they all need to go then, but then stack pointer adjustments have 
to be made in multiples of the alignment required by the psABI of course.

> > > +
> > > +		/* ABI requires current function address in $t9 */
> > > +#if defined(_ABIO32) || defined(_ABIN32)
> > > +		"lui $t9, %hi(_start_c)\n"
> > >  		"ori $t9, %lo(_start_c)\n"
> > > +#else
> > > +		"lui  $t9, %highest(_start_c)\n"
> > > +		"ori  $t9, %higher(_start_c)\n"
> > > +		"dsll $t9, 0x10\n"
> > > +		"ori  $t9, %hi(_start_c)\n"
> > > +		"dsll $t9, 0x10\n"
> > > +		"ori  $t9, %lo(_start_c)\n"
> > 
> >  This could be optimised using a temporary (e.g. $at, but I guess any will 
> > do as I gather we don't have any ABI abnormalities here).
> 
> clang rejects manual usage of $at without ".set noat".
> So $t0 is simpler.

 It's always `.set at' that's required to use $at by hand; it's been so 
long before clang was even thought of.  Or you could use LA and DLA macros 
for o32/n32 and n64 respectively for the assembler to do all this stuff 
for you in the default `.set reorder' mode (assuming that clang is not 
completely broken here).

> > > +#endif
> > > +
> > >  		"jalr $t9\n"             /* transfer to c runtime
> > > */
> > >  		" nop\n"                 /* delayed slot
> > 
> >  On an unrelated matter JALR above ought to be JAL (or otherwise there's 
> > no point in using the .cprestore pseudo-op).  And I fail to see why this 
> > code has to be "noreorder" (except for the .cpload piece, of course), it's 
> > just asking for troubles.
> 
> Thanks for the hints.
> 
> Without "noreorder", is the manually addition of the delayed slot "nop"
> still necessary?

 It's not.  It's for the `.set noreorder' mode only, to fill the branch 
delay slot by hand.  Otherwise it'll become just a useless instruction 
following the call sequence and executed after return.

> These points also apply to the existing O32 implementation, right?

 Correct.  Sadly it's the first time I see this code.

 Overall I find it a bit of a chimera: it uses `.set noreorder' and 
explicit relocations on one hand, and then high-level assembly `.cpload' 
and `.cprestore' pseudo-ops on the other, effectively mixing the two 
styles of assembly.

 The pseudo-ops come from times when using assembly macros was the norm 
and are there to support that coding model where macros rely on these 
pseudo-ops, and before the use of explicit relocations became the norm at 
least for GCC.  In the absence of assembly macros you can write code 
expansions for these pseudo-ops by hand, just as what GCC does nowadays 
(in the `-mexplicit-relocs' mode, which is usually the default).

 But due to architecture variations it's very hard to write handcoded 
assembly in the `.set noreorder' mode: you need to take care of all the 
data dependencies that appear due to the lack of interlocking between 
pipeline stages, which results in code that either works correctly 
everywhere, but is suboptimal for newer architecture revisions, or code 
that works better with newer architecture revisions, but is actually 
broken with earlier ones.

 About the only typical case where you do want to use `.set noreorder' is 
to schedule a branch delay slot by hand due to a data anti-dependency 
between the two instructions.  Patchable/self-modifying code is a less 
frequent case.  And I had literally one case in my entire career where I 
wanted to actually jump to a branch delay slot instruction (it's still 
there in arch/mips/include/asm/div64.h, in do_div64_32(); see label #2).

 Also it appears no code in this function actually relies on $gp having 
been set up, so perhaps this stuff can just be discarded in the first 
place?

References:

[1] "MIPSpro 64-Bit Porting and Transition Guide", 
    <https://irix7.com/techpubs/007-2391-006.pdf>

[2] "MIPSpro N32 ABI Handbook", 
    <https://irix7.com/techpubs/007-2816-004.pdf>

  Maciej
Willy Tarreau Feb. 18, 2025, 4:52 a.m. UTC | #6
Mi Maciej,

On Mon, Feb 17, 2025 at 11:23:11PM +0000, Maciej W. Rozycki wrote:
> > >  Why is this code breaking stack alignment just to have to fix it up two 
> > > instructions down the line?  Or is it that the incoming $sp is not aligned 
> > > in the first place (in which case we're having a deeper problem).
> > 
> > nolibc itself does not assume that $sp is aligned.
> > Maybe Willy can explain the historical background.
> 
>  I'm all ears.

I had a look, that's interesting. Actually this started in the very
early i386 code in nolibc, where we already use the stack in the entry
code, and simply fix it up before calling main. Then we pursued this
with x86_64 (which also uses the stack and needs to fix it up), then
arm (where we use and fix the stack), then the MIPS entry code was
simply written based on the same construct while it does not use the
stack thus does indeed not need to be fixed. Here are the links, it
predates kernel inclusion:

    arm: https://github.com/wtarreau/nolibc/commit/af968b1
   mips: https://github.com/wtarreau/nolibc/commit/e04cd25

Thus we can shave 4 more bytes from the MIPS entry code ;-)

> > > > +#endif
> > > > +
> > > >  		"jalr $t9\n"             /* transfer to c runtime
> > > > */
> > > >  		" nop\n"                 /* delayed slot
> > > 
> > >  On an unrelated matter JALR above ought to be JAL (or otherwise there's 
> > > no point in using the .cprestore pseudo-op).  And I fail to see why this 
> > > code has to be "noreorder" (except for the .cpload piece, of course), it's 
> > > just asking for troubles.
> > 
> > Thanks for the hints.
> > 
> > Without "noreorder", is the manually addition of the delayed slot "nop"
> > still necessary?
> 
>  It's not.  It's for the `.set noreorder' mode only, to fill the branch 
> delay slot by hand.  Otherwise it'll become just a useless instruction 
> following the call sequence and executed after return.

Similarly I had ".set noreorder" and the nop in the initial code from 2017.
I remember that it took me a long while to get that init code to work on
my MIPS boxes, and it's extremely likely that I found the noreorder trick
long after I placed the nop and did not remove it.

> > These points also apply to the existing O32 implementation, right?
> 
>  Correct.  Sadly it's the first time I see this code.
> 
>  Overall I find it a bit of a chimera: it uses `.set noreorder' and 
> explicit relocations on one hand, and then high-level assembly `.cpload' 
> and `.cprestore' pseudo-ops on the other, effectively mixing the two 
> styles of assembly.
> 
>  The pseudo-ops come from times when using assembly macros was the norm 
> and are there to support that coding model where macros rely on these 
> pseudo-ops, and before the use of explicit relocations became the norm at 
> least for GCC.  In the absence of assembly macros you can write code 
> expansions for these pseudo-ops by hand, just as what GCC does nowadays 
> (in the `-mexplicit-relocs' mode, which is usually the default).
> 
>  But due to architecture variations it's very hard to write handcoded 
> assembly in the `.set noreorder' mode: you need to take care of all the 
> data dependencies that appear due to the lack of interlocking between 
> pipeline stages, which results in code that either works correctly 
> everywhere, but is suboptimal for newer architecture revisions, or code 
> that works better with newer architecture revisions, but is actually 
> broken with earlier ones.
> 
>  About the only typical case where you do want to use `.set noreorder' is 
> to schedule a branch delay slot by hand due to a data anti-dependency 
> between the two instructions.  Patchable/self-modifying code is a less 
> frequent case.  And I had literally one case in my entire career where I 
> wanted to actually jump to a branch delay slot instruction (it's still 
> there in arch/mips/include/asm/div64.h, in do_div64_32(); see label #2).
> 
>  Also it appears no code in this function actually relies on $gp having 
> been set up, so perhaps this stuff can just be discarded in the first 
> place?

All of this is very helpful. For example we did face an issue with
noreorder here, that required to then add ".set push" then ".set pop"
as it was polluting the rest of the code:

   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=184177c3d6e0

But I'm pretty sure that I didn't invent this ".set noreorder" and if I
had found how to get rid of it, I'd happily done it. Like often with such
code, it was produced by trial and error, and it's very possible that one
solution was added to cover the problem caused by another one, and was
not optimal. For me by then, the code was considered OK enough when my
preinit program would work fine on all my machines (mostly mips-24Kc and
mips-1004Kc that I daily have access to).

In any case that's something that's easy to try again if we want to clean
that up to normalize it.

Thanks!
Willy
diff mbox series

Patch

diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h
index 753a8ed2cf695f0b5eac4b5e4d317fdb383ebf93..638520a3427a985fdbd5f5a49b55853bbadeee75 100644
--- a/tools/include/nolibc/arch-mips.h
+++ b/tools/include/nolibc/arch-mips.h
@@ -10,7 +10,7 @@ 
 #include "compiler.h"
 #include "crt.h"
 
-#if !defined(_ABIO32)
+#if !defined(_ABIO32) && !defined(_ABIN32) && !defined(_ABI64)
 #error Unsupported MIPS ABI
 #endif
 
@@ -32,11 +32,32 @@ 
  *   - the arguments are cast to long and assigned into the target registers
  *     which are then simply passed as registers to the asm code, so that we
  *     don't have to experience issues with register constraints.
+ *
+ * Syscalls for MIPS ABI N32, same as ABI O32 with the following differences :
+ *   - arguments are in a0, a1, a2, a3, t0, t1, t2, t3.
+ *     t0..t3 are also known as a4..a7.
+ *   - stack is 16-byte aligned
  */
 
+#if defined(_ABIO32)
+
 #define _NOLIBC_SYSCALL_CLOBBERLIST \
 	"memory", "cc", "at", "v1", "hi", "lo", \
 	"t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9"
+#define _NOLIBC_SYSCALL_STACK_RESERVE "addiu $sp, $sp, -32\n"
+#define _NOLIBC_SYSCALL_STACK_UNRESERVE "addiu $sp, $sp, 32\n"
+
+#elif defined(_ABIN32) || defined(_ABI64)
+
+/* binutils, GCC and clang disagree about register aliases, use numbers instead. */
+#define _NOLIBC_SYSCALL_CLOBBERLIST \
+	"memory", "cc", "at", "v1", \
+	"10", "11", "12", "13", "14", "15", "24", "25"
+
+#define _NOLIBC_SYSCALL_STACK_RESERVE
+#define _NOLIBC_SYSCALL_STACK_UNRESERVE
+
+#endif
 
 #define my_syscall0(num)                                                      \
 ({                                                                            \
@@ -44,9 +65,9 @@ 
 	register long _arg4 __asm__ ("a3");                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r"(_num), "=r"(_arg4)                                     \
 		: "r"(_num)                                                   \
 		: _NOLIBC_SYSCALL_CLOBBERLIST                                 \
@@ -61,9 +82,9 @@ 
 	register long _arg4 __asm__ ("a3");                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r"(_num), "=r"(_arg4)                                     \
 		: "0"(_num),                                                  \
 		  "r"(_arg1)                                                  \
@@ -80,9 +101,9 @@ 
 	register long _arg4 __asm__ ("a3");                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r"(_num), "=r"(_arg4)                                     \
 		: "0"(_num),                                                  \
 		  "r"(_arg1), "r"(_arg2)                                      \
@@ -100,9 +121,9 @@ 
 	register long _arg4 __asm__ ("a3");                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r"(_num), "=r"(_arg4)                                     \
 		: "0"(_num),                                                  \
 		  "r"(_arg1), "r"(_arg2), "r"(_arg3)                          \
@@ -120,9 +141,9 @@ 
 	register long _arg4 __asm__ ("a3") = (long)(arg4);                    \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r" (_num), "=r"(_arg4)                                    \
 		: "0"(_num),                                                  \
 		  "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4)              \
@@ -131,6 +152,8 @@ 
 	_arg4 ? -_num : _num;                                                 \
 })
 
+#if defined(_ABIO32)
+
 #define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
 ({                                                                            \
 	register long _num __asm__ ("v0") = (num);                            \
@@ -141,10 +164,10 @@ 
 	register long _arg5 = (long)(arg5);                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"sw %7, 16($sp)\n"                                            \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r" (_num), "=r"(_arg4)                                    \
 		: "0"(_num),                                                  \
 		  "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5)  \
@@ -164,11 +187,11 @@ 
 	register long _arg6 = (long)(arg6);                                   \
 									      \
 	__asm__ volatile (                                                    \
-		"addiu $sp, $sp, -32\n"                                       \
+		_NOLIBC_SYSCALL_STACK_RESERVE                                 \
 		"sw %7, 16($sp)\n"                                            \
 		"sw %8, 20($sp)\n"                                            \
 		"syscall\n"                                                   \
-		"addiu $sp, $sp, 32\n"                                        \
+		_NOLIBC_SYSCALL_STACK_UNRESERVE                               \
 		: "=r" (_num), "=r"(_arg4)                                    \
 		: "0"(_num),                                                  \
 		  "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
@@ -178,6 +201,50 @@ 
 	_arg4 ? -_num : _num;                                                 \
 })
 
+#else
+
+#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)                        \
+({                                                                            \
+	register long _num __asm__ ("v0") = (num);                            \
+	register long _arg1 __asm__ ("$4") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("$5") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("$6") = (long)(arg3);                    \
+	register long _arg4 __asm__ ("$7") = (long)(arg4);                    \
+	register long _arg5 __asm__ ("$8") = (long)(arg5);                    \
+									      \
+	__asm__ volatile (                                                    \
+		"syscall\n"                                                   \
+		: "=r" (_num), "=r"(_arg4)                                    \
+		: "0"(_num),                                                  \
+		  "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5)  \
+		: _NOLIBC_SYSCALL_CLOBBERLIST                                 \
+	);                                                                    \
+	_arg4 ? -_num : _num;                                                 \
+})
+
+#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6)                  \
+({                                                                            \
+	register long _num __asm__ ("v0")  = (num);                           \
+	register long _arg1 __asm__ ("$4") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("$5") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("$6") = (long)(arg3);                    \
+	register long _arg4 __asm__ ("$7") = (long)(arg4);                    \
+	register long _arg5 __asm__ ("$8") = (long)(arg5);                    \
+	register long _arg6 __asm__ ("$9") = (long)(arg6);                    \
+									      \
+	__asm__ volatile (                                                    \
+		"syscall\n"                                                   \
+		: "=r" (_num), "=r"(_arg4)                                    \
+		: "0"(_num),                                                  \
+		  "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
+		  "r"(_arg6)                                                  \
+		: _NOLIBC_SYSCALL_CLOBBERLIST                                 \
+	);                                                                    \
+	_arg4 ? -_num : _num;                                                 \
+})
+
+#endif
+
 /* startup code, note that it's called __start on MIPS */
 void __start(void);
 void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void)
@@ -190,13 +257,33 @@  void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __
 		"1:\n"
 		".cpload $ra\n"
 		"move  $a0, $sp\n"       /* save stack pointer to $a0, as arg1 of _start_c */
+
+#if defined(_ABIO32)
 		"addiu $sp, $sp, -4\n"   /* space for .cprestore to store $gp              */
 		".cprestore 0\n"
 		"li    $t0, -8\n"
 		"and   $sp, $sp, $t0\n"  /* $sp must be 8-byte aligned                     */
 		"addiu $sp, $sp, -16\n"  /* the callee expects to save a0..a3 there        */
-		"lui $t9, %hi(_start_c)\n" /* ABI requires current function address in $t9 */
+#else
+		"daddiu $sp, $sp, -8\n"  /* space for .cprestore to store $gp              */
+		".cpsetup $ra, 0, 1b\n"
+		"li    $t0, -16\n"
+		"and   $sp, $sp, $t0\n"  /* $sp must be 16-byte aligned                    */
+#endif
+
+		/* ABI requires current function address in $t9 */
+#if defined(_ABIO32) || defined(_ABIN32)
+		"lui $t9, %hi(_start_c)\n"
 		"ori $t9, %lo(_start_c)\n"
+#else
+		"lui  $t9, %highest(_start_c)\n"
+		"ori  $t9, %higher(_start_c)\n"
+		"dsll $t9, 0x10\n"
+		"ori  $t9, %hi(_start_c)\n"
+		"dsll $t9, 0x10\n"
+		"ori  $t9, %lo(_start_c)\n"
+#endif
+
 		"jalr $t9\n"             /* transfer to c runtime                          */
 		" nop\n"                 /* delayed slot                                   */
 		".set pop\n"
diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 983985b7529b65b7ce4a00c28f3f915d83974eea..2dec6ab9596c974b6aac439685e17f5c10a76948 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -52,6 +52,10 @@  ARCH_ppc64       = powerpc
 ARCH_ppc64le     = powerpc
 ARCH_mips32le    = mips
 ARCH_mips32be    = mips
+ARCH_mipsn32le   = mips
+ARCH_mipsn32be   = mips
+ARCH_mips64le    = mips
+ARCH_mips64be    = mips
 ARCH_riscv32     = riscv
 ARCH_riscv64     = riscv
 ARCH            := $(or $(ARCH_$(XARCH)),$(XARCH))
@@ -64,6 +68,10 @@  IMAGE_arm64      = arch/arm64/boot/Image
 IMAGE_arm        = arch/arm/boot/zImage
 IMAGE_mips32le   = vmlinuz
 IMAGE_mips32be   = vmlinuz
+IMAGE_mipsn32le  = vmlinuz
+IMAGE_mipsn32be  = vmlinuz
+IMAGE_mips64le   = vmlinuz
+IMAGE_mips64be   = vmlinuz
 IMAGE_ppc        = vmlinux
 IMAGE_ppc64      = vmlinux
 IMAGE_ppc64le    = arch/powerpc/boot/zImage
@@ -83,6 +91,10 @@  DEFCONFIG_arm64      = defconfig
 DEFCONFIG_arm        = multi_v7_defconfig
 DEFCONFIG_mips32le   = malta_defconfig
 DEFCONFIG_mips32be   = malta_defconfig generic/eb.config
+DEFCONFIG_mipsn32le  = malta_defconfig generic/64r2.config
+DEFCONFIG_mipsn32be  = malta_defconfig generic/64r6.config generic/eb.config
+DEFCONFIG_mips64le   = malta_defconfig generic/64r6.config
+DEFCONFIG_mips64be   = malta_defconfig generic/64r2.config generic/eb.config
 DEFCONFIG_ppc        = pmac32_defconfig
 DEFCONFIG_ppc64      = powernv_be_defconfig
 DEFCONFIG_ppc64le    = powernv_defconfig
@@ -105,7 +117,11 @@  QEMU_ARCH_x86        = x86_64
 QEMU_ARCH_arm64      = aarch64
 QEMU_ARCH_arm        = arm
 QEMU_ARCH_mips32le   = mipsel  # works with malta_defconfig
-QEMU_ARCH_mips32be  = mips
+QEMU_ARCH_mips32be   = mips
+QEMU_ARCH_mipsn32le  = mips64el
+QEMU_ARCH_mipsn32be  = mips64
+QEMU_ARCH_mips64le   = mips64el
+QEMU_ARCH_mips64be   = mips64
 QEMU_ARCH_ppc        = ppc
 QEMU_ARCH_ppc64      = ppc64
 QEMU_ARCH_ppc64le    = ppc64
@@ -117,6 +133,8 @@  QEMU_ARCH_loongarch  = loongarch64
 QEMU_ARCH            = $(QEMU_ARCH_$(XARCH))
 
 QEMU_ARCH_USER_ppc64le = ppc64le
+QEMU_ARCH_USER_mipsn32le = mipsn32el
+QEMU_ARCH_USER_mipsn32be = mipsn32
 QEMU_ARCH_USER         = $(or $(QEMU_ARCH_USER_$(XARCH)),$(QEMU_ARCH_$(XARCH)))
 
 QEMU_BIOS_DIR = /usr/share/edk2/
@@ -134,6 +152,10 @@  QEMU_ARGS_arm64      = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC
 QEMU_ARGS_arm        = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_mips32le   = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_mips32be   = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_mipsn32le  = -M malta -cpu 5KEc -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_mipsn32be  = -M malta -cpu I6400 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_mips64le   = -M malta -cpu I6400 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_mips64be   = -M malta -cpu 5KEc -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc        = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc64      = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
 QEMU_ARGS_ppc64le    = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
@@ -161,6 +183,10 @@  CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
 CFLAGS_s390 = -m64
 CFLAGS_mips32le = -EL -mabi=32 -fPIC
 CFLAGS_mips32be = -EB -mabi=32
+CFLAGS_mipsn32le = -EL -mabi=n32 -fPIC -march=mips64r2
+CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6
+CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6
+CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2
 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all))
 CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \
 		$(call cc-option,-fno-stack-protector) $(call cc-option,-Wmissing-prototypes) \
diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh
index 6db01115276888bc89f6ec5532153c37e55c83d3..f0f3890fb5fa8196cd33aa8681ed30b00d8f474e 100755
--- a/tools/testing/selftests/nolibc/run-tests.sh
+++ b/tools/testing/selftests/nolibc/run-tests.sh
@@ -20,7 +20,7 @@  llvm=
 all_archs=(
 	i386 x86_64
 	arm64 arm
-	mips32le mips32be
+	mips32le mips32be mipsn32le mipsn32be mips64le mips64be
 	ppc ppc64 ppc64le
 	riscv32 riscv64
 	s390