diff mbox series

[v3,02/11] tools/nolibc: add new crt.h with _start_c

Message ID ef5b9900a84bdbbc59eb4319e3260a6e29d24f68.1689150149.git.falcon@tinylab.org (mailing list archive)
State New
Headers show
Series tools/nolibc: shrink arch support | expand

Commit Message

Zhangjin Wu July 12, 2023, 9:17 a.m. UTC
As the environ and _auxv support added for nolibc, the assembly _start
function becomes more and more complex and therefore makes the porting
of nolibc to new architectures harder and harder.

To simplify portability, this C version of _start_c() is added to do
most of the assembly start operations in C, which reduces the complexity
a lot and will eventually simplify the porting of nolibc to the new
architectures.

The new _start_c() only requires a stack pointer argument, it will find
argv, envp and _auxv for us, and then call main(), finally, it exit()
with main's return status. With this new _start_c(), the future new
architectures only require to add very few assembly instructions.

As suggested by Thomas, users may use a different signature of main
(e.g. void main(void)), a _nolibc_main alias is added for main to
silence the warning about potential conflicting types.

Suggested-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/lkml/90fdd255-32f4-4caf-90ff-06456b53dac3@t-8ch.de/
Signed-off-by: Zhangjin Wu <falcon@tinylab.org>
---
 tools/include/nolibc/Makefile |  1 +
 tools/include/nolibc/crt.h    | 59 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 tools/include/nolibc/crt.h

Comments

Thomas Weißschuh July 12, 2023, 8:39 p.m. UTC | #1
On 2023-07-12 17:17:39+0800, Zhangjin Wu wrote:
> As the environ and _auxv support added for nolibc, the assembly _start
> function becomes more and more complex and therefore makes the porting
> of nolibc to new architectures harder and harder.
> 
> To simplify portability, this C version of _start_c() is added to do
> most of the assembly start operations in C, which reduces the complexity
> a lot and will eventually simplify the porting of nolibc to the new
> architectures.
> 
> The new _start_c() only requires a stack pointer argument, it will find
> argv, envp and _auxv for us, and then call main(), finally, it exit()
> with main's return status. With this new _start_c(), the future new
> architectures only require to add very few assembly instructions.
> 
> As suggested by Thomas, users may use a different signature of main
> (e.g. void main(void)), a _nolibc_main alias is added for main to
> silence the warning about potential conflicting types.
> 
> Suggested-by: Thomas Weißschuh <linux@weissschuh.net>
> Link: https://lore.kernel.org/lkml/90fdd255-32f4-4caf-90ff-06456b53dac3@t-8ch.de/
> Signed-off-by: Zhangjin Wu <falcon@tinylab.org>
> ---
>  tools/include/nolibc/Makefile |  1 +
>  tools/include/nolibc/crt.h    | 59 +++++++++++++++++++++++++++++++++++
>  2 files changed, 60 insertions(+)
>  create mode 100644 tools/include/nolibc/crt.h
> 
> diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
> index 64d67b080744..909b6eb500fe 100644
> --- a/tools/include/nolibc/Makefile
> +++ b/tools/include/nolibc/Makefile
> @@ -27,6 +27,7 @@ nolibc_arch := $(patsubst arm64,aarch64,$(ARCH))
>  arch_file := arch-$(nolibc_arch).h
>  all_files := \
>  		compiler.h \
> +		crt.h \
>  		ctype.h \
>  		errno.h \
>  		nolibc.h \
> diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
> new file mode 100644
> index 000000000000..f9db2389acd2
> --- /dev/null
> +++ b/tools/include/nolibc/crt.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
> +/*
> + * C Run Time support for NOLIBC
> + * Copyright (C) 2023 Zhangjin Wu <falcon@tinylab.org>
> + */
> +
> +#ifndef _NOLIBC_CRT_H
> +#define _NOLIBC_CRT_H
> +
> +char **environ __attribute__((weak));
> +const unsigned long *_auxv __attribute__((weak));
> +
> +typedef int (_nolibc_main_fn)(int, char **, char **);

What's the advantage of the typedef over using the pointer type inline?

> +static void exit(int);
> +
> +void _start_c(long *sp)
> +{
> +	int argc, i;
> +	char **argv;
> +	char **envp;
> +	/* silence potential warning: conflicting types for 'main' */
> +	_nolibc_main_fn _nolibc_main __asm__ ("main");

What about the stackprotector initialization?
It would really fit great into this series.

> +
> +	/*
> +	 * sp  :  argc          <-- argument count, required by main()
> +	 * argv:  argv[0]       <-- argument vector, required by main()
> +	 *        argv[1]
> +	 *        ...
> +	 *        argv[argc-1]
> +	 *        null
> +	 * envp:  envp[0]       <-- environment variables, required by main() and getenv()
> +	 *        envp[1]
> +	 *        ...
> +	 *        null
> +	 * _auxv: auxv[0]       <-- auxiliary vector, required by getauxval()
> +	 *        auxv[1]
> +	 *        ...
> +	 *        null
> +	 */
> +
> +	/* assign argc and argv */
> +	argc = sp[0];
> +	argv = (void *)(sp + 1);

Bit of a weird mismatch between array syntax and pointer arithmetic.

> +
> +	/* find envp */
> +	envp = argv + argc + 1;
> +	environ = envp;

Is envp really needed? Could just be assigned directly to environ.

> +
> +	/* find auxv */
> +	i = 0;
> +	while (envp[i])
> +		i++;
> +	_auxv = (void *)(envp + i + 1);

Could be simplified a bit:

_auxv = (void *) envp;
while (_auxv)
	_auxv++;

> +
> +	/* go to application */
> +	exit(_nolibc_main(argc, argv, envp));
> +}
> +
> +#endif /* _NOLIBC_CRT_H */
> -- 
> 2.25.1
>
Zhangjin Wu July 13, 2023, 6:12 a.m. UTC | #2
Hi, Thomas

> On 2023-07-12 17:17:39+0800, Zhangjin Wu wrote:
> > As the environ and _auxv support added for nolibc, the assembly _start
> > function becomes more and more complex and therefore makes the porting
> > of nolibc to new architectures harder and harder.
> > 
> > To simplify portability, this C version of _start_c() is added to do
> > most of the assembly start operations in C, which reduces the complexity
> > a lot and will eventually simplify the porting of nolibc to the new
> > architectures.
> > 
> > The new _start_c() only requires a stack pointer argument, it will find
> > argv, envp and _auxv for us, and then call main(), finally, it exit()
> > with main's return status. With this new _start_c(), the future new
> > architectures only require to add very few assembly instructions.
> > 
> > As suggested by Thomas, users may use a different signature of main
> > (e.g. void main(void)), a _nolibc_main alias is added for main to
> > silence the warning about potential conflicting types.
> > 
> > Suggested-by: Thomas Weißschuh <linux@weissschuh.net>
> > Link: https://lore.kernel.org/lkml/90fdd255-32f4-4caf-90ff-06456b53dac3@t-8ch.de/
> > Signed-off-by: Zhangjin Wu <falcon@tinylab.org>
> > ---
> >  tools/include/nolibc/Makefile |  1 +
> >  tools/include/nolibc/crt.h    | 59 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 60 insertions(+)
> >  create mode 100644 tools/include/nolibc/crt.h
> > 
> > diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
> > index 64d67b080744..909b6eb500fe 100644
> > --- a/tools/include/nolibc/Makefile
> > +++ b/tools/include/nolibc/Makefile
> > @@ -27,6 +27,7 @@ nolibc_arch := $(patsubst arm64,aarch64,$(ARCH))
> >  arch_file := arch-$(nolibc_arch).h
> >  all_files := \
> >  		compiler.h \
> > +		crt.h \
> >  		ctype.h \
> >  		errno.h \
> >  		nolibc.h \
> > diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
> > new file mode 100644
> > index 000000000000..f9db2389acd2
> > --- /dev/null
> > +++ b/tools/include/nolibc/crt.h
> > @@ -0,0 +1,59 @@
> > +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
> > +/*
> > + * C Run Time support for NOLIBC
> > + * Copyright (C) 2023 Zhangjin Wu <falcon@tinylab.org>
> > + */
> > +
> > +#ifndef _NOLIBC_CRT_H
> > +#define _NOLIBC_CRT_H
> > +
> > +char **environ __attribute__((weak));
> > +const unsigned long *_auxv __attribute__((weak));
> > +
> > +typedef int (_nolibc_main_fn)(int, char **, char **);
> 
> What's the advantage of the typedef over using the pointer type inline?
>

With the extra comment added, this is not required anymore, will remove
this typedef.

> > +static void exit(int);
> > +
> > +void _start_c(long *sp)
> > +{
> > +	int argc, i;
> > +	char **argv;
> > +	char **envp;
> > +	/* silence potential warning: conflicting types for 'main' */
> > +	_nolibc_main_fn _nolibc_main __asm__ ("main");
> 
> What about the stackprotector initialization?
> It would really fit great into this series.
>

Ok, which gcc version supports stackprotector? seems the test even skip
on gcc 10, I will find one to verify the code change.

> > +
> > +	/*
> > +	 * sp  :  argc          <-- argument count, required by main()
> > +	 * argv:  argv[0]       <-- argument vector, required by main()
> > +	 *        argv[1]
> > +	 *        ...
> > +	 *        argv[argc-1]
> > +	 *        null
> > +	 * envp:  envp[0]       <-- environment variables, required by main() and getenv()
> > +	 *        envp[1]
> > +	 *        ...
> > +	 *        null
> > +	 * _auxv: auxv[0]       <-- auxiliary vector, required by getauxval()
> > +	 *        auxv[1]
> > +	 *        ...
> > +	 *        null
> > +	 */
> > +
> > +	/* assign argc and argv */
> > +	argc = sp[0];
> > +	argv = (void *)(sp + 1);
> 
> Bit of a weird mismatch between array syntax and pointer arithmetic.
>

This 'argc = *sp;' may be better ;-)

> > +
> > +	/* find envp */
> > +	envp = argv + argc + 1;
> > +	environ = envp;
> 
> Is envp really needed? Could just be assigned directly to environ.
>

Ok, let's save one variable, envp is used to be consistent with the
frequenty declaration of main().

> > +
> > +	/* find auxv */
> > +	i = 0;
> > +	while (envp[i])
> > +		i++;
> > +	_auxv = (void *)(envp + i + 1);
> 
> Could be simplified a bit:
> 
> _auxv = (void *) envp;
> while (_auxv)
> 	_auxv++;
>

Yeah, it is better, but needs a little change.

    _auxv = (void *) envp;
    while (*_auxv)
	_auxv++;
    _auxv++;

Thanks,
Zhangjin

> > +
> > +	/* go to application */
> > +	exit(_nolibc_main(argc, argv, envp));
> > +}
> > +
> > +#endif /* _NOLIBC_CRT_H */
> > -- 
> > 2.25.1
> >
Zhangjin Wu July 13, 2023, 6:23 a.m. UTC | #3
Hi, Thomas

> > On 2023-07-12 17:17:39+0800, Zhangjin Wu wrote:
> [...]
> > > +static void exit(int);
> > > +
> > > +void _start_c(long *sp)
> > > +{
> > > +	int argc, i;
> > > +	char **argv;
> > > +	char **envp;
> > > +	/* silence potential warning: conflicting types for 'main' */
> > > +	_nolibc_main_fn _nolibc_main __asm__ ("main");
> > 
> > What about the stackprotector initialization?
> > It would really fit great into this series.
> >
> 
> Ok, which gcc version supports stackprotector? seems the test even skip
> on gcc 10, I will find one to verify the code change.
>

Thomas, please ignore this question, I forgot some options in my own script.

Best regards,
Zhangjin

> > > +
> > > +	/*
> > > +	 * sp  :  argc          <-- argument count, required by main()
> > > +	 * argv:  argv[0]       <-- argument vector, required by main()
> > > +	 *        argv[1]
> > > +	 *        ...
> > > +	 *        argv[argc-1]
> > > +	 *        null
> > > +	 * envp:  envp[0]       <-- environment variables, required by main() and getenv()
> > > +	 *        envp[1]
> > > +	 *        ...
> > > +	 *        null
> > > +	 * _auxv: auxv[0]       <-- auxiliary vector, required by getauxval()
> > > +	 *        auxv[1]
> > > +	 *        ...
> > > +	 *        null
> > > +	 */
> > > +
> > > +	/* assign argc and argv */
> > > +	argc = sp[0];
> > > +	argv = (void *)(sp + 1);
> > 
> > Bit of a weird mismatch between array syntax and pointer arithmetic.
> >
Thomas Weißschuh July 13, 2023, 6:24 a.m. UTC | #4
On 2023-07-13 14:12:27+0800, Zhangjin Wu wrote:
> > On 2023-07-12 17:17:39+0800, Zhangjin Wu wrote:

> [..]

> > > diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
> > > new file mode 100644
> > > index 000000000000..f9db2389acd2
> > > --- /dev/null
> > > +++ b/tools/include/nolibc/crt.h
> > > @@ -0,0 +1,59 @@
> > > +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
> > > +/*
> > > + * C Run Time support for NOLIBC
> > > + * Copyright (C) 2023 Zhangjin Wu <falcon@tinylab.org>
> > > + */
> > > +
> > > +#ifndef _NOLIBC_CRT_H
> > > +#define _NOLIBC_CRT_H
> > > +
> > > +char **environ __attribute__((weak));
> > > +const unsigned long *_auxv __attribute__((weak));
> > > +
> > > +typedef int (_nolibc_main_fn)(int, char **, char **);
> > 
> > What's the advantage of the typedef over using the pointer type inline?
> >
> 
> With the extra comment added, this is not required anymore, will remove
> this typedef.
> 
> > > +static void exit(int);
> > > +
> > > +void _start_c(long *sp)
> > > +{
> > > +	int argc, i;
> > > +	char **argv;
> > > +	char **envp;
> > > +	/* silence potential warning: conflicting types for 'main' */
> > > +	_nolibc_main_fn _nolibc_main __asm__ ("main");
> > 
> > What about the stackprotector initialization?
> > It would really fit great into this series.
> >
> 
> Ok, which gcc version supports stackprotector? seems the test even skip
> on gcc 10, I will find one to verify the code change.

For crosscompilation I use the crosstools from kernel.org at
https://mirrors.edge.kernel.org/pub/tools/crosstool/

It seems to be at least in their gcc 9.5.0.
And also in their 11.2 which I use mostly at the moment.

> > > +	/*
> > > +	 * sp  :  argc          <-- argument count, required by main()
> > > +	 * argv:  argv[0]       <-- argument vector, required by main()
> > > +	 *        argv[1]
> > > +	 *        ...
> > > +	 *        argv[argc-1]
> > > +	 *        null
> > > +	 * envp:  envp[0]       <-- environment variables, required by main() and getenv()
> > > +	 *        envp[1]
> > > +	 *        ...
> > > +	 *        null
> > > +	 * _auxv: auxv[0]       <-- auxiliary vector, required by getauxval()
> > > +	 *        auxv[1]
> > > +	 *        ...
> > > +	 *        null
> > > +	 */
> > > +
> > > +	/* assign argc and argv */
> > > +	argc = sp[0];
> > > +	argv = (void *)(sp + 1);
> > 
> > Bit of a weird mismatch between array syntax and pointer arithmetic.
> >
> 
> This 'argc = *sp;' may be better ;-)
> 
> > > +
> > > +	/* find envp */
> > > +	envp = argv + argc + 1;
> > > +	environ = envp;
> > 
> > Is envp really needed? Could just be assigned directly to environ.
> >
> 
> Ok, let's save one variable, envp is used to be consistent with the
> frequenty declaration of main().
> 
> > > +
> > > +	/* find auxv */
> > > +	i = 0;
> > > +	while (envp[i])
> > > +		i++;
> > > +	_auxv = (void *)(envp + i + 1);
> > 
> > Could be simplified a bit:
> > 
> > _auxv = (void *) envp;
> > while (_auxv)
> > 	_auxv++;
> >
> 
> Yeah, it is better, but needs a little change.
> 
>     _auxv = (void *) envp;
>     while (*_auxv)
> 	_auxv++;
>     _auxv++;

Good catch, thanks.

> [..]

Thomas
Willy Tarreau July 13, 2023, 6:34 a.m. UTC | #5
Hi Zhangjin,

I haven't reviewed the rest yet but regarding this point:

On Thu, Jul 13, 2023 at 02:12:27PM +0800, Zhangjin Wu wrote:
> > > +	/* find auxv */
> > > +	i = 0;
> > > +	while (envp[i])
> > > +		i++;
> > > +	_auxv = (void *)(envp + i + 1);
> > 
> > Could be simplified a bit:
> > 
> > _auxv = (void *) envp;
> > while (_auxv)
> > 	_auxv++;
> >
> 
> Yeah, it is better, but needs a little change.
> 
>     _auxv = (void *) envp;
>     while (*_auxv)
> 	_auxv++;
>     _auxv++;

Or just:

    _auxv = (void*)environ;
    while (*_auxv++)
         ;

or:

    for (_auxv = (void*)environ; *_auxv++; )
         ;

Please also have a look at the output code, because at low optimization
levels, compilers sometimes produce a better code with a local variable
than with a global variable in a loop. Thus I wouldn't be that much
surprised if at -O0 or -O1 you'd see slightly more compact code using:

       /* find envp */
       environ = argv + argc + 1;
        
       /* find auxv */
       for (auxv = environ; *auxv++)
                ;
       _auxv = auxv;

than:
       /* find envp */
       envp = argv + argc + 1;
       environ = envp;
        
       /* find auxv */
       for (_auxv = environ; *_auxv++)
                ;

Since it's going to become generic code, it's worth running a few tests
to see how to best polish it.

Thanks,
Willy
Zhangjin Wu July 14, 2023, 5:58 a.m. UTC | #6
Hi, Willy, Thomas

> Hi Zhangjin,
>
> I haven't reviewed the rest yet but regarding this point:
>
> On Thu, Jul 13, 2023 at 02:12:27PM +0800, Zhangjin Wu wrote:
> > > > +	/* find auxv */
> > > > +	i = 0;
> > > > +	while (envp[i])
> > > > +		i++;
> > > > +	_auxv = (void *)(envp + i + 1);
> > >
> > > Could be simplified a bit:
> > >
> > > _auxv = (void *) envp;
> > > while (_auxv)
> > > 	_auxv++;
> > >
> >
> > Yeah, it is better, but needs a little change.
> >
> >     _auxv = (void *) envp;
> >     while (*_auxv)
> > 	_auxv++;
> >     _auxv++;
>
> Or just:
>
>     _auxv = (void*)environ;
>     while (*_auxv++)
>          ;
>
> or:
>
>     for (_auxv = (void*)environ; *_auxv++; )
>          ;
>
> Please also have a look at the output code, because at low optimization
> levels, compilers sometimes produce a better code with a local variable
> than with a global variable in a loop. Thus I wouldn't be that much
> surprised if at -O0 or -O1 you'd see slightly more compact code using:
>

Very good suggestion, I did find some interesting results, after
removing some local variables, although the text size shrinks, but the
total size is the same with -O0/1/2/3/s.

Here is the diff with -O0/1/2/3/s (a REPORT_SIZE macro may be required for
this):

    $ diff -Nubr startup.old.txt startup.new.txt
    --- startup.old.txt	2023-07-14 10:22:45.990413661 +0800
    +++ startup.new.txt	2023-07-14 10:22:52.278869146 +0800
    @@ -2,34 +2,34 @@
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  46872      88      80   47040    b7c0 nolibc-test
    +  46836      88      80   47004    b79c nolibc-test
     wc -c nolibc-test
     58016 nolibc-test
      O1:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  27298      88      72   27458    6b42 nolibc-test
    +  27287      88      72   27447    6b37 nolibc-test
     wc -c nolibc-test
     37536 nolibc-test
      O2:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  25688      88      72   25848    64f8 nolibc-test
    +  25672      88      72   25832    64e8 nolibc-test
     wc -c nolibc-test
     37536 nolibc-test
      O3:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  44020      88      72   44180    ac94 nolibc-test
    +  44004      88      72   44164    ac84 nolibc-test
     wc -c nolibc-test
     53920 nolibc-test
      Os:
     sudo strip -s nolibc-test
     size nolibc-test
        text    data     bss     dec     hex filename
    -  20887      88      72   21047    5237 nolibc-test
    +  20884      88      72   21044    5234 nolibc-test
     wc -c nolibc-test
     33440 nolibc-test

The code with local variables:

    void _start_c(long *sp)
    {
    	int argc, i;
    	char **argv;
    	char **envp;
    	/* silence potential warning: conflicting types for 'main' */
    	int _nolibc_main(int, char **, char **) __asm__ ("main");
    
    	/*
    	 * sp  :    argc          <-- argument count, required by main()
    	 * argv:    argv[0]       <-- argument vector, required by main()
    	 *          argv[1]
    	 *          ...
    	 *          argv[argc-1]
    	 *          null
    	 * environ: environ[0]    <-- environment variables, required by main() and getenv()
    	 *          environ[1]
    	 *          ...
    	 *          null
    	 * _auxv:   _auxv[0]      <-- auxiliary vector, required by getauxval()
    	 *          _auxv[1]
    	 *          ...
    	 *          null
    	 */
    
    	/* assign argc and argv */
    	argc = *sp;
    	argv = (void *)(sp + 1);
    
    	/* find environ */
    	environ = envp = argv + argc + 1;
    
    	/* find _auxv */
    	for (i = 0; *(envp + i); i++)
    		;
    	_auxv = (void *)(envp + i + 1);
    
    	/* go to application */
    	exit(_nolibc_main(argc, argv, envp));
    }

The code with global variables:

    void _start_c(long *sp)
    {
    	int argc;
    	char **argv;
    	/* silence potential warning: conflicting types for 'main' */
    	int _nolibc_main(int, char **, char **) __asm__ ("main");

    	/*
    	 * sp  :    argc          <-- argument count, required by main()
    	 * argv:    argv[0]       <-- argument vector, required by main()
    	 *          argv[1]
    	 *          ...
    	 *          argv[argc-1]
    	 *          null
    	 * environ: environ[0]    <-- environment variables, required by main() and getenv()
    	 *          environ[1]
    	 *          ...
    	 *          null
    	 * _auxv:   _auxv[0]      <-- auxiliary vector, required by getauxval()
    	 *          _auxv[1]
    	 *          ...
    	 *          null
    	 */

    	/* assign argc and argv */
    	argc = *sp;
    	argv = (void *)(sp + 1);

    	/* find environ */
    	environ = argv + argc + 1;

    	/* find _auxv */
    	for (_auxv = (void *)environ; *_auxv++;)
    		;

    	/* go to application */
    	exit(_nolibc_main(argc, argv, environ));
    }

Which one do you prefer? the one with local variables may be more readable (not
that much), the one with global variables has smaller text size (and therefore
smaller memory footprint).

BTW, just found an arch-<ARCH>.h bug with -O0, seems the
'optimize("omit-frame-pointer")' attribute not really work as expected with
-O0. It uses frame pointer for _start eventually and breaks the stack pointer
variable (a push 'rbp' inserted at the begging of _start, so, the real rsp has
an offset), so, therefore, it is not able to get the right argc, argv, environ
and _auxv with -O0 currently. A solution is reverting _start to pure assembly.

For the above tests, I manually reverted the arch-x86_64.h to:

    __asm__(
            ".text\n"
            ".weak _start\n"
            "_start:\n"
    #ifdef _NOLIBC_STACKPROTECTOR
            "call __stack_chk_init\n" /* initialize stack protector                      */
    #endif
            "xor  %ebp, %ebp\n"       /* zero the stack frame                            */
            "mov  %rsp, %rdi\n"       /* save stack pointer to %rdi, as arg1 of _start_c */
            "and  $-16, %rsp\n"       /* %rsp must be 16-byte aligned before call        */
            "call _start_c\n"         /* transfer to c runtime                           */
            "hlt\n"                   /* ensure it does not return                       */
    );


'man gcc' shows:

    Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.

To want -O0 work again, since now we have C _start_c, is it ok for us to revert
the commit 7f8548589661 ("tools/nolibc: make compiler and assembler agree on
the section around _start") and the later __no_stack_protector changes?

At the same time, to verify such issues, as Thomas suggested, in this series,
we may need to add more startup tests to verify argc, argv, environ, _auxv, do
we need to add a standalone run_startup (or run_crt) test entry just like
run_syscall? or, let's simply add more in the run_stdlib, like the environ test
added by Thomas.  seems, the argc test is necessary one currently missing (argc
>= 1):

    CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;

As we summarized before, the related test cases are:

argv0:

    CASE_TEST(chmod_argv0);       EXPECT_SYSZR(1, chmod(test_argv0, 0555)); break;
    CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot(test_argv0), -1, ENOTDIR); break;

environ:

    CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break;
    CASE_TEST(environ);            EXPECT_PTREQ(1, environ, test_envp); break;
    CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
    CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;

auxv:

    CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;

The above tests are in different test group and are not aimed to startup test,
we'd better add a run_startup (or run_crt) test group before any other tests,
it is a requiremnt of the others, we at least have these ones:

    +int run_startup(int min, int max)
    +{
    +       int test;
    +       int tmp;
    +       int ret = 0;
    +
    +       for (test = min; test >= 0 && test <= max; test++) {
    +               int llen = 0; /* line length */
    +
    +               /* avoid leaving empty lines below, this will insert holes into
    +                * test numbers.
    +                */
    +               switch (test + __LINE__ + 1) {
    +               CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
    +               CASE_TEST(argv_addr);          EXPECT_PTRNZ(1, test_argv); break;
    +               CASE_TEST(argv_total);         EXPECT_EQ(1, environ - test_argv - 1, test_argc); break;
    +               CASE_TEST(argv0_addr);         EXPECT_PTRNZ(1, argv0); break;
    +               CASE_TEST(argv0_str);          EXPECT_STRNZ(1, argv0); break;
    +               CASE_TEST(argv0_len);          EXPECT_GE(1, strlen(argv0), 1); break;
    +               CASE_TEST(environ_addr);       EXPECT_PTRNZ(1, environ); break;
    +               CASE_TEST(environ_envp);       EXPECT_PTREQ(1, environ, test_envp); break;
    +               CASE_TEST(environ_total);      EXPECT_GE(1, (void *)_auxv - (void *)environ - 1, 1); break;
    +               CASE_TEST(_auxv_addr);         EXPECT_PTRNZ(1, _auxv); break;
    +               case __LINE__:
    +                       return ret; /* must be last */
    +               /* note: do not set any defaults so as to permit holes above */
    +               }
    +       }
    +       return ret;
    +}

Any more?

>        /* find envp */
>        environ = argv + argc + 1;
>
>        /* find auxv */
>        for (auxv = environ; *auxv++)
>                 ;
>        _auxv = auxv;
>
> than:
>        /* find envp */
>        envp = argv + argc + 1;
>        environ = envp;
>
>        /* find auxv */
>        for (_auxv = environ; *_auxv++)
>                 ;
>

Great, `*_auxv++` is a very good trick to avoid duplicated _auxv++, although it
is a little hard to read (not that hard in reality) ;-)

> Since it's going to become generic code, it's worth running a few tests
> to see how to best polish it.
>

Yes, I focused on the big shrinking itself but forgot to polish the _start_c
itself ;-)

Best regards,
Zhangjin

> Thanks,
> Willy
Thomas Weißschuh July 14, 2023, 6:34 a.m. UTC | #7
On 2023-07-14 13:58:13+0800, Zhangjin Wu wrote:

> [..]

> Which one do you prefer? the one with local variables may be more readable (not
> that much), the one with global variables has smaller text size (and therefore
> smaller memory footprint).

The one with local variables. But not by much.

> BTW, just found an arch-<ARCH>.h bug with -O0, seems the
> 'optimize("omit-frame-pointer")' attribute not really work as expected with
> -O0. It uses frame pointer for _start eventually and breaks the stack pointer
> variable (a push 'rbp' inserted at the begging of _start, so, the real rsp has
> an offset), so, therefore, it is not able to get the right argc, argv, environ
> and _auxv with -O0 currently. A solution is reverting _start to pure assembly.
> 
> For the above tests, I manually reverted the arch-x86_64.h to:
> 
>     __asm__(
>             ".text\n"
>             ".weak _start\n"
>             "_start:\n"
>     #ifdef _NOLIBC_STACKPROTECTOR
>             "call __stack_chk_init\n" /* initialize stack protector                      */
>     #endif
>             "xor  %ebp, %ebp\n"       /* zero the stack frame                            */
>             "mov  %rsp, %rdi\n"       /* save stack pointer to %rdi, as arg1 of _start_c */
>             "and  $-16, %rsp\n"       /* %rsp must be 16-byte aligned before call        */
>             "call _start_c\n"         /* transfer to c runtime                           */
>             "hlt\n"                   /* ensure it does not return                       */
>     );
> 
> 
> 'man gcc' shows:
> 
>     Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.
> 
> To want -O0 work again, since now we have C _start_c, is it ok for us to revert
> the commit 7f8548589661 ("tools/nolibc: make compiler and assembler agree on
> the section around _start") and the later __no_stack_protector changes?

This commit explicitly mentions being tested with -O0 on x86_64.
I was also not able to reproduce the issue.

Before doing any reverts I think some more investigation is in order.
Can you provide exact reproduction steps?

> At the same time, to verify such issues, as Thomas suggested, in this series,
> we may need to add more startup tests to verify argc, argv, environ, _auxv, do
> we need to add a standalone run_startup (or run_crt) test entry just like
> run_syscall? or, let's simply add more in the run_stdlib, like the environ test
> added by Thomas.  seems, the argc test is necessary one currently missing (argc
> >= 1):
> 
>     CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
> 
> As we summarized before, the related test cases are:
> 
> argv0:
> 
>     CASE_TEST(chmod_argv0);       EXPECT_SYSZR(1, chmod(test_argv0, 0555)); break;
>     CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot(test_argv0), -1, ENOTDIR); break;
> 
> environ:
> 
>     CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break;
>     CASE_TEST(environ);            EXPECT_PTREQ(1, environ, test_envp); break;
>     CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
>     CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;
> 
> auxv:
> 
>     CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;
> 
> The above tests are in different test group and are not aimed to startup test,
> we'd better add a run_startup (or run_crt) test group before any other tests,
> it is a requiremnt of the others, we at least have these ones:
> 
>     +int run_startup(int min, int max)
>     +{
>     +       int test;
>     +       int tmp;
>     +       int ret = 0;
>     +
>     +       for (test = min; test >= 0 && test <= max; test++) {
>     +               int llen = 0; /* line length */
>     +
>     +               /* avoid leaving empty lines below, this will insert holes into
>     +                * test numbers.
>     +                */
>     +               switch (test + __LINE__ + 1) {
>     +               CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
>     +               CASE_TEST(argv_addr);          EXPECT_PTRNZ(1, test_argv); break;
>     +               CASE_TEST(argv_total);         EXPECT_EQ(1, environ - test_argv - 1, test_argc); break;
>     +               CASE_TEST(argv0_addr);         EXPECT_PTRNZ(1, argv0); break;
>     +               CASE_TEST(argv0_str);          EXPECT_STRNZ(1, argv0); break;
>     +               CASE_TEST(argv0_len);          EXPECT_GE(1, strlen(argv0), 1); break;
>     +               CASE_TEST(environ_addr);       EXPECT_PTRNZ(1, environ); break;
>     +               CASE_TEST(environ_envp);       EXPECT_PTREQ(1, environ, test_envp); break;
>     +               CASE_TEST(environ_total);      EXPECT_GE(1, (void *)_auxv - (void *)environ - 1, 1); break;
>     +               CASE_TEST(_auxv_addr);         EXPECT_PTRNZ(1, _auxv); break;
>     +               case __LINE__:
>     +                       return ret; /* must be last */
>     +               /* note: do not set any defaults so as to permit holes above */
>     +               }
>     +       }
>     +       return ret;
>     +}
> 
> Any more?

My original idea was to have tests that exec /proc/self/exe or argv0.
This way we can actually pass and validate arbitrary argc, argv and
environ values.

But looking at your list, that should be enough.

> [..]
Zhangjin Wu July 14, 2023, 9:47 a.m. UTC | #8
> On 2023-07-14 13:58:13+0800, Zhangjin Wu wrote:
>
> > [..]
>
> > Which one do you prefer? the one with local variables may be more readable (not
> > that much), the one with global variables has smaller text size (and therefore
> > smaller memory footprint).
>
> The one with local variables. But not by much.
>
> > BTW, just found an arch-<ARCH>.h bug with -O0, seems the
> > 'optimize("omit-frame-pointer")' attribute not really work as expected with
> > -O0. It uses frame pointer for _start eventually and breaks the stack pointer
> > variable (a push 'rbp' inserted at the begging of _start, so, the real rsp has
> > an offset), so, therefore, it is not able to get the right argc, argv, environ
> > and _auxv with -O0 currently. A solution is reverting _start to pure assembly.
> >
> > For the above tests, I manually reverted the arch-x86_64.h to:
> >
> >     __asm__(
> >             ".text\n"
> >             ".weak _start\n"
> >             "_start:\n"
> >     #ifdef _NOLIBC_STACKPROTECTOR
> >             "call __stack_chk_init\n" /* initialize stack protector                      */
> >     #endif
> >             "xor  %ebp, %ebp\n"       /* zero the stack frame                            */
> >             "mov  %rsp, %rdi\n"       /* save stack pointer to %rdi, as arg1 of _start_c */
> >             "and  $-16, %rsp\n"       /* %rsp must be 16-byte aligned before call        */
> >             "call _start_c\n"         /* transfer to c runtime                           */
> >             "hlt\n"                   /* ensure it does not return                       */
> >     );
> >
> >
> > 'man gcc' shows:
> >
> >     Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.
> >
> > To want -O0 work again, since now we have C _start_c, is it ok for us to revert
> > the commit 7f8548589661 ("tools/nolibc: make compiler and assembler agree on
> > the section around _start") and the later __no_stack_protector changes?
>
> This commit explicitly mentions being tested with -O0 on x86_64.

Yeah, I was worried about that the old tests didn't use any of the startup
variables, but the getpagesize test may be a very old test, It uses _auxv and
should fail If -O0 not work.

> I was also not able to reproduce the issue.
>

Thanks very much for your 'reproduce' result, It is so weird, just
rechecked the toolchain, 13.1.0 from https://mirrors.edge.kernel.org/ is
ok, gcc 9, gcc 10.3 not work.

But even in the page of 13.1.0 [1], we still see this line:

    Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.

Not sure if "individual optimization flags" also means the optimize()
flags in gcc attributes. or the doc is not updated yet?

And further found gcc 11.1.0 is ok, gcc 10.4 still not work, so, gcc
11.1.0 may changed something to let the "individual optimization flags"
work with -O0.

We may need to at least document this issue in some files, -O0 is not such a
frequently-used option, not sure if we still need -O0 work with the older gcc <
11.1.0 ;-)

Willy, I'm not sure if the issues solved by the commit 7f8548589661
("tools/nolibc: make compiler and assembler agree on the section around
_start") still exist after we using _start_c()?

Thomas, because we plan to move the stackprotector init to _start_c(), If using
pure assembly _start, we may also not need the __no_stack_protector macro too?

Welcome more discussion. 

[1]: https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gcc/Optimize-Options.html

> Before doing any reverts I think some more investigation is in order.
> Can you provide exact reproduction steps?
>

some startup variables related tests failed with gcc 9 and gcc 10 (even with
gcc 10.4.0):

    $ x86_64-linux-gcc --version
    x86_64-linux-gcc (GCC) 10.4.0
    Copyright (C) 2020 Free Software Foundation, Inc.
    This is free software; see the source for copying conditions.  There is NO
    warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

    $ make run-user CROSS_COMPILE=x86_64-linux-
    0 argc = 0                                                      [FAIL]
    3 argv_total = 0                                                [FAIL]
    4 argv0_addr = <0x1>                                            [FAIL]
    5 argv0_str = <(null)>                                          [FAIL]
    6 argv0_len = 0                                                 [FAIL]
    14 chmod_argv0 = -1 EFAULT                                      [FAIL]
    19 chroot_exe = -1 EFAULT  != (-1 ENOTDIR)                      [FAIL]
    0 getenv_TERM = <(null)>                                        [FAIL]

> > At the same time, to verify such issues, as Thomas suggested, in this series,
> > we may need to add more startup tests to verify argc, argv, environ, _auxv, do
> > we need to add a standalone run_startup (or run_crt) test entry just like
> > run_syscall? or, let's simply add more in the run_stdlib, like the environ test
> > added by Thomas.  seems, the argc test is necessary one currently missing (argc
> > >= 1):
> >
> >     CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
> >
> > As we summarized before, the related test cases are:
> >
> > argv0:
> >
> >     CASE_TEST(chmod_argv0);       EXPECT_SYSZR(1, chmod(test_argv0, 0555)); break;
> >     CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot(test_argv0), -1, ENOTDIR); break;
> >
> > environ:
> >
> >     CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break;
> >     CASE_TEST(environ);            EXPECT_PTREQ(1, environ, test_envp); break;
> >     CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
> >     CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;
> >
> > auxv:
> >
> >     CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;
> >
> > The above tests are in different test group and are not aimed to startup test,
> > we'd better add a run_startup (or run_crt) test group before any other tests,
> > it is a requiremnt of the others, we at least have these ones:
> >
> >     +int run_startup(int min, int max)
> >     +{
> >     +       int test;
> >     +       int tmp;
> >     +       int ret = 0;
> >     +
> >     +       for (test = min; test >= 0 && test <= max; test++) {
> >     +               int llen = 0; /* line length */
> >     +
> >     +               /* avoid leaving empty lines below, this will insert holes into
> >     +                * test numbers.
> >     +                */
> >     +               switch (test + __LINE__ + 1) {
> >     +               CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
> >     +               CASE_TEST(argv_addr);          EXPECT_PTRNZ(1, test_argv); break;
> >     +               CASE_TEST(argv_total);         EXPECT_EQ(1, environ - test_argv - 1, test_argc); break;
> >     +               CASE_TEST(argv0_addr);         EXPECT_PTRNZ(1, argv0); break;
> >     +               CASE_TEST(argv0_str);          EXPECT_STRNZ(1, argv0); break;
> >     +               CASE_TEST(argv0_len);          EXPECT_GE(1, strlen(argv0), 1); break;
> >     +               CASE_TEST(environ_addr);       EXPECT_PTRNZ(1, environ); break;
> >     +               CASE_TEST(environ_envp);       EXPECT_PTREQ(1, environ, test_envp); break;
> >     +               CASE_TEST(environ_total);      EXPECT_GE(1, (void *)_auxv - (void *)environ - 1, 1); break;
> >     +               CASE_TEST(_auxv_addr);         EXPECT_PTRNZ(1, _auxv); break;
> >     +               case __LINE__:
> >     +                       return ret; /* must be last */
> >     +               /* note: do not set any defaults so as to permit holes above */
> >     +               }
> >     +       }
> >     +       return ret;
> >     +}
> >
> > Any more?
>
> My original idea was to have tests that exec /proc/self/exe or argv0.
> This way we can actually pass and validate arbitrary argc, argv and
> environ values.
>
> But looking at your list, that should be enough.
>

Ok.

Best regards,
Zhangjin

> > [..]
Thomas Weißschuh July 15, 2023, 7:24 a.m. UTC | #9
On 2023-07-14 17:47:23+0800, Zhangjin Wu wrote:
> > On 2023-07-14 13:58:13+0800, Zhangjin Wu wrote:

> [..]

> > I was also not able to reproduce the issue.
> >
> 
> Thanks very much for your 'reproduce' result, It is so weird, just
> rechecked the toolchain, 13.1.0 from https://mirrors.edge.kernel.org/ is
> ok, gcc 9, gcc 10.3 not work.
> 
> But even in the page of 13.1.0 [1], we still see this line:
> 
>     Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.
> 
> Not sure if "individual optimization flags" also means the optimize()
> flags in gcc attributes. or the doc is not updated yet?
> 
> And further found gcc 11.1.0 is ok, gcc 10.4 still not work, so, gcc
> 11.1.0 may changed something to let the "individual optimization flags"
> work with -O0.
> 
> We may need to at least document this issue in some files, -O0 is not such a
> frequently-used option, not sure if we still need -O0 work with the older gcc <
> 11.1.0 ;-)

It seems we can avoid the issue by enforcing optimizations for _start:

diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
index f5614a67f05a..b9d8b8861dc4 100644
--- a/tools/include/nolibc/arch-x86_64.h
+++ b/tools/include/nolibc/arch-x86_64.h
@@ -161,12 +161,9 @@
  * 2) The deepest stack frame should be zero (the %rbp).
  *
  */
-void __attribute__((weak, noreturn, optimize("omit-frame-pointer"))) __no_stack_protector _start(void)
+void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)

> 
> Willy, I'm not sure if the issues solved by the commit 7f8548589661
> ("tools/nolibc: make compiler and assembler agree on the section around
> _start") still exist after we using _start_c()?
> 
> Thomas, because we plan to move the stackprotector init to _start_c(), If using
> pure assembly _start, we may also not need the __no_stack_protector macro too?

It would probably not needed anymore in this case.

Thomas
Zhangjin Wu July 15, 2023, 9:23 a.m. UTC | #10
> On 2023-07-14 17:47:23+0800, Zhangjin Wu wrote:
> > > On 2023-07-14 13:58:13+0800, Zhangjin Wu wrote:
> 
> > [..]
> 
> > > I was also not able to reproduce the issue.
> > >
> > 
> > Thanks very much for your 'reproduce' result, It is so weird, just
> > rechecked the toolchain, 13.1.0 from https://mirrors.edge.kernel.org/ is
> > ok, gcc 9, gcc 10.3 not work.
> > 
> > But even in the page of 13.1.0 [1], we still see this line:
> > 
> >     Most optimizations are completely disabled at -O0 or if an -O level is not set on the command line, even if individual optimization flags are specified.
> > 
> > Not sure if "individual optimization flags" also means the optimize()
> > flags in gcc attributes. or the doc is not updated yet?
> > 
> > And further found gcc 11.1.0 is ok, gcc 10.4 still not work, so, gcc
> > 11.1.0 may changed something to let the "individual optimization flags"
> > work with -O0.
> > 
> > We may need to at least document this issue in some files, -O0 is not such a
> > frequently-used option, not sure if we still need -O0 work with the older gcc <
> > 11.1.0 ;-)
> 
> It seems we can avoid the issue by enforcing optimizations for _start:
> 
> diff --git a/tools/include/nolibc/arch-x86_64.h b/tools/include/nolibc/arch-x86_64.h
> index f5614a67f05a..b9d8b8861dc4 100644
> --- a/tools/include/nolibc/arch-x86_64.h
> +++ b/tools/include/nolibc/arch-x86_64.h
> @@ -161,12 +161,9 @@
>   * 2) The deepest stack frame should be zero (the %rbp).
>   *
>   */
> -void __attribute__((weak, noreturn, optimize("omit-frame-pointer"))) __no_stack_protector _start(void)
> +void __attribute__((weak, noreturn, optimize("Os", "omit-frame-pointer"))) __no_stack_protector _start(void)
>

Great, it works and it is minimal enough ;-)

Thanks very much.

> > 
> > Willy, I'm not sure if the issues solved by the commit 7f8548589661
> > ("tools/nolibc: make compiler and assembler agree on the section around
> > _start") still exist after we using _start_c()?
> > 
> > Thomas, because we plan to move the stackprotector init to _start_c(), If using
> > pure assembly _start, we may also not need the __no_stack_protector macro too?
> 
> It would probably not needed anymore in this case.
>

Yeah, but let's reserve it as-is for we have the working
omit-frame-pointer now.

Best regards,
Zhangjin

> Thomas
Willy Tarreau July 15, 2023, 9:57 a.m. UTC | #11
Hi Zhangjin,

On Fri, Jul 14, 2023 at 01:58:13PM +0800, Zhangjin Wu wrote:
> > or:
> >
> >     for (_auxv = (void*)environ; *_auxv++; )
> >          ;
> >
> > Please also have a look at the output code, because at low optimization
> > levels, compilers sometimes produce a better code with a local variable
> > than with a global variable in a loop. Thus I wouldn't be that much
> > surprised if at -O0 or -O1 you'd see slightly more compact code using:
> >
> 
> Very good suggestion, I did find some interesting results, after
> removing some local variables, although the text size shrinks, but the
> total size is the same with -O0/1/2/3/s.
> 
> Here is the diff with -O0/1/2/3/s (a REPORT_SIZE macro may be required for
> this):
> 
>     $ diff -Nubr startup.old.txt startup.new.txt
>     --- startup.old.txt	2023-07-14 10:22:45.990413661 +0800
>     +++ startup.new.txt	2023-07-14 10:22:52.278869146 +0800
>     @@ -2,34 +2,34 @@
>      sudo strip -s nolibc-test
>      size nolibc-test
>         text    data     bss     dec     hex filename
>     -  46872      88      80   47040    b7c0 nolibc-test
>     +  46836      88      80   47004    b79c nolibc-test
(...)

I meant only checking the function's size, not the whole program :-)

Here's what I'm having for the function:

  $ for opt in O0 O1 O2 Os O3; do
      echo "## $opt"
      for file in global local local2; do
        gcc -$opt -c startc-$file.c
      done
      nm -o --size startc-*.o | grep _start_c
      echo
    done
  
  ## O0
  startc-global.o:0000000000000089 T _start_c
  startc-local.o:00000000000000ad T _start_c
  startc-local2.o:0000000000000090 T _start_c
  
  ## O1
  startc-global.o:0000000000000048 T _start_c
  startc-local.o:0000000000000054 T _start_c
  startc-local2.o:000000000000003a T _start_c
  
  ## O2
  startc-global.o:0000000000000044 T _start_c
  startc-local.o:000000000000004d T _start_c
  startc-local2.o:0000000000000040 T _start_c
  
  ## Os
  startc-global.o:0000000000000041 T _start_c
  startc-local.o:0000000000000040 T _start_c
  startc-local2.o:000000000000003a T _start_c
  
  ## O3
  startc-global.o:0000000000000044 T _start_c
  startc-local.o:000000000000004d T _start_c
  startc-local2.o:0000000000000040 T _start_c

In local2 that's always smaller I've just used a local auxv variable
instead of iterating over the global one, and it's cheaper than using
an index since 1) the instructions are shorter, and 2) the value is
already available, no need to initialize one register to zero:

    void _start_c(long *sp)   
    {                 
        long argc;                                 
        char **argv;                                    
        char **envp;                                    
        const unsigned long *auxv;                                    
        /* silence potential warning: conflicting types for 'main' */
        int _nolibc_main(int, char **, char **) __asm__ ("main");
                                                        
        /*                    
         * sp  :    argc          <-- argument count, required by main()
         * argv:    argv[0]       <-- argument vector, required by main()
         *          argv[1]                             
         *          ...                                 
         *          argv[argc-1]                     
         *          null
         * environ: environ[0]    <-- environment variables, required by main() and getenv()
         *          environ[1]
         *          ...
         *          null
         * _auxv:   _auxv[0]      <-- auxiliary vector, required by getauxval()
         *          _auxv[1]
         *          ...
         *          null
         */

        /* assign argc and argv */
        argc = *sp;
        argv = (void *)(sp + 1);

        /* find environ */
        environ = envp = argv + argc + 1;

        /* find _auxv */
        for (auxv = (void*)envp; *auxv++; )
                ;
        _auxv = auxv;

        /* go to application */
        exit(_nolibc_main(argc, argv, envp));
    }

I'm not showing how ugly it is at -O1, as I suspected the global version
does indeed perform a write to _auxv for each turn. At -Os it's
interesting:

In the global version it looks like this (rdx has envp):

  13:   48 8d 42 08             lea    0x8(%rdx),%rax
  17:   48 89 15 00 00 00 00    mov    %rdx,0x0(%rip)        # environ
  1e:   48 89 c7                mov    %rax,%rdi
  21:   48 83 c0 08             add    $0x8,%rax
  25:   48 83 78 f0 00          cmpq   $0x0,-0x10(%rax)
  2a:   75 f2                   jne    1e <_start_c+0x1e>

In your local version with the index, it looks like this (rdx has envp):

  13:   31 c0                   xor    %eax,%eax
  15:   48 89 15 00 00 00 00    mov    %rdx,0x0(%rip)        # environ
  1c:   48 ff c0                inc    %rax
  1f:   48 83 7c c2 f8 00       cmpq   $0x0,-0x8(%rdx,%rax,8)
  25:   75 f5                   jne    1c <_start_c+0x1c>
  27:   48 8d 04 c2             lea    (%rdx,%rax,8),%rax    # rax now has auxv

In the one without index it's like this:

  13:   48 89 15 00 00 00 00    mov    %rdx,0x0(%rip)        # environ
  1a:   48 89 d0                mov    %rdx,%rax
  1d:   48 83 c0 08             add    $0x8,%rax
  21:   48 83 78 f8 00          cmpq   $0x0,-0x8(%rax)
  26:   75 f5                   jne    1d <_start_c+0x1d>

Finally, since argc is used in pointer computation while being taken from
a long*, it experiences a double conversion while doing so. Storing it as
a long avoids this saving 3 extra bytes.

> Which one do you prefer? the one with local variables may be more readable (not
> that much), the one with global variables has smaller text size (and therefore
> smaller memory footprint).

The smaller one with local variables and no index ;-)

> BTW, just found an arch-<ARCH>.h bug with -O0, seems the
> 'optimize("omit-frame-pointer")' attribute not really work as expected with
> -O0. It uses frame pointer for _start eventually and breaks the stack pointer
> variable (a push 'rbp' inserted at the begging of _start, so, the real rsp has
> an offset), so, therefore, it is not able to get the right argc, argv, environ
> and _auxv with -O0 currently. A solution is reverting _start to pure assembly.

I didn't notice this one.

> 'man gcc' shows:
> 
>     Most optimizations are completely disabled at -O0 or if an -O level is
>     not set on the command line, even if individual optimization flags are
>     specified.

Indeed, that's pretty clear!

> To want -O0 work again, since now we have C _start_c, is it ok for us to revert
> the commit 7f8548589661 ("tools/nolibc: make compiler and assembler agree on
> the section around _start") and the later __no_stack_protector changes?

As Thomas said in somewhere else in the thread, let's analyze deeper first.
Maybe passing optimize("O") in addition will be sufficient.

> At the same time, to verify such issues, as Thomas suggested, in this series,
> we may need to add more startup tests to verify argc, argv, environ, _auxv, do

Yep!

> we need to add a standalone run_startup (or run_crt) test entry just like
> run_syscall? or, let's simply add more in the run_stdlib, like the environ test
> added by Thomas.  seems, the argc test is necessary one currently missing (argc
> >= 1):

Yes it could be a good idea to add a startup test category. Some of the
stuff we've placed into "stdlib" are more about startup code (provided
by the lib) that tends to be very sensitive to the architecture and
optimizations.
 

>     CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
> 
> As we summarized before, the related test cases are:
> 
> argv0:
> 
>     CASE_TEST(chmod_argv0);       EXPECT_SYSZR(1, chmod(test_argv0, 0555)); break;
>     CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot(test_argv0), -1, ENOTDIR); break;
> 
> environ:
> 
>     CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); chdir(getenv("PWD")); break;
>     CASE_TEST(environ);            EXPECT_PTREQ(1, environ, test_envp); break;
>     CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
>     CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;
> 
> auxv:
> 
>     CASE_TEST(getpagesize);       EXPECT_SYSZR(1, test_getpagesize()); break;
> 
> The above tests are in different test group and are not aimed to startup test,
> we'd better add a run_startup (or run_crt) test group before any other tests,
> it is a requiremnt of the others, we at least have these ones:
> 
>     +int run_startup(int min, int max)
>     +{
>     +       int test;
>     +       int tmp;
>     +       int ret = 0;
>     +
>     +       for (test = min; test >= 0 && test <= max; test++) {
>     +               int llen = 0; /* line length */
>     +
>     +               /* avoid leaving empty lines below, this will insert holes into
>     +                * test numbers.
>     +                */
>     +               switch (test + __LINE__ + 1) {
>     +               CASE_TEST(argc);               EXPECT_GE(1, test_argc, 1); break;
>     +               CASE_TEST(argv_addr);          EXPECT_PTRNZ(1, test_argv); break;
>     +               CASE_TEST(argv_total);         EXPECT_EQ(1, environ - test_argv - 1, test_argc); break;
>     +               CASE_TEST(argv0_addr);         EXPECT_PTRNZ(1, argv0); break;
>     +               CASE_TEST(argv0_str);          EXPECT_STRNZ(1, argv0); break;
>     +               CASE_TEST(argv0_len);          EXPECT_GE(1, strlen(argv0), 1); break;
>     +               CASE_TEST(environ_addr);       EXPECT_PTRNZ(1, environ); break;
>     +               CASE_TEST(environ_envp);       EXPECT_PTREQ(1, environ, test_envp); break;
>     +               CASE_TEST(environ_total);      EXPECT_GE(1, (void *)_auxv - (void *)environ - 1, 1); break;
>     +               CASE_TEST(_auxv_addr);         EXPECT_PTRNZ(1, _auxv); break;
>     +               case __LINE__:
>     +                       return ret; /* must be last */
>     +               /* note: do not set any defaults so as to permit holes above */
>     +               }
>     +       }
>     +       return ret;
>     +}
> 
> Any more?

I quickly glanced over this but I tend to like it, indeed.

Thanks!
Willy
diff mbox series

Patch

diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index 64d67b080744..909b6eb500fe 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -27,6 +27,7 @@  nolibc_arch := $(patsubst arm64,aarch64,$(ARCH))
 arch_file := arch-$(nolibc_arch).h
 all_files := \
 		compiler.h \
+		crt.h \
 		ctype.h \
 		errno.h \
 		nolibc.h \
diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h
new file mode 100644
index 000000000000..f9db2389acd2
--- /dev/null
+++ b/tools/include/nolibc/crt.h
@@ -0,0 +1,59 @@ 
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * C Run Time support for NOLIBC
+ * Copyright (C) 2023 Zhangjin Wu <falcon@tinylab.org>
+ */
+
+#ifndef _NOLIBC_CRT_H
+#define _NOLIBC_CRT_H
+
+char **environ __attribute__((weak));
+const unsigned long *_auxv __attribute__((weak));
+
+typedef int (_nolibc_main_fn)(int, char **, char **);
+static void exit(int);
+
+void _start_c(long *sp)
+{
+	int argc, i;
+	char **argv;
+	char **envp;
+	/* silence potential warning: conflicting types for 'main' */
+	_nolibc_main_fn _nolibc_main __asm__ ("main");
+
+	/*
+	 * sp  :  argc          <-- argument count, required by main()
+	 * argv:  argv[0]       <-- argument vector, required by main()
+	 *        argv[1]
+	 *        ...
+	 *        argv[argc-1]
+	 *        null
+	 * envp:  envp[0]       <-- environment variables, required by main() and getenv()
+	 *        envp[1]
+	 *        ...
+	 *        null
+	 * _auxv: auxv[0]       <-- auxiliary vector, required by getauxval()
+	 *        auxv[1]
+	 *        ...
+	 *        null
+	 */
+
+	/* assign argc and argv */
+	argc = sp[0];
+	argv = (void *)(sp + 1);
+
+	/* find envp */
+	envp = argv + argc + 1;
+	environ = envp;
+
+	/* find auxv */
+	i = 0;
+	while (envp[i])
+		i++;
+	_auxv = (void *)(envp + i + 1);
+
+	/* go to application */
+	exit(_nolibc_main(argc, argv, envp));
+}
+
+#endif /* _NOLIBC_CRT_H */