diff mbox series

[v3,1/3] riscv: mm: Use hint address in mmap if available

Message ID 20240130-use_mmap_hint_address-v3-1-8a655cfa8bcb@rivosinc.com (mailing list archive)
State Accepted
Commit b5b4287accd702f562a49a60b10dbfaf7d40270f
Headers show
Series riscv: mm: Extend mappable memory up to hint address | expand

Checks

Context Check Description
conchuod/vmtest-fixes-PR fail merge-conflict

Commit Message

Charlie Jenkins Jan. 31, 2024, 1:07 a.m. UTC
On riscv it is guaranteed that the address returned by mmap is less than
the hint address. Allow mmap to return an address all the way up to
addr, if provided, rather than just up to the lower address space.

This provides a performance benefit as well, allowing mmap to exit after
checking that the address is in range rather than searching for a valid
address.

It is possible to provide an address that uses at most the same number
of bits, however it is significantly more computationally expensive to
provide that number rather than setting the max to be the hint address.
There is the instruction clz/clzw in Zbb that returns the highest set bit
which could be used to performantly implement this, but it would still
be slower than the current implementation. At worst case, half of the
address would not be able to be allocated when a hint address is
provided.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
---
 arch/riscv/include/asm/processor.h | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

Comments

Yangyu Chen Jan. 31, 2024, 2:41 p.m. UTC | #1
On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> On riscv it is guaranteed that the address returned by mmap is less
> than
> the hint address. Allow mmap to return an address all the way up to
> addr, if provided, rather than just up to the lower address space.
> 
> This provides a performance benefit as well, allowing mmap to exit
> after
> checking that the address is in range rather than searching for a
> valid
> address.
> 
> It is possible to provide an address that uses at most the same
> number
> of bits, however it is significantly more computationally expensive
> to
> provide that number rather than setting the max to be the hint
> address.
> There is the instruction clz/clzw in Zbb that returns the highest set
> bit
> which could be used to performantly implement this, but it would
> still
> be slower than the current implementation. At worst case, half of the
> address would not be able to be allocated when a hint address is
> provided.
> 
> Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> ---
>  arch/riscv/include/asm/processor.h | 27 +++++++++++----------------
>  1 file changed, 11 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/processor.h
> b/arch/riscv/include/asm/processor.h
> index f19f861cda54..8ece7a8f0e18 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -14,22 +14,16 @@
>  
>  #include <asm/ptrace.h>
>  
> -#ifdef CONFIG_64BIT
> -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> -#define STACK_TOP_MAX		TASK_SIZE_64
> -
>  #define arch_get_mmap_end(addr, len, flags)			\
>  ({								\
>  	unsigned long
> mmap_end;					\
>  	typeof(addr) _addr = (addr);				\
> -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> is_compat_task())) \
> +	if ((_addr) == 0 ||					\
> +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> +	    ((_addr + len) > BIT(VA_BITS -
> 1)))			\
>  		mmap_end = STACK_TOP_MAX;			\
> -	else if ((_addr) >= VA_USER_SV57)			\
> -		mmap_end = STACK_TOP_MAX;			\
> -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> VA_BITS_SV48)) \
> -		mmap_end = VA_USER_SV48;			\
>  	else							\
> -		mmap_end = VA_USER_SV39;			\
> +		mmap_end = (_addr + len);			\
>  	mmap_end;						\
>  })
>  
> @@ -39,17 +33,18 @@
>  	typeof(addr) _addr = (addr);				\
>  	typeof(base) _base = (base);				\
>  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> is_compat_task())) \
> +	if ((_addr) == 0 ||					\
> +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> +	    ((_addr + len) > BIT(VA_BITS -
> 1)))			\
>  		mmap_base = (_base);				\
> -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> VA_BITS_SV57)) \
> -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> VA_BITS_SV48)) \
> -		mmap_base = VA_USER_SV48 - rnd_gap;		\
>  	else							\
> -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> +		mmap_base = (_addr + len) - rnd_gap;		\
>  	mmap_base;						\
>  })
>  
> +#ifdef CONFIG_64BIT
> +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> +#define STACK_TOP_MAX		TASK_SIZE_64
>  #else
>  #define DEFAULT_MAP_WINDOW	TASK_SIZE
>  #define STACK_TOP_MAX		TASK_SIZE
> 

I have carefully tested your patch on qemu with sv57. A bug that needs
to be solved is that mmap with the same hint address without MAP_FIXED
set will fail the second time.

Userspace code to reproduce the bug:

#include <sys/mman.h>
#include <stdio.h>
#include <stdint.h>

void test(char *addr) {
    char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS
| MAP_PRIVATE, -1, 0);
    printf("hint %p got %p.\n", addr, res);
}

int main (void) {
    test(1<<30);
    test(1<<30);
    test(1<<30);
    return 0;
}

output:

hint 0x40000000 got 0x40000000.
hint 0x40000000 got 0xffffffffffffffff.
hint 0x40000000 got 0xffffffffffffffff.

output on x86:

hint 0x40000000 got 0x40000000.
hint 0x40000000 got 0x7f9171363000.
hint 0x40000000 got 0x7f9171362000.

It may need to implement a special arch_get_unmapped_area and
arch_get_unmapped_area_topdown function.
Yangyu Chen Jan. 31, 2024, 3:59 p.m. UTC | #2
On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > On riscv it is guaranteed that the address returned by mmap is less
> > than
> > the hint address. Allow mmap to return an address all the way up to
> > addr, if provided, rather than just up to the lower address space.
> > 
> > This provides a performance benefit as well, allowing mmap to exit
> > after
> > checking that the address is in range rather than searching for a
> > valid
> > address.
> > 
> > It is possible to provide an address that uses at most the same
> > number
> > of bits, however it is significantly more computationally expensive
> > to
> > provide that number rather than setting the max to be the hint
> > address.
> > There is the instruction clz/clzw in Zbb that returns the highest
> > set
> > bit
> > which could be used to performantly implement this, but it would
> > still
> > be slower than the current implementation. At worst case, half of
> > the
> > address would not be able to be allocated when a hint address is
> > provided.
> > 
> > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > ---
> >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > -
> >  1 file changed, 11 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/riscv/include/asm/processor.h
> > b/arch/riscv/include/asm/processor.h
> > index f19f861cda54..8ece7a8f0e18 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -14,22 +14,16 @@
> >  
> >  #include <asm/ptrace.h>
> >  
> > -#ifdef CONFIG_64BIT
> > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > -#define STACK_TOP_MAX		TASK_SIZE_64
> > -
> >  #define arch_get_mmap_end(addr, len, flags)			\
> >  ({								\
> >  	unsigned long
> > mmap_end;					\
> >  	typeof(addr) _addr = (addr);				\
> > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > is_compat_task())) \
> > +	if ((_addr) == 0 ||					\
> > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > +	    ((_addr + len) > BIT(VA_BITS -
> > 1)))			\
> >  		mmap_end = STACK_TOP_MAX;			\
> > -	else if ((_addr) >= VA_USER_SV57)			\
> > -		mmap_end = STACK_TOP_MAX;			\
> > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > VA_BITS_SV48)) \
> > -		mmap_end = VA_USER_SV48;			\
> >  	else							\
> > -		mmap_end = VA_USER_SV39;			\
> > +		mmap_end = (_addr + len);			\
> >  	mmap_end;						\
> >  })
> >  
> > @@ -39,17 +33,18 @@
> >  	typeof(addr) _addr = (addr);				\
> >  	typeof(base) _base = (base);				\
> >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > is_compat_task())) \
> > +	if ((_addr) == 0 ||					\
> > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > +	    ((_addr + len) > BIT(VA_BITS -
> > 1)))			\
> >  		mmap_base = (_base);				\
> > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > VA_BITS_SV57)) \
> > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > VA_BITS_SV48)) \
> > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
> >  	else							\
> > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> > +		mmap_base = (_addr + len) - rnd_gap;		\
> >  	mmap_base;						\
> >  })
> >  
> > +#ifdef CONFIG_64BIT
> > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > +#define STACK_TOP_MAX		TASK_SIZE_64
> >  #else
> >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
> >  #define STACK_TOP_MAX		TASK_SIZE
> > 
> 
> I have carefully tested your patch on qemu with sv57. A bug that
> needs
> to be solved is that mmap with the same hint address without
> MAP_FIXED
> set will fail the second time.
> 
> Userspace code to reproduce the bug:
> 
> #include <sys/mman.h>
> #include <stdio.h>
> #include <stdint.h>
> 
> void test(char *addr) {
>     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> MAP_ANONYMOUS
> > MAP_PRIVATE, -1, 0);
>     printf("hint %p got %p.\n", addr, res);
> }
> 
> int main (void) {
>     test(1<<30);
>     test(1<<30);
>     test(1<<30);
>     return 0;
> }
> 
> output:
> 
> hint 0x40000000 got 0x40000000.
> hint 0x40000000 got 0xffffffffffffffff.
> hint 0x40000000 got 0xffffffffffffffff.
> 
> output on x86:
> 
> hint 0x40000000 got 0x40000000.
> hint 0x40000000 got 0x7f9171363000.
> hint 0x40000000 got 0x7f9171362000.
> 
> It may need to implement a special arch_get_unmapped_area and
> arch_get_unmapped_area_topdown function.
> 

This is because hint address < rnd_gap. I have tried to let mmap_base =
min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
does not work for bottom-up while ulimit -s is unlimited. You said this
behavior is expected from patch v2 review. However it brings a new
regression even on sv39 systems.

I still don't know the reason why use addr+len as the upper-bound. I
think solution like x86/arm64/powerpc provide two address space switch
based on whether hint address above the default map window is enough.
Charlie Jenkins Feb. 2, 2024, 2:28 a.m. UTC | #3
On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > > On riscv it is guaranteed that the address returned by mmap is less
> > > than
> > > the hint address. Allow mmap to return an address all the way up to
> > > addr, if provided, rather than just up to the lower address space.
> > > 
> > > This provides a performance benefit as well, allowing mmap to exit
> > > after
> > > checking that the address is in range rather than searching for a
> > > valid
> > > address.
> > > 
> > > It is possible to provide an address that uses at most the same
> > > number
> > > of bits, however it is significantly more computationally expensive
> > > to
> > > provide that number rather than setting the max to be the hint
> > > address.
> > > There is the instruction clz/clzw in Zbb that returns the highest
> > > set
> > > bit
> > > which could be used to performantly implement this, but it would
> > > still
> > > be slower than the current implementation. At worst case, half of
> > > the
> > > address would not be able to be allocated when a hint address is
> > > provided.
> > > 
> > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > > ---
> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > > -
> > >  1 file changed, 11 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/arch/riscv/include/asm/processor.h
> > > b/arch/riscv/include/asm/processor.h
> > > index f19f861cda54..8ece7a8f0e18 100644
> > > --- a/arch/riscv/include/asm/processor.h
> > > +++ b/arch/riscv/include/asm/processor.h
> > > @@ -14,22 +14,16 @@
> > >  
> > >  #include <asm/ptrace.h>
> > >  
> > > -#ifdef CONFIG_64BIT
> > > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > > -#define STACK_TOP_MAX		TASK_SIZE_64
> > > -
> > >  #define arch_get_mmap_end(addr, len, flags)			\
> > >  ({								\
> > >  	unsigned long
> > > mmap_end;					\
> > >  	typeof(addr) _addr = (addr);				\
> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > is_compat_task())) \
> > > +	if ((_addr) == 0 ||					\
> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > > +	    ((_addr + len) > BIT(VA_BITS -
> > > 1)))			\
> > >  		mmap_end = STACK_TOP_MAX;			\
> > > -	else if ((_addr) >= VA_USER_SV57)			\
> > > -		mmap_end = STACK_TOP_MAX;			\
> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > VA_BITS_SV48)) \
> > > -		mmap_end = VA_USER_SV48;			\
> > >  	else							\
> > > -		mmap_end = VA_USER_SV39;			\
> > > +		mmap_end = (_addr + len);			\
> > >  	mmap_end;						\
> > >  })
> > >  
> > > @@ -39,17 +33,18 @@
> > >  	typeof(addr) _addr = (addr);				\
> > >  	typeof(base) _base = (base);				\
> > >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > is_compat_task())) \
> > > +	if ((_addr) == 0 ||					\
> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > > +	    ((_addr + len) > BIT(VA_BITS -
> > > 1)))			\
> > >  		mmap_base = (_base);				\
> > > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > > VA_BITS_SV57)) \
> > > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > VA_BITS_SV48)) \
> > > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
> > >  	else							\
> > > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> > > +		mmap_base = (_addr + len) - rnd_gap;		\
> > >  	mmap_base;						\
> > >  })
> > >  
> > > +#ifdef CONFIG_64BIT
> > > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > > +#define STACK_TOP_MAX		TASK_SIZE_64
> > >  #else
> > >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
> > >  #define STACK_TOP_MAX		TASK_SIZE
> > > 
> > 
> > I have carefully tested your patch on qemu with sv57. A bug that
> > needs
> > to be solved is that mmap with the same hint address without
> > MAP_FIXED
> > set will fail the second time.
> > 
> > Userspace code to reproduce the bug:
> > 
> > #include <sys/mman.h>
> > #include <stdio.h>
> > #include <stdint.h>
> > 
> > void test(char *addr) {
> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> > MAP_ANONYMOUS
> > > MAP_PRIVATE, -1, 0);
> >     printf("hint %p got %p.\n", addr, res);
> > }
> > 
> > int main (void) {
> >     test(1<<30);
> >     test(1<<30);
> >     test(1<<30);
> >     return 0;
> > }
> > 
> > output:
> > 
> > hint 0x40000000 got 0x40000000.
> > hint 0x40000000 got 0xffffffffffffffff.
> > hint 0x40000000 got 0xffffffffffffffff.
> > 
> > output on x86:
> > 
> > hint 0x40000000 got 0x40000000.
> > hint 0x40000000 got 0x7f9171363000.
> > hint 0x40000000 got 0x7f9171362000.
> > 
> > It may need to implement a special arch_get_unmapped_area and
> > arch_get_unmapped_area_topdown function.
> > 
> 
> This is because hint address < rnd_gap. I have tried to let mmap_base =
> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> does not work for bottom-up while ulimit -s is unlimited. You said this
> behavior is expected from patch v2 review. However it brings a new
> regression even on sv39 systems.
> 
> I still don't know the reason why use addr+len as the upper-bound. I
> think solution like x86/arm64/powerpc provide two address space switch
> based on whether hint address above the default map window is enough.
> 

Yep this is expected. It is up to the maintainers to decide.

- Charlie
Palmer Dabbelt March 22, 2024, 2:06 p.m. UTC | #4
On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>> > > On riscv it is guaranteed that the address returned by mmap is less
>> > > than
>> > > the hint address. Allow mmap to return an address all the way up to
>> > > addr, if provided, rather than just up to the lower address space.
>> > > 
>> > > This provides a performance benefit as well, allowing mmap to exit
>> > > after
>> > > checking that the address is in range rather than searching for a
>> > > valid
>> > > address.
>> > > 
>> > > It is possible to provide an address that uses at most the same
>> > > number
>> > > of bits, however it is significantly more computationally expensive
>> > > to
>> > > provide that number rather than setting the max to be the hint
>> > > address.
>> > > There is the instruction clz/clzw in Zbb that returns the highest
>> > > set
>> > > bit
>> > > which could be used to performantly implement this, but it would
>> > > still
>> > > be slower than the current implementation. At worst case, half of
>> > > the
>> > > address would not be able to be allocated when a hint address is
>> > > provided.
>> > > 
>> > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
>> > > ---
>> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>> > > -
>> > >  1 file changed, 11 insertions(+), 16 deletions(-)
>> > > 
>> > > diff --git a/arch/riscv/include/asm/processor.h
>> > > b/arch/riscv/include/asm/processor.h
>> > > index f19f861cda54..8ece7a8f0e18 100644
>> > > --- a/arch/riscv/include/asm/processor.h
>> > > +++ b/arch/riscv/include/asm/processor.h
>> > > @@ -14,22 +14,16 @@
>> > >  
>> > >  #include <asm/ptrace.h>
>> > >  
>> > > -#ifdef CONFIG_64BIT
>> > > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
>> > > -#define STACK_TOP_MAX		TASK_SIZE_64
>> > > -
>> > >  #define arch_get_mmap_end(addr, len, flags)			\
>> > >  ({								\
>> > >  	unsigned long
>> > > mmap_end;					\
>> > >  	typeof(addr) _addr = (addr);				\
>> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>> > > is_compat_task())) \
>> > > +	if ((_addr) == 0 ||					\
>> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
>> > > +	    ((_addr + len) > BIT(VA_BITS -
>> > > 1)))			\
>> > >  		mmap_end = STACK_TOP_MAX;			\
>> > > -	else if ((_addr) >= VA_USER_SV57)			\
>> > > -		mmap_end = STACK_TOP_MAX;			\
>> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>> > > VA_BITS_SV48)) \
>> > > -		mmap_end = VA_USER_SV48;			\
>> > >  	else							\
>> > > -		mmap_end = VA_USER_SV39;			\
>> > > +		mmap_end = (_addr + len);			\
>> > >  	mmap_end;						\
>> > >  })
>> > >  
>> > > @@ -39,17 +33,18 @@
>> > >  	typeof(addr) _addr = (addr);				\
>> > >  	typeof(base) _base = (base);				\
>> > >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
>> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>> > > is_compat_task())) \
>> > > +	if ((_addr) == 0 ||					\
>> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
>> > > +	    ((_addr + len) > BIT(VA_BITS -
>> > > 1)))			\
>> > >  		mmap_base = (_base);				\
>> > > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>> > > VA_BITS_SV57)) \
>> > > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
>> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>> > > VA_BITS_SV48)) \
>> > > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
>> > >  	else							\
>> > > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
>> > > +		mmap_base = (_addr + len) - rnd_gap;		\
>> > >  	mmap_base;						\
>> > >  })
>> > >  
>> > > +#ifdef CONFIG_64BIT
>> > > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
>> > > +#define STACK_TOP_MAX		TASK_SIZE_64
>> > >  #else
>> > >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
>> > >  #define STACK_TOP_MAX		TASK_SIZE
>> > > 
>> > 
>> > I have carefully tested your patch on qemu with sv57. A bug that
>> > needs
>> > to be solved is that mmap with the same hint address without
>> > MAP_FIXED
>> > set will fail the second time.
>> > 
>> > Userspace code to reproduce the bug:
>> > 
>> > #include <sys/mman.h>
>> > #include <stdio.h>
>> > #include <stdint.h>
>> > 
>> > void test(char *addr) {
>> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>> > MAP_ANONYMOUS
>> > > MAP_PRIVATE, -1, 0);
>> >     printf("hint %p got %p.\n", addr, res);
>> > }
>> > 
>> > int main (void) {
>> >     test(1<<30);
>> >     test(1<<30);
>> >     test(1<<30);
>> >     return 0;
>> > }
>> > 
>> > output:
>> > 
>> > hint 0x40000000 got 0x40000000.
>> > hint 0x40000000 got 0xffffffffffffffff.
>> > hint 0x40000000 got 0xffffffffffffffff.
>> > 
>> > output on x86:
>> > 
>> > hint 0x40000000 got 0x40000000.
>> > hint 0x40000000 got 0x7f9171363000.
>> > hint 0x40000000 got 0x7f9171362000.
>> > 
>> > It may need to implement a special arch_get_unmapped_area and
>> > arch_get_unmapped_area_topdown function.
>> > 
>> 
>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>> does not work for bottom-up while ulimit -s is unlimited. You said this
>> behavior is expected from patch v2 review. However it brings a new
>> regression even on sv39 systems.
>> 
>> I still don't know the reason why use addr+len as the upper-bound. I
>> think solution like x86/arm64/powerpc provide two address space switch
>> based on whether hint address above the default map window is enough.
>> 
>
> Yep this is expected. It is up to the maintainers to decide.

Sorry I forgot to reply to this, I had a buffer sitting around somewhere 
but I must have lost it.

I think Charlie's approach is the right way to go.  Putting my userspace 
hat on, I'd much rather have my allocations fail rather than silently 
ignore the hint when there's memory pressure.

If there's some real use case that needs these low hints to be silently 
ignored under VA pressure then we can try and figure something out that 
makes those applications work.

>
> - Charlie
Levi Zim Aug. 18, 2024, 3:34 p.m. UTC | #5
On 2024-03-22 22:06, Palmer Dabbelt wrote:
> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>> > > On riscv it is guaranteed that the address returned by mmap is less
>>> > > than
>>> > > the hint address. Allow mmap to return an address all the way up to
>>> > > addr, if provided, rather than just up to the lower address space.
>>> > > > > This provides a performance benefit as well, allowing mmap 
>>> to exit
>>> > > after
>>> > > checking that the address is in range rather than searching for a
>>> > > valid
>>> > > address.
>>> > > > > It is possible to provide an address that uses at most the same
>>> > > number
>>> > > of bits, however it is significantly more computationally expensive
>>> > > to
>>> > > provide that number rather than setting the max to be the hint
>>> > > address.
>>> > > There is the instruction clz/clzw in Zbb that returns the highest
>>> > > set
>>> > > bit
>>> > > which could be used to performantly implement this, but it would
>>> > > still
>>> > > be slower than the current implementation. At worst case, half of
>>> > > the
>>> > > address would not be able to be allocated when a hint address is
>>> > > provided.
>>> > > > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
>>> > > ---
>>> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>> > > -
>>> > >  1 file changed, 11 insertions(+), 16 deletions(-)
>>> > > > > diff --git a/arch/riscv/include/asm/processor.h
>>> > > b/arch/riscv/include/asm/processor.h
>>> > > index f19f861cda54..8ece7a8f0e18 100644
>>> > > --- a/arch/riscv/include/asm/processor.h
>>> > > +++ b/arch/riscv/include/asm/processor.h
>>> > > @@ -14,22 +14,16 @@
>>> > >
>>> > >  #include <asm/ptrace.h>
>>> > >
>>> > > -#ifdef CONFIG_64BIT
>>> > > -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>> > > -#define STACK_TOP_MAX        TASK_SIZE_64
>>> > > -
>>> > >  #define arch_get_mmap_end(addr, len, flags)            \
>>> > >  ({                                \
>>> > >      unsigned long
>>> > > mmap_end;                    \
>>> > >      typeof(addr) _addr = (addr);                \
>>> > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>> > > is_compat_task())) \
>>> > > +    if ((_addr) == 0 ||                    \
>>> > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>> > > +        ((_addr + len) > BIT(VA_BITS -
>>> > > 1)))            \
>>> > >          mmap_end = STACK_TOP_MAX;            \
>>> > > -    else if ((_addr) >= VA_USER_SV57) \
>>> > > -        mmap_end = STACK_TOP_MAX;            \
>>> > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>> > > VA_BITS_SV48)) \
>>> > > -        mmap_end = VA_USER_SV48;            \
>>> > >      else                            \
>>> > > -        mmap_end = VA_USER_SV39;            \
>>> > > +        mmap_end = (_addr + len);            \
>>> > >      mmap_end;                        \
>>> > >  })
>>> > >
>>> > > @@ -39,17 +33,18 @@
>>> > >      typeof(addr) _addr = (addr);                \
>>> > >      typeof(base) _base = (base);                \
>>> > >      unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>> > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>> > > is_compat_task())) \
>>> > > +    if ((_addr) == 0 ||                    \
>>> > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>> > > +        ((_addr + len) > BIT(VA_BITS -
>>> > > 1)))            \
>>> > >          mmap_base = (_base);                \
>>> > > -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>> > > VA_BITS_SV57)) \
>>> > > -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>> > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>> > > VA_BITS_SV48)) \
>>> > > -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>> > >      else                            \
>>> > > -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>> > > +        mmap_base = (_addr + len) - rnd_gap; \
>>> > >      mmap_base;                        \
>>> > >  })
>>> > >
>>> > > +#ifdef CONFIG_64BIT
>>> > > +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>> > > +#define STACK_TOP_MAX        TASK_SIZE_64
>>> > >  #else
>>> > >  #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>> > >  #define STACK_TOP_MAX        TASK_SIZE
>>> > > > > I have carefully tested your patch on qemu with sv57. A bug 
>>> that
>>> > needs
>>> > to be solved is that mmap with the same hint address without
>>> > MAP_FIXED
>>> > set will fail the second time.
>>> > > Userspace code to reproduce the bug:
>>> > > #include <sys/mman.h>
>>> > #include <stdio.h>
>>> > #include <stdint.h>
>>> > > void test(char *addr) {
>>> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>> > MAP_ANONYMOUS
>>> > > MAP_PRIVATE, -1, 0);
>>> >     printf("hint %p got %p.\n", addr, res);
>>> > }
>>> > > int main (void) {
>>> >     test(1<<30);
>>> >     test(1<<30);
>>> >     test(1<<30);
>>> >     return 0;
>>> > }
>>> > > output:
>>> > > hint 0x40000000 got 0x40000000.
>>> > hint 0x40000000 got 0xffffffffffffffff.
>>> > hint 0x40000000 got 0xffffffffffffffff.
>>> > > output on x86:
>>> > > hint 0x40000000 got 0x40000000.
>>> > hint 0x40000000 got 0x7f9171363000.
>>> > hint 0x40000000 got 0x7f9171362000.
>>> > > It may need to implement a special arch_get_unmapped_area and
>>> > arch_get_unmapped_area_topdown function.
>>> >
>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>> behavior is expected from patch v2 review. However it brings a new
>>> regression even on sv39 systems.
>>>
>>> I still don't know the reason why use addr+len as the upper-bound. I
>>> think solution like x86/arm64/powerpc provide two address space switch
>>> based on whether hint address above the default map window is enough.
>>>
>>
>> Yep this is expected. It is up to the maintainers to decide.
>
> Sorry I forgot to reply to this, I had a buffer sitting around 
> somewhere but I must have lost it.
>
> I think Charlie's approach is the right way to go.  Putting my 
> userspace hat on, I'd much rather have my allocations fail rather than 
> silently ignore the hint when there's memory pressure.
>
> If there's some real use case that needs these low hints to be 
> silently ignored under VA pressure then we can try and figure 
> something out that makes those applications work.

I suspect this has broke chromium and all other software that depends on 
chromium on riscv64. See also 
https://github.com/riscv-forks/electron/issues/4

>
>>
>> - Charlie
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Levi Zim Aug. 19, 2024, 5:55 a.m. UTC | #6
On 2024-03-22 22:06, Palmer Dabbelt wrote:
> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>> > > On riscv it is guaranteed that the address returned by mmap is less
>>> > > than
>>> > > the hint address. Allow mmap to return an address all the way up to
>>> > > addr, if provided, rather than just up to the lower address space.
>>> > > > > This provides a performance benefit as well, allowing mmap 
>>> to exit
>>> > > after
>>> > > checking that the address is in range rather than searching for a
>>> > > valid
>>> > > address.
>>> > > > > It is possible to provide an address that uses at most the same
>>> > > number
>>> > > of bits, however it is significantly more computationally expensive
>>> > > to
>>> > > provide that number rather than setting the max to be the hint
>>> > > address.
>>> > > There is the instruction clz/clzw in Zbb that returns the highest
>>> > > set
>>> > > bit
>>> > > which could be used to performantly implement this, but it would
>>> > > still
>>> > > be slower than the current implementation. At worst case, half of
>>> > > the
>>> > > address would not be able to be allocated when a hint address is
>>> > > provided.
>>> > > > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
>>> > > ---
>>> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>> > > -
>>> > >  1 file changed, 11 insertions(+), 16 deletions(-)
>>> > > > > diff --git a/arch/riscv/include/asm/processor.h
>>> > > b/arch/riscv/include/asm/processor.h
>>> > > index f19f861cda54..8ece7a8f0e18 100644
>>> > > --- a/arch/riscv/include/asm/processor.h
>>> > > +++ b/arch/riscv/include/asm/processor.h
>>> > > @@ -14,22 +14,16 @@
>>> > >
>>> > >  #include <asm/ptrace.h>
>>> > >
>>> > > -#ifdef CONFIG_64BIT
>>> > > -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>> > > -#define STACK_TOP_MAX        TASK_SIZE_64
>>> > > -
>>> > >  #define arch_get_mmap_end(addr, len, flags)            \
>>> > >  ({                                \
>>> > >      unsigned long
>>> > > mmap_end;                    \
>>> > >      typeof(addr) _addr = (addr);                \
>>> > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>> > > is_compat_task())) \
>>> > > +    if ((_addr) == 0 ||                    \
>>> > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>> > > +        ((_addr + len) > BIT(VA_BITS -
>>> > > 1)))            \
>>> > >          mmap_end = STACK_TOP_MAX;            \
>>> > > -    else if ((_addr) >= VA_USER_SV57) \
>>> > > -        mmap_end = STACK_TOP_MAX;            \
>>> > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>> > > VA_BITS_SV48)) \
>>> > > -        mmap_end = VA_USER_SV48;            \
>>> > >      else                            \
>>> > > -        mmap_end = VA_USER_SV39;            \
>>> > > +        mmap_end = (_addr + len);            \
>>> > >      mmap_end;                        \
>>> > >  })
>>> > >
>>> > > @@ -39,17 +33,18 @@
>>> > >      typeof(addr) _addr = (addr);                \
>>> > >      typeof(base) _base = (base);                \
>>> > >      unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>> > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>> > > is_compat_task())) \
>>> > > +    if ((_addr) == 0 ||                    \
>>> > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>> > > +        ((_addr + len) > BIT(VA_BITS -
>>> > > 1)))            \
>>> > >          mmap_base = (_base);                \
>>> > > -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>> > > VA_BITS_SV57)) \
>>> > > -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>> > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>> > > VA_BITS_SV48)) \
>>> > > -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>> > >      else                            \
>>> > > -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>> > > +        mmap_base = (_addr + len) - rnd_gap; \
>>> > >      mmap_base;                        \
>>> > >  })
>>> > >
>>> > > +#ifdef CONFIG_64BIT
>>> > > +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>> > > +#define STACK_TOP_MAX        TASK_SIZE_64
>>> > >  #else
>>> > >  #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>> > >  #define STACK_TOP_MAX        TASK_SIZE
>>> > > > > I have carefully tested your patch on qemu with sv57. A bug 
>>> that
>>> > needs
>>> > to be solved is that mmap with the same hint address without
>>> > MAP_FIXED
>>> > set will fail the second time.
>>> > > Userspace code to reproduce the bug:
>>> > > #include <sys/mman.h>
>>> > #include <stdio.h>
>>> > #include <stdint.h>
>>> > > void test(char *addr) {
>>> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>> > MAP_ANONYMOUS
>>> > > MAP_PRIVATE, -1, 0);
>>> >     printf("hint %p got %p.\n", addr, res);
>>> > }
>>> > > int main (void) {
>>> >     test(1<<30);
>>> >     test(1<<30);
>>> >     test(1<<30);
>>> >     return 0;
>>> > }
>>> > > output:
>>> > > hint 0x40000000 got 0x40000000.
>>> > hint 0x40000000 got 0xffffffffffffffff.
>>> > hint 0x40000000 got 0xffffffffffffffff.
>>> > > output on x86:
>>> > > hint 0x40000000 got 0x40000000.
>>> > hint 0x40000000 got 0x7f9171363000.
>>> > hint 0x40000000 got 0x7f9171362000.
>>> > > It may need to implement a special arch_get_unmapped_area and
>>> > arch_get_unmapped_area_topdown function.
>>> >
>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>> behavior is expected from patch v2 review. However it brings a new
>>> regression even on sv39 systems.
>>>
>>> I still don't know the reason why use addr+len as the upper-bound. I
>>> think solution like x86/arm64/powerpc provide two address space switch
>>> based on whether hint address above the default map window is enough.
>>>
>>
>> Yep this is expected. It is up to the maintainers to decide.
>
> Sorry I forgot to reply to this, I had a buffer sitting around 
> somewhere but I must have lost it.
>
> I think Charlie's approach is the right way to go.  Putting my 
> userspace hat on, I'd much rather have my allocations fail rather than 
> silently ignore the hint when there's memory pressure.
>
> If there's some real use case that needs these low hints to be 
> silently ignored under VA pressure then we can try and figure 
> something out that makes those applications work.

I could confirm that this patch has broken chromium's partition 
allocator on riscv64. The minimal reproduction I use is chromium-mmap.c:

#include <stdio.h>
#include <sys/mman.h>

int main() {
     void* expected = (void*)0x400000000;
     void* addr = mmap(expected, 17179869184, PROT_NONE, 
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
     if (addr != expected) {
         printf("Not expected address: %p != %p\n", addr, expected);
     }
     expected = (void*)0x3fffff000;
     addr = mmap(expected, 17179873280, PROT_NONE, 
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
     if (addr != expected) {
         printf("Not expected address: %p != %p\n", addr, expected);
     }
     return 0;
}

The second mmap fails with ENOMEM. Manually reverting this commit fixes 
the issue for me. So I think it's clearly a regression and breaks 
userspace.

See also https://github.com/riscv-forks/electron/issues/4

>>
>> - Charlie

Sincerely,
Levi
Charlie Jenkins Aug. 19, 2024, 5 p.m. UTC | #7
On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> On 2024-03-22 22:06, Palmer Dabbelt wrote:
> > On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> > > On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> > > > On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> > > > > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > > > > > On riscv it is guaranteed that the address returned by mmap is less
> > > > > > than
> > > > > > the hint address. Allow mmap to return an address all the way up to
> > > > > > addr, if provided, rather than just up to the lower address space.
> > > > > > > > This provides a performance benefit as well, allowing
> > > > mmap to exit
> > > > > > after
> > > > > > checking that the address is in range rather than searching for a
> > > > > > valid
> > > > > > address.
> > > > > > > > It is possible to provide an address that uses at most the same
> > > > > > number
> > > > > > of bits, however it is significantly more computationally expensive
> > > > > > to
> > > > > > provide that number rather than setting the max to be the hint
> > > > > > address.
> > > > > > There is the instruction clz/clzw in Zbb that returns the highest
> > > > > > set
> > > > > > bit
> > > > > > which could be used to performantly implement this, but it would
> > > > > > still
> > > > > > be slower than the current implementation. At worst case, half of
> > > > > > the
> > > > > > address would not be able to be allocated when a hint address is
> > > > > > provided.
> > > > > > > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > > > > > ---
> > > > > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > > > > > -
> > > > > >  1 file changed, 11 insertions(+), 16 deletions(-)
> > > > > > > > diff --git a/arch/riscv/include/asm/processor.h
> > > > > > b/arch/riscv/include/asm/processor.h
> > > > > > index f19f861cda54..8ece7a8f0e18 100644
> > > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > > @@ -14,22 +14,16 @@
> > > > > >
> > > > > >  #include <asm/ptrace.h>
> > > > > >
> > > > > > -#ifdef CONFIG_64BIT
> > > > > > -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > -#define STACK_TOP_MAX        TASK_SIZE_64
> > > > > > -
> > > > > >  #define arch_get_mmap_end(addr, len, flags)            \
> > > > > >  ({                                \
> > > > > >      unsigned long
> > > > > > mmap_end;                    \
> > > > > >      typeof(addr) _addr = (addr);                \
> > > > > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > is_compat_task())) \
> > > > > > +    if ((_addr) == 0 ||                    \
> > > > > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> > > > > > +        ((_addr + len) > BIT(VA_BITS -
> > > > > > 1)))            \
> > > > > >          mmap_end = STACK_TOP_MAX;            \
> > > > > > -    else if ((_addr) >= VA_USER_SV57) \
> > > > > > -        mmap_end = STACK_TOP_MAX;            \
> > > > > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > VA_BITS_SV48)) \
> > > > > > -        mmap_end = VA_USER_SV48;            \
> > > > > >      else                            \
> > > > > > -        mmap_end = VA_USER_SV39;            \
> > > > > > +        mmap_end = (_addr + len);            \
> > > > > >      mmap_end;                        \
> > > > > >  })
> > > > > >
> > > > > > @@ -39,17 +33,18 @@
> > > > > >      typeof(addr) _addr = (addr);                \
> > > > > >      typeof(base) _base = (base);                \
> > > > > >      unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
> > > > > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > is_compat_task())) \
> > > > > > +    if ((_addr) == 0 ||                    \
> > > > > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> > > > > > +        ((_addr + len) > BIT(VA_BITS -
> > > > > > 1)))            \
> > > > > >          mmap_base = (_base);                \
> > > > > > -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > > > > > VA_BITS_SV57)) \
> > > > > > -        mmap_base = VA_USER_SV57 - rnd_gap; \
> > > > > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > VA_BITS_SV48)) \
> > > > > > -        mmap_base = VA_USER_SV48 - rnd_gap; \
> > > > > >      else                            \
> > > > > > -        mmap_base = VA_USER_SV39 - rnd_gap; \
> > > > > > +        mmap_base = (_addr + len) - rnd_gap; \
> > > > > >      mmap_base;                        \
> > > > > >  })
> > > > > >
> > > > > > +#ifdef CONFIG_64BIT
> > > > > > +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > +#define STACK_TOP_MAX        TASK_SIZE_64
> > > > > >  #else
> > > > > >  #define DEFAULT_MAP_WINDOW    TASK_SIZE
> > > > > >  #define STACK_TOP_MAX        TASK_SIZE
> > > > > > > > I have carefully tested your patch on qemu with sv57. A
> > > > bug that
> > > > > needs
> > > > > to be solved is that mmap with the same hint address without
> > > > > MAP_FIXED
> > > > > set will fail the second time.
> > > > > > Userspace code to reproduce the bug:
> > > > > > #include <sys/mman.h>
> > > > > #include <stdio.h>
> > > > > #include <stdint.h>
> > > > > > void test(char *addr) {
> > > > >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> > > > > MAP_ANONYMOUS
> > > > > > MAP_PRIVATE, -1, 0);
> > > > >     printf("hint %p got %p.\n", addr, res);
> > > > > }
> > > > > > int main (void) {
> > > > >     test(1<<30);
> > > > >     test(1<<30);
> > > > >     test(1<<30);
> > > > >     return 0;
> > > > > }
> > > > > > output:
> > > > > > hint 0x40000000 got 0x40000000.
> > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > > output on x86:
> > > > > > hint 0x40000000 got 0x40000000.
> > > > > hint 0x40000000 got 0x7f9171363000.
> > > > > hint 0x40000000 got 0x7f9171362000.
> > > > > > It may need to implement a special arch_get_unmapped_area and
> > > > > arch_get_unmapped_area_topdown function.
> > > > >
> > > > This is because hint address < rnd_gap. I have tried to let mmap_base =
> > > > min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> > > > does not work for bottom-up while ulimit -s is unlimited. You said this
> > > > behavior is expected from patch v2 review. However it brings a new
> > > > regression even on sv39 systems.
> > > > 
> > > > I still don't know the reason why use addr+len as the upper-bound. I
> > > > think solution like x86/arm64/powerpc provide two address space switch
> > > > based on whether hint address above the default map window is enough.
> > > > 
> > > 
> > > Yep this is expected. It is up to the maintainers to decide.
> > 
> > Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> > but I must have lost it.
> > 
> > I think Charlie's approach is the right way to go.  Putting my userspace
> > hat on, I'd much rather have my allocations fail rather than silently
> > ignore the hint when there's memory pressure.
> > 
> > If there's some real use case that needs these low hints to be silently
> > ignored under VA pressure then we can try and figure something out that
> > makes those applications work.
> 
> I could confirm that this patch has broken chromium's partition allocator on
> riscv64. The minimal reproduction I use is chromium-mmap.c:
> 
> #include <stdio.h>
> #include <sys/mman.h>
> 
> int main() {
>     void* expected = (void*)0x400000000;
>     void* addr = mmap(expected, 17179869184, PROT_NONE,
> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>     if (addr != expected) {

It is not valid to assume that the address returned by mmap will be the
hint address. If the hint address is not available, mmap will return a
different address.

>         printf("Not expected address: %p != %p\n", addr, expected);
>     }
>     expected = (void*)0x3fffff000;
>     addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> -1, 0);
>     if (addr != expected) {
>         printf("Not expected address: %p != %p\n", addr, expected);
>     }
>     return 0;
> }
> 
> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> issue for me. So I think it's clearly a regression and breaks userspace.
> 

The issue here is that overlapping memory is being requested. This
second mmap will never be able to provide an address at 0x3fffff000 with
a size of 0x400001000 since mmap just provided an address at 0x400000000
with a size of 0x400000000.

Before this patch, this request causes mmap to return a completely
arbitrary value. There is no reason to use a hint address in this manner
because the hint can never be respected. Since an arbitrary address is
desired, a hint of zero should be used.

This patch causes the behavior to be more deterministic. Instead of
providing an arbitrary address, it causes the address to be less than or
equal to the hint address. This allows for applications to make
assumptions about the returned address.

This code is unfortunately relying on the previously mostly undefined
behavior of the hint address in mmap. The goal of this patch is to help
developers have more consistent mmap behavior, but maybe it is necessary
to hide this behavior behind an mmap flag.

- Charlie

> See also https://github.com/riscv-forks/electron/issues/4
> 
> > > 
> > > - Charlie
> 
> Sincerely,
> Levi
>
Levi Zim Aug. 20, 2024, 1:58 a.m. UTC | #8
On 2024-08-20 01:00, Charlie Jenkins wrote:
> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
>>>>>>> than
>>>>>>> the hint address. Allow mmap to return an address all the way up to
>>>>>>> addr, if provided, rather than just up to the lower address space.
>>>>>>>>> This provides a performance benefit as well, allowing
>>>>> mmap to exit
>>>>>>> after
>>>>>>> checking that the address is in range rather than searching for a
>>>>>>> valid
>>>>>>> address.
>>>>>>>>> It is possible to provide an address that uses at most the same
>>>>>>> number
>>>>>>> of bits, however it is significantly more computationally expensive
>>>>>>> to
>>>>>>> provide that number rather than setting the max to be the hint
>>>>>>> address.
>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
>>>>>>> set
>>>>>>> bit
>>>>>>> which could be used to performantly implement this, but it would
>>>>>>> still
>>>>>>> be slower than the current implementation. At worst case, half of
>>>>>>> the
>>>>>>> address would not be able to be allocated when a hint address is
>>>>>>> provided.
>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
>>>>>>> ---
>>>>>>>   arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>>>>>> -
>>>>>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
>>>>>>> b/arch/riscv/include/asm/processor.h
>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
>>>>>>> --- a/arch/riscv/include/asm/processor.h
>>>>>>> +++ b/arch/riscv/include/asm/processor.h
>>>>>>> @@ -14,22 +14,16 @@
>>>>>>>
>>>>>>>   #include <asm/ptrace.h>
>>>>>>>
>>>>>>> -#ifdef CONFIG_64BIT
>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>> -
>>>>>>>   #define arch_get_mmap_end(addr, len, flags)            \
>>>>>>>   ({                                \
>>>>>>>       unsigned long
>>>>>>> mmap_end;                    \
>>>>>>>       typeof(addr) _addr = (addr);                \
>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>> is_compat_task())) \
>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>> 1)))            \
>>>>>>>           mmap_end = STACK_TOP_MAX;            \
>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>> VA_BITS_SV48)) \
>>>>>>> -        mmap_end = VA_USER_SV48;            \
>>>>>>>       else                            \
>>>>>>> -        mmap_end = VA_USER_SV39;            \
>>>>>>> +        mmap_end = (_addr + len);            \
>>>>>>>       mmap_end;                        \
>>>>>>>   })
>>>>>>>
>>>>>>> @@ -39,17 +33,18 @@
>>>>>>>       typeof(addr) _addr = (addr);                \
>>>>>>>       typeof(base) _base = (base);                \
>>>>>>>       unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>> is_compat_task())) \
>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>> 1)))            \
>>>>>>>           mmap_base = (_base);                \
>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>>>>>> VA_BITS_SV57)) \
>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>> VA_BITS_SV48)) \
>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>>>>>>       else                            \
>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
>>>>>>>       mmap_base;                        \
>>>>>>>   })
>>>>>>>
>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>   #else
>>>>>>>   #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>>>>>>   #define STACK_TOP_MAX        TASK_SIZE
>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
>>>>> bug that
>>>>>> needs
>>>>>> to be solved is that mmap with the same hint address without
>>>>>> MAP_FIXED
>>>>>> set will fail the second time.
>>>>>>> Userspace code to reproduce the bug:
>>>>>>> #include <sys/mman.h>
>>>>>> #include <stdio.h>
>>>>>> #include <stdint.h>
>>>>>>> void test(char *addr) {
>>>>>>      char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>>>>> MAP_ANONYMOUS
>>>>>>> MAP_PRIVATE, -1, 0);
>>>>>>      printf("hint %p got %p.\n", addr, res);
>>>>>> }
>>>>>>> int main (void) {
>>>>>>      test(1<<30);
>>>>>>      test(1<<30);
>>>>>>      test(1<<30);
>>>>>>      return 0;
>>>>>> }
>>>>>>> output:
>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>> output on x86:
>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>> hint 0x40000000 got 0x7f9171363000.
>>>>>> hint 0x40000000 got 0x7f9171362000.
>>>>>>> It may need to implement a special arch_get_unmapped_area and
>>>>>> arch_get_unmapped_area_topdown function.
>>>>>>
>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>>>> behavior is expected from patch v2 review. However it brings a new
>>>>> regression even on sv39 systems.
>>>>>
>>>>> I still don't know the reason why use addr+len as the upper-bound. I
>>>>> think solution like x86/arm64/powerpc provide two address space switch
>>>>> based on whether hint address above the default map window is enough.
>>>>>
>>>> Yep this is expected. It is up to the maintainers to decide.
>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
>>> but I must have lost it.
>>>
>>> I think Charlie's approach is the right way to go.  Putting my userspace
>>> hat on, I'd much rather have my allocations fail rather than silently
>>> ignore the hint when there's memory pressure.
>>>
>>> If there's some real use case that needs these low hints to be silently
>>> ignored under VA pressure then we can try and figure something out that
>>> makes those applications work.
>> I could confirm that this patch has broken chromium's partition allocator on
>> riscv64. The minimal reproduction I use is chromium-mmap.c:
>>
>> #include <stdio.h>
>> #include <sys/mman.h>
>>
>> int main() {
>>      void* expected = (void*)0x400000000;
>>      void* addr = mmap(expected, 17179869184, PROT_NONE,
>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>      if (addr != expected) {
> It is not valid to assume that the address returned by mmap will be the
> hint address. If the hint address is not available, mmap will return a
> different address.

Oh, sorry I didn't make it clear what is the expected behavior.
The printf here is solely for debugging purpose and I don't mean that
chromium expect it will get the hint address. The expected behavior is 
that both the two mmap calls will succeed.

>>          printf("Not expected address: %p != %p\n", addr, expected);
>>      }
>>      expected = (void*)0x3fffff000;
>>      addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
>> -1, 0);
>>      if (addr != expected) {
>>          printf("Not expected address: %p != %p\n", addr, expected);
>>      }
>>      return 0;
>> }
>>
>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
>> issue for me. So I think it's clearly a regression and breaks userspace.
>>
> The issue here is that overlapping memory is being requested. This
> second mmap will never be able to provide an address at 0x3fffff000 with
> a size of 0x400001000 since mmap just provided an address at 0x400000000
> with a size of 0x400000000.
>
> Before this patch, this request causes mmap to return a completely
> arbitrary value. There is no reason to use a hint address in this manner
> because the hint can never be respected. Since an arbitrary address is
> desired, a hint of zero should be used.
>
> This patch causes the behavior to be more deterministic. Instead of
> providing an arbitrary address, it causes the address to be less than or
> equal to the hint address. This allows for applications to make
> assumptions about the returned address.

About the overlap, of course the partition allocator's request for 
overlapped vma seems unreasonable.

But I still don't quite understand why mmap cannot use an address higher 
than the hint address.
The hint address, after all, is a hint, not a requirement.

Quoting the man page:

>    If another mapping already exists there, the kernel picks
>         a new address that may or may not depend on the hint.  The
>         address of the new mapping is returned as the result of the call.

So for casual programmers that only reads man page but not architecture 
specific kernel documentation, the current behavior of mmap on riscv64 
failing on overlapped address ranges are quite surprising IMO.

And quoting the man page again about the errno:

>       ENOMEM No memory is available.
>
>       ENOMEM The process's maximum number of mappings would have been
>              exceeded.  This error can also occur for munmap(), when
>              unmapping a region in the middle of an existing mapping,
>              since this results in two smaller mappings on either side
>              of the region being unmapped.
>
>       ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
>              described in getrlimit(2), would have been exceeded.
>
>       ENOMEM We don't like addr, because it exceeds the virtual address
>              space of the CPU.
>

There's no matching description for the ENOMEM returned here.
I would suggest removing "because it exceeds the virtual address
space of the CPU." from the last item if the ENOMEM behavior here
is expected.

> This code is unfortunately relying on the previously mostly undefined
> behavior of the hint address in mmap.

Although I haven't read the code of chromium's partition allocator to 
judge whether it should
be improved or fixed for riscv64, I do know that the kernel "don't break 
userspace" and "never EVER blame the user programs".

> The goal of this patch is to help
> developers have more consistent mmap behavior, but maybe it is necessary
> to hide this behavior behind an mmap flag.

Thank you for helping to shape a more consistent mmap behavior.
I think this should be fixed ASAP either by allowing the hint address to 
be ignored
(as suggested by the Linux man page), or hide this behavior behind an 
mmap flag as you said.

> - Charlie
>
>> See alsohttps://github.com/riscv-forks/electron/issues/4
>>
>>>> - Charlie
>> Sincerely,
>> Levi
>>

I accidentally introduced some HTML into this reply so this reply is 
resent as plain text.

Sincerely,
Levi
Charlie Jenkins Aug. 20, 2024, 2:43 a.m. UTC | #9
On Tue, Aug 20, 2024 at 09:48:50AM +0800, Levi Zim wrote:
> On 2024-08-20 01:00, Charlie Jenkins wrote:
> > On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> > > On 2024-03-22 22:06, Palmer Dabbelt wrote:
> > > > On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> > > > > On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> > > > > > On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> > > > > > > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > > > > > > > On riscv it is guaranteed that the address returned by mmap is less
> > > > > > > > than
> > > > > > > > the hint address. Allow mmap to return an address all the way up to
> > > > > > > > addr, if provided, rather than just up to the lower address space.
> > > > > > > > > > This provides a performance benefit as well, allowing
> > > > > > mmap to exit
> > > > > > > > after
> > > > > > > > checking that the address is in range rather than searching for a
> > > > > > > > valid
> > > > > > > > address.
> > > > > > > > > > It is possible to provide an address that uses at most the same
> > > > > > > > number
> > > > > > > > of bits, however it is significantly more computationally expensive
> > > > > > > > to
> > > > > > > > provide that number rather than setting the max to be the hint
> > > > > > > > address.
> > > > > > > > There is the instruction clz/clzw in Zbb that returns the highest
> > > > > > > > set
> > > > > > > > bit
> > > > > > > > which could be used to performantly implement this, but it would
> > > > > > > > still
> > > > > > > > be slower than the current implementation. At worst case, half of
> > > > > > > > the
> > > > > > > > address would not be able to be allocated when a hint address is
> > > > > > > > provided.
> > > > > > > > > > Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
> > > > > > > > ---
> > > > > > > >   arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > > > > > > > -
> > > > > > > >   1 file changed, 11 insertions(+), 16 deletions(-)
> > > > > > > > > > diff --git a/arch/riscv/include/asm/processor.h
> > > > > > > > b/arch/riscv/include/asm/processor.h
> > > > > > > > index f19f861cda54..8ece7a8f0e18 100644
> > > > > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > > > > @@ -14,22 +14,16 @@
> > > > > > > > 
> > > > > > > >   #include <asm/ptrace.h>
> > > > > > > > 
> > > > > > > > -#ifdef CONFIG_64BIT
> > > > > > > > -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > > > -#define STACK_TOP_MAX        TASK_SIZE_64
> > > > > > > > -
> > > > > > > >   #define arch_get_mmap_end(addr, len, flags)            \
> > > > > > > >   ({                                \
> > > > > > > >       unsigned long
> > > > > > > > mmap_end;                    \
> > > > > > > >       typeof(addr) _addr = (addr);                \
> > > > > > > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > > > is_compat_task())) \
> > > > > > > > +    if ((_addr) == 0 ||                    \
> > > > > > > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> > > > > > > > +        ((_addr + len) > BIT(VA_BITS -
> > > > > > > > 1)))            \
> > > > > > > >           mmap_end = STACK_TOP_MAX;            \
> > > > > > > > -    else if ((_addr) >= VA_USER_SV57) \
> > > > > > > > -        mmap_end = STACK_TOP_MAX;            \
> > > > > > > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > > > VA_BITS_SV48)) \
> > > > > > > > -        mmap_end = VA_USER_SV48;            \
> > > > > > > >       else                            \
> > > > > > > > -        mmap_end = VA_USER_SV39;            \
> > > > > > > > +        mmap_end = (_addr + len);            \
> > > > > > > >       mmap_end;                        \
> > > > > > > >   })
> > > > > > > > 
> > > > > > > > @@ -39,17 +33,18 @@
> > > > > > > >       typeof(addr) _addr = (addr);                \
> > > > > > > >       typeof(base) _base = (base);                \
> > > > > > > >       unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
> > > > > > > > -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > > > is_compat_task())) \
> > > > > > > > +    if ((_addr) == 0 ||                    \
> > > > > > > > +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> > > > > > > > +        ((_addr + len) > BIT(VA_BITS -
> > > > > > > > 1)))            \
> > > > > > > >           mmap_base = (_base);                \
> > > > > > > > -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > > > > > > > VA_BITS_SV57)) \
> > > > > > > > -        mmap_base = VA_USER_SV57 - rnd_gap; \
> > > > > > > > -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > > > VA_BITS_SV48)) \
> > > > > > > > -        mmap_base = VA_USER_SV48 - rnd_gap; \
> > > > > > > >       else                            \
> > > > > > > > -        mmap_base = VA_USER_SV39 - rnd_gap; \
> > > > > > > > +        mmap_base = (_addr + len) - rnd_gap; \
> > > > > > > >       mmap_base;                        \
> > > > > > > >   })
> > > > > > > > 
> > > > > > > > +#ifdef CONFIG_64BIT
> > > > > > > > +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > > > +#define STACK_TOP_MAX        TASK_SIZE_64
> > > > > > > >   #else
> > > > > > > >   #define DEFAULT_MAP_WINDOW    TASK_SIZE
> > > > > > > >   #define STACK_TOP_MAX        TASK_SIZE
> > > > > > > > > > I have carefully tested your patch on qemu with sv57. A
> > > > > > bug that
> > > > > > > needs
> > > > > > > to be solved is that mmap with the same hint address without
> > > > > > > MAP_FIXED
> > > > > > > set will fail the second time.
> > > > > > > > Userspace code to reproduce the bug:
> > > > > > > > #include <sys/mman.h>
> > > > > > > #include <stdio.h>
> > > > > > > #include <stdint.h>
> > > > > > > > void test(char *addr) {
> > > > > > >      char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> > > > > > > MAP_ANONYMOUS
> > > > > > > > MAP_PRIVATE, -1, 0);
> > > > > > >      printf("hint %p got %p.\n", addr, res);
> > > > > > > }
> > > > > > > > int main (void) {
> > > > > > >      test(1<<30);
> > > > > > >      test(1<<30);
> > > > > > >      test(1<<30);
> > > > > > >      return 0;
> > > > > > > }
> > > > > > > > output:
> > > > > > > > hint 0x40000000 got 0x40000000.
> > > > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > > > > output on x86:
> > > > > > > > hint 0x40000000 got 0x40000000.
> > > > > > > hint 0x40000000 got 0x7f9171363000.
> > > > > > > hint 0x40000000 got 0x7f9171362000.
> > > > > > > > It may need to implement a special arch_get_unmapped_area and
> > > > > > > arch_get_unmapped_area_topdown function.
> > > > > > > 
> > > > > > This is because hint address < rnd_gap. I have tried to let mmap_base =
> > > > > > min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> > > > > > does not work for bottom-up while ulimit -s is unlimited. You said this
> > > > > > behavior is expected from patch v2 review. However it brings a new
> > > > > > regression even on sv39 systems.
> > > > > > 
> > > > > > I still don't know the reason why use addr+len as the upper-bound. I
> > > > > > think solution like x86/arm64/powerpc provide two address space switch
> > > > > > based on whether hint address above the default map window is enough.
> > > > > > 
> > > > > Yep this is expected. It is up to the maintainers to decide.
> > > > Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> > > > but I must have lost it.
> > > > 
> > > > I think Charlie's approach is the right way to go.  Putting my userspace
> > > > hat on, I'd much rather have my allocations fail rather than silently
> > > > ignore the hint when there's memory pressure.
> > > > 
> > > > If there's some real use case that needs these low hints to be silently
> > > > ignored under VA pressure then we can try and figure something out that
> > > > makes those applications work.
> > > I could confirm that this patch has broken chromium's partition allocator on
> > > riscv64. The minimal reproduction I use is chromium-mmap.c:
> > > 
> > > #include <stdio.h>
> > > #include <sys/mman.h>
> > > 
> > > int main() {
> > >      void* expected = (void*)0x400000000;
> > >      void* addr = mmap(expected, 17179869184, PROT_NONE,
> > > MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> > >      if (addr != expected) {
> > It is not valid to assume that the address returned by mmap will be the
> > hint address. If the hint address is not available, mmap will return a
> > different address.
> 
> Oh, sorry I didn't make it clear what is the expected behavior.
> The printf here is solely for debugging purpose and I don't mean that
> chromium expect it will get the hint address. The expected behavior is that
> both the two mmap calls will succeed.
> 
> > >          printf("Not expected address: %p != %p\n", addr, expected);
> > >      }
> > >      expected = (void*)0x3fffff000;
> > >      addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> > > -1, 0);
> > >      if (addr != expected) {
> > >          printf("Not expected address: %p != %p\n", addr, expected);
> > >      }
> > >      return 0;
> > > }
> > > 
> > > The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> > > issue for me. So I think it's clearly a regression and breaks userspace.
> > > 
> > The issue here is that overlapping memory is being requested. This
> > second mmap will never be able to provide an address at 0x3fffff000 with
> > a size of 0x400001000 since mmap just provided an address at 0x400000000
> > with a size of 0x400000000.
> > 
> > Before this patch, this request causes mmap to return a completely
> > arbitrary value. There is no reason to use a hint address in this manner
> > because the hint can never be respected. Since an arbitrary address is
> > desired, a hint of zero should be used.
> > 
> > This patch causes the behavior to be more deterministic. Instead of
> > providing an arbitrary address, it causes the address to be less than or
> > equal to the hint address. This allows for applications to make
> > assumptions about the returned address.
> 
> About the overlap, of course the partition allocator's request for
> overlapped vma seems unreasonable.
> 
> But I still don't quite understand why mmap cannot use an address higher
> than the hint address.
> The hint address, after all, is a hint, not a requirement.

Yes that is fair. A "hint" that does not guarantee anything is
useless so architectures have abused the term quite a bit.

> 
> Quoting the man page:
> 
> >    If another mapping already exists there, the kernel picks
> >         a new address that may or may not depend on the hint.  The
> >         address of the new mapping is returned as the result of the call.
> So for casual programmers that only reads man page but not architecture
> specific kernel
> documentation, the current behavior of mmap on riscv64 failing on overlapped
> address ranges
> are quite surprising IMO.

The man pages for riscv are in desperate need of attention. I have
submitted a couple of updates to them recently, but there is a lot more
work to be done to help developers.

> 
> And quoting the man page again about the errno:
> 
> >       ENOMEM No memory is available.
> > 
> >       ENOMEM The process's maximum number of mappings would have been
> >              exceeded.  This error can also occur for munmap(), when
> >              unmapping a region in the middle of an existing mapping,
> >              since this results in two smaller mappings on either side
> >              of the region being unmapped.
> > 
> >       ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
> >              described in getrlimit(2), would have been exceeded.
> > 
> >       ENOMEM We don't like addr, because it exceeds the virtual address
> >              space of the CPU.
> > 
> 
> There's no matching description for the ENOMEM returned here.
> I would suggest removing "because it exceeds the virtual address
> space of the CPU." from the last item if the ENOMEM behavior here
> is expected.

This ENOMEM means something like "no memory available in the requested
region".

> 
> > This code is unfortunately relying on the previously mostly undefined
> > behavior of the hint address in mmap.
> Although I haven't read the code of chromium's partition allocator to judge
> whether it should
> be improved or fixed for riscv64, I do know that the kernel "don't break
> userspace" and
> "never EVER blame the user programs".

The hint address design of mmap is a tricky one because it is largely
implementation defined and what the man pages say is not how it is
implemented in most architectures!

> > The goal of this patch is to help
> > developers have more consistent mmap behavior, but maybe it is necessary
> > to hide this behavior behind an mmap flag.
> Thank you for helping to shape a more consistent mmap behavior.
> I think this should be fixed ASAP either by allowing the hint address to be
> ignored
> (as suggested by the Linux man page), or hide this behavior behind an mmap
> flag as you said.

Having a flag could also lead to a generic way of defining this
behavior. Other architectures do not provide a way for applications to
guarantee that some number of bits are left unused in a virtual address,
and that was one of the motivating design goals here.

- Charlie

> 
> > - Charlie
> > 
> > > See alsohttps://github.com/riscv-forks/electron/issues/4
> > > 
> > > > > - Charlie
> > > Sincerely,
> > > Levi
> > >
Palmer Dabbelt Aug. 21, 2024, 10:17 p.m. UTC | #10
On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
> On 2024-08-20 01:00, Charlie Jenkins wrote:
>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
>>>>>>>> than
>>>>>>>> the hint address. Allow mmap to return an address all the way up to
>>>>>>>> addr, if provided, rather than just up to the lower address space.
>>>>>>>>>> This provides a performance benefit as well, allowing
>>>>>> mmap to exit
>>>>>>>> after
>>>>>>>> checking that the address is in range rather than searching for a
>>>>>>>> valid
>>>>>>>> address.
>>>>>>>>>> It is possible to provide an address that uses at most the same
>>>>>>>> number
>>>>>>>> of bits, however it is significantly more computationally expensive
>>>>>>>> to
>>>>>>>> provide that number rather than setting the max to be the hint
>>>>>>>> address.
>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
>>>>>>>> set
>>>>>>>> bit
>>>>>>>> which could be used to performantly implement this, but it would
>>>>>>>> still
>>>>>>>> be slower than the current implementation. At worst case, half of
>>>>>>>> the
>>>>>>>> address would not be able to be allocated when a hint address is
>>>>>>>> provided.
>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
>>>>>>>> ---
>>>>>>>>   arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>>>>>>> -
>>>>>>>>   1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
>>>>>>>> b/arch/riscv/include/asm/processor.h
>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
>>>>>>>> --- a/arch/riscv/include/asm/processor.h
>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
>>>>>>>> @@ -14,22 +14,16 @@
>>>>>>>>
>>>>>>>>   #include <asm/ptrace.h>
>>>>>>>>
>>>>>>>> -#ifdef CONFIG_64BIT
>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>> -
>>>>>>>>   #define arch_get_mmap_end(addr, len, flags)            \
>>>>>>>>   ({                                \
>>>>>>>>       unsigned long
>>>>>>>> mmap_end;                    \
>>>>>>>>       typeof(addr) _addr = (addr);                \
>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>> is_compat_task())) \
>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>> 1)))            \
>>>>>>>>           mmap_end = STACK_TOP_MAX;            \
>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>> VA_BITS_SV48)) \
>>>>>>>> -        mmap_end = VA_USER_SV48;            \
>>>>>>>>       else                            \
>>>>>>>> -        mmap_end = VA_USER_SV39;            \
>>>>>>>> +        mmap_end = (_addr + len);            \
>>>>>>>>       mmap_end;                        \
>>>>>>>>   })
>>>>>>>>
>>>>>>>> @@ -39,17 +33,18 @@
>>>>>>>>       typeof(addr) _addr = (addr);                \
>>>>>>>>       typeof(base) _base = (base);                \
>>>>>>>>       unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>> is_compat_task())) \
>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>> 1)))            \
>>>>>>>>           mmap_base = (_base);                \
>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>>>>>>> VA_BITS_SV57)) \
>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>> VA_BITS_SV48)) \
>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>>>>>>>       else                            \
>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
>>>>>>>>       mmap_base;                        \
>>>>>>>>   })
>>>>>>>>
>>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>   #else
>>>>>>>>   #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>>>>>>>   #define STACK_TOP_MAX        TASK_SIZE
>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
>>>>>> bug that
>>>>>>> needs
>>>>>>> to be solved is that mmap with the same hint address without
>>>>>>> MAP_FIXED
>>>>>>> set will fail the second time.
>>>>>>>> Userspace code to reproduce the bug:
>>>>>>>> #include <sys/mman.h>
>>>>>>> #include <stdio.h>
>>>>>>> #include <stdint.h>
>>>>>>>> void test(char *addr) {
>>>>>>>      char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>>>>>> MAP_ANONYMOUS
>>>>>>>> MAP_PRIVATE, -1, 0);
>>>>>>>      printf("hint %p got %p.\n", addr, res);
>>>>>>> }
>>>>>>>> int main (void) {
>>>>>>>      test(1<<30);
>>>>>>>      test(1<<30);
>>>>>>>      test(1<<30);
>>>>>>>      return 0;
>>>>>>> }
>>>>>>>> output:
>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>> output on x86:
>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>> hint 0x40000000 got 0x7f9171363000.
>>>>>>> hint 0x40000000 got 0x7f9171362000.
>>>>>>>> It may need to implement a special arch_get_unmapped_area and
>>>>>>> arch_get_unmapped_area_topdown function.
>>>>>>>
>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>>>>> behavior is expected from patch v2 review. However it brings a new
>>>>>> regression even on sv39 systems.
>>>>>>
>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
>>>>>> think solution like x86/arm64/powerpc provide two address space switch
>>>>>> based on whether hint address above the default map window is enough.
>>>>>>
>>>>> Yep this is expected. It is up to the maintainers to decide.
>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
>>>> but I must have lost it.
>>>>
>>>> I think Charlie's approach is the right way to go.  Putting my userspace
>>>> hat on, I'd much rather have my allocations fail rather than silently
>>>> ignore the hint when there's memory pressure.
>>>>
>>>> If there's some real use case that needs these low hints to be silently
>>>> ignored under VA pressure then we can try and figure something out that
>>>> makes those applications work.
>>> I could confirm that this patch has broken chromium's partition allocator on
>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
>>>
>>> #include <stdio.h>
>>> #include <sys/mman.h>
>>>
>>> int main() {
>>>      void* expected = (void*)0x400000000;
>>>      void* addr = mmap(expected, 17179869184, PROT_NONE,
>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>      if (addr != expected) {
>> It is not valid to assume that the address returned by mmap will be the
>> hint address. If the hint address is not available, mmap will return a
>> different address.
>
> Oh, sorry I didn't make it clear what is the expected behavior.
> The printf here is solely for debugging purpose and I don't mean that
> chromium expect it will get the hint address. The expected behavior is
> that both the two mmap calls will succeed.
>
>>>          printf("Not expected address: %p != %p\n", addr, expected);
>>>      }
>>>      expected = (void*)0x3fffff000;
>>>      addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
>>> -1, 0);
>>>      if (addr != expected) {
>>>          printf("Not expected address: %p != %p\n", addr, expected);
>>>      }
>>>      return 0;
>>> }
>>>
>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
>>> issue for me. So I think it's clearly a regression and breaks userspace.
>>>
>> The issue here is that overlapping memory is being requested. This
>> second mmap will never be able to provide an address at 0x3fffff000 with
>> a size of 0x400001000 since mmap just provided an address at 0x400000000
>> with a size of 0x400000000.
>>
>> Before this patch, this request causes mmap to return a completely
>> arbitrary value. There is no reason to use a hint address in this manner
>> because the hint can never be respected. Since an arbitrary address is
>> desired, a hint of zero should be used.
>>
>> This patch causes the behavior to be more deterministic. Instead of
>> providing an arbitrary address, it causes the address to be less than or
>> equal to the hint address. This allows for applications to make
>> assumptions about the returned address.
>
> About the overlap, of course the partition allocator's request for
> overlapped vma seems unreasonable.
>
> But I still don't quite understand why mmap cannot use an address higher
> than the hint address.
> The hint address, after all, is a hint, not a requirement.
>
> Quoting the man page:
>
>>    If another mapping already exists there, the kernel picks
>>         a new address that may or may not depend on the hint.  The
>>         address of the new mapping is returned as the result of the call.
>
> So for casual programmers that only reads man page but not architecture
> specific kernel documentation, the current behavior of mmap on riscv64
> failing on overlapped address ranges are quite surprising IMO.
>
> And quoting the man page again about the errno:
>
>>       ENOMEM No memory is available.
>>
>>       ENOMEM The process's maximum number of mappings would have been
>>              exceeded.  This error can also occur for munmap(), when
>>              unmapping a region in the middle of an existing mapping,
>>              since this results in two smaller mappings on either side
>>              of the region being unmapped.
>>
>>       ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
>>              described in getrlimit(2), would have been exceeded.
>>
>>       ENOMEM We don't like addr, because it exceeds the virtual address
>>              space of the CPU.
>>
>
> There's no matching description for the ENOMEM returned here.
> I would suggest removing "because it exceeds the virtual address
> space of the CPU." from the last item if the ENOMEM behavior here
> is expected.
>
>> This code is unfortunately relying on the previously mostly undefined
>> behavior of the hint address in mmap.
>
> Although I haven't read the code of chromium's partition allocator to
> judge whether it should
> be improved or fixed for riscv64, I do know that the kernel "don't break
> userspace" and "never EVER blame the user programs".

Ya, sorry for breaking stuff.

The goal here was to move to the mmap flag behavor similar to what arm64 
and x86 have, as that was done in a way that didn't appear to break 
userspace -- or at least any real userspace programs.  IIRC that first 
test was pretty broken (it actually depended on the hint address), but 
sounds like that's not the case.

I think maybe this is just luck: we didn't chunk the address space up, 
we're just hinting on every bit, so we're just more likely to hit the 
exhaustion.  Doesn't really matter, though, as if it's breaking stuff so 
we've got to deal with it.

Charlie and I are just talking, and best we can come up with is to move 
to the behavior where we fall back to larger allocation regions when 
there's no space in the smaller allocation region.  Charlie's going to 
try and throw together a patch for that, hopefully it'll sort things 
out.

>> The goal of this patch is to help
>> developers have more consistent mmap behavior, but maybe it is necessary
>> to hide this behavior behind an mmap flag.
>
> Thank you for helping to shape a more consistent mmap behavior.
> I think this should be fixed ASAP either by allowing the hint address to
> be ignored
> (as suggested by the Linux man page), or hide this behavior behind an
> mmap flag as you said.
>
>> - Charlie
>>
>>> See alsohttps://github.com/riscv-forks/electron/issues/4
>>>
>>>>> - Charlie
>>> Sincerely,
>>> Levi
>>>
>
> I accidentally introduced some HTML into this reply so this reply is
> resent as plain text.
>
> Sincerely,
> Levi
Yangyu Chen Aug. 22, 2024, 2:51 a.m. UTC | #11
> On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
> 
> On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
>> On 2024-08-20 01:00, Charlie Jenkins wrote:
>>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
>>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
>>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
>>>>>>>>> than
>>>>>>>>> the hint address. Allow mmap to return an address all the way up to
>>>>>>>>> addr, if provided, rather than just up to the lower address space.
>>>>>>>>>>> This provides a performance benefit as well, allowing
>>>>>>> mmap to exit
>>>>>>>>> after
>>>>>>>>> checking that the address is in range rather than searching for a
>>>>>>>>> valid
>>>>>>>>> address.
>>>>>>>>>>> It is possible to provide an address that uses at most the same
>>>>>>>>> number
>>>>>>>>> of bits, however it is significantly more computationally expensive
>>>>>>>>> to
>>>>>>>>> provide that number rather than setting the max to be the hint
>>>>>>>>> address.
>>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
>>>>>>>>> set
>>>>>>>>> bit
>>>>>>>>> which could be used to performantly implement this, but it would
>>>>>>>>> still
>>>>>>>>> be slower than the current implementation. At worst case, half of
>>>>>>>>> the
>>>>>>>>> address would not be able to be allocated when a hint address is
>>>>>>>>> provided.
>>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
>>>>>>>>> ---
>>>>>>>>>  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>>>>>>>> -
>>>>>>>>>  1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
>>>>>>>>> b/arch/riscv/include/asm/processor.h
>>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
>>>>>>>>> --- a/arch/riscv/include/asm/processor.h
>>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
>>>>>>>>> @@ -14,22 +14,16 @@
>>>>>>>>> 
>>>>>>>>>  #include <asm/ptrace.h>
>>>>>>>>> 
>>>>>>>>> -#ifdef CONFIG_64BIT
>>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>> -
>>>>>>>>>  #define arch_get_mmap_end(addr, len, flags)            \
>>>>>>>>>  ({                                \
>>>>>>>>>      unsigned long
>>>>>>>>> mmap_end;                    \
>>>>>>>>>      typeof(addr) _addr = (addr);                \
>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>> is_compat_task())) \
>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>> 1)))            \
>>>>>>>>>          mmap_end = STACK_TOP_MAX;            \
>>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
>>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>> -        mmap_end = VA_USER_SV48;            \
>>>>>>>>>      else                            \
>>>>>>>>> -        mmap_end = VA_USER_SV39;            \
>>>>>>>>> +        mmap_end = (_addr + len);            \
>>>>>>>>>      mmap_end;                        \
>>>>>>>>>  })
>>>>>>>>> 
>>>>>>>>> @@ -39,17 +33,18 @@
>>>>>>>>>      typeof(addr) _addr = (addr);                \
>>>>>>>>>      typeof(base) _base = (base);                \
>>>>>>>>>      unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>> is_compat_task())) \
>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>> 1)))            \
>>>>>>>>>          mmap_base = (_base);                \
>>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>>>>>>>> VA_BITS_SV57)) \
>>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>>>>>>>>      else                            \
>>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
>>>>>>>>>      mmap_base;                        \
>>>>>>>>>  })
>>>>>>>>> 
>>>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>>  #else
>>>>>>>>>  #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>>>>>>>>  #define STACK_TOP_MAX        TASK_SIZE
>>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
>>>>>>> bug that
>>>>>>>> needs
>>>>>>>> to be solved is that mmap with the same hint address without
>>>>>>>> MAP_FIXED
>>>>>>>> set will fail the second time.
>>>>>>>>> Userspace code to reproduce the bug:
>>>>>>>>> #include <sys/mman.h>
>>>>>>>> #include <stdio.h>
>>>>>>>> #include <stdint.h>
>>>>>>>>> void test(char *addr) {
>>>>>>>>     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>>>>>>> MAP_ANONYMOUS
>>>>>>>>> MAP_PRIVATE, -1, 0);
>>>>>>>>     printf("hint %p got %p.\n", addr, res);
>>>>>>>> }
>>>>>>>>> int main (void) {
>>>>>>>>     test(1<<30);
>>>>>>>>     test(1<<30);
>>>>>>>>     test(1<<30);
>>>>>>>>     return 0;
>>>>>>>> }
>>>>>>>>> output:
>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>>> output on x86:
>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>> hint 0x40000000 got 0x7f9171363000.
>>>>>>>> hint 0x40000000 got 0x7f9171362000.
>>>>>>>>> It may need to implement a special arch_get_unmapped_area and
>>>>>>>> arch_get_unmapped_area_topdown function.
>>>>>>>> 
>>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>>>>>> behavior is expected from patch v2 review. However it brings a new
>>>>>>> regression even on sv39 systems.
>>>>>>> 
>>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
>>>>>>> think solution like x86/arm64/powerpc provide two address space switch
>>>>>>> based on whether hint address above the default map window is enough.
>>>>>>> 
>>>>>> Yep this is expected. It is up to the maintainers to decide.
>>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
>>>>> but I must have lost it.
>>>>> 
>>>>> I think Charlie's approach is the right way to go.  Putting my userspace
>>>>> hat on, I'd much rather have my allocations fail rather than silently
>>>>> ignore the hint when there's memory pressure.
>>>>> 
>>>>> If there's some real use case that needs these low hints to be silently
>>>>> ignored under VA pressure then we can try and figure something out that
>>>>> makes those applications work.
>>>> I could confirm that this patch has broken chromium's partition allocator on
>>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
>>>> 
>>>> #include <stdio.h>
>>>> #include <sys/mman.h>
>>>> 
>>>> int main() {
>>>>     void* expected = (void*)0x400000000;
>>>>     void* addr = mmap(expected, 17179869184, PROT_NONE,
>>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>>     if (addr != expected) {
>>> It is not valid to assume that the address returned by mmap will be the
>>> hint address. If the hint address is not available, mmap will return a
>>> different address.
>> 
>> Oh, sorry I didn't make it clear what is the expected behavior.
>> The printf here is solely for debugging purpose and I don't mean that
>> chromium expect it will get the hint address. The expected behavior is
>> that both the two mmap calls will succeed.
>> 
>>>>         printf("Not expected address: %p != %p\n", addr, expected);
>>>>     }
>>>>     expected = (void*)0x3fffff000;
>>>>     addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
>>>> -1, 0);
>>>>     if (addr != expected) {
>>>>         printf("Not expected address: %p != %p\n", addr, expected);
>>>>     }
>>>>     return 0;
>>>> }
>>>> 
>>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
>>>> issue for me. So I think it's clearly a regression and breaks userspace.
>>>> 
>>> The issue here is that overlapping memory is being requested. This
>>> second mmap will never be able to provide an address at 0x3fffff000 with
>>> a size of 0x400001000 since mmap just provided an address at 0x400000000
>>> with a size of 0x400000000.
>>> 
>>> Before this patch, this request causes mmap to return a completely
>>> arbitrary value. There is no reason to use a hint address in this manner
>>> because the hint can never be respected. Since an arbitrary address is
>>> desired, a hint of zero should be used.
>>> 
>>> This patch causes the behavior to be more deterministic. Instead of
>>> providing an arbitrary address, it causes the address to be less than or
>>> equal to the hint address. This allows for applications to make
>>> assumptions about the returned address.
>> 
>> About the overlap, of course the partition allocator's request for
>> overlapped vma seems unreasonable.
>> 
>> But I still don't quite understand why mmap cannot use an address higher
>> than the hint address.
>> The hint address, after all, is a hint, not a requirement.
>> 
>> Quoting the man page:
>> 
>>>   If another mapping already exists there, the kernel picks
>>>        a new address that may or may not depend on the hint.  The
>>>        address of the new mapping is returned as the result of the call.
>> 
>> So for casual programmers that only reads man page but not architecture
>> specific kernel documentation, the current behavior of mmap on riscv64
>> failing on overlapped address ranges are quite surprising IMO.
>> 
>> And quoting the man page again about the errno:
>> 
>>>       ENOMEM No memory is available.
>>> 
>>>       ENOMEM The process's maximum number of mappings would have been
>>>              exceeded.  This error can also occur for munmap(), when
>>>              unmapping a region in the middle of an existing mapping,
>>>              since this results in two smaller mappings on either side
>>>              of the region being unmapped.
>>> 
>>>       ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
>>>              described in getrlimit(2), would have been exceeded.
>>> 
>>>       ENOMEM We don't like addr, because it exceeds the virtual address
>>>              space of the CPU.
>>> 
>> 
>> There's no matching description for the ENOMEM returned here.
>> I would suggest removing "because it exceeds the virtual address
>> space of the CPU." from the last item if the ENOMEM behavior here
>> is expected.
>> 
>>> This code is unfortunately relying on the previously mostly undefined
>>> behavior of the hint address in mmap.
>> 
>> Although I haven't read the code of chromium's partition allocator to
>> judge whether it should
>> be improved or fixed for riscv64, I do know that the kernel "don't break
>> userspace" and "never EVER blame the user programs".
> 
> Ya, sorry for breaking stuff.
> 
> The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
> 
> I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
> 
> Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region.  


For this solution, the only difference from the mmap behavior of
x86 and aarch64 is that we will first try to allocate some memory
from an address less or equal to the request address + size. But
for most cases, I think there is no need to do that, especially for
those addresses < BIT(47), as most program works fine on x86-64,
which has 47bit available userspace address space to use. And for
that program that wants an address < BIT(32), we already have
MAP_32BIT now.

I think we can just fix like that patch:
https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/

> Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
> 
>>> The goal of this patch is to help
>>> developers have more consistent mmap behavior, but maybe it is necessary
>>> to hide this behavior behind an mmap flag.
>> 
>> Thank you for helping to shape a more consistent mmap behavior.
>> I think this should be fixed ASAP either by allowing the hint address to
>> be ignored
>> (as suggested by the Linux man page), or hide this behavior behind an
>> mmap flag as you said.
>> 
>>> - Charlie
>>> 
>>>> See alsohttps://github.com/riscv-forks/electron/issues/4
>>>> 
>>>>>> - Charlie
>>>> Sincerely,
>>>> Levi
>>>> 
>> 
>> I accidentally introduced some HTML into this reply so this reply is
>> resent as plain text.
>> 
>> Sincerely,
>> Levi
Charlie Jenkins Aug. 23, 2024, 4:39 a.m. UTC | #12
On Thu, Aug 22, 2024 at 10:51:54AM +0800, Yangyu Chen wrote:
> 
> 
> > On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
> > 
> > On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
> >> On 2024-08-20 01:00, Charlie Jenkins wrote:
> >>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> >>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
> >>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> >>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> >>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> >>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> >>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
> >>>>>>>>> than
> >>>>>>>>> the hint address. Allow mmap to return an address all the way up to
> >>>>>>>>> addr, if provided, rather than just up to the lower address space.
> >>>>>>>>>>> This provides a performance benefit as well, allowing
> >>>>>>> mmap to exit
> >>>>>>>>> after
> >>>>>>>>> checking that the address is in range rather than searching for a
> >>>>>>>>> valid
> >>>>>>>>> address.
> >>>>>>>>>>> It is possible to provide an address that uses at most the same
> >>>>>>>>> number
> >>>>>>>>> of bits, however it is significantly more computationally expensive
> >>>>>>>>> to
> >>>>>>>>> provide that number rather than setting the max to be the hint
> >>>>>>>>> address.
> >>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
> >>>>>>>>> set
> >>>>>>>>> bit
> >>>>>>>>> which could be used to performantly implement this, but it would
> >>>>>>>>> still
> >>>>>>>>> be slower than the current implementation. At worst case, half of
> >>>>>>>>> the
> >>>>>>>>> address would not be able to be allocated when a hint address is
> >>>>>>>>> provided.
> >>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
> >>>>>>>>> ---
> >>>>>>>>>  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> >>>>>>>>> -
> >>>>>>>>>  1 file changed, 11 insertions(+), 16 deletions(-)
> >>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
> >>>>>>>>> b/arch/riscv/include/asm/processor.h
> >>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
> >>>>>>>>> --- a/arch/riscv/include/asm/processor.h
> >>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
> >>>>>>>>> @@ -14,22 +14,16 @@
> >>>>>>>>> 
> >>>>>>>>>  #include <asm/ptrace.h>
> >>>>>>>>> 
> >>>>>>>>> -#ifdef CONFIG_64BIT
> >>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>> -
> >>>>>>>>>  #define arch_get_mmap_end(addr, len, flags)            \
> >>>>>>>>>  ({                                \
> >>>>>>>>>      unsigned long
> >>>>>>>>> mmap_end;                    \
> >>>>>>>>>      typeof(addr) _addr = (addr);                \
> >>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>> is_compat_task())) \
> >>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>> 1)))            \
> >>>>>>>>>          mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
> >>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>> -        mmap_end = VA_USER_SV48;            \
> >>>>>>>>>      else                            \
> >>>>>>>>> -        mmap_end = VA_USER_SV39;            \
> >>>>>>>>> +        mmap_end = (_addr + len);            \
> >>>>>>>>>      mmap_end;                        \
> >>>>>>>>>  })
> >>>>>>>>> 
> >>>>>>>>> @@ -39,17 +33,18 @@
> >>>>>>>>>      typeof(addr) _addr = (addr);                \
> >>>>>>>>>      typeof(base) _base = (base);                \
> >>>>>>>>>      unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
> >>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>> is_compat_task())) \
> >>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>> 1)))            \
> >>>>>>>>>          mmap_base = (_base);                \
> >>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> >>>>>>>>> VA_BITS_SV57)) \
> >>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
> >>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
> >>>>>>>>>      else                            \
> >>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
> >>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
> >>>>>>>>>      mmap_base;                        \
> >>>>>>>>>  })
> >>>>>>>>> 
> >>>>>>>>> +#ifdef CONFIG_64BIT
> >>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>>  #else
> >>>>>>>>>  #define DEFAULT_MAP_WINDOW    TASK_SIZE
> >>>>>>>>>  #define STACK_TOP_MAX        TASK_SIZE
> >>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
> >>>>>>> bug that
> >>>>>>>> needs
> >>>>>>>> to be solved is that mmap with the same hint address without
> >>>>>>>> MAP_FIXED
> >>>>>>>> set will fail the second time.
> >>>>>>>>> Userspace code to reproduce the bug:
> >>>>>>>>> #include <sys/mman.h>
> >>>>>>>> #include <stdio.h>
> >>>>>>>> #include <stdint.h>
> >>>>>>>>> void test(char *addr) {
> >>>>>>>>     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> >>>>>>>> MAP_ANONYMOUS
> >>>>>>>>> MAP_PRIVATE, -1, 0);
> >>>>>>>>     printf("hint %p got %p.\n", addr, res);
> >>>>>>>> }
> >>>>>>>>> int main (void) {
> >>>>>>>>     test(1<<30);
> >>>>>>>>     test(1<<30);
> >>>>>>>>     test(1<<30);
> >>>>>>>>     return 0;
> >>>>>>>> }
> >>>>>>>>> output:
> >>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>>> output on x86:
> >>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>> hint 0x40000000 got 0x7f9171363000.
> >>>>>>>> hint 0x40000000 got 0x7f9171362000.
> >>>>>>>>> It may need to implement a special arch_get_unmapped_area and
> >>>>>>>> arch_get_unmapped_area_topdown function.
> >>>>>>>> 
> >>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
> >>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> >>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
> >>>>>>> behavior is expected from patch v2 review. However it brings a new
> >>>>>>> regression even on sv39 systems.
> >>>>>>> 
> >>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
> >>>>>>> think solution like x86/arm64/powerpc provide two address space switch
> >>>>>>> based on whether hint address above the default map window is enough.
> >>>>>>> 
> >>>>>> Yep this is expected. It is up to the maintainers to decide.
> >>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> >>>>> but I must have lost it.
> >>>>> 
> >>>>> I think Charlie's approach is the right way to go.  Putting my userspace
> >>>>> hat on, I'd much rather have my allocations fail rather than silently
> >>>>> ignore the hint when there's memory pressure.
> >>>>> 
> >>>>> If there's some real use case that needs these low hints to be silently
> >>>>> ignored under VA pressure then we can try and figure something out that
> >>>>> makes those applications work.
> >>>> I could confirm that this patch has broken chromium's partition allocator on
> >>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
> >>>> 
> >>>> #include <stdio.h>
> >>>> #include <sys/mman.h>
> >>>> 
> >>>> int main() {
> >>>>     void* expected = (void*)0x400000000;
> >>>>     void* addr = mmap(expected, 17179869184, PROT_NONE,
> >>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> >>>>     if (addr != expected) {
> >>> It is not valid to assume that the address returned by mmap will be the
> >>> hint address. If the hint address is not available, mmap will return a
> >>> different address.
> >> 
> >> Oh, sorry I didn't make it clear what is the expected behavior.
> >> The printf here is solely for debugging purpose and I don't mean that
> >> chromium expect it will get the hint address. The expected behavior is
> >> that both the two mmap calls will succeed.
> >> 
> >>>>         printf("Not expected address: %p != %p\n", addr, expected);
> >>>>     }
> >>>>     expected = (void*)0x3fffff000;
> >>>>     addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> >>>> -1, 0);
> >>>>     if (addr != expected) {
> >>>>         printf("Not expected address: %p != %p\n", addr, expected);
> >>>>     }
> >>>>     return 0;
> >>>> }
> >>>> 
> >>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> >>>> issue for me. So I think it's clearly a regression and breaks userspace.
> >>>> 
> >>> The issue here is that overlapping memory is being requested. This
> >>> second mmap will never be able to provide an address at 0x3fffff000 with
> >>> a size of 0x400001000 since mmap just provided an address at 0x400000000
> >>> with a size of 0x400000000.
> >>> 
> >>> Before this patch, this request causes mmap to return a completely
> >>> arbitrary value. There is no reason to use a hint address in this manner
> >>> because the hint can never be respected. Since an arbitrary address is
> >>> desired, a hint of zero should be used.
> >>> 
> >>> This patch causes the behavior to be more deterministic. Instead of
> >>> providing an arbitrary address, it causes the address to be less than or
> >>> equal to the hint address. This allows for applications to make
> >>> assumptions about the returned address.
> >> 
> >> About the overlap, of course the partition allocator's request for
> >> overlapped vma seems unreasonable.
> >> 
> >> But I still don't quite understand why mmap cannot use an address higher
> >> than the hint address.
> >> The hint address, after all, is a hint, not a requirement.
> >> 
> >> Quoting the man page:
> >> 
> >>>   If another mapping already exists there, the kernel picks
> >>>        a new address that may or may not depend on the hint.  The
> >>>        address of the new mapping is returned as the result of the call.
> >> 
> >> So for casual programmers that only reads man page but not architecture
> >> specific kernel documentation, the current behavior of mmap on riscv64
> >> failing on overlapped address ranges are quite surprising IMO.
> >> 
> >> And quoting the man page again about the errno:
> >> 
> >>>       ENOMEM No memory is available.
> >>> 
> >>>       ENOMEM The process's maximum number of mappings would have been
> >>>              exceeded.  This error can also occur for munmap(), when
> >>>              unmapping a region in the middle of an existing mapping,
> >>>              since this results in two smaller mappings on either side
> >>>              of the region being unmapped.
> >>> 
> >>>       ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
> >>>              described in getrlimit(2), would have been exceeded.
> >>> 
> >>>       ENOMEM We don't like addr, because it exceeds the virtual address
> >>>              space of the CPU.
> >>> 
> >> 
> >> There's no matching description for the ENOMEM returned here.
> >> I would suggest removing "because it exceeds the virtual address
> >> space of the CPU." from the last item if the ENOMEM behavior here
> >> is expected.
> >> 
> >>> This code is unfortunately relying on the previously mostly undefined
> >>> behavior of the hint address in mmap.
> >> 
> >> Although I haven't read the code of chromium's partition allocator to
> >> judge whether it should
> >> be improved or fixed for riscv64, I do know that the kernel "don't break
> >> userspace" and "never EVER blame the user programs".
> > 
> > Ya, sorry for breaking stuff.
> > 
> > The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
> > 
> > I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
> > 
> > Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region.  
> 
> 
> For this solution, the only difference from the mmap behavior of
> x86 and aarch64 is that we will first try to allocate some memory
> from an address less or equal to the request address + size. But
> for most cases, I think there is no need to do that, especially for
> those addresses < BIT(47), as most program works fine on x86-64,
> which has 47bit available userspace address space to use. And for
> that program that wants an address < BIT(32), we already have
> MAP_32BIT now.
> 
> I think we can just fix like that patch:
> https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/

This patch does not satisfy the requirement of having the ability to guarantee
that mmap returns an address that is less than the hint address. This
patch only allows an address to be less than the DEFAULT_MAP_WINDOW
which is 32 bits on sv32, 39 bits on sv39, and 48 bits on sv48 or sv57.

This patch also again falls into the trap of using the hint address to
forcefully restrict the address space. I agree with Levi that it is not
very good behavior to have a "hint" cause mmap to fail if conforming to
the hint isn't possible. Instead, I believe it to be more logical to try
to allocate at the hint address, otherwise give a random address.

The current behavior can then be maintained through the flag
MAP_BELOW_HINT. This way the user explicitly selects that they want mmap
to fail if an address could not be found within the hint address
constraints.

- Charlie

> 
> > Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
> > 
> >>> The goal of this patch is to help
> >>> developers have more consistent mmap behavior, but maybe it is necessary
> >>> to hide this behavior behind an mmap flag.
> >> 
> >> Thank you for helping to shape a more consistent mmap behavior.
> >> I think this should be fixed ASAP either by allowing the hint address to
> >> be ignored
> >> (as suggested by the Linux man page), or hide this behavior behind an
> >> mmap flag as you said.
> >> 
> >>> - Charlie
> >>> 
> >>>> See alsohttps://github.com/riscv-forks/electron/issues/4
> >>>> 
> >>>>>> - Charlie
> >>>> Sincerely,
> >>>> Levi
> >>>> 
> >> 
> >> I accidentally introduced some HTML into this reply so this reply is
> >> resent as plain text.
> >> 
> >> Sincerely,
> >> Levi
> 
>
Yangyu Chen Aug. 23, 2024, 5:28 a.m. UTC | #13
> On Aug 23, 2024, at 12:39, Charlie Jenkins <charlie@rivosinc.com> wrote:
> 
> On Thu, Aug 22, 2024 at 10:51:54AM +0800, Yangyu Chen wrote:
>> 
>> 
>>> On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>> 
>>> On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
>>>> On 2024-08-20 01:00, Charlie Jenkins wrote:
>>>>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
>>>>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
>>>>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>>>>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>>>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>>>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>>>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
>>>>>>>>>>> than
>>>>>>>>>>> the hint address. Allow mmap to return an address all the way up to
>>>>>>>>>>> addr, if provided, rather than just up to the lower address space.
>>>>>>>>>>>>> This provides a performance benefit as well, allowing
>>>>>>>>> mmap to exit
>>>>>>>>>>> after
>>>>>>>>>>> checking that the address is in range rather than searching for a
>>>>>>>>>>> valid
>>>>>>>>>>> address.
>>>>>>>>>>>>> It is possible to provide an address that uses at most the same
>>>>>>>>>>> number
>>>>>>>>>>> of bits, however it is significantly more computationally expensive
>>>>>>>>>>> to
>>>>>>>>>>> provide that number rather than setting the max to be the hint
>>>>>>>>>>> address.
>>>>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
>>>>>>>>>>> set
>>>>>>>>>>> bit
>>>>>>>>>>> which could be used to performantly implement this, but it would
>>>>>>>>>>> still
>>>>>>>>>>> be slower than the current implementation. At worst case, half of
>>>>>>>>>>> the
>>>>>>>>>>> address would not be able to be allocated when a hint address is
>>>>>>>>>>> provided.
>>>>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
>>>>>>>>>>> ---
>>>>>>>>>>> arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>>>>>>>>>> -
>>>>>>>>>>> 1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
>>>>>>>>>>> b/arch/riscv/include/asm/processor.h
>>>>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
>>>>>>>>>>> --- a/arch/riscv/include/asm/processor.h
>>>>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
>>>>>>>>>>> @@ -14,22 +14,16 @@
>>>>>>>>>>> 
>>>>>>>>>>> #include <asm/ptrace.h>
>>>>>>>>>>> 
>>>>>>>>>>> -#ifdef CONFIG_64BIT
>>>>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>>>> -
>>>>>>>>>>> #define arch_get_mmap_end(addr, len, flags)            \
>>>>>>>>>>> ({                                \
>>>>>>>>>>>     unsigned long
>>>>>>>>>>> mmap_end;                    \
>>>>>>>>>>>     typeof(addr) _addr = (addr);                \
>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>>>> is_compat_task())) \
>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>>>> 1)))            \
>>>>>>>>>>>         mmap_end = STACK_TOP_MAX;            \
>>>>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
>>>>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>>>> -        mmap_end = VA_USER_SV48;            \
>>>>>>>>>>>     else                            \
>>>>>>>>>>> -        mmap_end = VA_USER_SV39;            \
>>>>>>>>>>> +        mmap_end = (_addr + len);            \
>>>>>>>>>>>     mmap_end;                        \
>>>>>>>>>>> })
>>>>>>>>>>> 
>>>>>>>>>>> @@ -39,17 +33,18 @@
>>>>>>>>>>>     typeof(addr) _addr = (addr);                \
>>>>>>>>>>>     typeof(base) _base = (base);                \
>>>>>>>>>>>     unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>>>> is_compat_task())) \
>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>>>> 1)))            \
>>>>>>>>>>>         mmap_base = (_base);                \
>>>>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>>>>>>>>>> VA_BITS_SV57)) \
>>>>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>>>>>>>>>>     else                            \
>>>>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>>>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
>>>>>>>>>>>     mmap_base;                        \
>>>>>>>>>>> })
>>>>>>>>>>> 
>>>>>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>>>> #else
>>>>>>>>>>> #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>>>>>>>>>> #define STACK_TOP_MAX        TASK_SIZE
>>>>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
>>>>>>>>> bug that
>>>>>>>>>> needs
>>>>>>>>>> to be solved is that mmap with the same hint address without
>>>>>>>>>> MAP_FIXED
>>>>>>>>>> set will fail the second time.
>>>>>>>>>>> Userspace code to reproduce the bug:
>>>>>>>>>>> #include <sys/mman.h>
>>>>>>>>>> #include <stdio.h>
>>>>>>>>>> #include <stdint.h>
>>>>>>>>>>> void test(char *addr) {
>>>>>>>>>>    char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>>>>>>>>> MAP_ANONYMOUS
>>>>>>>>>>> MAP_PRIVATE, -1, 0);
>>>>>>>>>>    printf("hint %p got %p.\n", addr, res);
>>>>>>>>>> }
>>>>>>>>>>> int main (void) {
>>>>>>>>>>    test(1<<30);
>>>>>>>>>>    test(1<<30);
>>>>>>>>>>    test(1<<30);
>>>>>>>>>>    return 0;
>>>>>>>>>> }
>>>>>>>>>>> output:
>>>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>>>>> output on x86:
>>>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>>>> hint 0x40000000 got 0x7f9171363000.
>>>>>>>>>> hint 0x40000000 got 0x7f9171362000.
>>>>>>>>>>> It may need to implement a special arch_get_unmapped_area and
>>>>>>>>>> arch_get_unmapped_area_topdown function.
>>>>>>>>>> 
>>>>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>>>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>>>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>>>>>>>> behavior is expected from patch v2 review. However it brings a new
>>>>>>>>> regression even on sv39 systems.
>>>>>>>>> 
>>>>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
>>>>>>>>> think solution like x86/arm64/powerpc provide two address space switch
>>>>>>>>> based on whether hint address above the default map window is enough.
>>>>>>>>> 
>>>>>>>> Yep this is expected. It is up to the maintainers to decide.
>>>>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
>>>>>>> but I must have lost it.
>>>>>>> 
>>>>>>> I think Charlie's approach is the right way to go.  Putting my userspace
>>>>>>> hat on, I'd much rather have my allocations fail rather than silently
>>>>>>> ignore the hint when there's memory pressure.
>>>>>>> 
>>>>>>> If there's some real use case that needs these low hints to be silently
>>>>>>> ignored under VA pressure then we can try and figure something out that
>>>>>>> makes those applications work.
>>>>>> I could confirm that this patch has broken chromium's partition allocator on
>>>>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
>>>>>> 
>>>>>> #include <stdio.h>
>>>>>> #include <sys/mman.h>
>>>>>> 
>>>>>> int main() {
>>>>>>    void* expected = (void*)0x400000000;
>>>>>>    void* addr = mmap(expected, 17179869184, PROT_NONE,
>>>>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>>>>    if (addr != expected) {
>>>>> It is not valid to assume that the address returned by mmap will be the
>>>>> hint address. If the hint address is not available, mmap will return a
>>>>> different address.
>>>> 
>>>> Oh, sorry I didn't make it clear what is the expected behavior.
>>>> The printf here is solely for debugging purpose and I don't mean that
>>>> chromium expect it will get the hint address. The expected behavior is
>>>> that both the two mmap calls will succeed.
>>>> 
>>>>>>        printf("Not expected address: %p != %p\n", addr, expected);
>>>>>>    }
>>>>>>    expected = (void*)0x3fffff000;
>>>>>>    addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
>>>>>> -1, 0);
>>>>>>    if (addr != expected) {
>>>>>>        printf("Not expected address: %p != %p\n", addr, expected);
>>>>>>    }
>>>>>>    return 0;
>>>>>> }
>>>>>> 
>>>>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
>>>>>> issue for me. So I think it's clearly a regression and breaks userspace.
>>>>>> 
>>>>> The issue here is that overlapping memory is being requested. This
>>>>> second mmap will never be able to provide an address at 0x3fffff000 with
>>>>> a size of 0x400001000 since mmap just provided an address at 0x400000000
>>>>> with a size of 0x400000000.
>>>>> 
>>>>> Before this patch, this request causes mmap to return a completely
>>>>> arbitrary value. There is no reason to use a hint address in this manner
>>>>> because the hint can never be respected. Since an arbitrary address is
>>>>> desired, a hint of zero should be used.
>>>>> 
>>>>> This patch causes the behavior to be more deterministic. Instead of
>>>>> providing an arbitrary address, it causes the address to be less than or
>>>>> equal to the hint address. This allows for applications to make
>>>>> assumptions about the returned address.
>>>> 
>>>> About the overlap, of course the partition allocator's request for
>>>> overlapped vma seems unreasonable.
>>>> 
>>>> But I still don't quite understand why mmap cannot use an address higher
>>>> than the hint address.
>>>> The hint address, after all, is a hint, not a requirement.
>>>> 
>>>> Quoting the man page:
>>>> 
>>>>>  If another mapping already exists there, the kernel picks
>>>>>       a new address that may or may not depend on the hint.  The
>>>>>       address of the new mapping is returned as the result of the call.
>>>> 
>>>> So for casual programmers that only reads man page but not architecture
>>>> specific kernel documentation, the current behavior of mmap on riscv64
>>>> failing on overlapped address ranges are quite surprising IMO.
>>>> 
>>>> And quoting the man page again about the errno:
>>>> 
>>>>>      ENOMEM No memory is available.
>>>>> 
>>>>>      ENOMEM The process's maximum number of mappings would have been
>>>>>             exceeded.  This error can also occur for munmap(), when
>>>>>             unmapping a region in the middle of an existing mapping,
>>>>>             since this results in two smaller mappings on either side
>>>>>             of the region being unmapped.
>>>>> 
>>>>>      ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
>>>>>             described in getrlimit(2), would have been exceeded.
>>>>> 
>>>>>      ENOMEM We don't like addr, because it exceeds the virtual address
>>>>>             space of the CPU.
>>>>> 
>>>> 
>>>> There's no matching description for the ENOMEM returned here.
>>>> I would suggest removing "because it exceeds the virtual address
>>>> space of the CPU." from the last item if the ENOMEM behavior here
>>>> is expected.
>>>> 
>>>>> This code is unfortunately relying on the previously mostly undefined
>>>>> behavior of the hint address in mmap.
>>>> 
>>>> Although I haven't read the code of chromium's partition allocator to
>>>> judge whether it should
>>>> be improved or fixed for riscv64, I do know that the kernel "don't break
>>>> userspace" and "never EVER blame the user programs".
>>> 
>>> Ya, sorry for breaking stuff.
>>> 
>>> The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
>>> 
>>> I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
>>> 
>>> Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region. 
>> 
>> 
>> For this solution, the only difference from the mmap behavior of
>> x86 and aarch64 is that we will first try to allocate some memory
>> from an address less or equal to the request address + size. But
>> for most cases, I think there is no need to do that, especially for
>> those addresses < BIT(47), as most program works fine on x86-64,
>> which has 47bit available userspace address space to use. And for
>> that program that wants an address < BIT(32), we already have
>> MAP_32BIT now.
>> 
>> I think we can just fix like that patch:
>> https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/
> 
> This patch does not satisfy the requirement of having the ability to guarantee
> that mmap returns an address that is less than the hint address.

Indeed. My intuition is to remove it and align it with x86 and aarch64.

> This
> patch only allows an address to be less than the DEFAULT_MAP_WINDOW
> which is 32 bits on sv32, 39 bits on sv39, and 48 bits on sv48 or sv57.
> 
> This patch also again falls into the trap of using the hint address to
> forcefully restrict the address space.

Indeed. However, x86 and aarch64 also use this behavior to restrict
va >= BIT(47) by default unless we have the hint address larger
than BIT(47).

> I agree with Levi that it is not
> very good behavior to have a "hint" cause mmap to fail if conforming to
> the hint isn't possible. Instead, I believe it to be more logical to try
> to allocate at the hint address, otherwise give a random address.
> 

I also agree with this.

> The current behavior can then be maintained through the flag
> MAP_BELOW_HINT. This way the user explicitly selects that they want mmap
> to fail if an address could not be found within the hint address
> constraints.
> 

I think restricting the addresses with the MAP_BELOW_HINT flag
would be the best choice. However, it remains a problem: What should
the behavior be when there is no MAP_BELOW_HINT? I think we can
fallback to Sv48 on the Sv57 machine by default to align with x86
and aarch64.

> - Charlie
> 
>> 
>>> Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
>>> 
>>>>> The goal of this patch is to help
>>>>> developers have more consistent mmap behavior, but maybe it is necessary
>>>>> to hide this behavior behind an mmap flag.
>>>> 
>>>> Thank you for helping to shape a more consistent mmap behavior.
>>>> I think this should be fixed ASAP either by allowing the hint address to
>>>> be ignored
>>>> (as suggested by the Linux man page), or hide this behavior behind an
>>>> mmap flag as you said.
>>>> 
>>>>> - Charlie
>>>>> 
>>>>>> See alsohttps://github.com/riscv-forks/electron/issues/4
>>>>>> 
>>>>>>>> - Charlie
>>>>>> Sincerely,
>>>>>> Levi
>>>>>> 
>>>> 
>>>> I accidentally introduced some HTML into this reply so this reply is
>>>> resent as plain text.
>>>> 
>>>> Sincerely,
>>>> Levi
Charlie Jenkins Aug. 23, 2024, 5:57 a.m. UTC | #14
On Fri, Aug 23, 2024 at 01:28:18PM +0800, Yangyu Chen wrote:
> 
> 
> > On Aug 23, 2024, at 12:39, Charlie Jenkins <charlie@rivosinc.com> wrote:
> > 
> > On Thu, Aug 22, 2024 at 10:51:54AM +0800, Yangyu Chen wrote:
> >> 
> >> 
> >>> On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>> 
> >>> On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
> >>>> On 2024-08-20 01:00, Charlie Jenkins wrote:
> >>>>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> >>>>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
> >>>>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> >>>>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> >>>>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> >>>>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> >>>>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
> >>>>>>>>>>> than
> >>>>>>>>>>> the hint address. Allow mmap to return an address all the way up to
> >>>>>>>>>>> addr, if provided, rather than just up to the lower address space.
> >>>>>>>>>>>>> This provides a performance benefit as well, allowing
> >>>>>>>>> mmap to exit
> >>>>>>>>>>> after
> >>>>>>>>>>> checking that the address is in range rather than searching for a
> >>>>>>>>>>> valid
> >>>>>>>>>>> address.
> >>>>>>>>>>>>> It is possible to provide an address that uses at most the same
> >>>>>>>>>>> number
> >>>>>>>>>>> of bits, however it is significantly more computationally expensive
> >>>>>>>>>>> to
> >>>>>>>>>>> provide that number rather than setting the max to be the hint
> >>>>>>>>>>> address.
> >>>>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
> >>>>>>>>>>> set
> >>>>>>>>>>> bit
> >>>>>>>>>>> which could be used to performantly implement this, but it would
> >>>>>>>>>>> still
> >>>>>>>>>>> be slower than the current implementation. At worst case, half of
> >>>>>>>>>>> the
> >>>>>>>>>>> address would not be able to be allocated when a hint address is
> >>>>>>>>>>> provided.
> >>>>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
> >>>>>>>>>>> ---
> >>>>>>>>>>> arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> >>>>>>>>>>> -
> >>>>>>>>>>> 1 file changed, 11 insertions(+), 16 deletions(-)
> >>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
> >>>>>>>>>>> b/arch/riscv/include/asm/processor.h
> >>>>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
> >>>>>>>>>>> --- a/arch/riscv/include/asm/processor.h
> >>>>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
> >>>>>>>>>>> @@ -14,22 +14,16 @@
> >>>>>>>>>>> 
> >>>>>>>>>>> #include <asm/ptrace.h>
> >>>>>>>>>>> 
> >>>>>>>>>>> -#ifdef CONFIG_64BIT
> >>>>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>>>> -
> >>>>>>>>>>> #define arch_get_mmap_end(addr, len, flags)            \
> >>>>>>>>>>> ({                                \
> >>>>>>>>>>>     unsigned long
> >>>>>>>>>>> mmap_end;                    \
> >>>>>>>>>>>     typeof(addr) _addr = (addr);                \
> >>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>>>> is_compat_task())) \
> >>>>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>>>> 1)))            \
> >>>>>>>>>>>         mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
> >>>>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>>>> -        mmap_end = VA_USER_SV48;            \
> >>>>>>>>>>>     else                            \
> >>>>>>>>>>> -        mmap_end = VA_USER_SV39;            \
> >>>>>>>>>>> +        mmap_end = (_addr + len);            \
> >>>>>>>>>>>     mmap_end;                        \
> >>>>>>>>>>> })
> >>>>>>>>>>> 
> >>>>>>>>>>> @@ -39,17 +33,18 @@
> >>>>>>>>>>>     typeof(addr) _addr = (addr);                \
> >>>>>>>>>>>     typeof(base) _base = (base);                \
> >>>>>>>>>>>     unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
> >>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>>>> is_compat_task())) \
> >>>>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>>>> 1)))            \
> >>>>>>>>>>>         mmap_base = (_base);                \
> >>>>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> >>>>>>>>>>> VA_BITS_SV57)) \
> >>>>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
> >>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
> >>>>>>>>>>>     else                            \
> >>>>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
> >>>>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
> >>>>>>>>>>>     mmap_base;                        \
> >>>>>>>>>>> })
> >>>>>>>>>>> 
> >>>>>>>>>>> +#ifdef CONFIG_64BIT
> >>>>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>>>> #else
> >>>>>>>>>>> #define DEFAULT_MAP_WINDOW    TASK_SIZE
> >>>>>>>>>>> #define STACK_TOP_MAX        TASK_SIZE
> >>>>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
> >>>>>>>>> bug that
> >>>>>>>>>> needs
> >>>>>>>>>> to be solved is that mmap with the same hint address without
> >>>>>>>>>> MAP_FIXED
> >>>>>>>>>> set will fail the second time.
> >>>>>>>>>>> Userspace code to reproduce the bug:
> >>>>>>>>>>> #include <sys/mman.h>
> >>>>>>>>>> #include <stdio.h>
> >>>>>>>>>> #include <stdint.h>
> >>>>>>>>>>> void test(char *addr) {
> >>>>>>>>>>    char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> >>>>>>>>>> MAP_ANONYMOUS
> >>>>>>>>>>> MAP_PRIVATE, -1, 0);
> >>>>>>>>>>    printf("hint %p got %p.\n", addr, res);
> >>>>>>>>>> }
> >>>>>>>>>>> int main (void) {
> >>>>>>>>>>    test(1<<30);
> >>>>>>>>>>    test(1<<30);
> >>>>>>>>>>    test(1<<30);
> >>>>>>>>>>    return 0;
> >>>>>>>>>> }
> >>>>>>>>>>> output:
> >>>>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>>>>> output on x86:
> >>>>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>>>> hint 0x40000000 got 0x7f9171363000.
> >>>>>>>>>> hint 0x40000000 got 0x7f9171362000.
> >>>>>>>>>>> It may need to implement a special arch_get_unmapped_area and
> >>>>>>>>>> arch_get_unmapped_area_topdown function.
> >>>>>>>>>> 
> >>>>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
> >>>>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> >>>>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
> >>>>>>>>> behavior is expected from patch v2 review. However it brings a new
> >>>>>>>>> regression even on sv39 systems.
> >>>>>>>>> 
> >>>>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
> >>>>>>>>> think solution like x86/arm64/powerpc provide two address space switch
> >>>>>>>>> based on whether hint address above the default map window is enough.
> >>>>>>>>> 
> >>>>>>>> Yep this is expected. It is up to the maintainers to decide.
> >>>>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> >>>>>>> but I must have lost it.
> >>>>>>> 
> >>>>>>> I think Charlie's approach is the right way to go.  Putting my userspace
> >>>>>>> hat on, I'd much rather have my allocations fail rather than silently
> >>>>>>> ignore the hint when there's memory pressure.
> >>>>>>> 
> >>>>>>> If there's some real use case that needs these low hints to be silently
> >>>>>>> ignored under VA pressure then we can try and figure something out that
> >>>>>>> makes those applications work.
> >>>>>> I could confirm that this patch has broken chromium's partition allocator on
> >>>>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
> >>>>>> 
> >>>>>> #include <stdio.h>
> >>>>>> #include <sys/mman.h>
> >>>>>> 
> >>>>>> int main() {
> >>>>>>    void* expected = (void*)0x400000000;
> >>>>>>    void* addr = mmap(expected, 17179869184, PROT_NONE,
> >>>>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> >>>>>>    if (addr != expected) {
> >>>>> It is not valid to assume that the address returned by mmap will be the
> >>>>> hint address. If the hint address is not available, mmap will return a
> >>>>> different address.
> >>>> 
> >>>> Oh, sorry I didn't make it clear what is the expected behavior.
> >>>> The printf here is solely for debugging purpose and I don't mean that
> >>>> chromium expect it will get the hint address. The expected behavior is
> >>>> that both the two mmap calls will succeed.
> >>>> 
> >>>>>>        printf("Not expected address: %p != %p\n", addr, expected);
> >>>>>>    }
> >>>>>>    expected = (void*)0x3fffff000;
> >>>>>>    addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> >>>>>> -1, 0);
> >>>>>>    if (addr != expected) {
> >>>>>>        printf("Not expected address: %p != %p\n", addr, expected);
> >>>>>>    }
> >>>>>>    return 0;
> >>>>>> }
> >>>>>> 
> >>>>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> >>>>>> issue for me. So I think it's clearly a regression and breaks userspace.
> >>>>>> 
> >>>>> The issue here is that overlapping memory is being requested. This
> >>>>> second mmap will never be able to provide an address at 0x3fffff000 with
> >>>>> a size of 0x400001000 since mmap just provided an address at 0x400000000
> >>>>> with a size of 0x400000000.
> >>>>> 
> >>>>> Before this patch, this request causes mmap to return a completely
> >>>>> arbitrary value. There is no reason to use a hint address in this manner
> >>>>> because the hint can never be respected. Since an arbitrary address is
> >>>>> desired, a hint of zero should be used.
> >>>>> 
> >>>>> This patch causes the behavior to be more deterministic. Instead of
> >>>>> providing an arbitrary address, it causes the address to be less than or
> >>>>> equal to the hint address. This allows for applications to make
> >>>>> assumptions about the returned address.
> >>>> 
> >>>> About the overlap, of course the partition allocator's request for
> >>>> overlapped vma seems unreasonable.
> >>>> 
> >>>> But I still don't quite understand why mmap cannot use an address higher
> >>>> than the hint address.
> >>>> The hint address, after all, is a hint, not a requirement.
> >>>> 
> >>>> Quoting the man page:
> >>>> 
> >>>>>  If another mapping already exists there, the kernel picks
> >>>>>       a new address that may or may not depend on the hint.  The
> >>>>>       address of the new mapping is returned as the result of the call.
> >>>> 
> >>>> So for casual programmers that only reads man page but not architecture
> >>>> specific kernel documentation, the current behavior of mmap on riscv64
> >>>> failing on overlapped address ranges are quite surprising IMO.
> >>>> 
> >>>> And quoting the man page again about the errno:
> >>>> 
> >>>>>      ENOMEM No memory is available.
> >>>>> 
> >>>>>      ENOMEM The process's maximum number of mappings would have been
> >>>>>             exceeded.  This error can also occur for munmap(), when
> >>>>>             unmapping a region in the middle of an existing mapping,
> >>>>>             since this results in two smaller mappings on either side
> >>>>>             of the region being unmapped.
> >>>>> 
> >>>>>      ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
> >>>>>             described in getrlimit(2), would have been exceeded.
> >>>>> 
> >>>>>      ENOMEM We don't like addr, because it exceeds the virtual address
> >>>>>             space of the CPU.
> >>>>> 
> >>>> 
> >>>> There's no matching description for the ENOMEM returned here.
> >>>> I would suggest removing "because it exceeds the virtual address
> >>>> space of the CPU." from the last item if the ENOMEM behavior here
> >>>> is expected.
> >>>> 
> >>>>> This code is unfortunately relying on the previously mostly undefined
> >>>>> behavior of the hint address in mmap.
> >>>> 
> >>>> Although I haven't read the code of chromium's partition allocator to
> >>>> judge whether it should
> >>>> be improved or fixed for riscv64, I do know that the kernel "don't break
> >>>> userspace" and "never EVER blame the user programs".
> >>> 
> >>> Ya, sorry for breaking stuff.
> >>> 
> >>> The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
> >>> 
> >>> I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
> >>> 
> >>> Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region. 
> >> 
> >> 
> >> For this solution, the only difference from the mmap behavior of
> >> x86 and aarch64 is that we will first try to allocate some memory
> >> from an address less or equal to the request address + size. But
> >> for most cases, I think there is no need to do that, especially for
> >> those addresses < BIT(47), as most program works fine on x86-64,
> >> which has 47bit available userspace address space to use. And for
> >> that program that wants an address < BIT(32), we already have
> >> MAP_32BIT now.
> >> 
> >> I think we can just fix like that patch:
> >> https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/
> > 
> > This patch does not satisfy the requirement of having the ability to guarantee
> > that mmap returns an address that is less than the hint address.
> 
> Indeed. My intuition is to remove it and align it with x86 and aarch64.
> 
> > This
> > patch only allows an address to be less than the DEFAULT_MAP_WINDOW
> > which is 32 bits on sv32, 39 bits on sv39, and 48 bits on sv48 or sv57.
> > 
> > This patch also again falls into the trap of using the hint address to
> > forcefully restrict the address space.
> 
> Indeed. However, x86 and aarch64 also use this behavior to restrict
> va >= BIT(47) by default unless we have the hint address larger
> than BIT(47).
> 
> > I agree with Levi that it is not
> > very good behavior to have a "hint" cause mmap to fail if conforming to
> > the hint isn't possible. Instead, I believe it to be more logical to try
> > to allocate at the hint address, otherwise give a random address.
> > 
> 
> I also agree with this.
> 
> > The current behavior can then be maintained through the flag
> > MAP_BELOW_HINT. This way the user explicitly selects that they want mmap
> > to fail if an address could not be found within the hint address
> > constraints.
> > 
> 
> I think restricting the addresses with the MAP_BELOW_HINT flag
> would be the best choice. However, it remains a problem: What should
> the behavior be when there is no MAP_BELOW_HINT? I think we can
> fallback to Sv48 on the Sv57 machine by default to align with x86
> and aarch64.

Although that is the behavior on other architectures, I am hesitant to
follow it because it is a somewhat arbitrary restriction. With a generic
flag that can force mmap to provide exactly the number of bits that an
application needs, there is no need for this restriction on riscv. It
may cause problems for applications running on sv57 hardware, however:

1. sv57 hardware does not exist yet

2. A hint address would still be required if following the same behavior
as other architectures.
	a. It would aid in the porting of an application to sv57
	hardware, but I am not sure that forcing this restriction is
	worth having this one piece of parity. Applications using the
	proposed generic flag would work as expected on all
	architectures as well.

> 
> > - Charlie
> > 
> >> 
> >>> Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
> >>> 
> >>>>> The goal of this patch is to help
> >>>>> developers have more consistent mmap behavior, but maybe it is necessary
> >>>>> to hide this behavior behind an mmap flag.
> >>>> 
> >>>> Thank you for helping to shape a more consistent mmap behavior.
> >>>> I think this should be fixed ASAP either by allowing the hint address to
> >>>> be ignored
> >>>> (as suggested by the Linux man page), or hide this behavior behind an
> >>>> mmap flag as you said.
> >>>> 
> >>>>> - Charlie
> >>>>> 
> >>>>>> See alsohttps://github.com/riscv-forks/electron/issues/4
> >>>>>> 
> >>>>>>>> - Charlie
> >>>>>> Sincerely,
> >>>>>> Levi
> >>>>>> 
> >>>> 
> >>>> I accidentally introduced some HTML into this reply so this reply is
> >>>> resent as plain text.
> >>>> 
> >>>> Sincerely,
> >>>> Levi
> 
>
Yangyu Chen Aug. 23, 2024, 6:55 a.m. UTC | #15
> On Aug 23, 2024, at 13:57, Charlie Jenkins <charlie@rivosinc.com> wrote:
> 
> On Fri, Aug 23, 2024 at 01:28:18PM +0800, Yangyu Chen wrote:
>> 
>> 
>>> On Aug 23, 2024, at 12:39, Charlie Jenkins <charlie@rivosinc.com> wrote:
>>> 
>>> On Thu, Aug 22, 2024 at 10:51:54AM +0800, Yangyu Chen wrote:
>>>> 
>>>> 
>>>>> On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>>>> 
>>>>> On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
>>>>>> On 2024-08-20 01:00, Charlie Jenkins wrote:
>>>>>>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
>>>>>>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
>>>>>>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
>>>>>>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>>>>>>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>>>>>>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>>>>>>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
>>>>>>>>>>>>> than
>>>>>>>>>>>>> the hint address. Allow mmap to return an address all the way up to
>>>>>>>>>>>>> addr, if provided, rather than just up to the lower address space.
>>>>>>>>>>>>>>> This provides a performance benefit as well, allowing
>>>>>>>>>>> mmap to exit
>>>>>>>>>>>>> after
>>>>>>>>>>>>> checking that the address is in range rather than searching for a
>>>>>>>>>>>>> valid
>>>>>>>>>>>>> address.
>>>>>>>>>>>>>>> It is possible to provide an address that uses at most the same
>>>>>>>>>>>>> number
>>>>>>>>>>>>> of bits, however it is significantly more computationally expensive
>>>>>>>>>>>>> to
>>>>>>>>>>>>> provide that number rather than setting the max to be the hint
>>>>>>>>>>>>> address.
>>>>>>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
>>>>>>>>>>>>> set
>>>>>>>>>>>>> bit
>>>>>>>>>>>>> which could be used to performantly implement this, but it would
>>>>>>>>>>>>> still
>>>>>>>>>>>>> be slower than the current implementation. At worst case, half of
>>>>>>>>>>>>> the
>>>>>>>>>>>>> address would not be able to be allocated when a hint address is
>>>>>>>>>>>>> provided.
>>>>>>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>> arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>>>>>>>>>>>>> -
>>>>>>>>>>>>> 1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
>>>>>>>>>>>>> b/arch/riscv/include/asm/processor.h
>>>>>>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
>>>>>>>>>>>>> --- a/arch/riscv/include/asm/processor.h
>>>>>>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
>>>>>>>>>>>>> @@ -14,22 +14,16 @@
>>>>>>>>>>>>> 
>>>>>>>>>>>>> #include <asm/ptrace.h>
>>>>>>>>>>>>> 
>>>>>>>>>>>>> -#ifdef CONFIG_64BIT
>>>>>>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>>>>>> -
>>>>>>>>>>>>> #define arch_get_mmap_end(addr, len, flags)            \
>>>>>>>>>>>>> ({                                \
>>>>>>>>>>>>>    unsigned long
>>>>>>>>>>>>> mmap_end;                    \
>>>>>>>>>>>>>    typeof(addr) _addr = (addr);                \
>>>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>>>>>> is_compat_task())) \
>>>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>>>>>> 1)))            \
>>>>>>>>>>>>>        mmap_end = STACK_TOP_MAX;            \
>>>>>>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
>>>>>>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
>>>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>>>>>> -        mmap_end = VA_USER_SV48;            \
>>>>>>>>>>>>>    else                            \
>>>>>>>>>>>>> -        mmap_end = VA_USER_SV39;            \
>>>>>>>>>>>>> +        mmap_end = (_addr + len);            \
>>>>>>>>>>>>>    mmap_end;                        \
>>>>>>>>>>>>> })
>>>>>>>>>>>>> 
>>>>>>>>>>>>> @@ -39,17 +33,18 @@
>>>>>>>>>>>>>    typeof(addr) _addr = (addr);                \
>>>>>>>>>>>>>    typeof(base) _base = (base);                \
>>>>>>>>>>>>>    unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
>>>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>>>>>>>>>>>>> is_compat_task())) \
>>>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
>>>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
>>>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
>>>>>>>>>>>>> 1)))            \
>>>>>>>>>>>>>        mmap_base = (_base);                \
>>>>>>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>>>>>>>>>>>>> VA_BITS_SV57)) \
>>>>>>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
>>>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>>>>>>>>>>>>> VA_BITS_SV48)) \
>>>>>>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
>>>>>>>>>>>>>    else                            \
>>>>>>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
>>>>>>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
>>>>>>>>>>>>>    mmap_base;                        \
>>>>>>>>>>>>> })
>>>>>>>>>>>>> 
>>>>>>>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
>>>>>>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
>>>>>>>>>>>>> #else
>>>>>>>>>>>>> #define DEFAULT_MAP_WINDOW    TASK_SIZE
>>>>>>>>>>>>> #define STACK_TOP_MAX        TASK_SIZE
>>>>>>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
>>>>>>>>>>> bug that
>>>>>>>>>>>> needs
>>>>>>>>>>>> to be solved is that mmap with the same hint address without
>>>>>>>>>>>> MAP_FIXED
>>>>>>>>>>>> set will fail the second time.
>>>>>>>>>>>>> Userspace code to reproduce the bug:
>>>>>>>>>>>>> #include <sys/mman.h>
>>>>>>>>>>>> #include <stdio.h>
>>>>>>>>>>>> #include <stdint.h>
>>>>>>>>>>>>> void test(char *addr) {
>>>>>>>>>>>>   char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>>>>>>>>>>>> MAP_ANONYMOUS
>>>>>>>>>>>>> MAP_PRIVATE, -1, 0);
>>>>>>>>>>>>   printf("hint %p got %p.\n", addr, res);
>>>>>>>>>>>> }
>>>>>>>>>>>>> int main (void) {
>>>>>>>>>>>>   test(1<<30);
>>>>>>>>>>>>   test(1<<30);
>>>>>>>>>>>>   test(1<<30);
>>>>>>>>>>>>   return 0;
>>>>>>>>>>>> }
>>>>>>>>>>>>> output:
>>>>>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
>>>>>>>>>>>>> output on x86:
>>>>>>>>>>>>> hint 0x40000000 got 0x40000000.
>>>>>>>>>>>> hint 0x40000000 got 0x7f9171363000.
>>>>>>>>>>>> hint 0x40000000 got 0x7f9171362000.
>>>>>>>>>>>>> It may need to implement a special arch_get_unmapped_area and
>>>>>>>>>>>> arch_get_unmapped_area_topdown function.
>>>>>>>>>>>> 
>>>>>>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>>>>>>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>>>>>>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
>>>>>>>>>>> behavior is expected from patch v2 review. However it brings a new
>>>>>>>>>>> regression even on sv39 systems.
>>>>>>>>>>> 
>>>>>>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
>>>>>>>>>>> think solution like x86/arm64/powerpc provide two address space switch
>>>>>>>>>>> based on whether hint address above the default map window is enough.
>>>>>>>>>>> 
>>>>>>>>>> Yep this is expected. It is up to the maintainers to decide.
>>>>>>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
>>>>>>>>> but I must have lost it.
>>>>>>>>> 
>>>>>>>>> I think Charlie's approach is the right way to go.  Putting my userspace
>>>>>>>>> hat on, I'd much rather have my allocations fail rather than silently
>>>>>>>>> ignore the hint when there's memory pressure.
>>>>>>>>> 
>>>>>>>>> If there's some real use case that needs these low hints to be silently
>>>>>>>>> ignored under VA pressure then we can try and figure something out that
>>>>>>>>> makes those applications work.
>>>>>>>> I could confirm that this patch has broken chromium's partition allocator on
>>>>>>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
>>>>>>>> 
>>>>>>>> #include <stdio.h>
>>>>>>>> #include <sys/mman.h>
>>>>>>>> 
>>>>>>>> int main() {
>>>>>>>>   void* expected = (void*)0x400000000;
>>>>>>>>   void* addr = mmap(expected, 17179869184, PROT_NONE,
>>>>>>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>>>>>>>>   if (addr != expected) {
>>>>>>> It is not valid to assume that the address returned by mmap will be the
>>>>>>> hint address. If the hint address is not available, mmap will return a
>>>>>>> different address.
>>>>>> 
>>>>>> Oh, sorry I didn't make it clear what is the expected behavior.
>>>>>> The printf here is solely for debugging purpose and I don't mean that
>>>>>> chromium expect it will get the hint address. The expected behavior is
>>>>>> that both the two mmap calls will succeed.
>>>>>> 
>>>>>>>>       printf("Not expected address: %p != %p\n", addr, expected);
>>>>>>>>   }
>>>>>>>>   expected = (void*)0x3fffff000;
>>>>>>>>   addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
>>>>>>>> -1, 0);
>>>>>>>>   if (addr != expected) {
>>>>>>>>       printf("Not expected address: %p != %p\n", addr, expected);
>>>>>>>>   }
>>>>>>>>   return 0;
>>>>>>>> }
>>>>>>>> 
>>>>>>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
>>>>>>>> issue for me. So I think it's clearly a regression and breaks userspace.
>>>>>>>> 
>>>>>>> The issue here is that overlapping memory is being requested. This
>>>>>>> second mmap will never be able to provide an address at 0x3fffff000 with
>>>>>>> a size of 0x400001000 since mmap just provided an address at 0x400000000
>>>>>>> with a size of 0x400000000.
>>>>>>> 
>>>>>>> Before this patch, this request causes mmap to return a completely
>>>>>>> arbitrary value. There is no reason to use a hint address in this manner
>>>>>>> because the hint can never be respected. Since an arbitrary address is
>>>>>>> desired, a hint of zero should be used.
>>>>>>> 
>>>>>>> This patch causes the behavior to be more deterministic. Instead of
>>>>>>> providing an arbitrary address, it causes the address to be less than or
>>>>>>> equal to the hint address. This allows for applications to make
>>>>>>> assumptions about the returned address.
>>>>>> 
>>>>>> About the overlap, of course the partition allocator's request for
>>>>>> overlapped vma seems unreasonable.
>>>>>> 
>>>>>> But I still don't quite understand why mmap cannot use an address higher
>>>>>> than the hint address.
>>>>>> The hint address, after all, is a hint, not a requirement.
>>>>>> 
>>>>>> Quoting the man page:
>>>>>> 
>>>>>>> If another mapping already exists there, the kernel picks
>>>>>>>      a new address that may or may not depend on the hint.  The
>>>>>>>      address of the new mapping is returned as the result of the call.
>>>>>> 
>>>>>> So for casual programmers that only reads man page but not architecture
>>>>>> specific kernel documentation, the current behavior of mmap on riscv64
>>>>>> failing on overlapped address ranges are quite surprising IMO.
>>>>>> 
>>>>>> And quoting the man page again about the errno:
>>>>>> 
>>>>>>>     ENOMEM No memory is available.
>>>>>>> 
>>>>>>>     ENOMEM The process's maximum number of mappings would have been
>>>>>>>            exceeded.  This error can also occur for munmap(), when
>>>>>>>            unmapping a region in the middle of an existing mapping,
>>>>>>>            since this results in two smaller mappings on either side
>>>>>>>            of the region being unmapped.
>>>>>>> 
>>>>>>>     ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
>>>>>>>            described in getrlimit(2), would have been exceeded.
>>>>>>> 
>>>>>>>     ENOMEM We don't like addr, because it exceeds the virtual address
>>>>>>>            space of the CPU.
>>>>>>> 
>>>>>> 
>>>>>> There's no matching description for the ENOMEM returned here.
>>>>>> I would suggest removing "because it exceeds the virtual address
>>>>>> space of the CPU." from the last item if the ENOMEM behavior here
>>>>>> is expected.
>>>>>> 
>>>>>>> This code is unfortunately relying on the previously mostly undefined
>>>>>>> behavior of the hint address in mmap.
>>>>>> 
>>>>>> Although I haven't read the code of chromium's partition allocator to
>>>>>> judge whether it should
>>>>>> be improved or fixed for riscv64, I do know that the kernel "don't break
>>>>>> userspace" and "never EVER blame the user programs".
>>>>> 
>>>>> Ya, sorry for breaking stuff.
>>>>> 
>>>>> The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
>>>>> 
>>>>> I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
>>>>> 
>>>>> Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region.
>>>> 
>>>> 
>>>> For this solution, the only difference from the mmap behavior of
>>>> x86 and aarch64 is that we will first try to allocate some memory
>>>> from an address less or equal to the request address + size. But
>>>> for most cases, I think there is no need to do that, especially for
>>>> those addresses < BIT(47), as most program works fine on x86-64,
>>>> which has 47bit available userspace address space to use. And for
>>>> that program that wants an address < BIT(32), we already have
>>>> MAP_32BIT now.
>>>> 
>>>> I think we can just fix like that patch:
>>>> https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/
>>> 
>>> This patch does not satisfy the requirement of having the ability to guarantee
>>> that mmap returns an address that is less than the hint address.
>> 
>> Indeed. My intuition is to remove it and align it with x86 and aarch64.
>> 
>>> This
>>> patch only allows an address to be less than the DEFAULT_MAP_WINDOW
>>> which is 32 bits on sv32, 39 bits on sv39, and 48 bits on sv48 or sv57.
>>> 
>>> This patch also again falls into the trap of using the hint address to
>>> forcefully restrict the address space.
>> 
>> Indeed. However, x86 and aarch64 also use this behavior to restrict
>> va >= BIT(47) by default unless we have the hint address larger
>> than BIT(47).
>> 
>>> I agree with Levi that it is not
>>> very good behavior to have a "hint" cause mmap to fail if conforming to
>>> the hint isn't possible. Instead, I believe it to be more logical to try
>>> to allocate at the hint address, otherwise give a random address.
>>> 
>> 
>> I also agree with this.
>> 
>>> The current behavior can then be maintained through the flag
>>> MAP_BELOW_HINT. This way the user explicitly selects that they want mmap
>>> to fail if an address could not be found within the hint address
>>> constraints.
>>> 
>> 
>> I think restricting the addresses with the MAP_BELOW_HINT flag
>> would be the best choice. However, it remains a problem: What should
>> the behavior be when there is no MAP_BELOW_HINT? I think we can
>> fallback to Sv48 on the Sv57 machine by default to align with x86
>> and aarch64.
> 
> Although that is the behavior on other architectures, I am hesitant to
> follow it because it is a somewhat arbitrary restriction. With a generic
> flag that can force mmap to provide exactly the number of bits that an
> application needs, there is no need for this restriction on riscv. It
> may cause problems for applications running on sv57 hardware, however:
> 
> 1. sv57 hardware does not exist yet
> 

Note that we have QEMU, which uses Sv57 by default. If the mmap
returns an address >= BIT(47) with hint address == NULL, many QEMU
users, such as some distro package builders, may need to deal with
some problems.

> 2. A hint address would still be required if following the same behavior
> as other architectures.
> a. It would aid in the porting of an application to sv57
> hardware, but I am not sure that forcing this restriction is
> worth having this one piece of parity. Applications using the
> proposed generic flag would work as expected on all
> architectures as well.
> 
>> 
>>> - Charlie
>>> 
>>>> 
>>>>> Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
>>>>> 
>>>>>>> The goal of this patch is to help
>>>>>>> developers have more consistent mmap behavior, but maybe it is necessary
>>>>>>> to hide this behavior behind an mmap flag.
>>>>>> 
>>>>>> Thank you for helping to shape a more consistent mmap behavior.
>>>>>> I think this should be fixed ASAP either by allowing the hint address to
>>>>>> be ignored
>>>>>> (as suggested by the Linux man page), or hide this behavior behind an
>>>>>> mmap flag as you said.
>>>>>> 
>>>>>>> - Charlie
>>>>>>> 
>>>>>>>> See alsohttps://github.com/riscv-forks/electron/issues/4
>>>>>>>> 
>>>>>>>>>> - Charlie
>>>>>>>> Sincerely,
>>>>>>>> Levi
>>>>>>>> 
>>>>>> 
>>>>>> I accidentally introduced some HTML into this reply so this reply is
>>>>>> resent as plain text.
>>>>>> 
>>>>>> Sincerely,
>>>>>> Levi
>> 
>>
Charlie Jenkins Aug. 26, 2024, 4:30 p.m. UTC | #16
On Fri, Aug 23, 2024 at 02:55:15PM +0800, Yangyu Chen wrote:
> 
> 
> > On Aug 23, 2024, at 13:57, Charlie Jenkins <charlie@rivosinc.com> wrote:
> > 
> > On Fri, Aug 23, 2024 at 01:28:18PM +0800, Yangyu Chen wrote:
> >> 
> >> 
> >>> On Aug 23, 2024, at 12:39, Charlie Jenkins <charlie@rivosinc.com> wrote:
> >>> 
> >>> On Thu, Aug 22, 2024 at 10:51:54AM +0800, Yangyu Chen wrote:
> >>>> 
> >>>> 
> >>>>> On Aug 22, 2024, at 06:17, Palmer Dabbelt <palmer@dabbelt.com> wrote:
> >>>>> 
> >>>>> On Mon, 19 Aug 2024 18:58:18 PDT (-0700), rsworktech@outlook.com wrote:
> >>>>>> On 2024-08-20 01:00, Charlie Jenkins wrote:
> >>>>>>> On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> >>>>>>>> On 2024-03-22 22:06, Palmer Dabbelt wrote:
> >>>>>>>>> On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> >>>>>>>>>> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> >>>>>>>>>>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> >>>>>>>>>>>> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> >>>>>>>>>>>>> On riscv it is guaranteed that the address returned by mmap is less
> >>>>>>>>>>>>> than
> >>>>>>>>>>>>> the hint address. Allow mmap to return an address all the way up to
> >>>>>>>>>>>>> addr, if provided, rather than just up to the lower address space.
> >>>>>>>>>>>>>>> This provides a performance benefit as well, allowing
> >>>>>>>>>>> mmap to exit
> >>>>>>>>>>>>> after
> >>>>>>>>>>>>> checking that the address is in range rather than searching for a
> >>>>>>>>>>>>> valid
> >>>>>>>>>>>>> address.
> >>>>>>>>>>>>>>> It is possible to provide an address that uses at most the same
> >>>>>>>>>>>>> number
> >>>>>>>>>>>>> of bits, however it is significantly more computationally expensive
> >>>>>>>>>>>>> to
> >>>>>>>>>>>>> provide that number rather than setting the max to be the hint
> >>>>>>>>>>>>> address.
> >>>>>>>>>>>>> There is the instruction clz/clzw in Zbb that returns the highest
> >>>>>>>>>>>>> set
> >>>>>>>>>>>>> bit
> >>>>>>>>>>>>> which could be used to performantly implement this, but it would
> >>>>>>>>>>>>> still
> >>>>>>>>>>>>> be slower than the current implementation. At worst case, half of
> >>>>>>>>>>>>> the
> >>>>>>>>>>>>> address would not be able to be allocated when a hint address is
> >>>>>>>>>>>>> provided.
> >>>>>>>>>>>>>>> Signed-off-by: Charlie Jenkins<charlie@rivosinc.com>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>> arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> >>>>>>>>>>>>> -
> >>>>>>>>>>>>> 1 file changed, 11 insertions(+), 16 deletions(-)
> >>>>>>>>>>>>>>> diff --git a/arch/riscv/include/asm/processor.h
> >>>>>>>>>>>>> b/arch/riscv/include/asm/processor.h
> >>>>>>>>>>>>> index f19f861cda54..8ece7a8f0e18 100644
> >>>>>>>>>>>>> --- a/arch/riscv/include/asm/processor.h
> >>>>>>>>>>>>> +++ b/arch/riscv/include/asm/processor.h
> >>>>>>>>>>>>> @@ -14,22 +14,16 @@
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> #include <asm/ptrace.h>
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> -#ifdef CONFIG_64BIT
> >>>>>>>>>>>>> -#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>>>>>> -#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>>>>>> -
> >>>>>>>>>>>>> #define arch_get_mmap_end(addr, len, flags)            \
> >>>>>>>>>>>>> ({                                \
> >>>>>>>>>>>>>    unsigned long
> >>>>>>>>>>>>> mmap_end;                    \
> >>>>>>>>>>>>>    typeof(addr) _addr = (addr);                \
> >>>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>>>>>> is_compat_task())) \
> >>>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>>>>>> 1)))            \
> >>>>>>>>>>>>>        mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>>>>>> -    else if ((_addr) >= VA_USER_SV57) \
> >>>>>>>>>>>>> -        mmap_end = STACK_TOP_MAX;            \
> >>>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>>>>>> -        mmap_end = VA_USER_SV48;            \
> >>>>>>>>>>>>>    else                            \
> >>>>>>>>>>>>> -        mmap_end = VA_USER_SV39;            \
> >>>>>>>>>>>>> +        mmap_end = (_addr + len);            \
> >>>>>>>>>>>>>    mmap_end;                        \
> >>>>>>>>>>>>> })
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> @@ -39,17 +33,18 @@
> >>>>>>>>>>>>>    typeof(addr) _addr = (addr);                \
> >>>>>>>>>>>>>    typeof(base) _base = (base);                \
> >>>>>>>>>>>>>    unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);    \
> >>>>>>>>>>>>> -    if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> >>>>>>>>>>>>> is_compat_task())) \
> >>>>>>>>>>>>> +    if ((_addr) == 0 ||                    \
> >>>>>>>>>>>>> +        (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||    \
> >>>>>>>>>>>>> +        ((_addr + len) > BIT(VA_BITS -
> >>>>>>>>>>>>> 1)))            \
> >>>>>>>>>>>>>        mmap_base = (_base);                \
> >>>>>>>>>>>>> -    else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> >>>>>>>>>>>>> VA_BITS_SV57)) \
> >>>>>>>>>>>>> -        mmap_base = VA_USER_SV57 - rnd_gap; \
> >>>>>>>>>>>>> -    else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> >>>>>>>>>>>>> VA_BITS_SV48)) \
> >>>>>>>>>>>>> -        mmap_base = VA_USER_SV48 - rnd_gap; \
> >>>>>>>>>>>>>    else                            \
> >>>>>>>>>>>>> -        mmap_base = VA_USER_SV39 - rnd_gap; \
> >>>>>>>>>>>>> +        mmap_base = (_addr + len) - rnd_gap; \
> >>>>>>>>>>>>>    mmap_base;                        \
> >>>>>>>>>>>>> })
> >>>>>>>>>>>>> 
> >>>>>>>>>>>>> +#ifdef CONFIG_64BIT
> >>>>>>>>>>>>> +#define DEFAULT_MAP_WINDOW    (UL(1) << (MMAP_VA_BITS - 1))
> >>>>>>>>>>>>> +#define STACK_TOP_MAX        TASK_SIZE_64
> >>>>>>>>>>>>> #else
> >>>>>>>>>>>>> #define DEFAULT_MAP_WINDOW    TASK_SIZE
> >>>>>>>>>>>>> #define STACK_TOP_MAX        TASK_SIZE
> >>>>>>>>>>>>>>> I have carefully tested your patch on qemu with sv57. A
> >>>>>>>>>>> bug that
> >>>>>>>>>>>> needs
> >>>>>>>>>>>> to be solved is that mmap with the same hint address without
> >>>>>>>>>>>> MAP_FIXED
> >>>>>>>>>>>> set will fail the second time.
> >>>>>>>>>>>>> Userspace code to reproduce the bug:
> >>>>>>>>>>>>> #include <sys/mman.h>
> >>>>>>>>>>>> #include <stdio.h>
> >>>>>>>>>>>> #include <stdint.h>
> >>>>>>>>>>>>> void test(char *addr) {
> >>>>>>>>>>>>   char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> >>>>>>>>>>>> MAP_ANONYMOUS
> >>>>>>>>>>>>> MAP_PRIVATE, -1, 0);
> >>>>>>>>>>>>   printf("hint %p got %p.\n", addr, res);
> >>>>>>>>>>>> }
> >>>>>>>>>>>>> int main (void) {
> >>>>>>>>>>>>   test(1<<30);
> >>>>>>>>>>>>   test(1<<30);
> >>>>>>>>>>>>   test(1<<30);
> >>>>>>>>>>>>   return 0;
> >>>>>>>>>>>> }
> >>>>>>>>>>>>> output:
> >>>>>>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>>>>>> hint 0x40000000 got 0xffffffffffffffff.
> >>>>>>>>>>>>> output on x86:
> >>>>>>>>>>>>> hint 0x40000000 got 0x40000000.
> >>>>>>>>>>>> hint 0x40000000 got 0x7f9171363000.
> >>>>>>>>>>>> hint 0x40000000 got 0x7f9171362000.
> >>>>>>>>>>>>> It may need to implement a special arch_get_unmapped_area and
> >>>>>>>>>>>> arch_get_unmapped_area_topdown function.
> >>>>>>>>>>>> 
> >>>>>>>>>>> This is because hint address < rnd_gap. I have tried to let mmap_base =
> >>>>>>>>>>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> >>>>>>>>>>> does not work for bottom-up while ulimit -s is unlimited. You said this
> >>>>>>>>>>> behavior is expected from patch v2 review. However it brings a new
> >>>>>>>>>>> regression even on sv39 systems.
> >>>>>>>>>>> 
> >>>>>>>>>>> I still don't know the reason why use addr+len as the upper-bound. I
> >>>>>>>>>>> think solution like x86/arm64/powerpc provide two address space switch
> >>>>>>>>>>> based on whether hint address above the default map window is enough.
> >>>>>>>>>>> 
> >>>>>>>>>> Yep this is expected. It is up to the maintainers to decide.
> >>>>>>>>> Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> >>>>>>>>> but I must have lost it.
> >>>>>>>>> 
> >>>>>>>>> I think Charlie's approach is the right way to go.  Putting my userspace
> >>>>>>>>> hat on, I'd much rather have my allocations fail rather than silently
> >>>>>>>>> ignore the hint when there's memory pressure.
> >>>>>>>>> 
> >>>>>>>>> If there's some real use case that needs these low hints to be silently
> >>>>>>>>> ignored under VA pressure then we can try and figure something out that
> >>>>>>>>> makes those applications work.
> >>>>>>>> I could confirm that this patch has broken chromium's partition allocator on
> >>>>>>>> riscv64. The minimal reproduction I use is chromium-mmap.c:
> >>>>>>>> 
> >>>>>>>> #include <stdio.h>
> >>>>>>>> #include <sys/mman.h>
> >>>>>>>> 
> >>>>>>>> int main() {
> >>>>>>>>   void* expected = (void*)0x400000000;
> >>>>>>>>   void* addr = mmap(expected, 17179869184, PROT_NONE,
> >>>>>>>> MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> >>>>>>>>   if (addr != expected) {
> >>>>>>> It is not valid to assume that the address returned by mmap will be the
> >>>>>>> hint address. If the hint address is not available, mmap will return a
> >>>>>>> different address.
> >>>>>> 
> >>>>>> Oh, sorry I didn't make it clear what is the expected behavior.
> >>>>>> The printf here is solely for debugging purpose and I don't mean that
> >>>>>> chromium expect it will get the hint address. The expected behavior is
> >>>>>> that both the two mmap calls will succeed.
> >>>>>> 
> >>>>>>>>       printf("Not expected address: %p != %p\n", addr, expected);
> >>>>>>>>   }
> >>>>>>>>   expected = (void*)0x3fffff000;
> >>>>>>>>   addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> >>>>>>>> -1, 0);
> >>>>>>>>   if (addr != expected) {
> >>>>>>>>       printf("Not expected address: %p != %p\n", addr, expected);
> >>>>>>>>   }
> >>>>>>>>   return 0;
> >>>>>>>> }
> >>>>>>>> 
> >>>>>>>> The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> >>>>>>>> issue for me. So I think it's clearly a regression and breaks userspace.
> >>>>>>>> 
> >>>>>>> The issue here is that overlapping memory is being requested. This
> >>>>>>> second mmap will never be able to provide an address at 0x3fffff000 with
> >>>>>>> a size of 0x400001000 since mmap just provided an address at 0x400000000
> >>>>>>> with a size of 0x400000000.
> >>>>>>> 
> >>>>>>> Before this patch, this request causes mmap to return a completely
> >>>>>>> arbitrary value. There is no reason to use a hint address in this manner
> >>>>>>> because the hint can never be respected. Since an arbitrary address is
> >>>>>>> desired, a hint of zero should be used.
> >>>>>>> 
> >>>>>>> This patch causes the behavior to be more deterministic. Instead of
> >>>>>>> providing an arbitrary address, it causes the address to be less than or
> >>>>>>> equal to the hint address. This allows for applications to make
> >>>>>>> assumptions about the returned address.
> >>>>>> 
> >>>>>> About the overlap, of course the partition allocator's request for
> >>>>>> overlapped vma seems unreasonable.
> >>>>>> 
> >>>>>> But I still don't quite understand why mmap cannot use an address higher
> >>>>>> than the hint address.
> >>>>>> The hint address, after all, is a hint, not a requirement.
> >>>>>> 
> >>>>>> Quoting the man page:
> >>>>>> 
> >>>>>>> If another mapping already exists there, the kernel picks
> >>>>>>>      a new address that may or may not depend on the hint.  The
> >>>>>>>      address of the new mapping is returned as the result of the call.
> >>>>>> 
> >>>>>> So for casual programmers that only reads man page but not architecture
> >>>>>> specific kernel documentation, the current behavior of mmap on riscv64
> >>>>>> failing on overlapped address ranges are quite surprising IMO.
> >>>>>> 
> >>>>>> And quoting the man page again about the errno:
> >>>>>> 
> >>>>>>>     ENOMEM No memory is available.
> >>>>>>> 
> >>>>>>>     ENOMEM The process's maximum number of mappings would have been
> >>>>>>>            exceeded.  This error can also occur for munmap(), when
> >>>>>>>            unmapping a region in the middle of an existing mapping,
> >>>>>>>            since this results in two smaller mappings on either side
> >>>>>>>            of the region being unmapped.
> >>>>>>> 
> >>>>>>>     ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
> >>>>>>>            described in getrlimit(2), would have been exceeded.
> >>>>>>> 
> >>>>>>>     ENOMEM We don't like addr, because it exceeds the virtual address
> >>>>>>>            space of the CPU.
> >>>>>>> 
> >>>>>> 
> >>>>>> There's no matching description for the ENOMEM returned here.
> >>>>>> I would suggest removing "because it exceeds the virtual address
> >>>>>> space of the CPU." from the last item if the ENOMEM behavior here
> >>>>>> is expected.
> >>>>>> 
> >>>>>>> This code is unfortunately relying on the previously mostly undefined
> >>>>>>> behavior of the hint address in mmap.
> >>>>>> 
> >>>>>> Although I haven't read the code of chromium's partition allocator to
> >>>>>> judge whether it should
> >>>>>> be improved or fixed for riscv64, I do know that the kernel "don't break
> >>>>>> userspace" and "never EVER blame the user programs".
> >>>>> 
> >>>>> Ya, sorry for breaking stuff.
> >>>>> 
> >>>>> The goal here was to move to the mmap flag behavor similar to what arm64 and x86 have, as that was done in a way that didn't appear to break userspace -- or at least any real userspace programs.  IIRC that first test was pretty broken (it actually depended on the hint address), but sounds like that's not the case.
> >>>>> 
> >>>>> I think maybe this is just luck: we didn't chunk the address space up, we're just hinting on every bit, so we're just more likely to hit the exhaustion.  Doesn't really matter, though, as if it's breaking stuff so we've got to deal with it.
> >>>>> 
> >>>>> Charlie and I are just talking, and best we can come up with is to move to the behavior where we fall back to larger allocation regions when there's no space in the smaller allocation region.
> >>>> 
> >>>> 
> >>>> For this solution, the only difference from the mmap behavior of
> >>>> x86 and aarch64 is that we will first try to allocate some memory
> >>>> from an address less or equal to the request address + size. But
> >>>> for most cases, I think there is no need to do that, especially for
> >>>> those addresses < BIT(47), as most program works fine on x86-64,
> >>>> which has 47bit available userspace address space to use. And for
> >>>> that program that wants an address < BIT(32), we already have
> >>>> MAP_32BIT now.
> >>>> 
> >>>> I think we can just fix like that patch:
> >>>> https://lore.kernel.org/lkml/tencent_B2D0435BC011135736262764B511994F4805@qq.com/
> >>> 
> >>> This patch does not satisfy the requirement of having the ability to guarantee
> >>> that mmap returns an address that is less than the hint address.
> >> 
> >> Indeed. My intuition is to remove it and align it with x86 and aarch64.
> >> 
> >>> This
> >>> patch only allows an address to be less than the DEFAULT_MAP_WINDOW
> >>> which is 32 bits on sv32, 39 bits on sv39, and 48 bits on sv48 or sv57.
> >>> 
> >>> This patch also again falls into the trap of using the hint address to
> >>> forcefully restrict the address space.
> >> 
> >> Indeed. However, x86 and aarch64 also use this behavior to restrict
> >> va >= BIT(47) by default unless we have the hint address larger
> >> than BIT(47).
> >> 
> >>> I agree with Levi that it is not
> >>> very good behavior to have a "hint" cause mmap to fail if conforming to
> >>> the hint isn't possible. Instead, I believe it to be more logical to try
> >>> to allocate at the hint address, otherwise give a random address.
> >>> 
> >> 
> >> I also agree with this.
> >> 
> >>> The current behavior can then be maintained through the flag
> >>> MAP_BELOW_HINT. This way the user explicitly selects that they want mmap
> >>> to fail if an address could not be found within the hint address
> >>> constraints.
> >>> 
> >> 
> >> I think restricting the addresses with the MAP_BELOW_HINT flag
> >> would be the best choice. However, it remains a problem: What should
> >> the behavior be when there is no MAP_BELOW_HINT? I think we can
> >> fallback to Sv48 on the Sv57 machine by default to align with x86
> >> and aarch64.
> > 
> > Although that is the behavior on other architectures, I am hesitant to
> > follow it because it is a somewhat arbitrary restriction. With a generic
> > flag that can force mmap to provide exactly the number of bits that an
> > application needs, there is no need for this restriction on riscv. It
> > may cause problems for applications running on sv57 hardware, however:
> > 
> > 1. sv57 hardware does not exist yet
> > 
> 
> Note that we have QEMU, which uses Sv57 by default. If the mmap
> returns an address >= BIT(47) with hint address == NULL, many QEMU
> users, such as some distro package builders, may need to deal with
> some problems.

Yes that is true. However that was an existing problem before any of
these patches went in. I think the best solution is to have an explicit
flag as you have suggested with similar behavor to MAP_32BIT to solve
this.

- Charlie

> 
> > 2. A hint address would still be required if following the same behavior
> > as other architectures.
> > a. It would aid in the porting of an application to sv57
> > hardware, but I am not sure that forcing this restriction is
> > worth having this one piece of parity. Applications using the
> > proposed generic flag would work as expected on all
> > architectures as well.
> > 
> >> 
> >>> - Charlie
> >>> 
> >>>> 
> >>>>> Charlie's going to try and throw together a patch for that, hopefully it'll sort things out.
> >>>>> 
> >>>>>>> The goal of this patch is to help
> >>>>>>> developers have more consistent mmap behavior, but maybe it is necessary
> >>>>>>> to hide this behavior behind an mmap flag.
> >>>>>> 
> >>>>>> Thank you for helping to shape a more consistent mmap behavior.
> >>>>>> I think this should be fixed ASAP either by allowing the hint address to
> >>>>>> be ignored
> >>>>>> (as suggested by the Linux man page), or hide this behavior behind an
> >>>>>> mmap flag as you said.
> >>>>>> 
> >>>>>>> - Charlie
> >>>>>>> 
> >>>>>>>> See alsohttps://github.com/riscv-forks/electron/issues/4
> >>>>>>>> 
> >>>>>>>>>> - Charlie
> >>>>>>>> Sincerely,
> >>>>>>>> Levi
> >>>>>>>> 
> >>>>>> 
> >>>>>> I accidentally introduced some HTML into this reply so this reply is
> >>>>>> resent as plain text.
> >>>>>> 
> >>>>>> Sincerely,
> >>>>>> Levi
> >> 
> >> 
>
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index f19f861cda54..8ece7a8f0e18 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -14,22 +14,16 @@ 
 
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_64BIT
-#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
-#define STACK_TOP_MAX		TASK_SIZE_64
-
 #define arch_get_mmap_end(addr, len, flags)			\
 ({								\
 	unsigned long mmap_end;					\
 	typeof(addr) _addr = (addr);				\
-	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
+	if ((_addr) == 0 ||					\
+	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
+	    ((_addr + len) > BIT(VA_BITS - 1)))			\
 		mmap_end = STACK_TOP_MAX;			\
-	else if ((_addr) >= VA_USER_SV57)			\
-		mmap_end = STACK_TOP_MAX;			\
-	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-		mmap_end = VA_USER_SV48;			\
 	else							\
-		mmap_end = VA_USER_SV39;			\
+		mmap_end = (_addr + len);			\
 	mmap_end;						\
 })
 
@@ -39,17 +33,18 @@ 
 	typeof(addr) _addr = (addr);				\
 	typeof(base) _base = (base);				\
 	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
-	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
+	if ((_addr) == 0 ||					\
+	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
+	    ((_addr + len) > BIT(VA_BITS - 1)))			\
 		mmap_base = (_base);				\
-	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
-		mmap_base = VA_USER_SV57 - rnd_gap;		\
-	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-		mmap_base = VA_USER_SV48 - rnd_gap;		\
 	else							\
-		mmap_base = VA_USER_SV39 - rnd_gap;		\
+		mmap_base = (_addr + len) - rnd_gap;		\
 	mmap_base;						\
 })
 
+#ifdef CONFIG_64BIT
+#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
+#define STACK_TOP_MAX		TASK_SIZE_64
 #else
 #define DEFAULT_MAP_WINDOW	TASK_SIZE
 #define STACK_TOP_MAX		TASK_SIZE