diff mbox series

[v3,1/3] riscv: mm: Use hint address in mmap if available

Message ID 20240130-use_mmap_hint_address-v3-1-8a655cfa8bcb@rivosinc.com (mailing list archive)
State Accepted
Commit b5b4287accd702f562a49a60b10dbfaf7d40270f
Headers show
Series riscv: mm: Extend mappable memory up to hint address | expand

Commit Message

Charlie Jenkins Jan. 31, 2024, 1:07 a.m. UTC
On riscv it is guaranteed that the address returned by mmap is less than
the hint address. Allow mmap to return an address all the way up to
addr, if provided, rather than just up to the lower address space.

This provides a performance benefit as well, allowing mmap to exit after
checking that the address is in range rather than searching for a valid
address.

It is possible to provide an address that uses at most the same number
of bits, however it is significantly more computationally expensive to
provide that number rather than setting the max to be the hint address.
There is the instruction clz/clzw in Zbb that returns the highest set bit
which could be used to performantly implement this, but it would still
be slower than the current implementation. At worst case, half of the
address would not be able to be allocated when a hint address is
provided.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
---
 arch/riscv/include/asm/processor.h | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

Comments

Yangyu Chen Jan. 31, 2024, 2:41 p.m. UTC | #1
On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> On riscv it is guaranteed that the address returned by mmap is less
> than
> the hint address. Allow mmap to return an address all the way up to
> addr, if provided, rather than just up to the lower address space.
> 
> This provides a performance benefit as well, allowing mmap to exit
> after
> checking that the address is in range rather than searching for a
> valid
> address.
> 
> It is possible to provide an address that uses at most the same
> number
> of bits, however it is significantly more computationally expensive
> to
> provide that number rather than setting the max to be the hint
> address.
> There is the instruction clz/clzw in Zbb that returns the highest set
> bit
> which could be used to performantly implement this, but it would
> still
> be slower than the current implementation. At worst case, half of the
> address would not be able to be allocated when a hint address is
> provided.
> 
> Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> ---
>  arch/riscv/include/asm/processor.h | 27 +++++++++++----------------
>  1 file changed, 11 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/processor.h
> b/arch/riscv/include/asm/processor.h
> index f19f861cda54..8ece7a8f0e18 100644
> --- a/arch/riscv/include/asm/processor.h
> +++ b/arch/riscv/include/asm/processor.h
> @@ -14,22 +14,16 @@
>  
>  #include <asm/ptrace.h>
>  
> -#ifdef CONFIG_64BIT
> -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> -#define STACK_TOP_MAX		TASK_SIZE_64
> -
>  #define arch_get_mmap_end(addr, len, flags)			\
>  ({								\
>  	unsigned long
> mmap_end;					\
>  	typeof(addr) _addr = (addr);				\
> -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> is_compat_task())) \
> +	if ((_addr) == 0 ||					\
> +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> +	    ((_addr + len) > BIT(VA_BITS -
> 1)))			\
>  		mmap_end = STACK_TOP_MAX;			\
> -	else if ((_addr) >= VA_USER_SV57)			\
> -		mmap_end = STACK_TOP_MAX;			\
> -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> VA_BITS_SV48)) \
> -		mmap_end = VA_USER_SV48;			\
>  	else							\
> -		mmap_end = VA_USER_SV39;			\
> +		mmap_end = (_addr + len);			\
>  	mmap_end;						\
>  })
>  
> @@ -39,17 +33,18 @@
>  	typeof(addr) _addr = (addr);				\
>  	typeof(base) _base = (base);				\
>  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> is_compat_task())) \
> +	if ((_addr) == 0 ||					\
> +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> +	    ((_addr + len) > BIT(VA_BITS -
> 1)))			\
>  		mmap_base = (_base);				\
> -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> VA_BITS_SV57)) \
> -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> VA_BITS_SV48)) \
> -		mmap_base = VA_USER_SV48 - rnd_gap;		\
>  	else							\
> -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> +		mmap_base = (_addr + len) - rnd_gap;		\
>  	mmap_base;						\
>  })
>  
> +#ifdef CONFIG_64BIT
> +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> +#define STACK_TOP_MAX		TASK_SIZE_64
>  #else
>  #define DEFAULT_MAP_WINDOW	TASK_SIZE
>  #define STACK_TOP_MAX		TASK_SIZE
> 

I have carefully tested your patch on qemu with sv57. A bug that needs
to be solved is that mmap with the same hint address without MAP_FIXED
set will fail the second time.

Userspace code to reproduce the bug:

#include <sys/mman.h>
#include <stdio.h>
#include <stdint.h>

void test(char *addr) {
    char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS
| MAP_PRIVATE, -1, 0);
    printf("hint %p got %p.\n", addr, res);
}

int main (void) {
    test(1<<30);
    test(1<<30);
    test(1<<30);
    return 0;
}

output:

hint 0x40000000 got 0x40000000.
hint 0x40000000 got 0xffffffffffffffff.
hint 0x40000000 got 0xffffffffffffffff.

output on x86:

hint 0x40000000 got 0x40000000.
hint 0x40000000 got 0x7f9171363000.
hint 0x40000000 got 0x7f9171362000.

It may need to implement a special arch_get_unmapped_area and
arch_get_unmapped_area_topdown function.
Yangyu Chen Jan. 31, 2024, 3:59 p.m. UTC | #2
On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > On riscv it is guaranteed that the address returned by mmap is less
> > than
> > the hint address. Allow mmap to return an address all the way up to
> > addr, if provided, rather than just up to the lower address space.
> > 
> > This provides a performance benefit as well, allowing mmap to exit
> > after
> > checking that the address is in range rather than searching for a
> > valid
> > address.
> > 
> > It is possible to provide an address that uses at most the same
> > number
> > of bits, however it is significantly more computationally expensive
> > to
> > provide that number rather than setting the max to be the hint
> > address.
> > There is the instruction clz/clzw in Zbb that returns the highest
> > set
> > bit
> > which could be used to performantly implement this, but it would
> > still
> > be slower than the current implementation. At worst case, half of
> > the
> > address would not be able to be allocated when a hint address is
> > provided.
> > 
> > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > ---
> >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > -
> >  1 file changed, 11 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/riscv/include/asm/processor.h
> > b/arch/riscv/include/asm/processor.h
> > index f19f861cda54..8ece7a8f0e18 100644
> > --- a/arch/riscv/include/asm/processor.h
> > +++ b/arch/riscv/include/asm/processor.h
> > @@ -14,22 +14,16 @@
> >  
> >  #include <asm/ptrace.h>
> >  
> > -#ifdef CONFIG_64BIT
> > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > -#define STACK_TOP_MAX		TASK_SIZE_64
> > -
> >  #define arch_get_mmap_end(addr, len, flags)			\
> >  ({								\
> >  	unsigned long
> > mmap_end;					\
> >  	typeof(addr) _addr = (addr);				\
> > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > is_compat_task())) \
> > +	if ((_addr) == 0 ||					\
> > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > +	    ((_addr + len) > BIT(VA_BITS -
> > 1)))			\
> >  		mmap_end = STACK_TOP_MAX;			\
> > -	else if ((_addr) >= VA_USER_SV57)			\
> > -		mmap_end = STACK_TOP_MAX;			\
> > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > VA_BITS_SV48)) \
> > -		mmap_end = VA_USER_SV48;			\
> >  	else							\
> > -		mmap_end = VA_USER_SV39;			\
> > +		mmap_end = (_addr + len);			\
> >  	mmap_end;						\
> >  })
> >  
> > @@ -39,17 +33,18 @@
> >  	typeof(addr) _addr = (addr);				\
> >  	typeof(base) _base = (base);				\
> >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > is_compat_task())) \
> > +	if ((_addr) == 0 ||					\
> > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > +	    ((_addr + len) > BIT(VA_BITS -
> > 1)))			\
> >  		mmap_base = (_base);				\
> > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > VA_BITS_SV57)) \
> > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > VA_BITS_SV48)) \
> > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
> >  	else							\
> > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> > +		mmap_base = (_addr + len) - rnd_gap;		\
> >  	mmap_base;						\
> >  })
> >  
> > +#ifdef CONFIG_64BIT
> > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > +#define STACK_TOP_MAX		TASK_SIZE_64
> >  #else
> >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
> >  #define STACK_TOP_MAX		TASK_SIZE
> > 
> 
> I have carefully tested your patch on qemu with sv57. A bug that
> needs
> to be solved is that mmap with the same hint address without
> MAP_FIXED
> set will fail the second time.
> 
> Userspace code to reproduce the bug:
> 
> #include <sys/mman.h>
> #include <stdio.h>
> #include <stdint.h>
> 
> void test(char *addr) {
>     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> MAP_ANONYMOUS
> > MAP_PRIVATE, -1, 0);
>     printf("hint %p got %p.\n", addr, res);
> }
> 
> int main (void) {
>     test(1<<30);
>     test(1<<30);
>     test(1<<30);
>     return 0;
> }
> 
> output:
> 
> hint 0x40000000 got 0x40000000.
> hint 0x40000000 got 0xffffffffffffffff.
> hint 0x40000000 got 0xffffffffffffffff.
> 
> output on x86:
> 
> hint 0x40000000 got 0x40000000.
> hint 0x40000000 got 0x7f9171363000.
> hint 0x40000000 got 0x7f9171362000.
> 
> It may need to implement a special arch_get_unmapped_area and
> arch_get_unmapped_area_topdown function.
> 

This is because hint address < rnd_gap. I have tried to let mmap_base =
min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
does not work for bottom-up while ulimit -s is unlimited. You said this
behavior is expected from patch v2 review. However it brings a new
regression even on sv39 systems.

I still don't know the reason why use addr+len as the upper-bound. I
think solution like x86/arm64/powerpc provide two address space switch
based on whether hint address above the default map window is enough.
Charlie Jenkins Feb. 2, 2024, 2:28 a.m. UTC | #3
On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > > On riscv it is guaranteed that the address returned by mmap is less
> > > than
> > > the hint address. Allow mmap to return an address all the way up to
> > > addr, if provided, rather than just up to the lower address space.
> > > 
> > > This provides a performance benefit as well, allowing mmap to exit
> > > after
> > > checking that the address is in range rather than searching for a
> > > valid
> > > address.
> > > 
> > > It is possible to provide an address that uses at most the same
> > > number
> > > of bits, however it is significantly more computationally expensive
> > > to
> > > provide that number rather than setting the max to be the hint
> > > address.
> > > There is the instruction clz/clzw in Zbb that returns the highest
> > > set
> > > bit
> > > which could be used to performantly implement this, but it would
> > > still
> > > be slower than the current implementation. At worst case, half of
> > > the
> > > address would not be able to be allocated when a hint address is
> > > provided.
> > > 
> > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> > > ---
> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > > -
> > >  1 file changed, 11 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/arch/riscv/include/asm/processor.h
> > > b/arch/riscv/include/asm/processor.h
> > > index f19f861cda54..8ece7a8f0e18 100644
> > > --- a/arch/riscv/include/asm/processor.h
> > > +++ b/arch/riscv/include/asm/processor.h
> > > @@ -14,22 +14,16 @@
> > >  
> > >  #include <asm/ptrace.h>
> > >  
> > > -#ifdef CONFIG_64BIT
> > > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > > -#define STACK_TOP_MAX		TASK_SIZE_64
> > > -
> > >  #define arch_get_mmap_end(addr, len, flags)			\
> > >  ({								\
> > >  	unsigned long
> > > mmap_end;					\
> > >  	typeof(addr) _addr = (addr);				\
> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > is_compat_task())) \
> > > +	if ((_addr) == 0 ||					\
> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > > +	    ((_addr + len) > BIT(VA_BITS -
> > > 1)))			\
> > >  		mmap_end = STACK_TOP_MAX;			\
> > > -	else if ((_addr) >= VA_USER_SV57)			\
> > > -		mmap_end = STACK_TOP_MAX;			\
> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > VA_BITS_SV48)) \
> > > -		mmap_end = VA_USER_SV48;			\
> > >  	else							\
> > > -		mmap_end = VA_USER_SV39;			\
> > > +		mmap_end = (_addr + len);			\
> > >  	mmap_end;						\
> > >  })
> > >  
> > > @@ -39,17 +33,18 @@
> > >  	typeof(addr) _addr = (addr);				\
> > >  	typeof(base) _base = (base);				\
> > >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > is_compat_task())) \
> > > +	if ((_addr) == 0 ||					\
> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
> > > +	    ((_addr + len) > BIT(VA_BITS -
> > > 1)))			\
> > >  		mmap_base = (_base);				\
> > > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > > VA_BITS_SV57)) \
> > > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > VA_BITS_SV48)) \
> > > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
> > >  	else							\
> > > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
> > > +		mmap_base = (_addr + len) - rnd_gap;		\
> > >  	mmap_base;						\
> > >  })
> > >  
> > > +#ifdef CONFIG_64BIT
> > > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
> > > +#define STACK_TOP_MAX		TASK_SIZE_64
> > >  #else
> > >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
> > >  #define STACK_TOP_MAX		TASK_SIZE
> > > 
> > 
> > I have carefully tested your patch on qemu with sv57. A bug that
> > needs
> > to be solved is that mmap with the same hint address without
> > MAP_FIXED
> > set will fail the second time.
> > 
> > Userspace code to reproduce the bug:
> > 
> > #include <sys/mman.h>
> > #include <stdio.h>
> > #include <stdint.h>
> > 
> > void test(char *addr) {
> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> > MAP_ANONYMOUS
> > > MAP_PRIVATE, -1, 0);
> >     printf("hint %p got %p.\n", addr, res);
> > }
> > 
> > int main (void) {
> >     test(1<<30);
> >     test(1<<30);
> >     test(1<<30);
> >     return 0;
> > }
> > 
> > output:
> > 
> > hint 0x40000000 got 0x40000000.
> > hint 0x40000000 got 0xffffffffffffffff.
> > hint 0x40000000 got 0xffffffffffffffff.
> > 
> > output on x86:
> > 
> > hint 0x40000000 got 0x40000000.
> > hint 0x40000000 got 0x7f9171363000.
> > hint 0x40000000 got 0x7f9171362000.
> > 
> > It may need to implement a special arch_get_unmapped_area and
> > arch_get_unmapped_area_topdown function.
> > 
> 
> This is because hint address < rnd_gap. I have tried to let mmap_base =
> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> does not work for bottom-up while ulimit -s is unlimited. You said this
> behavior is expected from patch v2 review. However it brings a new
> regression even on sv39 systems.
> 
> I still don't know the reason why use addr+len as the upper-bound. I
> think solution like x86/arm64/powerpc provide two address space switch
> based on whether hint address above the default map window is enough.
> 

Yep this is expected. It is up to the maintainers to decide.

- Charlie
Palmer Dabbelt March 22, 2024, 2:06 p.m. UTC | #4
On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
>> On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
>> > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
>> > > On riscv it is guaranteed that the address returned by mmap is less
>> > > than
>> > > the hint address. Allow mmap to return an address all the way up to
>> > > addr, if provided, rather than just up to the lower address space.
>> > > 
>> > > This provides a performance benefit as well, allowing mmap to exit
>> > > after
>> > > checking that the address is in range rather than searching for a
>> > > valid
>> > > address.
>> > > 
>> > > It is possible to provide an address that uses at most the same
>> > > number
>> > > of bits, however it is significantly more computationally expensive
>> > > to
>> > > provide that number rather than setting the max to be the hint
>> > > address.
>> > > There is the instruction clz/clzw in Zbb that returns the highest
>> > > set
>> > > bit
>> > > which could be used to performantly implement this, but it would
>> > > still
>> > > be slower than the current implementation. At worst case, half of
>> > > the
>> > > address would not be able to be allocated when a hint address is
>> > > provided.
>> > > 
>> > > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
>> > > ---
>> > >  arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
>> > > -
>> > >  1 file changed, 11 insertions(+), 16 deletions(-)
>> > > 
>> > > diff --git a/arch/riscv/include/asm/processor.h
>> > > b/arch/riscv/include/asm/processor.h
>> > > index f19f861cda54..8ece7a8f0e18 100644
>> > > --- a/arch/riscv/include/asm/processor.h
>> > > +++ b/arch/riscv/include/asm/processor.h
>> > > @@ -14,22 +14,16 @@
>> > >  
>> > >  #include <asm/ptrace.h>
>> > >  
>> > > -#ifdef CONFIG_64BIT
>> > > -#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
>> > > -#define STACK_TOP_MAX		TASK_SIZE_64
>> > > -
>> > >  #define arch_get_mmap_end(addr, len, flags)			\
>> > >  ({								\
>> > >  	unsigned long
>> > > mmap_end;					\
>> > >  	typeof(addr) _addr = (addr);				\
>> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>> > > is_compat_task())) \
>> > > +	if ((_addr) == 0 ||					\
>> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
>> > > +	    ((_addr + len) > BIT(VA_BITS -
>> > > 1)))			\
>> > >  		mmap_end = STACK_TOP_MAX;			\
>> > > -	else if ((_addr) >= VA_USER_SV57)			\
>> > > -		mmap_end = STACK_TOP_MAX;			\
>> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>> > > VA_BITS_SV48)) \
>> > > -		mmap_end = VA_USER_SV48;			\
>> > >  	else							\
>> > > -		mmap_end = VA_USER_SV39;			\
>> > > +		mmap_end = (_addr + len);			\
>> > >  	mmap_end;						\
>> > >  })
>> > >  
>> > > @@ -39,17 +33,18 @@
>> > >  	typeof(addr) _addr = (addr);				\
>> > >  	typeof(base) _base = (base);				\
>> > >  	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
>> > > -	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
>> > > is_compat_task())) \
>> > > +	if ((_addr) == 0 ||					\
>> > > +	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
>> > > +	    ((_addr + len) > BIT(VA_BITS -
>> > > 1)))			\
>> > >  		mmap_base = (_base);				\
>> > > -	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
>> > > VA_BITS_SV57)) \
>> > > -		mmap_base = VA_USER_SV57 - rnd_gap;		\
>> > > -	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
>> > > VA_BITS_SV48)) \
>> > > -		mmap_base = VA_USER_SV48 - rnd_gap;		\
>> > >  	else							\
>> > > -		mmap_base = VA_USER_SV39 - rnd_gap;		\
>> > > +		mmap_base = (_addr + len) - rnd_gap;		\
>> > >  	mmap_base;						\
>> > >  })
>> > >  
>> > > +#ifdef CONFIG_64BIT
>> > > +#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
>> > > +#define STACK_TOP_MAX		TASK_SIZE_64
>> > >  #else
>> > >  #define DEFAULT_MAP_WINDOW	TASK_SIZE
>> > >  #define STACK_TOP_MAX		TASK_SIZE
>> > > 
>> > 
>> > I have carefully tested your patch on qemu with sv57. A bug that
>> > needs
>> > to be solved is that mmap with the same hint address without
>> > MAP_FIXED
>> > set will fail the second time.
>> > 
>> > Userspace code to reproduce the bug:
>> > 
>> > #include <sys/mman.h>
>> > #include <stdio.h>
>> > #include <stdint.h>
>> > 
>> > void test(char *addr) {
>> >     char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
>> > MAP_ANONYMOUS
>> > > MAP_PRIVATE, -1, 0);
>> >     printf("hint %p got %p.\n", addr, res);
>> > }
>> > 
>> > int main (void) {
>> >     test(1<<30);
>> >     test(1<<30);
>> >     test(1<<30);
>> >     return 0;
>> > }
>> > 
>> > output:
>> > 
>> > hint 0x40000000 got 0x40000000.
>> > hint 0x40000000 got 0xffffffffffffffff.
>> > hint 0x40000000 got 0xffffffffffffffff.
>> > 
>> > output on x86:
>> > 
>> > hint 0x40000000 got 0x40000000.
>> > hint 0x40000000 got 0x7f9171363000.
>> > hint 0x40000000 got 0x7f9171362000.
>> > 
>> > It may need to implement a special arch_get_unmapped_area and
>> > arch_get_unmapped_area_topdown function.
>> > 
>> 
>> This is because hint address < rnd_gap. I have tried to let mmap_base =
>> min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
>> does not work for bottom-up while ulimit -s is unlimited. You said this
>> behavior is expected from patch v2 review. However it brings a new
>> regression even on sv39 systems.
>> 
>> I still don't know the reason why use addr+len as the upper-bound. I
>> think solution like x86/arm64/powerpc provide two address space switch
>> based on whether hint address above the default map window is enough.
>> 
>
> Yep this is expected. It is up to the maintainers to decide.

Sorry I forgot to reply to this, I had a buffer sitting around somewhere 
but I must have lost it.

I think Charlie's approach is the right way to go.  Putting my userspace 
hat on, I'd much rather have my allocations fail rather than silently 
ignore the hint when there's memory pressure.

If there's some real use case that needs these low hints to be silently 
ignored under VA pressure then we can try and figure something out that 
makes those applications work.

>
> - Charlie
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index f19f861cda54..8ece7a8f0e18 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -14,22 +14,16 @@ 
 
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_64BIT
-#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
-#define STACK_TOP_MAX		TASK_SIZE_64
-
 #define arch_get_mmap_end(addr, len, flags)			\
 ({								\
 	unsigned long mmap_end;					\
 	typeof(addr) _addr = (addr);				\
-	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
+	if ((_addr) == 0 ||					\
+	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
+	    ((_addr + len) > BIT(VA_BITS - 1)))			\
 		mmap_end = STACK_TOP_MAX;			\
-	else if ((_addr) >= VA_USER_SV57)			\
-		mmap_end = STACK_TOP_MAX;			\
-	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-		mmap_end = VA_USER_SV48;			\
 	else							\
-		mmap_end = VA_USER_SV39;			\
+		mmap_end = (_addr + len);			\
 	mmap_end;						\
 })
 
@@ -39,17 +33,18 @@ 
 	typeof(addr) _addr = (addr);				\
 	typeof(base) _base = (base);				\
 	unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);	\
-	if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
+	if ((_addr) == 0 ||					\
+	    (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) ||	\
+	    ((_addr + len) > BIT(VA_BITS - 1)))			\
 		mmap_base = (_base);				\
-	else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
-		mmap_base = VA_USER_SV57 - rnd_gap;		\
-	else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-		mmap_base = VA_USER_SV48 - rnd_gap;		\
 	else							\
-		mmap_base = VA_USER_SV39 - rnd_gap;		\
+		mmap_base = (_addr + len) - rnd_gap;		\
 	mmap_base;						\
 })
 
+#ifdef CONFIG_64BIT
+#define DEFAULT_MAP_WINDOW	(UL(1) << (MMAP_VA_BITS - 1))
+#define STACK_TOP_MAX		TASK_SIZE_64
 #else
 #define DEFAULT_MAP_WINDOW	TASK_SIZE
 #define STACK_TOP_MAX		TASK_SIZE