diff mbox series

[1/5] io_uring: Adjust mapping wrt architecture aliasing requirements

Message ID 20230314171641.10542-2-axboe@kernel.dk (mailing list archive)
State New
Headers show
Series User mapped provided buffer rings | expand

Commit Message

Jens Axboe March 14, 2023, 5:16 p.m. UTC
From: Helge Deller <deller@gmx.de>

Some architectures have memory cache aliasing requirements (e.g. parisc)
if memory is shared between userspace and kernel. This patch fixes the
kernel to return an aliased address when asked by userspace via mmap().

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Comments

matoro July 12, 2023, 4:43 a.m. UTC | #1
On 2023-03-14 13:16, Jens Axboe wrote:
> From: Helge Deller <deller@gmx.de>
> 
> Some architectures have memory cache aliasing requirements (e.g. 
> parisc)
> if memory is shared between userspace and kernel. This patch fixes the
> kernel to return an aliased address when asked by userspace via mmap().
> 
> Signed-off-by: Helge Deller <deller@gmx.de>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 722624b6d0dc..3adecebbac71 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -72,6 +72,7 @@
>  #include <linux/io_uring.h>
>  #include <linux/audit.h>
>  #include <linux/security.h>
> +#include <asm/shmparam.h>
> 
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/io_uring.h>
> @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file 
> *file, struct vm_area_struct *vma)
>  	return remap_pfn_range(vma, vma->vm_start, pfn, sz, 
> vma->vm_page_prot);
>  }
> 
> +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
> +			unsigned long addr, unsigned long len,
> +			unsigned long pgoff, unsigned long flags)
> +{
> +	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
> +	struct vm_unmapped_area_info info;
> +	void *ptr;
> +
> +	/*
> +	 * Do not allow to map to user-provided address to avoid breaking the
> +	 * aliasing rules. Userspace is not able to guess the offset address 
> of
> +	 * kernel kmalloc()ed memory area.
> +	 */
> +	if (addr)
> +		return -EINVAL;
> +
> +	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
> +	if (IS_ERR(ptr))
> +		return -ENOMEM;
> +
> +	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> +	info.length = len;
> +	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
> +	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
> +#ifdef SHM_COLOUR
> +	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
> +#else
> +	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> +#endif
> +	info.align_offset = (unsigned long) ptr;
> +
> +	/*
> +	 * A failed mmap() very likely causes application failure,
> +	 * so fall back to the bottom-up function here. This scenario
> +	 * can happen with large stack limits and large mmap()
> +	 * allocations.
> +	 */
> +	addr = vm_unmapped_area(&info);
> +	if (offset_in_page(addr)) {
> +		info.flags = 0;
> +		info.low_limit = TASK_UNMAPPED_BASE;
> +		info.high_limit = mmap_end;
> +		addr = vm_unmapped_area(&info);
> +	}
> +
> +	return addr;
> +}
> +
>  #else /* !CONFIG_MMU */
> 
>  static int io_uring_mmap(struct file *file, struct vm_area_struct 
> *vma)
> @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops 
> = {
>  #ifndef CONFIG_MMU
>  	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
>  	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
> +#else
> +	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
>  #endif
>  	.poll		= io_uring_poll,
>  #ifdef CONFIG_PROC_FS

Hi Jens, Helge - I've bisected a regression with io_uring on ia64 to 
this patch in 6.4.  Unfortunately this breaks userspace programs using 
io_uring, the easiest one to test is cmake with an io_uring enabled 
libuv (i.e., libuv >= 1.45.0) which will hang.

I am aware that ia64 is in a vulnerable place right now which I why I am 
keeping this spread limited.  Since this clearly involves 
architecture-specific changes for parisc, is there any chance of looking 
at what is required to do the same for ia64?  I looked at 
0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of 
SHMLBA from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA 
-> SHM_COLOUR change, but it made no difference.

If hardware is necessary for testing, I can provide it, including remote 
BMC access for restarts/kernel debugging.  Any takers?

$ git bisect log
git bisect start
# status: waiting for both good and bad commits
# good: [eceb0b18ae34b399856a2dd1eee8c18b2341e6f0] Linux 6.3.12
git bisect good eceb0b18ae34b399856a2dd1eee8c18b2341e6f0
# status: waiting for bad commit, 1 good commit known
# bad: [59377679473491963a599bfd51cc9877492312ee] Linux 6.4.1
git bisect bad 59377679473491963a599bfd51cc9877492312ee
# good: [457391b0380335d5e9a5babdec90ac53928b23b4] Linux 6.3
git bisect good 457391b0380335d5e9a5babdec90ac53928b23b4
# bad: [cb6fe2ceb667eb78f252d473b03deb23999ab1cf] Merge tag 
'devicetree-for-6.4-2' of 
git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux
git bisect bad cb6fe2ceb667eb78f252d473b03deb23999ab1cf
# good: [f5468bec213ec2ad3f2724e3f1714b3bc7bf1515] Merge tag 
'regmap-v6.4' of 
git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap
git bisect good f5468bec213ec2ad3f2724e3f1714b3bc7bf1515
# good: [207296f1a03bfead0110ffc4f192f242100ce4ff] netfilter: nf_tables: 
allow to create netdev chain without device
git bisect good 207296f1a03bfead0110ffc4f192f242100ce4ff
# good: [85d7ab2463822a4ab096c0b7b59feec962552572] Merge tag 
'for-6.4-tag' of 
git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
git bisect good 85d7ab2463822a4ab096c0b7b59feec962552572
# bad: [b68ee1c6131c540a62ecd443be89c406401df091] Merge tag 'scsi-misc' 
of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
git bisect bad b68ee1c6131c540a62ecd443be89c406401df091
# bad: [48dc810012a6b4f4ba94073d6b7edb4f76edeb72] Merge tag 
'for-6.4/dm-changes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
git bisect bad 48dc810012a6b4f4ba94073d6b7edb4f76edeb72
# bad: [5b9a7bb72fddbc5247f56ede55d485fab7abdf92] Merge tag 
'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux
git bisect bad 5b9a7bb72fddbc5247f56ede55d485fab7abdf92
# good: [5c7ecada25d2086aee607ff7deb69e77faa4aa92] Merge tag 
'f2fs-for-6.4-rc1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
git bisect good 5c7ecada25d2086aee607ff7deb69e77faa4aa92
# bad: [6e7248adf8f7adb5e36ec1e91efcc85a83bf8aeb] io_uring: refactor 
io_cqring_wake()
git bisect bad 6e7248adf8f7adb5e36ec1e91efcc85a83bf8aeb
# bad: [2ad57931db641f3de627023afb8147a8ec0b41dc] io_uring: rename 
trace_io_uring_submit_sqe() tracepoint
git bisect bad 2ad57931db641f3de627023afb8147a8ec0b41dc
# bad: [efba1a9e653e107577a48157b5424878c46f2285] io_uring: Move from 
hlist to io_wq_work_node
git bisect bad efba1a9e653e107577a48157b5424878c46f2285
# bad: [ba56b63242d12df088ed9a701cad320e6b306dfe] io_uring/kbuf: move 
pinning of provided buffer ring into helper
git bisect bad ba56b63242d12df088ed9a701cad320e6b306dfe
# good: [d4755e15386c38e4ae532ace5acc29fbfaee42e7] io_uring: avoid 
hashing O_DIRECT writes if the filesystem doesn't need it
git bisect good d4755e15386c38e4ae532ace5acc29fbfaee42e7
# bad: [d808459b2e31bd5123a14258a7a529995db974c8] io_uring: Adjust 
mapping wrt architecture aliasing requirements
git bisect bad d808459b2e31bd5123a14258a7a529995db974c8
# first bad commit: [d808459b2e31bd5123a14258a7a529995db974c8] io_uring: 
Adjust mapping wrt architecture aliasing requirements
Helge Deller July 12, 2023, 4:24 p.m. UTC | #2
Hi Matoro,

* matoro <matoro_mailinglist_kernel@matoro.tk>:
> On 2023-03-14 13:16, Jens Axboe wrote:
> > From: Helge Deller <deller@gmx.de>
> >
> > Some architectures have memory cache aliasing requirements (e.g. parisc)
> > if memory is shared between userspace and kernel. This patch fixes the
> > kernel to return an aliased address when asked by userspace via mmap().
> >
> > Signed-off-by: Helge Deller <deller@gmx.de>
> > Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > ---
> >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 51 insertions(+)
> >
> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > index 722624b6d0dc..3adecebbac71 100644
> > --- a/io_uring/io_uring.c
> > +++ b/io_uring/io_uring.c
> > @@ -72,6 +72,7 @@
> >  #include <linux/io_uring.h>
> >  #include <linux/audit.h>
> >  #include <linux/security.h>
> > +#include <asm/shmparam.h>
> >
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/io_uring.h>
> > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
> > *file, struct vm_area_struct *vma)
> >  	return remap_pfn_range(vma, vma->vm_start, pfn, sz,
> > vma->vm_page_prot);
> >  }
> >
> > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
> > +			unsigned long addr, unsigned long len,
> > +			unsigned long pgoff, unsigned long flags)
> > +{
> > +	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
> > +	struct vm_unmapped_area_info info;
> > +	void *ptr;
> > +
> > +	/*
> > +	 * Do not allow to map to user-provided address to avoid breaking the
> > +	 * aliasing rules. Userspace is not able to guess the offset address
> > of
> > +	 * kernel kmalloc()ed memory area.
> > +	 */
> > +	if (addr)
> > +		return -EINVAL;
> > +
> > +	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
> > +	if (IS_ERR(ptr))
> > +		return -ENOMEM;
> > +
> > +	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> > +	info.length = len;
> > +	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
> > +	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
> > +#ifdef SHM_COLOUR
> > +	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
> > +#else
> > +	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> > +#endif
> > +	info.align_offset = (unsigned long) ptr;
> > +
> > +	/*
> > +	 * A failed mmap() very likely causes application failure,
> > +	 * so fall back to the bottom-up function here. This scenario
> > +	 * can happen with large stack limits and large mmap()
> > +	 * allocations.
> > +	 */
> > +	addr = vm_unmapped_area(&info);
> > +	if (offset_in_page(addr)) {
> > +		info.flags = 0;
> > +		info.low_limit = TASK_UNMAPPED_BASE;
> > +		info.high_limit = mmap_end;
> > +		addr = vm_unmapped_area(&info);
> > +	}
> > +
> > +	return addr;
> > +}
> > +
> >  #else /* !CONFIG_MMU */
> >
> >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
> > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
> > = {
> >  #ifndef CONFIG_MMU
> >  	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
> >  	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
> > +#else
> > +	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
> >  #endif
> >  	.poll		= io_uring_poll,
> >  #ifdef CONFIG_PROC_FS
>
> Hi Jens, Helge - I've bisected a regression with io_uring on ia64 to this
> patch in 6.4.  Unfortunately this breaks userspace programs using io_uring,
> the easiest one to test is cmake with an io_uring enabled libuv (i.e., libuv
> >= 1.45.0) which will hang.
>
> I am aware that ia64 is in a vulnerable place right now which I why I am
> keeping this spread limited.  Since this clearly involves
> architecture-specific changes for parisc,

it isn't so much architecture-specific... (just one ifdef)

> is there any chance of looking at
> what is required to do the same for ia64?  I looked at
> 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of SHMLBA
> from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
> SHM_COLOUR change, but it made no difference.
>
> If hardware is necessary for testing, I can provide it, including remote BMC
> access for restarts/kernel debugging.  Any takers?

I won't have time to test myself, but maybe you could test?

Basically we should try to find out why io_uring_mmu_get_unmapped_area()
doesn't return valid addresses, while arch_get_unmapped_area()
[in arch/ia64/kernel/sys_ia64.c] does.

You could apply this patch first:
It introduces a memory leak (as it requests memory twice), but maybe we
get an idea?
The ia64 arch_get_unmapped_area() searches for memory from bottom
(flags=0), while io_uring function tries top-down first. Maybe that's
the problem. And I don't understand the offset_in_page() check right
now.

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..93b1964d2bbb 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3431,13 +3431,17 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
+/* compare to arch_get_unmapped_area() in arch/ia64/kernel/sys_ia64.c */
 	addr = vm_unmapped_area(&info);
-	if (offset_in_page(addr)) {
+printk("io_uring_mmu_get_unmapped_area() address 1 is: %px\n", addr);
+	addr = NULL;
+	if (!addr) {
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
 		info.high_limit = mmap_end;
 		addr = vm_unmapped_area(&info);
 	}
+printk("io_uring_mmu_get_unmapped_area() returns address %px\n", addr);

 	return addr;
 }


Another option is to disable the call to io_uring_nommu_get_unmapped_area())
with the next patch. Maybe you could add printks() to ia64's arch_get_unmapped_area()
and check what it returns there?

@@ -3654,6 +3658,8 @@ static const struct file_operations io_uring_fops = {
 #ifndef CONFIG_MMU
 	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
 	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
+#elif 0    /* IS_ENABLED(CONFIG_IA64) */
+	.get_unmapped_area = NULL,
 #else
 	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
 #endif

Helge
matoro July 12, 2023, 5:28 p.m. UTC | #3
On 2023-07-12 12:24, Helge Deller wrote:
> Hi Matoro,
> 
> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> On 2023-03-14 13:16, Jens Axboe wrote:
>> > From: Helge Deller <deller@gmx.de>
>> >
>> > Some architectures have memory cache aliasing requirements (e.g. parisc)
>> > if memory is shared between userspace and kernel. This patch fixes the
>> > kernel to return an aliased address when asked by userspace via mmap().
>> >
>> > Signed-off-by: Helge Deller <deller@gmx.de>
>> > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> > ---
>> >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>> >  1 file changed, 51 insertions(+)
>> >
>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > index 722624b6d0dc..3adecebbac71 100644
>> > --- a/io_uring/io_uring.c
>> > +++ b/io_uring/io_uring.c
>> > @@ -72,6 +72,7 @@
>> >  #include <linux/io_uring.h>
>> >  #include <linux/audit.h>
>> >  #include <linux/security.h>
>> > +#include <asm/shmparam.h>
>> >
>> >  #define CREATE_TRACE_POINTS
>> >  #include <trace/events/io_uring.h>
>> > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>> > *file, struct vm_area_struct *vma)
>> >  	return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>> > vma->vm_page_prot);
>> >  }
>> >
>> > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>> > +			unsigned long addr, unsigned long len,
>> > +			unsigned long pgoff, unsigned long flags)
>> > +{
>> > +	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>> > +	struct vm_unmapped_area_info info;
>> > +	void *ptr;
>> > +
>> > +	/*
>> > +	 * Do not allow to map to user-provided address to avoid breaking the
>> > +	 * aliasing rules. Userspace is not able to guess the offset address
>> > of
>> > +	 * kernel kmalloc()ed memory area.
>> > +	 */
>> > +	if (addr)
>> > +		return -EINVAL;
>> > +
>> > +	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>> > +	if (IS_ERR(ptr))
>> > +		return -ENOMEM;
>> > +
>> > +	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>> > +	info.length = len;
>> > +	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>> > +	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>> > +#ifdef SHM_COLOUR
>> > +	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>> > +#else
>> > +	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> > +#endif
>> > +	info.align_offset = (unsigned long) ptr;
>> > +
>> > +	/*
>> > +	 * A failed mmap() very likely causes application failure,
>> > +	 * so fall back to the bottom-up function here. This scenario
>> > +	 * can happen with large stack limits and large mmap()
>> > +	 * allocations.
>> > +	 */
>> > +	addr = vm_unmapped_area(&info);
>> > +	if (offset_in_page(addr)) {
>> > +		info.flags = 0;
>> > +		info.low_limit = TASK_UNMAPPED_BASE;
>> > +		info.high_limit = mmap_end;
>> > +		addr = vm_unmapped_area(&info);
>> > +	}
>> > +
>> > +	return addr;
>> > +}
>> > +
>> >  #else /* !CONFIG_MMU */
>> >
>> >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>> > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>> > = {
>> >  #ifndef CONFIG_MMU
>> >  	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
>> >  	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> > +#else
>> > +	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
>> >  #endif
>> >  	.poll		= io_uring_poll,
>> >  #ifdef CONFIG_PROC_FS
>> 
>> Hi Jens, Helge - I've bisected a regression with io_uring on ia64 to 
>> this
>> patch in 6.4.  Unfortunately this breaks userspace programs using 
>> io_uring,
>> the easiest one to test is cmake with an io_uring enabled libuv (i.e., 
>> libuv
>> >= 1.45.0) which will hang.
>> 
>> I am aware that ia64 is in a vulnerable place right now which I why I 
>> am
>> keeping this spread limited.  Since this clearly involves
>> architecture-specific changes for parisc,
> 
> it isn't so much architecture-specific... (just one ifdef)
> 
>> is there any chance of looking at
>> what is required to do the same for ia64?  I looked at
>> 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of 
>> SHMLBA
>> from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>> SHM_COLOUR change, but it made no difference.
>> 
>> If hardware is necessary for testing, I can provide it, including 
>> remote BMC
>> access for restarts/kernel debugging.  Any takers?
> 
> I won't have time to test myself, but maybe you could test?
> 
> Basically we should try to find out why 
> io_uring_mmu_get_unmapped_area()
> doesn't return valid addresses, while arch_get_unmapped_area()
> [in arch/ia64/kernel/sys_ia64.c] does.
> 
> You could apply this patch first:
> It introduces a memory leak (as it requests memory twice), but maybe we
> get an idea?
> The ia64 arch_get_unmapped_area() searches for memory from bottom
> (flags=0), while io_uring function tries top-down first. Maybe that's
> the problem. And I don't understand the offset_in_page() check right
> now.
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 3bca7a79efda..93b1964d2bbb 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3431,13 +3431,17 @@ static unsigned long 
> io_uring_mmu_get_unmapped_area(struct file *filp,
>  	 * can happen with large stack limits and large mmap()
>  	 * allocations.
>  	 */
> +/* compare to arch_get_unmapped_area() in arch/ia64/kernel/sys_ia64.c 
> */
>  	addr = vm_unmapped_area(&info);
> -	if (offset_in_page(addr)) {
> +printk("io_uring_mmu_get_unmapped_area() address 1 is: %px\n", addr);
> +	addr = NULL;
> +	if (!addr) {
>  		info.flags = 0;
>  		info.low_limit = TASK_UNMAPPED_BASE;
>  		info.high_limit = mmap_end;
>  		addr = vm_unmapped_area(&info);
>  	}
> +printk("io_uring_mmu_get_unmapped_area() returns address %px\n", 
> addr);
> 
>  	return addr;
>  }
> 
> 
> Another option is to disable the call to 
> io_uring_nommu_get_unmapped_area())
> with the next patch. Maybe you could add printks() to ia64's 
> arch_get_unmapped_area()
> and check what it returns there?
> 
> @@ -3654,6 +3658,8 @@ static const struct file_operations io_uring_fops 
> = {
>  #ifndef CONFIG_MMU
>  	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
>  	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
> +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
> +	.get_unmapped_area = NULL,
>  #else
>  	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
>  #endif
> 
> Helge

Thanks Helge.  Sample output from that first patch:

[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
is: 1ffffffffff40000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
address 2000000001e40000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
is: 1ffffffffff20000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
address 2000000001f20000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
is: 1ffffffffff30000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
address 2000000001f30000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
is: 1ffffffffff90000
[Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
address 2000000001f90000

This pattern seems to be pretty stable, I tried instead just directly 
returning the result of a call to arch_get_unmapped_area() at the end of 
the function and it seems similar:

[Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return 
address 1ffffffffffd0000
[Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
address 2000000001f00000
[Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return 
address 1ffffffffff00000
[Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
address 1ffffffffff00000
[Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return 
address 1fffffffffe20000
[Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
address 2000000002000000
[Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return 
address 1fffffffffe30000
[Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
address 2000000002100000

Is that enough of a clue to go on?
Helge Deller July 12, 2023, 7:05 p.m. UTC | #4
On 7/12/23 19:28, matoro wrote:
> On 2023-07-12 12:24, Helge Deller wrote:
>> Hi Matoro,
>>
>> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>> On 2023-03-14 13:16, Jens Axboe wrote:
>>> > From: Helge Deller <deller@gmx.de>
>>> >
>>> > Some architectures have memory cache aliasing requirements (e.g. parisc)
>>> > if memory is shared between userspace and kernel. This patch fixes the
>>> > kernel to return an aliased address when asked by userspace via mmap().
>>> >
>>> > Signed-off-by: Helge Deller <deller@gmx.de>
>>> > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> > ---
>>> >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>>> >  1 file changed, 51 insertions(+)
>>> >
>>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> > index 722624b6d0dc..3adecebbac71 100644
>>> > --- a/io_uring/io_uring.c
>>> > +++ b/io_uring/io_uring.c
>>> > @@ -72,6 +72,7 @@
>>> >  #include <linux/io_uring.h>
>>> >  #include <linux/audit.h>
>>> >  #include <linux/security.h>
>>> > +#include <asm/shmparam.h>
>>> >
>>> >  #define CREATE_TRACE_POINTS
>>> >  #include <trace/events/io_uring.h>
>>> > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>>> > *file, struct vm_area_struct *vma)
>>> >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>>> > vma->vm_page_prot);
>>> >  }
>>> >
>>> > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>> > +            unsigned long addr, unsigned long len,
>>> > +            unsigned long pgoff, unsigned long flags)
>>> > +{
>>> > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>>> > +    struct vm_unmapped_area_info info;
>>> > +    void *ptr;
>>> > +
>>> > +    /*
>>> > +     * Do not allow to map to user-provided address to avoid breaking the
>>> > +     * aliasing rules. Userspace is not able to guess the offset address
>>> > of
>>> > +     * kernel kmalloc()ed memory area.
>>> > +     */
>>> > +    if (addr)
>>> > +        return -EINVAL;
>>> > +
>>> > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>>> > +    if (IS_ERR(ptr))
>>> > +        return -ENOMEM;
>>> > +
>>> > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>>> > +    info.length = len;
>>> > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>>> > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>>> > +#ifdef SHM_COLOUR
>>> > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>>> > +#else
>>> > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>>> > +#endif
>>> > +    info.align_offset = (unsigned long) ptr;
>>> > +
>>> > +    /*
>>> > +     * A failed mmap() very likely causes application failure,
>>> > +     * so fall back to the bottom-up function here. This scenario
>>> > +     * can happen with large stack limits and large mmap()
>>> > +     * allocations.
>>> > +     */
>>> > +    addr = vm_unmapped_area(&info);
>>> > +    if (offset_in_page(addr)) {
>>> > +        info.flags = 0;
>>> > +        info.low_limit = TASK_UNMAPPED_BASE;
>>> > +        info.high_limit = mmap_end;
>>> > +        addr = vm_unmapped_area(&info);
>>> > +    }
>>> > +
>>> > +    return addr;
>>> > +}
>>> > +
>>> >  #else /* !CONFIG_MMU */
>>> >
>>> >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>>> > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>>> > = {
>>> >  #ifndef CONFIG_MMU
>>> >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>> >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>> > +#else
>>> > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>> >  #endif
>>> >      .poll        = io_uring_poll,
>>> >  #ifdef CONFIG_PROC_FS
>>>
>>> Hi Jens, Helge - I've bisected a regression with io_uring on ia64 to this
>>> patch in 6.4.  Unfortunately this breaks userspace programs using io_uring,
>>> the easiest one to test is cmake with an io_uring enabled libuv (i.e., libuv
>>> >= 1.45.0) which will hang.
>>>
>>> I am aware that ia64 is in a vulnerable place right now which I why I am
>>> keeping this spread limited.  Since this clearly involves
>>> architecture-specific changes for parisc,
>>
>> it isn't so much architecture-specific... (just one ifdef)
>>
>>> is there any chance of looking at
>>> what is required to do the same for ia64?  I looked at
>>> 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of SHMLBA
>>> from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>>> SHM_COLOUR change, but it made no difference.
>>>
>>> If hardware is necessary for testing, I can provide it, including remote BMC
>>> access for restarts/kernel debugging.  Any takers?
>>
>> I won't have time to test myself, but maybe you could test?
>>
>> Basically we should try to find out why io_uring_mmu_get_unmapped_area()
>> doesn't return valid addresses, while arch_get_unmapped_area()
>> [in arch/ia64/kernel/sys_ia64.c] does.
>>
>> You could apply this patch first:
>> It introduces a memory leak (as it requests memory twice), but maybe we
>> get an idea?
>> The ia64 arch_get_unmapped_area() searches for memory from bottom
>> (flags=0), while io_uring function tries top-down first. Maybe that's
>> the problem. And I don't understand the offset_in_page() check right
>> now.
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 3bca7a79efda..93b1964d2bbb 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -3431,13 +3431,17 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>       * can happen with large stack limits and large mmap()
>>       * allocations.
>>       */
>> +/* compare to arch_get_unmapped_area() in arch/ia64/kernel/sys_ia64.c */
>>      addr = vm_unmapped_area(&info);
>> -    if (offset_in_page(addr)) {
>> +printk("io_uring_mmu_get_unmapped_area() address 1 is: %px\n", addr);
>> +    addr = NULL;
>> +    if (!addr) {
>>          info.flags = 0;
>>          info.low_limit = TASK_UNMAPPED_BASE;
>>          info.high_limit = mmap_end;
>>          addr = vm_unmapped_area(&info);
>>      }
>> +printk("io_uring_mmu_get_unmapped_area() returns address %px\n", addr);
>>
>>      return addr;
>>  }
>>
>>
>> Another option is to disable the call to io_uring_nommu_get_unmapped_area())
>> with the next patch. Maybe you could add printks() to ia64's arch_get_unmapped_area()
>> and check what it returns there?
>>
>> @@ -3654,6 +3658,8 @@ static const struct file_operations io_uring_fops = {
>>  #ifndef CONFIG_MMU
>>      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>> +    .get_unmapped_area = NULL,
>>  #else
>>      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>  #endif
>>
>> Helge
>
> Thanks Helge.  Sample output from that first patch:
>
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff40000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001e40000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff20000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f20000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff30000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f30000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff90000
> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f90000
>
> This pattern seems to be pretty stable, I tried instead just directly returning the result of a call to arch_get_unmapped_area() at the end of the function and it seems similar:
>
> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1ffffffffffd0000
> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000001f00000
> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1ffffffffff00000
> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 1ffffffffff00000
> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1fffffffffe20000
> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000002000000
> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1fffffffffe30000
> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000002100000
>
> Is that enough of a clue to go on?

SHMLBA on ia64 is 0x100000:
arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
but the values returned by io_uring_mmu_get_unmapped_area() does not fullfill this.

So, probably ia64's SHMLBA isn't pulled in correctly in io_uring/io_uring.c.
Check value of this line:
	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);

You could also add
#define SHM_COLOUR  0x100000
in front of the
	#ifdef SHM_COLOUR
(define SHM_COLOUR in io_uring/kbuf.c too).

Helge
Helge Deller July 12, 2023, 8:30 p.m. UTC | #5
On 7/12/23 21:05, Helge Deller wrote:
> On 7/12/23 19:28, matoro wrote:
>> On 2023-07-12 12:24, Helge Deller wrote:
>>> Hi Matoro,
>>>
>>> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>>> On 2023-03-14 13:16, Jens Axboe wrote:
>>>> > From: Helge Deller <deller@gmx.de>
>>>> >
>>>> > Some architectures have memory cache aliasing requirements (e.g. parisc)
>>>> > if memory is shared between userspace and kernel. This patch fixes the
>>>> > kernel to return an aliased address when asked by userspace via mmap().
>>>> >
>>>> > Signed-off-by: Helge Deller <deller@gmx.de>
>>>> > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> > ---
>>>> >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>>>> >  1 file changed, 51 insertions(+)
>>>> >
>>>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>> > index 722624b6d0dc..3adecebbac71 100644
>>>> > --- a/io_uring/io_uring.c
>>>> > +++ b/io_uring/io_uring.c
>>>> > @@ -72,6 +72,7 @@
>>>> >  #include <linux/io_uring.h>
>>>> >  #include <linux/audit.h>
>>>> >  #include <linux/security.h>
>>>> > +#include <asm/shmparam.h>
>>>> >
>>>> >  #define CREATE_TRACE_POINTS
>>>> >  #include <trace/events/io_uring.h>
>>>> > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>>>> > *file, struct vm_area_struct *vma)
>>>> >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>>>> > vma->vm_page_prot);
>>>> >  }
>>>> >
>>>> > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>>> > +            unsigned long addr, unsigned long len,
>>>> > +            unsigned long pgoff, unsigned long flags)
>>>> > +{
>>>> > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>>>> > +    struct vm_unmapped_area_info info;
>>>> > +    void *ptr;
>>>> > +
>>>> > +    /*
>>>> > +     * Do not allow to map to user-provided address to avoid breaking the
>>>> > +     * aliasing rules. Userspace is not able to guess the offset address
>>>> > of
>>>> > +     * kernel kmalloc()ed memory area.
>>>> > +     */
>>>> > +    if (addr)
>>>> > +        return -EINVAL;
>>>> > +
>>>> > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>>>> > +    if (IS_ERR(ptr))
>>>> > +        return -ENOMEM;
>>>> > +
>>>> > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>>>> > +    info.length = len;
>>>> > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>>>> > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>>>> > +#ifdef SHM_COLOUR
>>>> > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>>>> > +#else
>>>> > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>>>> > +#endif
>>>> > +    info.align_offset = (unsigned long) ptr;
>>>> > +
>>>> > +    /*
>>>> > +     * A failed mmap() very likely causes application failure,
>>>> > +     * so fall back to the bottom-up function here. This scenario
>>>> > +     * can happen with large stack limits and large mmap()
>>>> > +     * allocations.
>>>> > +     */
>>>> > +    addr = vm_unmapped_area(&info);
>>>> > +    if (offset_in_page(addr)) {
>>>> > +        info.flags = 0;
>>>> > +        info.low_limit = TASK_UNMAPPED_BASE;
>>>> > +        info.high_limit = mmap_end;
>>>> > +        addr = vm_unmapped_area(&info);
>>>> > +    }
>>>> > +
>>>> > +    return addr;
>>>> > +}
>>>> > +
>>>> >  #else /* !CONFIG_MMU */
>>>> >
>>>> >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>>>> > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>>>> > = {
>>>> >  #ifndef CONFIG_MMU
>>>> >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>>> >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>>> > +#else
>>>> > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>>> >  #endif
>>>> >      .poll        = io_uring_poll,
>>>> >  #ifdef CONFIG_PROC_FS
>>>>
>>>> Hi Jens, Helge - I've bisected a regression with io_uring on ia64 to this
>>>> patch in 6.4.  Unfortunately this breaks userspace programs using io_uring,
>>>> the easiest one to test is cmake with an io_uring enabled libuv (i.e., libuv
>>>> >= 1.45.0) which will hang.
>>>>
>>>> I am aware that ia64 is in a vulnerable place right now which I why I am
>>>> keeping this spread limited.  Since this clearly involves
>>>> architecture-specific changes for parisc,
>>>
>>> it isn't so much architecture-specific... (just one ifdef)
>>>
>>>> is there any chance of looking at
>>>> what is required to do the same for ia64?  I looked at
>>>> 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of SHMLBA
>>>> from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>>>> SHM_COLOUR change, but it made no difference.
>>>>
>>>> If hardware is necessary for testing, I can provide it, including remote BMC
>>>> access for restarts/kernel debugging.  Any takers?
>>>
>>> I won't have time to test myself, but maybe you could test?
>>>
>>> Basically we should try to find out why io_uring_mmu_get_unmapped_area()
>>> doesn't return valid addresses, while arch_get_unmapped_area()
>>> [in arch/ia64/kernel/sys_ia64.c] does.
>>>
>>> You could apply this patch first:
>>> It introduces a memory leak (as it requests memory twice), but maybe we
>>> get an idea?
>>> The ia64 arch_get_unmapped_area() searches for memory from bottom
>>> (flags=0), while io_uring function tries top-down first. Maybe that's
>>> the problem. And I don't understand the offset_in_page() check right
>>> now.
>>>
>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> index 3bca7a79efda..93b1964d2bbb 100644
>>> --- a/io_uring/io_uring.c
>>> +++ b/io_uring/io_uring.c
>>> @@ -3431,13 +3431,17 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>>       * can happen with large stack limits and large mmap()
>>>       * allocations.
>>>       */
>>> +/* compare to arch_get_unmapped_area() in arch/ia64/kernel/sys_ia64.c */
>>>      addr = vm_unmapped_area(&info);
>>> -    if (offset_in_page(addr)) {
>>> +printk("io_uring_mmu_get_unmapped_area() address 1 is: %px\n", addr);
>>> +    addr = NULL;
>>> +    if (!addr) {
>>>          info.flags = 0;
>>>          info.low_limit = TASK_UNMAPPED_BASE;
>>>          info.high_limit = mmap_end;
>>>          addr = vm_unmapped_area(&info);
>>>      }
>>> +printk("io_uring_mmu_get_unmapped_area() returns address %px\n", addr);
>>>
>>>      return addr;
>>>  }
>>>
>>>
>>> Another option is to disable the call to io_uring_nommu_get_unmapped_area())
>>> with the next patch. Maybe you could add printks() to ia64's arch_get_unmapped_area()
>>> and check what it returns there?
>>>
>>> @@ -3654,6 +3658,8 @@ static const struct file_operations io_uring_fops = {
>>>  #ifndef CONFIG_MMU
>>>      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>>      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>> +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>>> +    .get_unmapped_area = NULL,
>>>  #else
>>>      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>>  #endif
>>>
>>> Helge
>>
>> Thanks Helge.  Sample output from that first patch:
>>
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff40000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001e40000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff20000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f20000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff30000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f30000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 is: 1ffffffffff90000
>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns address 2000000001f90000
>>
>> This pattern seems to be pretty stable, I tried instead just directly returning the result of a call to arch_get_unmapped_area() at the end of the function and it seems similar:
>>
>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1ffffffffffd0000
>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000001f00000
>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1ffffffffff00000
>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 1ffffffffff00000
>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1fffffffffe20000
>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000002000000
>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would return address 1fffffffffe30000
>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return address 2000000002100000
>>
>> Is that enough of a clue to go on?
>
> SHMLBA on ia64 is 0x100000:
> arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
> but the values returned by io_uring_mmu_get_unmapped_area() does not fullfill this.
>
> So, probably ia64's SHMLBA isn't pulled in correctly in io_uring/io_uring.c.
> Check value of this line:
>      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>
> You could also add
> #define SHM_COLOUR  0x100000
> in front of the
>      #ifdef SHM_COLOUR
> (define SHM_COLOUR in io_uring/kbuf.c too).

What is the value of PAGE_SIZE and "ptr" on your machine?
For 4k page size I get:
SHMLBA -1   ->        FFFFF
PAGE_MASK   -> FFFFFFFFF000
so,
info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
You could try to set nfo.align_mask = 0xfffff;

Helge
matoro July 13, 2023, 12:35 a.m. UTC | #6
On 2023-07-12 16:30, Helge Deller wrote:
> On 7/12/23 21:05, Helge Deller wrote:
>> On 7/12/23 19:28, matoro wrote:
>>> On 2023-07-12 12:24, Helge Deller wrote:
>>>> Hi Matoro,
>>>> 
>>>> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>>>> On 2023-03-14 13:16, Jens Axboe wrote:
>>>>> > From: Helge Deller <deller@gmx.de>
>>>>> >
>>>>> > Some architectures have memory cache aliasing requirements (e.g. parisc)
>>>>> > if memory is shared between userspace and kernel. This patch fixes the
>>>>> > kernel to return an aliased address when asked by userspace via mmap().
>>>>> >
>>>>> > Signed-off-by: Helge Deller <deller@gmx.de>
>>>>> > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> > ---
>>>>> >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>>>>> >  1 file changed, 51 insertions(+)
>>>>> >
>>>>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>>> > index 722624b6d0dc..3adecebbac71 100644
>>>>> > --- a/io_uring/io_uring.c
>>>>> > +++ b/io_uring/io_uring.c
>>>>> > @@ -72,6 +72,7 @@
>>>>> >  #include <linux/io_uring.h>
>>>>> >  #include <linux/audit.h>
>>>>> >  #include <linux/security.h>
>>>>> > +#include <asm/shmparam.h>
>>>>> >
>>>>> >  #define CREATE_TRACE_POINTS
>>>>> >  #include <trace/events/io_uring.h>
>>>>> > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>>>>> > *file, struct vm_area_struct *vma)
>>>>> >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>>>>> > vma->vm_page_prot);
>>>>> >  }
>>>>> >
>>>>> > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>>>> > +            unsigned long addr, unsigned long len,
>>>>> > +            unsigned long pgoff, unsigned long flags)
>>>>> > +{
>>>>> > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>>>>> > +    struct vm_unmapped_area_info info;
>>>>> > +    void *ptr;
>>>>> > +
>>>>> > +    /*
>>>>> > +     * Do not allow to map to user-provided address to avoid breaking the
>>>>> > +     * aliasing rules. Userspace is not able to guess the offset address
>>>>> > of
>>>>> > +     * kernel kmalloc()ed memory area.
>>>>> > +     */
>>>>> > +    if (addr)
>>>>> > +        return -EINVAL;
>>>>> > +
>>>>> > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>>>>> > +    if (IS_ERR(ptr))
>>>>> > +        return -ENOMEM;
>>>>> > +
>>>>> > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>>>>> > +    info.length = len;
>>>>> > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>>>>> > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>>>>> > +#ifdef SHM_COLOUR
>>>>> > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>>>>> > +#else
>>>>> > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>>>>> > +#endif
>>>>> > +    info.align_offset = (unsigned long) ptr;
>>>>> > +
>>>>> > +    /*
>>>>> > +     * A failed mmap() very likely causes application failure,
>>>>> > +     * so fall back to the bottom-up function here. This scenario
>>>>> > +     * can happen with large stack limits and large mmap()
>>>>> > +     * allocations.
>>>>> > +     */
>>>>> > +    addr = vm_unmapped_area(&info);
>>>>> > +    if (offset_in_page(addr)) {
>>>>> > +        info.flags = 0;
>>>>> > +        info.low_limit = TASK_UNMAPPED_BASE;
>>>>> > +        info.high_limit = mmap_end;
>>>>> > +        addr = vm_unmapped_area(&info);
>>>>> > +    }
>>>>> > +
>>>>> > +    return addr;
>>>>> > +}
>>>>> > +
>>>>> >  #else /* !CONFIG_MMU */
>>>>> >
>>>>> >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>>>>> > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>>>>> > = {
>>>>> >  #ifndef CONFIG_MMU
>>>>> >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>>>> >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>>>> > +#else
>>>>> > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>>>> >  #endif
>>>>> >      .poll        = io_uring_poll,
>>>>> >  #ifdef CONFIG_PROC_FS
>>>>> 
>>>>> Hi Jens, Helge - I've bisected a regression with io_uring on ia64 
>>>>> to this
>>>>> patch in 6.4.  Unfortunately this breaks userspace programs using 
>>>>> io_uring,
>>>>> the easiest one to test is cmake with an io_uring enabled libuv 
>>>>> (i.e., libuv
>>>>> >= 1.45.0) which will hang.
>>>>> 
>>>>> I am aware that ia64 is in a vulnerable place right now which I why 
>>>>> I am
>>>>> keeping this spread limited.  Since this clearly involves
>>>>> architecture-specific changes for parisc,
>>>> 
>>>> it isn't so much architecture-specific... (just one ifdef)
>>>> 
>>>>> is there any chance of looking at
>>>>> what is required to do the same for ia64?  I looked at
>>>>> 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc: change value of 
>>>>> SHMLBA
>>>>> from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>>>>> SHM_COLOUR change, but it made no difference.
>>>>> 
>>>>> If hardware is necessary for testing, I can provide it, including 
>>>>> remote BMC
>>>>> access for restarts/kernel debugging.  Any takers?
>>>> 
>>>> I won't have time to test myself, but maybe you could test?
>>>> 
>>>> Basically we should try to find out why 
>>>> io_uring_mmu_get_unmapped_area()
>>>> doesn't return valid addresses, while arch_get_unmapped_area()
>>>> [in arch/ia64/kernel/sys_ia64.c] does.
>>>> 
>>>> You could apply this patch first:
>>>> It introduces a memory leak (as it requests memory twice), but maybe 
>>>> we
>>>> get an idea?
>>>> The ia64 arch_get_unmapped_area() searches for memory from bottom
>>>> (flags=0), while io_uring function tries top-down first. Maybe 
>>>> that's
>>>> the problem. And I don't understand the offset_in_page() check right
>>>> now.
>>>> 
>>>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>>> index 3bca7a79efda..93b1964d2bbb 100644
>>>> --- a/io_uring/io_uring.c
>>>> +++ b/io_uring/io_uring.c
>>>> @@ -3431,13 +3431,17 @@ static unsigned long 
>>>> io_uring_mmu_get_unmapped_area(struct file *filp,
>>>>       * can happen with large stack limits and large mmap()
>>>>       * allocations.
>>>>       */
>>>> +/* compare to arch_get_unmapped_area() in 
>>>> arch/ia64/kernel/sys_ia64.c */
>>>>      addr = vm_unmapped_area(&info);
>>>> -    if (offset_in_page(addr)) {
>>>> +printk("io_uring_mmu_get_unmapped_area() address 1 is: %px\n", 
>>>> addr);
>>>> +    addr = NULL;
>>>> +    if (!addr) {
>>>>          info.flags = 0;
>>>>          info.low_limit = TASK_UNMAPPED_BASE;
>>>>          info.high_limit = mmap_end;
>>>>          addr = vm_unmapped_area(&info);
>>>>      }
>>>> +printk("io_uring_mmu_get_unmapped_area() returns address %px\n", 
>>>> addr);
>>>> 
>>>>      return addr;
>>>>  }
>>>> 
>>>> 
>>>> Another option is to disable the call to 
>>>> io_uring_nommu_get_unmapped_area())
>>>> with the next patch. Maybe you could add printks() to ia64's 
>>>> arch_get_unmapped_area()
>>>> and check what it returns there?
>>>> 
>>>> @@ -3654,6 +3658,8 @@ static const struct file_operations 
>>>> io_uring_fops = {
>>>>  #ifndef CONFIG_MMU
>>>>      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>>>      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>>> +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>>>> +    .get_unmapped_area = NULL,
>>>>  #else
>>>>      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>>>  #endif
>>>> 
>>>> Helge
>>> 
>>> Thanks Helge.  Sample output from that first patch:
>>> 
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
>>> is: 1ffffffffff40000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
>>> address 2000000001e40000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
>>> is: 1ffffffffff20000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
>>> address 2000000001f20000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
>>> is: 1ffffffffff30000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
>>> address 2000000001f30000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() address 1 
>>> is: 1ffffffffff90000
>>> [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area() returns 
>>> address 2000000001f90000
>>> 
>>> This pattern seems to be pretty stable, I tried instead just directly 
>>> returning the result of a call to arch_get_unmapped_area() at the end 
>>> of the function and it seems similar:
>>> 
>>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would 
>>> return address 1ffffffffffd0000
>>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
>>> address 2000000001f00000
>>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would 
>>> return address 1ffffffffff00000
>>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
>>> address 1ffffffffff00000
>>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would 
>>> return address 1fffffffffe20000
>>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
>>> address 2000000002000000
>>> [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area() would 
>>> return address 1fffffffffe30000
>>> [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would return 
>>> address 2000000002100000
>>> 
>>> Is that enough of a clue to go on?
>> 
>> SHMLBA on ia64 is 0x100000:
>> arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
>> but the values returned by io_uring_mmu_get_unmapped_area() does not 
>> fullfill this.
>> 
>> So, probably ia64's SHMLBA isn't pulled in correctly in 
>> io_uring/io_uring.c.
>> Check value of this line:
>>      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> 
>> You could also add
>> #define SHM_COLOUR  0x100000
>> in front of the
>>      #ifdef SHM_COLOUR
>> (define SHM_COLOUR in io_uring/kbuf.c too).
> 
> What is the value of PAGE_SIZE and "ptr" on your machine?
> For 4k page size I get:
> SHMLBA -1   ->        FFFFF
> PAGE_MASK   -> FFFFFFFFF000
> so,
> info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
> You could try to set nfo.align_mask = 0xfffff;
> 
> Helge

Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.

Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask = 
0xFFFFF, but both of them made the problem change from 100% 
reproducible, to intermittent.

After inspecting the ouput I observed that it hangs only when the first 
allocation returns an address below 0x2000000000000000, and the second 
returns an address above it.  When both addresses are above it, it does 
not hang.  Examples:

When it works:
$ cmake --version
cmake version 3.26.4

CMake suite maintained and supported by Kitware (kitware.com/cmake).
$ dmesg --color=always -T | tail -n 4
[Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would return 
address 1fffffffffe20000
[Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return 
address 2000000002000000
[Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would return 
address 1fffffffffe50000
[Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return 
address 2000000002100000


When it hangs:
$ cmake --version
cmake version 3.26.4

CMake suite maintained and supported by Kitware (kitware.com/cmake).
^C
$ dmesg --color=always -T | tail -n 4
[Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would return 
address 1ffffffffff00000
[Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return 
address 1ffffffffff00000
[Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would return 
address 1fffffffffe60000
[Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return 
address 2000000001f00000

Is io_uring_mmu_get_unmapped_area supported to always return addresses 
above 0x2000000000000000?  Any reason why it is not doing so sometimes?
Helge Deller July 13, 2023, 7:27 a.m. UTC | #7
* matoro <matoro_mailinglist_kernel@matoro.tk>:
> On 2023-07-12 16:30, Helge Deller wrote:
> > On 7/12/23 21:05, Helge Deller wrote:
> > > On 7/12/23 19:28, matoro wrote:
> > > > On 2023-07-12 12:24, Helge Deller wrote:
> > > > > Hi Matoro,
> > > > >
> > > > > * matoro <matoro_mailinglist_kernel@matoro.tk>:
> > > > > > On 2023-03-14 13:16, Jens Axboe wrote:
> > > > > > > From: Helge Deller <deller@gmx.de>
> > > > > > >
> > > > > > > Some architectures have memory cache aliasing requirements (e.g. parisc)
> > > > > > > if memory is shared between userspace and kernel. This patch fixes the
> > > > > > > kernel to return an aliased address when asked by userspace via mmap().
> > > > > > >
> > > > > > > Signed-off-by: Helge Deller <deller@gmx.de>
> > > > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > > > > > > ---
> > > > > > >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  1 file changed, 51 insertions(+)
> > > > > > >
> > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > > > > > > index 722624b6d0dc..3adecebbac71 100644
> > > > > > > --- a/io_uring/io_uring.c
> > > > > > > +++ b/io_uring/io_uring.c
> > > > > > > @@ -72,6 +72,7 @@
> > > > > > >  #include <linux/io_uring.h>
> > > > > > >  #include <linux/audit.h>
> > > > > > >  #include <linux/security.h>
> > > > > > > +#include <asm/shmparam.h>
> > > > > > >
> > > > > > >  #define CREATE_TRACE_POINTS
> > > > > > >  #include <trace/events/io_uring.h>
> > > > > > > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
> > > > > > > *file, struct vm_area_struct *vma)
> > > > > > >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
> > > > > > > vma->vm_page_prot);
> > > > > > >  }
> > > > > > >
> > > > > > > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
> > > > > > > +            unsigned long addr, unsigned long len,
> > > > > > > +            unsigned long pgoff, unsigned long flags)
> > > > > > > +{
> > > > > > > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
> > > > > > > +    struct vm_unmapped_area_info info;
> > > > > > > +    void *ptr;
> > > > > > > +
> > > > > > > +    /*
> > > > > > > +     * Do not allow to map to user-provided address to avoid breaking the
> > > > > > > +     * aliasing rules. Userspace is not able to guess the offset address
> > > > > > > of
> > > > > > > +     * kernel kmalloc()ed memory area.
> > > > > > > +     */
> > > > > > > +    if (addr)
> > > > > > > +        return -EINVAL;
> > > > > > > +
> > > > > > > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
> > > > > > > +    if (IS_ERR(ptr))
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> > > > > > > +    info.length = len;
> > > > > > > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
> > > > > > > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
> > > > > > > +#ifdef SHM_COLOUR
> > > > > > > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
> > > > > > > +#else
> > > > > > > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> > > > > > > +#endif
> > > > > > > +    info.align_offset = (unsigned long) ptr;
> > > > > > > +
> > > > > > > +    /*
> > > > > > > +     * A failed mmap() very likely causes application failure,
> > > > > > > +     * so fall back to the bottom-up function here. This scenario
> > > > > > > +     * can happen with large stack limits and large mmap()
> > > > > > > +     * allocations.
> > > > > > > +     */
> > > > > > > +    addr = vm_unmapped_area(&info);
> > > > > > > +    if (offset_in_page(addr)) {
> > > > > > > +        info.flags = 0;
> > > > > > > +        info.low_limit = TASK_UNMAPPED_BASE;
> > > > > > > +        info.high_limit = mmap_end;
> > > > > > > +        addr = vm_unmapped_area(&info);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return addr;
> > > > > > > +}
> > > > > > > +
> > > > > > >  #else /* !CONFIG_MMU */
> > > > > > >
> > > > > > >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > > > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
> > > > > > > = {
> > > > > > >  #ifndef CONFIG_MMU
> > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
> > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
> > > > > > > +#else
> > > > > > > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
> > > > > > >  #endif
> > > > > > >      .poll        = io_uring_poll,
> > > > > > >  #ifdef CONFIG_PROC_FS
> > > > > >
> > > > > > Hi Jens, Helge - I've bisected a regression with
> > > > > > io_uring on ia64 to this
> > > > > > patch in 6.4.  Unfortunately this breaks userspace
> > > > > > programs using io_uring,
> > > > > > the easiest one to test is cmake with an io_uring
> > > > > > enabled libuv (i.e., libuv
> > > > > > >= 1.45.0) which will hang.
> > > > > >
> > > > > > I am aware that ia64 is in a vulnerable place right now
> > > > > > which I why I am
> > > > > > keeping this spread limited.  Since this clearly involves
> > > > > > architecture-specific changes for parisc,
> > > > >
> > > > > it isn't so much architecture-specific... (just one ifdef)
> > > > >
> > > > > > is there any chance of looking at
> > > > > > what is required to do the same for ia64?  I looked at
> > > > > > 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc:
> > > > > > change value of SHMLBA
> > > > > > from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
> > > > > > SHM_COLOUR change, but it made no difference.
> > > > > >
> > > > > > If hardware is necessary for testing, I can provide it,
> > > > > > including remote BMC
> > > > > > access for restarts/kernel debugging.  Any takers?
> > > > >
> > > > > I won't have time to test myself, but maybe you could test?
> > > > >
> > > > > Basically we should try to find out why
> > > > > io_uring_mmu_get_unmapped_area()
> > > > > doesn't return valid addresses, while arch_get_unmapped_area()
> > > > > [in arch/ia64/kernel/sys_ia64.c] does.
> > > > >
> > > > > You could apply this patch first:
> > > > > It introduces a memory leak (as it requests memory twice),
> > > > > but maybe we
> > > > > get an idea?
> > > > > The ia64 arch_get_unmapped_area() searches for memory from bottom
> > > > > (flags=0), while io_uring function tries top-down first.
> > > > > Maybe that's
> > > > > the problem. And I don't understand the offset_in_page() check right
> > > > > now.
> > > > >
> > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > > > > index 3bca7a79efda..93b1964d2bbb 100644
> > > > > --- a/io_uring/io_uring.c
> > > > > +++ b/io_uring/io_uring.c
> > > > > @@ -3431,13 +3431,17 @@ static unsigned long
> > > > > io_uring_mmu_get_unmapped_area(struct file *filp,
> > > > >       * can happen with large stack limits and large mmap()
> > > > >       * allocations.
> > > > >       */
> > > > > +/* compare to arch_get_unmapped_area() in
> > > > > arch/ia64/kernel/sys_ia64.c */
> > > > >      addr = vm_unmapped_area(&info);
> > > > > -    if (offset_in_page(addr)) {
> > > > > +printk("io_uring_mmu_get_unmapped_area() address 1 is:
> > > > > %px\n", addr);
> > > > > +    addr = NULL;
> > > > > +    if (!addr) {
> > > > >          info.flags = 0;
> > > > >          info.low_limit = TASK_UNMAPPED_BASE;
> > > > >          info.high_limit = mmap_end;
> > > > >          addr = vm_unmapped_area(&info);
> > > > >      }
> > > > > +printk("io_uring_mmu_get_unmapped_area() returns address
> > > > > %px\n", addr);
> > > > >
> > > > >      return addr;
> > > > >  }
> > > > >
> > > > >
> > > > > Another option is to disable the call to
> > > > > io_uring_nommu_get_unmapped_area())
> > > > > with the next patch. Maybe you could add printks() to ia64's
> > > > > arch_get_unmapped_area()
> > > > > and check what it returns there?
> > > > >
> > > > > @@ -3654,6 +3658,8 @@ static const struct file_operations
> > > > > io_uring_fops = {
> > > > >  #ifndef CONFIG_MMU
> > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
> > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
> > > > > +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
> > > > > +    .get_unmapped_area = NULL,
> > > > >  #else
> > > > >      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
> > > > >  #endif
> > > > >
> > > > > Helge
> > > >
> > > > Thanks Helge.  Sample output from that first patch:
> > > >
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > address 1 is: 1ffffffffff40000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > returns address 2000000001e40000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > address 1 is: 1ffffffffff20000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > returns address 2000000001f20000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > address 1 is: 1ffffffffff30000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > returns address 2000000001f30000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > address 1 is: 1ffffffffff90000
> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > returns address 2000000001f90000
> > > >
> > > > This pattern seems to be pretty stable, I tried instead just
> > > > directly returning the result of a call to
> > > > arch_get_unmapped_area() at the end of the function and it seems
> > > > similar:
> > > >
> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > would return address 1ffffffffffd0000
> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > return address 2000000001f00000
> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > would return address 1ffffffffff00000
> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > return address 1ffffffffff00000
> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > would return address 1fffffffffe20000
> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > return address 2000000002000000
> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > would return address 1fffffffffe30000
> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > return address 2000000002100000
> > > >
> > > > Is that enough of a clue to go on?
> > >
> > > SHMLBA on ia64 is 0x100000:
> > > arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
> > > but the values returned by io_uring_mmu_get_unmapped_area() does not
> > > fullfill this.
> > >
> > > So, probably ia64's SHMLBA isn't pulled in correctly in
> > > io_uring/io_uring.c.
> > > Check value of this line:
> > >      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> > >
> > > You could also add
> > > #define SHM_COLOUR  0x100000
> > > in front of the
> > >      #ifdef SHM_COLOUR
> > > (define SHM_COLOUR in io_uring/kbuf.c too).
> >
> > What is the value of PAGE_SIZE and "ptr" on your machine?
> > For 4k page size I get:
> > SHMLBA -1   ->        FFFFF
> > PAGE_MASK   -> FFFFFFFFF000
> > so,
> > info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
> > You could try to set nfo.align_mask = 0xfffff;
> >
> > Helge
>
> Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.
>
> Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask =
> 0xFFFFF, but both of them made the problem change from 100% reproducible, to
> intermittent.
>
> After inspecting the ouput I observed that it hangs only when the first
> allocation returns an address below 0x2000000000000000, and the second
> returns an address above it.  When both addresses are above it, it does not
> hang.  Examples:
>
> When it works:
> $ cmake --version
> cmake version 3.26.4
>
> CMake suite maintained and supported by Kitware (kitware.com/cmake).
> $ dmesg --color=always -T | tail -n 4
> [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would return
> address 1fffffffffe20000
> [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return address
> 2000000002000000
> [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would return
> address 1fffffffffe50000
> [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return address
> 2000000002100000
>
>
> When it hangs:
> $ cmake --version
> cmake version 3.26.4
>
> CMake suite maintained and supported by Kitware (kitware.com/cmake).
> ^C
> $ dmesg --color=always -T | tail -n 4
> [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would return
> address 1ffffffffff00000
> [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return address
> 1ffffffffff00000
> [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would return
> address 1fffffffffe60000
> [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return address
> 2000000001f00000
>
> Is io_uring_mmu_get_unmapped_area supported to always return addresses above
> 0x2000000000000000?

Yes, with the patch below.

> Any reason why it is not doing so sometimes?

It depends on the parameters for vm_unmapped_area(). Specifically
info.flags=0.

Try this patch:

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..b259794ab53b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3429,10 +3429,13 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
 	 * can happen with large stack limits and large mmap()
-	 * allocations.
+	 * allocations. Use bottom-up on IA64 for correct aliasing.
 	 */
-	addr = vm_unmapped_area(&info);
-	if (offset_in_page(addr)) {
+	if (IS_ENABLED(CONFIG_IA64))
+		addr = NULL;
+	else
+		addr = vm_unmapped_area(&info);
+	if (!addr) {
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
 		info.high_limit = mmap_end;

Helge
matoro July 13, 2023, 11:57 p.m. UTC | #8
On 2023-07-13 03:27, Helge Deller wrote:
> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> On 2023-07-12 16:30, Helge Deller wrote:
>> > On 7/12/23 21:05, Helge Deller wrote:
>> > > On 7/12/23 19:28, matoro wrote:
>> > > > On 2023-07-12 12:24, Helge Deller wrote:
>> > > > > Hi Matoro,
>> > > > >
>> > > > > * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> > > > > > On 2023-03-14 13:16, Jens Axboe wrote:
>> > > > > > > From: Helge Deller <deller@gmx.de>
>> > > > > > >
>> > > > > > > Some architectures have memory cache aliasing requirements (e.g. parisc)
>> > > > > > > if memory is shared between userspace and kernel. This patch fixes the
>> > > > > > > kernel to return an aliased address when asked by userspace via mmap().
>> > > > > > >
>> > > > > > > Signed-off-by: Helge Deller <deller@gmx.de>
>> > > > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> > > > > > > ---
>> > > > > > >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>> > > > > > >  1 file changed, 51 insertions(+)
>> > > > > > >
>> > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > > > > > > index 722624b6d0dc..3adecebbac71 100644
>> > > > > > > --- a/io_uring/io_uring.c
>> > > > > > > +++ b/io_uring/io_uring.c
>> > > > > > > @@ -72,6 +72,7 @@
>> > > > > > >  #include <linux/io_uring.h>
>> > > > > > >  #include <linux/audit.h>
>> > > > > > >  #include <linux/security.h>
>> > > > > > > +#include <asm/shmparam.h>
>> > > > > > >
>> > > > > > >  #define CREATE_TRACE_POINTS
>> > > > > > >  #include <trace/events/io_uring.h>
>> > > > > > > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>> > > > > > > *file, struct vm_area_struct *vma)
>> > > > > > >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>> > > > > > > vma->vm_page_prot);
>> > > > > > >  }
>> > > > > > >
>> > > > > > > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>> > > > > > > +            unsigned long addr, unsigned long len,
>> > > > > > > +            unsigned long pgoff, unsigned long flags)
>> > > > > > > +{
>> > > > > > > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>> > > > > > > +    struct vm_unmapped_area_info info;
>> > > > > > > +    void *ptr;
>> > > > > > > +
>> > > > > > > +    /*
>> > > > > > > +     * Do not allow to map to user-provided address to avoid breaking the
>> > > > > > > +     * aliasing rules. Userspace is not able to guess the offset address
>> > > > > > > of
>> > > > > > > +     * kernel kmalloc()ed memory area.
>> > > > > > > +     */
>> > > > > > > +    if (addr)
>> > > > > > > +        return -EINVAL;
>> > > > > > > +
>> > > > > > > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>> > > > > > > +    if (IS_ERR(ptr))
>> > > > > > > +        return -ENOMEM;
>> > > > > > > +
>> > > > > > > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>> > > > > > > +    info.length = len;
>> > > > > > > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>> > > > > > > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>> > > > > > > +#ifdef SHM_COLOUR
>> > > > > > > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>> > > > > > > +#else
>> > > > > > > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> > > > > > > +#endif
>> > > > > > > +    info.align_offset = (unsigned long) ptr;
>> > > > > > > +
>> > > > > > > +    /*
>> > > > > > > +     * A failed mmap() very likely causes application failure,
>> > > > > > > +     * so fall back to the bottom-up function here. This scenario
>> > > > > > > +     * can happen with large stack limits and large mmap()
>> > > > > > > +     * allocations.
>> > > > > > > +     */
>> > > > > > > +    addr = vm_unmapped_area(&info);
>> > > > > > > +    if (offset_in_page(addr)) {
>> > > > > > > +        info.flags = 0;
>> > > > > > > +        info.low_limit = TASK_UNMAPPED_BASE;
>> > > > > > > +        info.high_limit = mmap_end;
>> > > > > > > +        addr = vm_unmapped_area(&info);
>> > > > > > > +    }
>> > > > > > > +
>> > > > > > > +    return addr;
>> > > > > > > +}
>> > > > > > > +
>> > > > > > >  #else /* !CONFIG_MMU */
>> > > > > > >
>> > > > > > >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>> > > > > > > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>> > > > > > > = {
>> > > > > > >  #ifndef CONFIG_MMU
>> > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>> > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> > > > > > > +#else
>> > > > > > > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>> > > > > > >  #endif
>> > > > > > >      .poll        = io_uring_poll,
>> > > > > > >  #ifdef CONFIG_PROC_FS
>> > > > > >
>> > > > > > Hi Jens, Helge - I've bisected a regression with
>> > > > > > io_uring on ia64 to this
>> > > > > > patch in 6.4.  Unfortunately this breaks userspace
>> > > > > > programs using io_uring,
>> > > > > > the easiest one to test is cmake with an io_uring
>> > > > > > enabled libuv (i.e., libuv
>> > > > > > >= 1.45.0) which will hang.
>> > > > > >
>> > > > > > I am aware that ia64 is in a vulnerable place right now
>> > > > > > which I why I am
>> > > > > > keeping this spread limited.  Since this clearly involves
>> > > > > > architecture-specific changes for parisc,
>> > > > >
>> > > > > it isn't so much architecture-specific... (just one ifdef)
>> > > > >
>> > > > > > is there any chance of looking at
>> > > > > > what is required to do the same for ia64?  I looked at
>> > > > > > 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc:
>> > > > > > change value of SHMLBA
>> > > > > > from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>> > > > > > SHM_COLOUR change, but it made no difference.
>> > > > > >
>> > > > > > If hardware is necessary for testing, I can provide it,
>> > > > > > including remote BMC
>> > > > > > access for restarts/kernel debugging.  Any takers?
>> > > > >
>> > > > > I won't have time to test myself, but maybe you could test?
>> > > > >
>> > > > > Basically we should try to find out why
>> > > > > io_uring_mmu_get_unmapped_area()
>> > > > > doesn't return valid addresses, while arch_get_unmapped_area()
>> > > > > [in arch/ia64/kernel/sys_ia64.c] does.
>> > > > >
>> > > > > You could apply this patch first:
>> > > > > It introduces a memory leak (as it requests memory twice),
>> > > > > but maybe we
>> > > > > get an idea?
>> > > > > The ia64 arch_get_unmapped_area() searches for memory from bottom
>> > > > > (flags=0), while io_uring function tries top-down first.
>> > > > > Maybe that's
>> > > > > the problem. And I don't understand the offset_in_page() check right
>> > > > > now.
>> > > > >
>> > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > > > > index 3bca7a79efda..93b1964d2bbb 100644
>> > > > > --- a/io_uring/io_uring.c
>> > > > > +++ b/io_uring/io_uring.c
>> > > > > @@ -3431,13 +3431,17 @@ static unsigned long
>> > > > > io_uring_mmu_get_unmapped_area(struct file *filp,
>> > > > >       * can happen with large stack limits and large mmap()
>> > > > >       * allocations.
>> > > > >       */
>> > > > > +/* compare to arch_get_unmapped_area() in
>> > > > > arch/ia64/kernel/sys_ia64.c */
>> > > > >      addr = vm_unmapped_area(&info);
>> > > > > -    if (offset_in_page(addr)) {
>> > > > > +printk("io_uring_mmu_get_unmapped_area() address 1 is:
>> > > > > %px\n", addr);
>> > > > > +    addr = NULL;
>> > > > > +    if (!addr) {
>> > > > >          info.flags = 0;
>> > > > >          info.low_limit = TASK_UNMAPPED_BASE;
>> > > > >          info.high_limit = mmap_end;
>> > > > >          addr = vm_unmapped_area(&info);
>> > > > >      }
>> > > > > +printk("io_uring_mmu_get_unmapped_area() returns address
>> > > > > %px\n", addr);
>> > > > >
>> > > > >      return addr;
>> > > > >  }
>> > > > >
>> > > > >
>> > > > > Another option is to disable the call to
>> > > > > io_uring_nommu_get_unmapped_area())
>> > > > > with the next patch. Maybe you could add printks() to ia64's
>> > > > > arch_get_unmapped_area()
>> > > > > and check what it returns there?
>> > > > >
>> > > > > @@ -3654,6 +3658,8 @@ static const struct file_operations
>> > > > > io_uring_fops = {
>> > > > >  #ifndef CONFIG_MMU
>> > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>> > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> > > > > +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>> > > > > +    .get_unmapped_area = NULL,
>> > > > >  #else
>> > > > >      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>> > > > >  #endif
>> > > > >
>> > > > > Helge
>> > > >
>> > > > Thanks Helge.  Sample output from that first patch:
>> > > >
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > address 1 is: 1ffffffffff40000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > returns address 2000000001e40000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > address 1 is: 1ffffffffff20000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > returns address 2000000001f20000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > address 1 is: 1ffffffffff30000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > returns address 2000000001f30000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > address 1 is: 1ffffffffff90000
>> > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > returns address 2000000001f90000
>> > > >
>> > > > This pattern seems to be pretty stable, I tried instead just
>> > > > directly returning the result of a call to
>> > > > arch_get_unmapped_area() at the end of the function and it seems
>> > > > similar:
>> > > >
>> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > would return address 1ffffffffffd0000
>> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > return address 2000000001f00000
>> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > would return address 1ffffffffff00000
>> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > return address 1ffffffffff00000
>> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > would return address 1fffffffffe20000
>> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > return address 2000000002000000
>> > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > would return address 1fffffffffe30000
>> > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > return address 2000000002100000
>> > > >
>> > > > Is that enough of a clue to go on?
>> > >
>> > > SHMLBA on ia64 is 0x100000:
>> > > arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
>> > > but the values returned by io_uring_mmu_get_unmapped_area() does not
>> > > fullfill this.
>> > >
>> > > So, probably ia64's SHMLBA isn't pulled in correctly in
>> > > io_uring/io_uring.c.
>> > > Check value of this line:
>> > >      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> > >
>> > > You could also add
>> > > #define SHM_COLOUR  0x100000
>> > > in front of the
>> > >      #ifdef SHM_COLOUR
>> > > (define SHM_COLOUR in io_uring/kbuf.c too).
>> >
>> > What is the value of PAGE_SIZE and "ptr" on your machine?
>> > For 4k page size I get:
>> > SHMLBA -1   ->        FFFFF
>> > PAGE_MASK   -> FFFFFFFFF000
>> > so,
>> > info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
>> > You could try to set nfo.align_mask = 0xfffff;
>> >
>> > Helge
>> 
>> Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.
>> 
>> Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask =
>> 0xFFFFF, but both of them made the problem change from 100% 
>> reproducible, to
>> intermittent.
>> 
>> After inspecting the ouput I observed that it hangs only when the 
>> first
>> allocation returns an address below 0x2000000000000000, and the second
>> returns an address above it.  When both addresses are above it, it 
>> does not
>> hang.  Examples:
>> 
>> When it works:
>> $ cmake --version
>> cmake version 3.26.4
>> 
>> CMake suite maintained and supported by Kitware (kitware.com/cmake).
>> $ dmesg --color=always -T | tail -n 4
>> [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would 
>> return
>> address 1fffffffffe20000
>> [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return 
>> address
>> 2000000002000000
>> [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would 
>> return
>> address 1fffffffffe50000
>> [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return 
>> address
>> 2000000002100000
>> 
>> 
>> When it hangs:
>> $ cmake --version
>> cmake version 3.26.4
>> 
>> CMake suite maintained and supported by Kitware (kitware.com/cmake).
>> ^C
>> $ dmesg --color=always -T | tail -n 4
>> [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would 
>> return
>> address 1ffffffffff00000
>> [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return 
>> address
>> 1ffffffffff00000
>> [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would 
>> return
>> address 1fffffffffe60000
>> [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return 
>> address
>> 2000000001f00000
>> 
>> Is io_uring_mmu_get_unmapped_area supported to always return addresses 
>> above
>> 0x2000000000000000?
> 
> Yes, with the patch below.
> 
>> Any reason why it is not doing so sometimes?
> 
> It depends on the parameters for vm_unmapped_area(). Specifically
> info.flags=0.
> 
> Try this patch:
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 3bca7a79efda..b259794ab53b 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3429,10 +3429,13 @@ static unsigned long 
> io_uring_mmu_get_unmapped_area(struct file *filp,
>  	 * A failed mmap() very likely causes application failure,
>  	 * so fall back to the bottom-up function here. This scenario
>  	 * can happen with large stack limits and large mmap()
> -	 * allocations.
> +	 * allocations. Use bottom-up on IA64 for correct aliasing.
>  	 */
> -	addr = vm_unmapped_area(&info);
> -	if (offset_in_page(addr)) {
> +	if (IS_ENABLED(CONFIG_IA64))
> +		addr = NULL;
> +	else
> +		addr = vm_unmapped_area(&info);
> +	if (!addr) {
>  		info.flags = 0;
>  		info.low_limit = TASK_UNMAPPED_BASE;
>  		info.high_limit = mmap_end;
> 
> Helge

This patch does do the trick, but I am a little unsure if it's the right 
one to go in:

* Adding an arch-specific conditional feels like a bad hack, why is it 
not working with the other vm_unmapped_area_info settings?
* What happened to the offset_in_page check for other arches?
Helge Deller July 16, 2023, 6:54 a.m. UTC | #9
* matoro <matoro_mailinglist_kernel@matoro.tk>:
> On 2023-07-13 03:27, Helge Deller wrote:
> > * matoro <matoro_mailinglist_kernel@matoro.tk>:
> > > On 2023-07-12 16:30, Helge Deller wrote:
> > > > On 7/12/23 21:05, Helge Deller wrote:
> > > > > On 7/12/23 19:28, matoro wrote:
> > > > > > On 2023-07-12 12:24, Helge Deller wrote:
> > > > > > > Hi Matoro,
> > > > > > >
> > > > > > > * matoro <matoro_mailinglist_kernel@matoro.tk>:
> > > > > > > > On 2023-03-14 13:16, Jens Axboe wrote:
> > > > > > > > > From: Helge Deller <deller@gmx.de>
> > > > > > > > >
> > > > > > > > > Some architectures have memory cache aliasing requirements (e.g. parisc)
> > > > > > > > > if memory is shared between userspace and kernel. This patch fixes the
> > > > > > > > > kernel to return an aliased address when asked by userspace via mmap().
> > > > > > > > >
> > > > > > > > > Signed-off-by: Helge Deller <deller@gmx.de>
> > > > > > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > > > > > > > > ---
> > > > > > > > >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  1 file changed, 51 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > > > > > > > > index 722624b6d0dc..3adecebbac71 100644
> > > > > > > > > --- a/io_uring/io_uring.c
> > > > > > > > > +++ b/io_uring/io_uring.c
> > > > > > > > > @@ -72,6 +72,7 @@
> > > > > > > > >  #include <linux/io_uring.h>
> > > > > > > > >  #include <linux/audit.h>
> > > > > > > > >  #include <linux/security.h>
> > > > > > > > > +#include <asm/shmparam.h>
> > > > > > > > >
> > > > > > > > >  #define CREATE_TRACE_POINTS
> > > > > > > > >  #include <trace/events/io_uring.h>
> > > > > > > > > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
> > > > > > > > > *file, struct vm_area_struct *vma)
> > > > > > > > >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
> > > > > > > > > vma->vm_page_prot);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
> > > > > > > > > +            unsigned long addr, unsigned long len,
> > > > > > > > > +            unsigned long pgoff, unsigned long flags)
> > > > > > > > > +{
> > > > > > > > > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
> > > > > > > > > +    struct vm_unmapped_area_info info;
> > > > > > > > > +    void *ptr;
> > > > > > > > > +
> > > > > > > > > +    /*
> > > > > > > > > +     * Do not allow to map to user-provided address to avoid breaking the
> > > > > > > > > +     * aliasing rules. Userspace is not able to guess the offset address
> > > > > > > > > of
> > > > > > > > > +     * kernel kmalloc()ed memory area.
> > > > > > > > > +     */
> > > > > > > > > +    if (addr)
> > > > > > > > > +        return -EINVAL;
> > > > > > > > > +
> > > > > > > > > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
> > > > > > > > > +    if (IS_ERR(ptr))
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> > > > > > > > > +    info.length = len;
> > > > > > > > > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
> > > > > > > > > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
> > > > > > > > > +#ifdef SHM_COLOUR
> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
> > > > > > > > > +#else
> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> > > > > > > > > +#endif
> > > > > > > > > +    info.align_offset = (unsigned long) ptr;
> > > > > > > > > +
> > > > > > > > > +    /*
> > > > > > > > > +     * A failed mmap() very likely causes application failure,
> > > > > > > > > +     * so fall back to the bottom-up function here. This scenario
> > > > > > > > > +     * can happen with large stack limits and large mmap()
> > > > > > > > > +     * allocations.
> > > > > > > > > +     */
> > > > > > > > > +    addr = vm_unmapped_area(&info);
> > > > > > > > > +    if (offset_in_page(addr)) {
> > > > > > > > > +        info.flags = 0;
> > > > > > > > > +        info.low_limit = TASK_UNMAPPED_BASE;
> > > > > > > > > +        info.high_limit = mmap_end;
> > > > > > > > > +        addr = vm_unmapped_area(&info);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return addr;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  #else /* !CONFIG_MMU */
> > > > > > > > >
> > > > > > > > >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > > > > > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
> > > > > > > > > = {
> > > > > > > > >  #ifndef CONFIG_MMU
> > > > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
> > > > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
> > > > > > > > > +#else
> > > > > > > > > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
> > > > > > > > >  #endif
> > > > > > > > >      .poll        = io_uring_poll,
> > > > > > > > >  #ifdef CONFIG_PROC_FS
> > > > > > > >
> > > > > > > > Hi Jens, Helge - I've bisected a regression with
> > > > > > > > io_uring on ia64 to this
> > > > > > > > patch in 6.4.  Unfortunately this breaks userspace
> > > > > > > > programs using io_uring,
> > > > > > > > the easiest one to test is cmake with an io_uring
> > > > > > > > enabled libuv (i.e., libuv
> > > > > > > > >= 1.45.0) which will hang.
> > > > > > > >
> > > > > > > > I am aware that ia64 is in a vulnerable place right now
> > > > > > > > which I why I am
> > > > > > > > keeping this spread limited.  Since this clearly involves
> > > > > > > > architecture-specific changes for parisc,
> > > > > > >
> > > > > > > it isn't so much architecture-specific... (just one ifdef)
> > > > > > >
> > > > > > > > is there any chance of looking at
> > > > > > > > what is required to do the same for ia64?  I looked at
> > > > > > > > 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc:
> > > > > > > > change value of SHMLBA
> > > > > > > > from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
> > > > > > > > SHM_COLOUR change, but it made no difference.
> > > > > > > >
> > > > > > > > If hardware is necessary for testing, I can provide it,
> > > > > > > > including remote BMC
> > > > > > > > access for restarts/kernel debugging.  Any takers?
> > > > > > >
> > > > > > > I won't have time to test myself, but maybe you could test?
> > > > > > >
> > > > > > > Basically we should try to find out why
> > > > > > > io_uring_mmu_get_unmapped_area()
> > > > > > > doesn't return valid addresses, while arch_get_unmapped_area()
> > > > > > > [in arch/ia64/kernel/sys_ia64.c] does.
> > > > > > >
> > > > > > > You could apply this patch first:
> > > > > > > It introduces a memory leak (as it requests memory twice),
> > > > > > > but maybe we
> > > > > > > get an idea?
> > > > > > > The ia64 arch_get_unmapped_area() searches for memory from bottom
> > > > > > > (flags=0), while io_uring function tries top-down first.
> > > > > > > Maybe that's
> > > > > > > the problem. And I don't understand the offset_in_page() check right
> > > > > > > now.
> > > > > > >
> > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > > > > > > index 3bca7a79efda..93b1964d2bbb 100644
> > > > > > > --- a/io_uring/io_uring.c
> > > > > > > +++ b/io_uring/io_uring.c
> > > > > > > @@ -3431,13 +3431,17 @@ static unsigned long
> > > > > > > io_uring_mmu_get_unmapped_area(struct file *filp,
> > > > > > >       * can happen with large stack limits and large mmap()
> > > > > > >       * allocations.
> > > > > > >       */
> > > > > > > +/* compare to arch_get_unmapped_area() in
> > > > > > > arch/ia64/kernel/sys_ia64.c */
> > > > > > >      addr = vm_unmapped_area(&info);
> > > > > > > -    if (offset_in_page(addr)) {
> > > > > > > +printk("io_uring_mmu_get_unmapped_area() address 1 is:
> > > > > > > %px\n", addr);
> > > > > > > +    addr = NULL;
> > > > > > > +    if (!addr) {
> > > > > > >          info.flags = 0;
> > > > > > >          info.low_limit = TASK_UNMAPPED_BASE;
> > > > > > >          info.high_limit = mmap_end;
> > > > > > >          addr = vm_unmapped_area(&info);
> > > > > > >      }
> > > > > > > +printk("io_uring_mmu_get_unmapped_area() returns address
> > > > > > > %px\n", addr);
> > > > > > >
> > > > > > >      return addr;
> > > > > > >  }
> > > > > > >
> > > > > > >
> > > > > > > Another option is to disable the call to
> > > > > > > io_uring_nommu_get_unmapped_area())
> > > > > > > with the next patch. Maybe you could add printks() to ia64's
> > > > > > > arch_get_unmapped_area()
> > > > > > > and check what it returns there?
> > > > > > >
> > > > > > > @@ -3654,6 +3658,8 @@ static const struct file_operations
> > > > > > > io_uring_fops = {
> > > > > > >  #ifndef CONFIG_MMU
> > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
> > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
> > > > > > > +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
> > > > > > > +    .get_unmapped_area = NULL,
> > > > > > >  #else
> > > > > > >      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
> > > > > > >  #endif
> > > > > > >
> > > > > > > Helge
> > > > > >
> > > > > > Thanks Helge.  Sample output from that first patch:
> > > > > >
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > address 1 is: 1ffffffffff40000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > returns address 2000000001e40000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > address 1 is: 1ffffffffff20000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > returns address 2000000001f20000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > address 1 is: 1ffffffffff30000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > returns address 2000000001f30000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > address 1 is: 1ffffffffff90000
> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
> > > > > > returns address 2000000001f90000
> > > > > >
> > > > > > This pattern seems to be pretty stable, I tried instead just
> > > > > > directly returning the result of a call to
> > > > > > arch_get_unmapped_area() at the end of the function and it seems
> > > > > > similar:
> > > > > >
> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > > > would return address 1ffffffffffd0000
> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > > > return address 2000000001f00000
> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > > > would return address 1ffffffffff00000
> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > > > return address 1ffffffffff00000
> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > > > would return address 1fffffffffe20000
> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > > > return address 2000000002000000
> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
> > > > > > would return address 1fffffffffe30000
> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
> > > > > > return address 2000000002100000
> > > > > >
> > > > > > Is that enough of a clue to go on?
> > > > >
> > > > > SHMLBA on ia64 is 0x100000:
> > > > > arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
> > > > > but the values returned by io_uring_mmu_get_unmapped_area() does not
> > > > > fullfill this.
> > > > >
> > > > > So, probably ia64's SHMLBA isn't pulled in correctly in
> > > > > io_uring/io_uring.c.
> > > > > Check value of this line:
> > > > >      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> > > > >
> > > > > You could also add
> > > > > #define SHM_COLOUR  0x100000
> > > > > in front of the
> > > > >      #ifdef SHM_COLOUR
> > > > > (define SHM_COLOUR in io_uring/kbuf.c too).
> > > >
> > > > What is the value of PAGE_SIZE and "ptr" on your machine?
> > > > For 4k page size I get:
> > > > SHMLBA -1   ->        FFFFF
> > > > PAGE_MASK   -> FFFFFFFFF000
> > > > so,
> > > > info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
> > > > You could try to set nfo.align_mask = 0xfffff;
> > > >
> > > > Helge
> > >
> > > Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.
> > >
> > > Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask =
> > > 0xFFFFF, but both of them made the problem change from 100%
> > > reproducible, to
> > > intermittent.
> > >
> > > After inspecting the ouput I observed that it hangs only when the
> > > first
> > > allocation returns an address below 0x2000000000000000, and the second
> > > returns an address above it.  When both addresses are above it, it
> > > does not
> > > hang.  Examples:
> > >
> > > When it works:
> > > $ cmake --version
> > > cmake version 3.26.4
> > >
> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
> > > $ dmesg --color=always -T | tail -n 4
> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
> > > return
> > > address 1fffffffffe20000
> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
> > > address
> > > 2000000002000000
> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
> > > return
> > > address 1fffffffffe50000
> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
> > > address
> > > 2000000002100000
> > >
> > >
> > > When it hangs:
> > > $ cmake --version
> > > cmake version 3.26.4
> > >
> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
> > > ^C
> > > $ dmesg --color=always -T | tail -n 4
> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
> > > return
> > > address 1ffffffffff00000
> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
> > > address
> > > 1ffffffffff00000
> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
> > > return
> > > address 1fffffffffe60000
> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
> > > address
> > > 2000000001f00000
> > >
> > > Is io_uring_mmu_get_unmapped_area supported to always return
> > > addresses above
> > > 0x2000000000000000?
> >
> > Yes, with the patch below.
> >
> > > Any reason why it is not doing so sometimes?
> >
> > It depends on the parameters for vm_unmapped_area(). Specifically
> > info.flags=0.
> >
> > Try this patch:
> >
> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> > index 3bca7a79efda..b259794ab53b 100644
> > --- a/io_uring/io_uring.c
> > +++ b/io_uring/io_uring.c
> > @@ -3429,10 +3429,13 @@ static unsigned long
> > io_uring_mmu_get_unmapped_area(struct file *filp,
> >  	 * A failed mmap() very likely causes application failure,
> >  	 * so fall back to the bottom-up function here. This scenario
> >  	 * can happen with large stack limits and large mmap()
> > -	 * allocations.
> > +	 * allocations. Use bottom-up on IA64 for correct aliasing.
> >  	 */
> > -	addr = vm_unmapped_area(&info);
> > -	if (offset_in_page(addr)) {
> > +	if (IS_ENABLED(CONFIG_IA64))
> > +		addr = NULL;
> > +	else
> > +		addr = vm_unmapped_area(&info);
> > +	if (!addr) {
> >  		info.flags = 0;
> >  		info.low_limit = TASK_UNMAPPED_BASE;
> >  		info.high_limit = mmap_end;
> >
> > Helge
>
> This patch does do the trick, but I am a little unsure if it's the right one
> to go in:
>
> * Adding an arch-specific conditional feels like a bad hack, why is it not
> working with the other vm_unmapped_area_info settings?

because it tries to map below TASK_UNMAPPED_BASE, for which (I assume) IA-64
has different aliasing/caching rules. There are some comments in the arch/ia64
files, but I'm not a IA-64 expert...

> * What happened to the offset_in_page check for other arches?

I thought it's not necessary.

But below is another (and much better) approach, which you may test.
I see quite some errors with the liburing testcases on hppa, but I think
they are not related to this function.

Can you test and report back?

Helge


From 457f2c2db984bc159119bfb4426d9dc6c2779ed6 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sun, 16 Jul 2023 08:45:17 +0200
Subject: [PATCH] io_uring: Adjust mapping wrt architecture aliasing
 requirements

When mapping memory to userspace use the architecture-provided
get_unmapped_area() function instead of the own copy which fails on
IA-64 since it doesn't allow mappings below TASK_UNMAPPED_BASE.

Additionally make sure to flag the requested memory as MAP_SHARED so
that any architecture-specific aliasing rules will be applied.

Reported-by: matoro <matoro_mailinglist_kernel@matoro.tk>
Signed-off-by: Helge Deller <deller@gmx.de>

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..2e7dd93e45d0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3398,48 +3398,27 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
 			unsigned long addr, unsigned long len,
 			unsigned long pgoff, unsigned long flags)
 {
-	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
-	struct vm_unmapped_area_info info;
 	void *ptr;

 	/*
 	 * Do not allow to map to user-provided address to avoid breaking the
-	 * aliasing rules. Userspace is not able to guess the offset address of
-	 * kernel kmalloc()ed memory area.
+	 * aliasing rules of various architectures. Userspace is not able to
+	 * guess the offset address of kernel kmalloc()ed memory area.
 	 */
-	if (addr)
+	if (addr | (flags & MAP_FIXED))
 		return -EINVAL;

+	/*
+	 * The requested memory region is required to be shared between kernel
+	 * and userspace application.
+	 */
+	flags |= MAP_SHARED;
+
 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
 	if (IS_ERR(ptr))
 		return -ENOMEM;

-	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
-	info.length = len;
-	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
-#ifdef SHM_COLOUR
-	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
-#else
-	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
-#endif
-	info.align_offset = (unsigned long) ptr;
-
-	/*
-	 * A failed mmap() very likely causes application failure,
-	 * so fall back to the bottom-up function here. This scenario
-	 * can happen with large stack limits and large mmap()
-	 * allocations.
-	 */
-	addr = vm_unmapped_area(&info);
-	if (offset_in_page(addr)) {
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = mmap_end;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
 }

 #else /* !CONFIG_MMU */
matoro July 16, 2023, 6:03 p.m. UTC | #10
On 2023-07-16 02:54, Helge Deller wrote:
> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> On 2023-07-13 03:27, Helge Deller wrote:
>> > * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> > > On 2023-07-12 16:30, Helge Deller wrote:
>> > > > On 7/12/23 21:05, Helge Deller wrote:
>> > > > > On 7/12/23 19:28, matoro wrote:
>> > > > > > On 2023-07-12 12:24, Helge Deller wrote:
>> > > > > > > Hi Matoro,
>> > > > > > >
>> > > > > > > * matoro <matoro_mailinglist_kernel@matoro.tk>:
>> > > > > > > > On 2023-03-14 13:16, Jens Axboe wrote:
>> > > > > > > > > From: Helge Deller <deller@gmx.de>
>> > > > > > > > >
>> > > > > > > > > Some architectures have memory cache aliasing requirements (e.g. parisc)
>> > > > > > > > > if memory is shared between userspace and kernel. This patch fixes the
>> > > > > > > > > kernel to return an aliased address when asked by userspace via mmap().
>> > > > > > > > >
>> > > > > > > > > Signed-off-by: Helge Deller <deller@gmx.de>
>> > > > > > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> > > > > > > > > ---
>> > > > > > > > >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>> > > > > > > > >  1 file changed, 51 insertions(+)
>> > > > > > > > >
>> > > > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > > > > > > > > index 722624b6d0dc..3adecebbac71 100644
>> > > > > > > > > --- a/io_uring/io_uring.c
>> > > > > > > > > +++ b/io_uring/io_uring.c
>> > > > > > > > > @@ -72,6 +72,7 @@
>> > > > > > > > >  #include <linux/io_uring.h>
>> > > > > > > > >  #include <linux/audit.h>
>> > > > > > > > >  #include <linux/security.h>
>> > > > > > > > > +#include <asm/shmparam.h>
>> > > > > > > > >
>> > > > > > > > >  #define CREATE_TRACE_POINTS
>> > > > > > > > >  #include <trace/events/io_uring.h>
>> > > > > > > > > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>> > > > > > > > > *file, struct vm_area_struct *vma)
>> > > > > > > > >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>> > > > > > > > > vma->vm_page_prot);
>> > > > > > > > >  }
>> > > > > > > > >
>> > > > > > > > > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>> > > > > > > > > +            unsigned long addr, unsigned long len,
>> > > > > > > > > +            unsigned long pgoff, unsigned long flags)
>> > > > > > > > > +{
>> > > > > > > > > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>> > > > > > > > > +    struct vm_unmapped_area_info info;
>> > > > > > > > > +    void *ptr;
>> > > > > > > > > +
>> > > > > > > > > +    /*
>> > > > > > > > > +     * Do not allow to map to user-provided address to avoid breaking the
>> > > > > > > > > +     * aliasing rules. Userspace is not able to guess the offset address
>> > > > > > > > > of
>> > > > > > > > > +     * kernel kmalloc()ed memory area.
>> > > > > > > > > +     */
>> > > > > > > > > +    if (addr)
>> > > > > > > > > +        return -EINVAL;
>> > > > > > > > > +
>> > > > > > > > > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>> > > > > > > > > +    if (IS_ERR(ptr))
>> > > > > > > > > +        return -ENOMEM;
>> > > > > > > > > +
>> > > > > > > > > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>> > > > > > > > > +    info.length = len;
>> > > > > > > > > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>> > > > > > > > > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>> > > > > > > > > +#ifdef SHM_COLOUR
>> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>> > > > > > > > > +#else
>> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> > > > > > > > > +#endif
>> > > > > > > > > +    info.align_offset = (unsigned long) ptr;
>> > > > > > > > > +
>> > > > > > > > > +    /*
>> > > > > > > > > +     * A failed mmap() very likely causes application failure,
>> > > > > > > > > +     * so fall back to the bottom-up function here. This scenario
>> > > > > > > > > +     * can happen with large stack limits and large mmap()
>> > > > > > > > > +     * allocations.
>> > > > > > > > > +     */
>> > > > > > > > > +    addr = vm_unmapped_area(&info);
>> > > > > > > > > +    if (offset_in_page(addr)) {
>> > > > > > > > > +        info.flags = 0;
>> > > > > > > > > +        info.low_limit = TASK_UNMAPPED_BASE;
>> > > > > > > > > +        info.high_limit = mmap_end;
>> > > > > > > > > +        addr = vm_unmapped_area(&info);
>> > > > > > > > > +    }
>> > > > > > > > > +
>> > > > > > > > > +    return addr;
>> > > > > > > > > +}
>> > > > > > > > > +
>> > > > > > > > >  #else /* !CONFIG_MMU */
>> > > > > > > > >
>> > > > > > > > >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>> > > > > > > > > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>> > > > > > > > > = {
>> > > > > > > > >  #ifndef CONFIG_MMU
>> > > > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>> > > > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> > > > > > > > > +#else
>> > > > > > > > > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>> > > > > > > > >  #endif
>> > > > > > > > >      .poll        = io_uring_poll,
>> > > > > > > > >  #ifdef CONFIG_PROC_FS
>> > > > > > > >
>> > > > > > > > Hi Jens, Helge - I've bisected a regression with
>> > > > > > > > io_uring on ia64 to this
>> > > > > > > > patch in 6.4.  Unfortunately this breaks userspace
>> > > > > > > > programs using io_uring,
>> > > > > > > > the easiest one to test is cmake with an io_uring
>> > > > > > > > enabled libuv (i.e., libuv
>> > > > > > > > >= 1.45.0) which will hang.
>> > > > > > > >
>> > > > > > > > I am aware that ia64 is in a vulnerable place right now
>> > > > > > > > which I why I am
>> > > > > > > > keeping this spread limited.  Since this clearly involves
>> > > > > > > > architecture-specific changes for parisc,
>> > > > > > >
>> > > > > > > it isn't so much architecture-specific... (just one ifdef)
>> > > > > > >
>> > > > > > > > is there any chance of looking at
>> > > > > > > > what is required to do the same for ia64?  I looked at
>> > > > > > > > 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc:
>> > > > > > > > change value of SHMLBA
>> > > > > > > > from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>> > > > > > > > SHM_COLOUR change, but it made no difference.
>> > > > > > > >
>> > > > > > > > If hardware is necessary for testing, I can provide it,
>> > > > > > > > including remote BMC
>> > > > > > > > access for restarts/kernel debugging.  Any takers?
>> > > > > > >
>> > > > > > > I won't have time to test myself, but maybe you could test?
>> > > > > > >
>> > > > > > > Basically we should try to find out why
>> > > > > > > io_uring_mmu_get_unmapped_area()
>> > > > > > > doesn't return valid addresses, while arch_get_unmapped_area()
>> > > > > > > [in arch/ia64/kernel/sys_ia64.c] does.
>> > > > > > >
>> > > > > > > You could apply this patch first:
>> > > > > > > It introduces a memory leak (as it requests memory twice),
>> > > > > > > but maybe we
>> > > > > > > get an idea?
>> > > > > > > The ia64 arch_get_unmapped_area() searches for memory from bottom
>> > > > > > > (flags=0), while io_uring function tries top-down first.
>> > > > > > > Maybe that's
>> > > > > > > the problem. And I don't understand the offset_in_page() check right
>> > > > > > > now.
>> > > > > > >
>> > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > > > > > > index 3bca7a79efda..93b1964d2bbb 100644
>> > > > > > > --- a/io_uring/io_uring.c
>> > > > > > > +++ b/io_uring/io_uring.c
>> > > > > > > @@ -3431,13 +3431,17 @@ static unsigned long
>> > > > > > > io_uring_mmu_get_unmapped_area(struct file *filp,
>> > > > > > >       * can happen with large stack limits and large mmap()
>> > > > > > >       * allocations.
>> > > > > > >       */
>> > > > > > > +/* compare to arch_get_unmapped_area() in
>> > > > > > > arch/ia64/kernel/sys_ia64.c */
>> > > > > > >      addr = vm_unmapped_area(&info);
>> > > > > > > -    if (offset_in_page(addr)) {
>> > > > > > > +printk("io_uring_mmu_get_unmapped_area() address 1 is:
>> > > > > > > %px\n", addr);
>> > > > > > > +    addr = NULL;
>> > > > > > > +    if (!addr) {
>> > > > > > >          info.flags = 0;
>> > > > > > >          info.low_limit = TASK_UNMAPPED_BASE;
>> > > > > > >          info.high_limit = mmap_end;
>> > > > > > >          addr = vm_unmapped_area(&info);
>> > > > > > >      }
>> > > > > > > +printk("io_uring_mmu_get_unmapped_area() returns address
>> > > > > > > %px\n", addr);
>> > > > > > >
>> > > > > > >      return addr;
>> > > > > > >  }
>> > > > > > >
>> > > > > > >
>> > > > > > > Another option is to disable the call to
>> > > > > > > io_uring_nommu_get_unmapped_area())
>> > > > > > > with the next patch. Maybe you could add printks() to ia64's
>> > > > > > > arch_get_unmapped_area()
>> > > > > > > and check what it returns there?
>> > > > > > >
>> > > > > > > @@ -3654,6 +3658,8 @@ static const struct file_operations
>> > > > > > > io_uring_fops = {
>> > > > > > >  #ifndef CONFIG_MMU
>> > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>> > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>> > > > > > > +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>> > > > > > > +    .get_unmapped_area = NULL,
>> > > > > > >  #else
>> > > > > > >      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>> > > > > > >  #endif
>> > > > > > >
>> > > > > > > Helge
>> > > > > >
>> > > > > > Thanks Helge.  Sample output from that first patch:
>> > > > > >
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > address 1 is: 1ffffffffff40000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > returns address 2000000001e40000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > address 1 is: 1ffffffffff20000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > returns address 2000000001f20000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > address 1 is: 1ffffffffff30000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > returns address 2000000001f30000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > address 1 is: 1ffffffffff90000
>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > returns address 2000000001f90000
>> > > > > >
>> > > > > > This pattern seems to be pretty stable, I tried instead just
>> > > > > > directly returning the result of a call to
>> > > > > > arch_get_unmapped_area() at the end of the function and it seems
>> > > > > > similar:
>> > > > > >
>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > would return address 1ffffffffffd0000
>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > > > return address 2000000001f00000
>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > would return address 1ffffffffff00000
>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > > > return address 1ffffffffff00000
>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > would return address 1fffffffffe20000
>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > > > return address 2000000002000000
>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>> > > > > > would return address 1fffffffffe30000
>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>> > > > > > return address 2000000002100000
>> > > > > >
>> > > > > > Is that enough of a clue to go on?
>> > > > >
>> > > > > SHMLBA on ia64 is 0x100000:
>> > > > > arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
>> > > > > but the values returned by io_uring_mmu_get_unmapped_area() does not
>> > > > > fullfill this.
>> > > > >
>> > > > > So, probably ia64's SHMLBA isn't pulled in correctly in
>> > > > > io_uring/io_uring.c.
>> > > > > Check value of this line:
>> > > > >      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> > > > >
>> > > > > You could also add
>> > > > > #define SHM_COLOUR  0x100000
>> > > > > in front of the
>> > > > >      #ifdef SHM_COLOUR
>> > > > > (define SHM_COLOUR in io_uring/kbuf.c too).
>> > > >
>> > > > What is the value of PAGE_SIZE and "ptr" on your machine?
>> > > > For 4k page size I get:
>> > > > SHMLBA -1   ->        FFFFF
>> > > > PAGE_MASK   -> FFFFFFFFF000
>> > > > so,
>> > > > info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
>> > > > You could try to set nfo.align_mask = 0xfffff;
>> > > >
>> > > > Helge
>> > >
>> > > Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.
>> > >
>> > > Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask =
>> > > 0xFFFFF, but both of them made the problem change from 100%
>> > > reproducible, to
>> > > intermittent.
>> > >
>> > > After inspecting the ouput I observed that it hangs only when the
>> > > first
>> > > allocation returns an address below 0x2000000000000000, and the second
>> > > returns an address above it.  When both addresses are above it, it
>> > > does not
>> > > hang.  Examples:
>> > >
>> > > When it works:
>> > > $ cmake --version
>> > > cmake version 3.26.4
>> > >
>> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
>> > > $ dmesg --color=always -T | tail -n 4
>> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
>> > > return
>> > > address 1fffffffffe20000
>> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
>> > > address
>> > > 2000000002000000
>> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
>> > > return
>> > > address 1fffffffffe50000
>> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
>> > > address
>> > > 2000000002100000
>> > >
>> > >
>> > > When it hangs:
>> > > $ cmake --version
>> > > cmake version 3.26.4
>> > >
>> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
>> > > ^C
>> > > $ dmesg --color=always -T | tail -n 4
>> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
>> > > return
>> > > address 1ffffffffff00000
>> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
>> > > address
>> > > 1ffffffffff00000
>> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
>> > > return
>> > > address 1fffffffffe60000
>> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
>> > > address
>> > > 2000000001f00000
>> > >
>> > > Is io_uring_mmu_get_unmapped_area supported to always return
>> > > addresses above
>> > > 0x2000000000000000?
>> >
>> > Yes, with the patch below.
>> >
>> > > Any reason why it is not doing so sometimes?
>> >
>> > It depends on the parameters for vm_unmapped_area(). Specifically
>> > info.flags=0.
>> >
>> > Try this patch:
>> >
>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> > index 3bca7a79efda..b259794ab53b 100644
>> > --- a/io_uring/io_uring.c
>> > +++ b/io_uring/io_uring.c
>> > @@ -3429,10 +3429,13 @@ static unsigned long
>> > io_uring_mmu_get_unmapped_area(struct file *filp,
>> >  	 * A failed mmap() very likely causes application failure,
>> >  	 * so fall back to the bottom-up function here. This scenario
>> >  	 * can happen with large stack limits and large mmap()
>> > -	 * allocations.
>> > +	 * allocations. Use bottom-up on IA64 for correct aliasing.
>> >  	 */
>> > -	addr = vm_unmapped_area(&info);
>> > -	if (offset_in_page(addr)) {
>> > +	if (IS_ENABLED(CONFIG_IA64))
>> > +		addr = NULL;
>> > +	else
>> > +		addr = vm_unmapped_area(&info);
>> > +	if (!addr) {
>> >  		info.flags = 0;
>> >  		info.low_limit = TASK_UNMAPPED_BASE;
>> >  		info.high_limit = mmap_end;
>> >
>> > Helge
>> 
>> This patch does do the trick, but I am a little unsure if it's the 
>> right one
>> to go in:
>> 
>> * Adding an arch-specific conditional feels like a bad hack, why is it 
>> not
>> working with the other vm_unmapped_area_info settings?
> 
> because it tries to map below TASK_UNMAPPED_BASE, for which (I assume) 
> IA-64
> has different aliasing/caching rules. There are some comments in the 
> arch/ia64
> files, but I'm not a IA-64 expert...
> 
>> * What happened to the offset_in_page check for other arches?
> 
> I thought it's not necessary.
> 
> But below is another (and much better) approach, which you may test.
> I see quite some errors with the liburing testcases on hppa, but I 
> think
> they are not related to this function.
> 
> Can you test and report back?
> 
> Helge
> 
> 
> From 457f2c2db984bc159119bfb4426d9dc6c2779ed6 Mon Sep 17 00:00:00 2001
> From: Helge Deller <deller@gmx.de>
> Date: Sun, 16 Jul 2023 08:45:17 +0200
> Subject: [PATCH] io_uring: Adjust mapping wrt architecture aliasing
>  requirements
> 
> When mapping memory to userspace use the architecture-provided
> get_unmapped_area() function instead of the own copy which fails on
> IA-64 since it doesn't allow mappings below TASK_UNMAPPED_BASE.
> 
> Additionally make sure to flag the requested memory as MAP_SHARED so
> that any architecture-specific aliasing rules will be applied.
> 
> Reported-by: matoro <matoro_mailinglist_kernel@matoro.tk>
> Signed-off-by: Helge Deller <deller@gmx.de>
> 
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 3bca7a79efda..2e7dd93e45d0 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3398,48 +3398,27 @@ static unsigned long 
> io_uring_mmu_get_unmapped_area(struct file *filp,
>  			unsigned long addr, unsigned long len,
>  			unsigned long pgoff, unsigned long flags)
>  {
> -	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
> -	struct vm_unmapped_area_info info;
>  	void *ptr;
> 
>  	/*
>  	 * Do not allow to map to user-provided address to avoid breaking the
> -	 * aliasing rules. Userspace is not able to guess the offset address 
> of
> -	 * kernel kmalloc()ed memory area.
> +	 * aliasing rules of various architectures. Userspace is not able to
> +	 * guess the offset address of kernel kmalloc()ed memory area.
>  	 */
> -	if (addr)
> +	if (addr | (flags & MAP_FIXED))
>  		return -EINVAL;
> 
> +	/*
> +	 * The requested memory region is required to be shared between 
> kernel
> +	 * and userspace application.
> +	 */
> +	flags |= MAP_SHARED;
> +
>  	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>  	if (IS_ERR(ptr))
>  		return -ENOMEM;
> 
> -	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> -	info.length = len;
> -	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
> -	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
> -#ifdef SHM_COLOUR
> -	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
> -#else
> -	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
> -#endif
> -	info.align_offset = (unsigned long) ptr;
> -
> -	/*
> -	 * A failed mmap() very likely causes application failure,
> -	 * so fall back to the bottom-up function here. This scenario
> -	 * can happen with large stack limits and large mmap()
> -	 * allocations.
> -	 */
> -	addr = vm_unmapped_area(&info);
> -	if (offset_in_page(addr)) {
> -		info.flags = 0;
> -		info.low_limit = TASK_UNMAPPED_BASE;
> -		info.high_limit = mmap_end;
> -		addr = vm_unmapped_area(&info);
> -	}
> -
> -	return addr;
> +	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>  }
> 
>  #else /* !CONFIG_MMU */

This seems really close.  It worked for the trivial test case, so I ran 
the test suite from https://github.com/axboe/liburing to compare.  With 
kernel 6.3, I get 100% pass, after I get one failure:
Running test read-write.t                                           
cqe->res=33, expected=32
test_rem_buf_single(BUFFERS + 1) failed
Not root, skipping test_write_efbig
Test read-write.t failed with ret 1

Trying this patch out on other arches to see if it also affects them or 
is ia64-specific.
Helge Deller July 16, 2023, 8:54 p.m. UTC | #11
On 7/16/23 20:03, matoro wrote:
> On 2023-07-16 02:54, Helge Deller wrote:
>> * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>> On 2023-07-13 03:27, Helge Deller wrote:
>>> > * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>> > > On 2023-07-12 16:30, Helge Deller wrote:
>>> > > > On 7/12/23 21:05, Helge Deller wrote:
>>> > > > > On 7/12/23 19:28, matoro wrote:
>>> > > > > > On 2023-07-12 12:24, Helge Deller wrote:
>>> > > > > > > Hi Matoro,
>>> > > > > > >
>>> > > > > > > * matoro <matoro_mailinglist_kernel@matoro.tk>:
>>> > > > > > > > On 2023-03-14 13:16, Jens Axboe wrote:
>>> > > > > > > > > From: Helge Deller <deller@gmx.de>
>>> > > > > > > > >
>>> > > > > > > > > Some architectures have memory cache aliasing requirements (e.g. parisc)
>>> > > > > > > > > if memory is shared between userspace and kernel. This patch fixes the
>>> > > > > > > > > kernel to return an aliased address when asked by userspace via mmap().
>>> > > > > > > > >
>>> > > > > > > > > Signed-off-by: Helge Deller <deller@gmx.de>
>>> > > > > > > > > Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>> > > > > > > > > ---
>>> > > > > > > > >  io_uring/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
>>> > > > > > > > >  1 file changed, 51 insertions(+)
>>> > > > > > > > >
>>> > > > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> > > > > > > > > index 722624b6d0dc..3adecebbac71 100644
>>> > > > > > > > > --- a/io_uring/io_uring.c
>>> > > > > > > > > +++ b/io_uring/io_uring.c
>>> > > > > > > > > @@ -72,6 +72,7 @@
>>> > > > > > > > >  #include <linux/io_uring.h>
>>> > > > > > > > >  #include <linux/audit.h>
>>> > > > > > > > >  #include <linux/security.h>
>>> > > > > > > > > +#include <asm/shmparam.h>
>>> > > > > > > > >
>>> > > > > > > > >  #define CREATE_TRACE_POINTS
>>> > > > > > > > >  #include <trace/events/io_uring.h>
>>> > > > > > > > > @@ -3317,6 +3318,54 @@ static __cold int io_uring_mmap(struct file
>>> > > > > > > > > *file, struct vm_area_struct *vma)
>>> > > > > > > > >      return remap_pfn_range(vma, vma->vm_start, pfn, sz,
>>> > > > > > > > > vma->vm_page_prot);
>>> > > > > > > > >  }
>>> > > > > > > > >
>>> > > > > > > > > +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>> > > > > > > > > +            unsigned long addr, unsigned long len,
>>> > > > > > > > > +            unsigned long pgoff, unsigned long flags)
>>> > > > > > > > > +{
>>> > > > > > > > > +    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>>> > > > > > > > > +    struct vm_unmapped_area_info info;
>>> > > > > > > > > +    void *ptr;
>>> > > > > > > > > +
>>> > > > > > > > > +    /*
>>> > > > > > > > > +     * Do not allow to map to user-provided address to avoid breaking the
>>> > > > > > > > > +     * aliasing rules. Userspace is not able to guess the offset address
>>> > > > > > > > > of
>>> > > > > > > > > +     * kernel kmalloc()ed memory area.
>>> > > > > > > > > +     */
>>> > > > > > > > > +    if (addr)
>>> > > > > > > > > +        return -EINVAL;
>>> > > > > > > > > +
>>> > > > > > > > > +    ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>>> > > > > > > > > +    if (IS_ERR(ptr))
>>> > > > > > > > > +        return -ENOMEM;
>>> > > > > > > > > +
>>> > > > > > > > > +    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>>> > > > > > > > > +    info.length = len;
>>> > > > > > > > > +    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>>> > > > > > > > > +    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>>> > > > > > > > > +#ifdef SHM_COLOUR
>>> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>>> > > > > > > > > +#else
>>> > > > > > > > > +    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>>> > > > > > > > > +#endif
>>> > > > > > > > > +    info.align_offset = (unsigned long) ptr;
>>> > > > > > > > > +
>>> > > > > > > > > +    /*
>>> > > > > > > > > +     * A failed mmap() very likely causes application failure,
>>> > > > > > > > > +     * so fall back to the bottom-up function here. This scenario
>>> > > > > > > > > +     * can happen with large stack limits and large mmap()
>>> > > > > > > > > +     * allocations.
>>> > > > > > > > > +     */
>>> > > > > > > > > +    addr = vm_unmapped_area(&info);
>>> > > > > > > > > +    if (offset_in_page(addr)) {
>>> > > > > > > > > +        info.flags = 0;
>>> > > > > > > > > +        info.low_limit = TASK_UNMAPPED_BASE;
>>> > > > > > > > > +        info.high_limit = mmap_end;
>>> > > > > > > > > +        addr = vm_unmapped_area(&info);
>>> > > > > > > > > +    }
>>> > > > > > > > > +
>>> > > > > > > > > +    return addr;
>>> > > > > > > > > +}
>>> > > > > > > > > +
>>> > > > > > > > >  #else /* !CONFIG_MMU */
>>> > > > > > > > >
>>> > > > > > > > >  static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
>>> > > > > > > > > @@ -3529,6 +3578,8 @@ static const struct file_operations io_uring_fops
>>> > > > > > > > > = {
>>> > > > > > > > >  #ifndef CONFIG_MMU
>>> > > > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>> > > > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>> > > > > > > > > +#else
>>> > > > > > > > > +    .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>> > > > > > > > >  #endif
>>> > > > > > > > >      .poll        = io_uring_poll,
>>> > > > > > > > >  #ifdef CONFIG_PROC_FS
>>> > > > > > > >
>>> > > > > > > > Hi Jens, Helge - I've bisected a regression with
>>> > > > > > > > io_uring on ia64 to this
>>> > > > > > > > patch in 6.4.  Unfortunately this breaks userspace
>>> > > > > > > > programs using io_uring,
>>> > > > > > > > the easiest one to test is cmake with an io_uring
>>> > > > > > > > enabled libuv (i.e., libuv
>>> > > > > > > > >= 1.45.0) which will hang.
>>> > > > > > > >
>>> > > > > > > > I am aware that ia64 is in a vulnerable place right now
>>> > > > > > > > which I why I am
>>> > > > > > > > keeping this spread limited.  Since this clearly involves
>>> > > > > > > > architecture-specific changes for parisc,
>>> > > > > > >
>>> > > > > > > it isn't so much architecture-specific... (just one ifdef)
>>> > > > > > >
>>> > > > > > > > is there any chance of looking at
>>> > > > > > > > what is required to do the same for ia64?  I looked at
>>> > > > > > > > 0ef36bd2b37815719e31a72d2beecc28ca8ecd26 ("parisc:
>>> > > > > > > > change value of SHMLBA
>>> > > > > > > > from 0x00400000 to PAGE_SIZE") and tried to replicate the SHMLBA ->
>>> > > > > > > > SHM_COLOUR change, but it made no difference.
>>> > > > > > > >
>>> > > > > > > > If hardware is necessary for testing, I can provide it,
>>> > > > > > > > including remote BMC
>>> > > > > > > > access for restarts/kernel debugging.  Any takers?
>>> > > > > > >
>>> > > > > > > I won't have time to test myself, but maybe you could test?
>>> > > > > > >
>>> > > > > > > Basically we should try to find out why
>>> > > > > > > io_uring_mmu_get_unmapped_area()
>>> > > > > > > doesn't return valid addresses, while arch_get_unmapped_area()
>>> > > > > > > [in arch/ia64/kernel/sys_ia64.c] does.
>>> > > > > > >
>>> > > > > > > You could apply this patch first:
>>> > > > > > > It introduces a memory leak (as it requests memory twice),
>>> > > > > > > but maybe we
>>> > > > > > > get an idea?
>>> > > > > > > The ia64 arch_get_unmapped_area() searches for memory from bottom
>>> > > > > > > (flags=0), while io_uring function tries top-down first.
>>> > > > > > > Maybe that's
>>> > > > > > > the problem. And I don't understand the offset_in_page() check right
>>> > > > > > > now.
>>> > > > > > >
>>> > > > > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> > > > > > > index 3bca7a79efda..93b1964d2bbb 100644
>>> > > > > > > --- a/io_uring/io_uring.c
>>> > > > > > > +++ b/io_uring/io_uring.c
>>> > > > > > > @@ -3431,13 +3431,17 @@ static unsigned long
>>> > > > > > > io_uring_mmu_get_unmapped_area(struct file *filp,
>>> > > > > > >       * can happen with large stack limits and large mmap()
>>> > > > > > >       * allocations.
>>> > > > > > >       */
>>> > > > > > > +/* compare to arch_get_unmapped_area() in
>>> > > > > > > arch/ia64/kernel/sys_ia64.c */
>>> > > > > > >      addr = vm_unmapped_area(&info);
>>> > > > > > > -    if (offset_in_page(addr)) {
>>> > > > > > > +printk("io_uring_mmu_get_unmapped_area() address 1 is:
>>> > > > > > > %px\n", addr);
>>> > > > > > > +    addr = NULL;
>>> > > > > > > +    if (!addr) {
>>> > > > > > >          info.flags = 0;
>>> > > > > > >          info.low_limit = TASK_UNMAPPED_BASE;
>>> > > > > > >          info.high_limit = mmap_end;
>>> > > > > > >          addr = vm_unmapped_area(&info);
>>> > > > > > >      }
>>> > > > > > > +printk("io_uring_mmu_get_unmapped_area() returns address
>>> > > > > > > %px\n", addr);
>>> > > > > > >
>>> > > > > > >      return addr;
>>> > > > > > >  }
>>> > > > > > >
>>> > > > > > >
>>> > > > > > > Another option is to disable the call to
>>> > > > > > > io_uring_nommu_get_unmapped_area())
>>> > > > > > > with the next patch. Maybe you could add printks() to ia64's
>>> > > > > > > arch_get_unmapped_area()
>>> > > > > > > and check what it returns there?
>>> > > > > > >
>>> > > > > > > @@ -3654,6 +3658,8 @@ static const struct file_operations
>>> > > > > > > io_uring_fops = {
>>> > > > > > >  #ifndef CONFIG_MMU
>>> > > > > > >      .get_unmapped_area = io_uring_nommu_get_unmapped_area,
>>> > > > > > >      .mmap_capabilities = io_uring_nommu_mmap_capabilities,
>>> > > > > > > +#elif 0    /* IS_ENABLED(CONFIG_IA64) */
>>> > > > > > > +    .get_unmapped_area = NULL,
>>> > > > > > >  #else
>>> > > > > > >      .get_unmapped_area = io_uring_mmu_get_unmapped_area,
>>> > > > > > >  #endif
>>> > > > > > >
>>> > > > > > > Helge
>>> > > > > >
>>> > > > > > Thanks Helge.  Sample output from that first patch:
>>> > > > > >
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > address 1 is: 1ffffffffff40000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > returns address 2000000001e40000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > address 1 is: 1ffffffffff20000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > returns address 2000000001f20000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > address 1 is: 1ffffffffff30000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > returns address 2000000001f30000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > address 1 is: 1ffffffffff90000
>>> > > > > > [Wed Jul 12 13:09:50 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > returns address 2000000001f90000
>>> > > > > >
>>> > > > > > This pattern seems to be pretty stable, I tried instead just
>>> > > > > > directly returning the result of a call to
>>> > > > > > arch_get_unmapped_area() at the end of the function and it seems
>>> > > > > > similar:
>>> > > > > >
>>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > would return address 1ffffffffffd0000
>>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>>> > > > > > return address 2000000001f00000
>>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > would return address 1ffffffffff00000
>>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>>> > > > > > return address 1ffffffffff00000
>>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > would return address 1fffffffffe20000
>>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>>> > > > > > return address 2000000002000000
>>> > > > > > [Wed Jul 12 13:27:07 2023] io_uring_mmu_get_unmapped_area()
>>> > > > > > would return address 1fffffffffe30000
>>> > > > > > [Wed Jul 12 13:27:07 2023] but arch_get_unmapped_area() would
>>> > > > > > return address 2000000002100000
>>> > > > > >
>>> > > > > > Is that enough of a clue to go on?
>>> > > > >
>>> > > > > SHMLBA on ia64 is 0x100000:
>>> > > > > arch/ia64/include/asm/shmparam.h:#define        SHMLBA  (1024*1024)
>>> > > > > but the values returned by io_uring_mmu_get_unmapped_area() does not
>>> > > > > fullfill this.
>>> > > > >
>>> > > > > So, probably ia64's SHMLBA isn't pulled in correctly in
>>> > > > > io_uring/io_uring.c.
>>> > > > > Check value of this line:
>>> > > > >      info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>>> > > > >
>>> > > > > You could also add
>>> > > > > #define SHM_COLOUR  0x100000
>>> > > > > in front of the
>>> > > > >      #ifdef SHM_COLOUR
>>> > > > > (define SHM_COLOUR in io_uring/kbuf.c too).
>>> > > >
>>> > > > What is the value of PAGE_SIZE and "ptr" on your machine?
>>> > > > For 4k page size I get:
>>> > > > SHMLBA -1   ->        FFFFF
>>> > > > PAGE_MASK   -> FFFFFFFFF000
>>> > > > so,
>>> > > > info.align_mask = PAGE_MASK & (SHMLBA - 1UL) = 0xFF000;
>>> > > > You could try to set nfo.align_mask = 0xfffff;
>>> > > >
>>> > > > Helge
>>> > >
>>> > > Using 64KiB (65536) PAGE_SIZE here.  64-bit pointers.
>>> > >
>>> > > Tried both #define SHM_COLOUR 0x100000, as well and info.align_mask =
>>> > > 0xFFFFF, but both of them made the problem change from 100%
>>> > > reproducible, to
>>> > > intermittent.
>>> > >
>>> > > After inspecting the ouput I observed that it hangs only when the
>>> > > first
>>> > > allocation returns an address below 0x2000000000000000, and the second
>>> > > returns an address above it.  When both addresses are above it, it
>>> > > does not
>>> > > hang.  Examples:
>>> > >
>>> > > When it works:
>>> > > $ cmake --version
>>> > > cmake version 3.26.4
>>> > >
>>> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
>>> > > $ dmesg --color=always -T | tail -n 4
>>> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
>>> > > return
>>> > > address 1fffffffffe20000
>>> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
>>> > > address
>>> > > 2000000002000000
>>> > > [Wed Jul 12 20:32:37 2023] io_uring_mmu_get_unmapped_area() would
>>> > > return
>>> > > address 1fffffffffe50000
>>> > > [Wed Jul 12 20:32:37 2023] but arch_get_unmapped_area() would return
>>> > > address
>>> > > 2000000002100000
>>> > >
>>> > >
>>> > > When it hangs:
>>> > > $ cmake --version
>>> > > cmake version 3.26.4
>>> > >
>>> > > CMake suite maintained and supported by Kitware (kitware.com/cmake).
>>> > > ^C
>>> > > $ dmesg --color=always -T | tail -n 4
>>> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
>>> > > return
>>> > > address 1ffffffffff00000
>>> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
>>> > > address
>>> > > 1ffffffffff00000
>>> > > [Wed Jul 12 20:33:12 2023] io_uring_mmu_get_unmapped_area() would
>>> > > return
>>> > > address 1fffffffffe60000
>>> > > [Wed Jul 12 20:33:12 2023] but arch_get_unmapped_area() would return
>>> > > address
>>> > > 2000000001f00000
>>> > >
>>> > > Is io_uring_mmu_get_unmapped_area supported to always return
>>> > > addresses above
>>> > > 0x2000000000000000?
>>> >
>>> > Yes, with the patch below.
>>> >
>>> > > Any reason why it is not doing so sometimes?
>>> >
>>> > It depends on the parameters for vm_unmapped_area(). Specifically
>>> > info.flags=0.
>>> >
>>> > Try this patch:
>>> >
>>> > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>>> > index 3bca7a79efda..b259794ab53b 100644
>>> > --- a/io_uring/io_uring.c
>>> > +++ b/io_uring/io_uring.c
>>> > @@ -3429,10 +3429,13 @@ static unsigned long
>>> > io_uring_mmu_get_unmapped_area(struct file *filp,
>>> >       * A failed mmap() very likely causes application failure,
>>> >       * so fall back to the bottom-up function here. This scenario
>>> >       * can happen with large stack limits and large mmap()
>>> > -     * allocations.
>>> > +     * allocations. Use bottom-up on IA64 for correct aliasing.
>>> >       */
>>> > -    addr = vm_unmapped_area(&info);
>>> > -    if (offset_in_page(addr)) {
>>> > +    if (IS_ENABLED(CONFIG_IA64))
>>> > +        addr = NULL;
>>> > +    else
>>> > +        addr = vm_unmapped_area(&info);
>>> > +    if (!addr) {
>>> >          info.flags = 0;
>>> >          info.low_limit = TASK_UNMAPPED_BASE;
>>> >          info.high_limit = mmap_end;
>>> >
>>> > Helge
>>>
>>> This patch does do the trick, but I am a little unsure if it's the right one
>>> to go in:
>>>
>>> * Adding an arch-specific conditional feels like a bad hack, why is it not
>>> working with the other vm_unmapped_area_info settings?
>>
>> because it tries to map below TASK_UNMAPPED_BASE, for which (I assume) IA-64
>> has different aliasing/caching rules. There are some comments in the arch/ia64
>> files, but I'm not a IA-64 expert...
>>
>>> * What happened to the offset_in_page check for other arches?
>>
>> I thought it's not necessary.
>>
>> But below is another (and much better) approach, which you may test.
>> I see quite some errors with the liburing testcases on hppa, but I think
>> they are not related to this function.
>>
>> Can you test and report back?
>>
>> Helge
>>
>>
>> From 457f2c2db984bc159119bfb4426d9dc6c2779ed6 Mon Sep 17 00:00:00 2001
>> From: Helge Deller <deller@gmx.de>
>> Date: Sun, 16 Jul 2023 08:45:17 +0200
>> Subject: [PATCH] io_uring: Adjust mapping wrt architecture aliasing
>>  requirements
>>
>> When mapping memory to userspace use the architecture-provided
>> get_unmapped_area() function instead of the own copy which fails on
>> IA-64 since it doesn't allow mappings below TASK_UNMAPPED_BASE.
>>
>> Additionally make sure to flag the requested memory as MAP_SHARED so
>> that any architecture-specific aliasing rules will be applied.
>>
>> Reported-by: matoro <matoro_mailinglist_kernel@matoro.tk>
>> Signed-off-by: Helge Deller <deller@gmx.de>
>>
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 3bca7a79efda..2e7dd93e45d0 100644
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -3398,48 +3398,27 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
>>              unsigned long addr, unsigned long len,
>>              unsigned long pgoff, unsigned long flags)
>>  {
>> -    const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
>> -    struct vm_unmapped_area_info info;
>>      void *ptr;
>>
>>      /*
>>       * Do not allow to map to user-provided address to avoid breaking the
>> -     * aliasing rules. Userspace is not able to guess the offset address of
>> -     * kernel kmalloc()ed memory area.
>> +     * aliasing rules of various architectures. Userspace is not able to
>> +     * guess the offset address of kernel kmalloc()ed memory area.
>>       */
>> -    if (addr)
>> +    if (addr | (flags & MAP_FIXED))
>>          return -EINVAL;
>>
>> +    /*
>> +     * The requested memory region is required to be shared between kernel
>> +     * and userspace application.
>> +     */
>> +    flags |= MAP_SHARED;
>> +
>>      ptr = io_uring_validate_mmap_request(filp, pgoff, len);
>>      if (IS_ERR(ptr))
>>          return -ENOMEM;
>>
>> -    info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>> -    info.length = len;
>> -    info.low_limit = max(PAGE_SIZE, mmap_min_addr);
>> -    info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
>> -#ifdef SHM_COLOUR
>> -    info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
>> -#else
>> -    info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
>> -#endif
>> -    info.align_offset = (unsigned long) ptr;
>> -
>> -    /*
>> -     * A failed mmap() very likely causes application failure,
>> -     * so fall back to the bottom-up function here. This scenario
>> -     * can happen with large stack limits and large mmap()
>> -     * allocations.
>> -     */
>> -    addr = vm_unmapped_area(&info);
>> -    if (offset_in_page(addr)) {
>> -        info.flags = 0;
>> -        info.low_limit = TASK_UNMAPPED_BASE;
>> -        info.high_limit = mmap_end;
>> -        addr = vm_unmapped_area(&info);
>> -    }
>> -
>> -    return addr;
>> +    return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>>  }
>>
>>  #else /* !CONFIG_MMU */
>
> This seems really close.  It worked for the trivial test case, so I ran the test suite from https://github.com/axboe/liburing to compare.  With kernel 6.3, I get 100% pass, after I get one failure:
> Running test read-write.t cqe->res=33, expected=32
> test_rem_buf_single(BUFFERS + 1) failed
> Not root, skipping test_write_efbig
> Test read-write.t failed with ret 1
>
> Trying this patch out on other arches to see if it also affects them or is ia64-specific.

I'm sorry, but this patch does break parisc heavily...

I'll need to think more...

Helge
diff mbox series

Patch

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 722624b6d0dc..3adecebbac71 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -72,6 +72,7 @@ 
 #include <linux/io_uring.h>
 #include <linux/audit.h>
 #include <linux/security.h>
+#include <asm/shmparam.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -3317,6 +3318,54 @@  static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 }
 
+static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
+			unsigned long addr, unsigned long len,
+			unsigned long pgoff, unsigned long flags)
+{
+	const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
+	struct vm_unmapped_area_info info;
+	void *ptr;
+
+	/*
+	 * Do not allow to map to user-provided address to avoid breaking the
+	 * aliasing rules. Userspace is not able to guess the offset address of
+	 * kernel kmalloc()ed memory area.
+	 */
+	if (addr)
+		return -EINVAL;
+
+	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
+	if (IS_ERR(ptr))
+		return -ENOMEM;
+
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+	info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+#ifdef SHM_COLOUR
+	info.align_mask = PAGE_MASK & (SHM_COLOUR - 1UL);
+#else
+	info.align_mask = PAGE_MASK & (SHMLBA - 1UL);
+#endif
+	info.align_offset = (unsigned long) ptr;
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	addr = vm_unmapped_area(&info);
+	if (offset_in_page(addr)) {
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = mmap_end;
+		addr = vm_unmapped_area(&info);
+	}
+
+	return addr;
+}
+
 #else /* !CONFIG_MMU */
 
 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -3529,6 +3578,8 @@  static const struct file_operations io_uring_fops = {
 #ifndef CONFIG_MMU
 	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
 	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
+#else
+	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
 #endif
 	.poll		= io_uring_poll,
 #ifdef CONFIG_PROC_FS