diff mbox series

[RFC,v1] mm/vmalloc: fix incorrect __vmap_pages_range_noflush() if vm_area_alloc_pages() from high order fallback to order0

Message ID 20240724181916.31776-1-hailong.liu@oppo.com (mailing list archive)
State New
Headers show
Series [RFC,v1] mm/vmalloc: fix incorrect __vmap_pages_range_noflush() if vm_area_alloc_pages() from high order fallback to order0 | expand

Commit Message

Hailong Liu July 24, 2024, 6:19 p.m. UTC
From: "Hailong.Liu" <hailong.liu@oppo.com>

The scenario where the issue occurs is as follows:
CONFIG: vmap_allow_huge = true && 2M is for PMD_SIZE
kvmalloc(2M)
    __vmalloc_node_range(vm_flags=VM_ALLOW_HUGE_VMAP)
        vm_area_alloc_pages(order=9) --->allocs order9 failed and fallback to order0
                                        and phys_addr is aligned with PMD_SIZE
            vmap_pages_range
                vmap_pages_range_noflush
                    __vmap_pages_range_noflush(page_shift = 21) ----> incorrect vmap *huge* here

Fix it by introducing VM_AREA_ALLOC_PAGES_FALLBACK in page->private if fallback to 0.
Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations")

CC: Barry Song <21cnbao@gmail.com>
Reported-by: Tangquan.Zheng <zhengtangquan@oppo.com>
Signed-off-by: Hailong.Liu <hailong.liu@oppo.com>
---
 mm/vmalloc.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Hailong Liu July 24, 2024, 6:28 p.m. UTC | #1
On Thu, 25. Jul 02:19, hailong.liu@oppo.com wrote:
> From: "Hailong.Liu" <hailong.liu@oppo.com>
>
> The scenario where the issue occurs is as follows:
> CONFIG: vmap_allow_huge = true && 2M is for PMD_SIZE
> kvmalloc(2M)
>     __vmalloc_node_range(vm_flags=VM_ALLOW_HUGE_VMAP)
>         vm_area_alloc_pages(order=9) --->allocs order9 failed and fallback to order0
>                                         and phys_addr is aligned with PMD_SIZE
>             vmap_pages_range
>                 vmap_pages_range_noflush
>                     __vmap_pages_range_noflush(page_shift = 21) ----> incorrect vmap *huge* here
>
> Fix it by introducing VM_AREA_ALLOC_PAGES_FALLBACK in page->private if fallback to 0.
> Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations")
>
> CC: Barry Song <21cnbao@gmail.com>
> Reported-by: Tangquan.Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Hailong.Liu <hailong.liu@oppo.com>
> ---
>  mm/vmalloc.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 03c78fae06f3..b35dfd3eeee3 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -75,6 +75,8 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>  static const bool vmap_allow_huge = false;
>  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>
> +#define VM_AREA_ALLOC_PAGES_FALLBACK 0x1
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -604,8 +606,13 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>  	WARN_ON(page_shift < PAGE_SHIFT);
>
>  	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> -			page_shift == PAGE_SHIFT)
> -		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> +			page_shift == PAGE_SHIFT ||
> +			page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
> +		int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
> +
> +		set_page_private(pages[0], 0);
> +		return ret;
> +	}
>
>  	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
>  		int err;
> @@ -3583,6 +3590,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>
>  			/* fall back to the zero order allocations */
>  			alloc_gfp |= __GFP_NOFAIL;
> +			fallback = true;
Sry for my mistake, I forget define fallback here.
BTW, This is not the optimal solution. Does anyone have a better idea? Glad to
hear:)
>  			order = 0;
>  			continue;
>  		}
> @@ -3608,6 +3616,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>  		cond_resched();
>  		nr_allocated += 1U << order;
>  	}
> +	if (nr_allocated && fallback)
> +		set_page_private(pages[0], VM_AREA_ALLOC_PAGES_FALLBACK);
>
>  	return nr_allocated;
>  }
> --
> 2.34.1
>

--
help you, help me,
Hailong.
Matthew Wilcox July 24, 2024, 8:02 p.m. UTC | #2
On Thu, Jul 25, 2024 at 02:28:27AM +0800, Hailong.Liu wrote:
> >  	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> > -			page_shift == PAGE_SHIFT)
> > -		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> > +			page_shift == PAGE_SHIFT ||
> > +			page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
> > +		int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
> > +
> > +		set_page_private(pages[0], 0);
> > +		return ret;
> > +	}
> >
> >  	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> >  		int err;
> > @@ -3583,6 +3590,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> >
> >  			/* fall back to the zero order allocations */
> >  			alloc_gfp |= __GFP_NOFAIL;
> > +			fallback = true;
> Sry for my mistake, I forget define fallback here.
> BTW, This is not the optimal solution. Does anyone have a better idea? Glad to
> hear:)

Yeah, I really don't like this approach.  You could return a small
struct indicating both nr_allocated and whether you had to fall back.
Or you could pass a bool * parameter.  They're both pretty nasty.
Barry Song July 24, 2024, 10:11 p.m. UTC | #3
On Thu, Jul 25, 2024 at 8:02 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Jul 25, 2024 at 02:28:27AM +0800, Hailong.Liu wrote:
> > >     if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> > > -                   page_shift == PAGE_SHIFT)
> > > -           return vmap_small_pages_range_noflush(addr, end, prot, pages);
> > > +                   page_shift == PAGE_SHIFT ||
> > > +                   page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
> > > +           int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
> > > +
> > > +           set_page_private(pages[0], 0);
> > > +           return ret;
> > > +   }
> > >
> > >     for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> > >             int err;
> > > @@ -3583,6 +3590,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> > >
> > >                     /* fall back to the zero order allocations */
> > >                     alloc_gfp |= __GFP_NOFAIL;
> > > +                   fallback = true;
> > Sry for my mistake, I forget define fallback here.
> > BTW, This is not the optimal solution. Does anyone have a better idea? Glad to
> > hear:)
>
> Yeah, I really don't like this approach.  You could return a small
> struct indicating both nr_allocated and whether you had to fall back.
> Or you could pass a bool * parameter.  They're both pretty nasty.

Yes, I feel returning a bool won't work very well.

the result could be a mixture of PMD and PTE if the allocated pages are
larger than a PMD.

For example, if we allocate 8MB, it might result in the first 4MB being
2* PMD, and the remaining 4MB being PTE order-0 pages.

I am also curious what will happen if we allocate 3MB(1PMD + some PTEs),
is the below doing the correct mapping?

        do {
                ret = vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));

Is it possible we have only mapped the first 2MB if page_shift is PMD?

Thanks
Barry
Barry Song July 24, 2024, 10:23 p.m. UTC | #4
On Thu, Jul 25, 2024 at 6:19 AM <hailong.liu@oppo.com> wrote:
>
> From: "Hailong.Liu" <hailong.liu@oppo.com>
>
> The scenario where the issue occurs is as follows:
> CONFIG: vmap_allow_huge = true && 2M is for PMD_SIZE
> kvmalloc(2M)
>     __vmalloc_node_range(vm_flags=VM_ALLOW_HUGE_VMAP)
>         vm_area_alloc_pages(order=9) --->allocs order9 failed and fallback to order0
>                                         and phys_addr is aligned with PMD_SIZE
>             vmap_pages_range
>                 vmap_pages_range_noflush
>                     __vmap_pages_range_noflush(page_shift = 21) ----> incorrect vmap *huge* here
>
> Fix it by introducing VM_AREA_ALLOC_PAGES_FALLBACK in page->private if fallback to 0.
> Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations")
>
> CC: Barry Song <21cnbao@gmail.com>
> Reported-by: Tangquan.Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Hailong.Liu <hailong.liu@oppo.com>
> ---
>  mm/vmalloc.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 03c78fae06f3..b35dfd3eeee3 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -75,6 +75,8 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>  static const bool vmap_allow_huge = false;
>  #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>
> +#define VM_AREA_ALLOC_PAGES_FALLBACK 0x1
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>         unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -604,8 +606,13 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>         WARN_ON(page_shift < PAGE_SHIFT);
>
>         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> -                       page_shift == PAGE_SHIFT)
> -               return vmap_small_pages_range_noflush(addr, end, prot, pages);
> +                       page_shift == PAGE_SHIFT ||
> +                       page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
> +               int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
> +
> +               set_page_private(pages[0], 0);
> +               return ret;
> +       }

we could have more than one *serious* bug here? do we also need the below
if ((end -  start) % PMD_SIZE) != 0)  ? no ?

int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

+        if (addr < end)
+                return vmap_small_pages_range_noflush(addr, end,
prot, pages + i);

        return 0;
}

>
>         for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
>                 int err;
> @@ -3583,6 +3590,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>
>                         /* fall back to the zero order allocations */
>                         alloc_gfp |= __GFP_NOFAIL;
> +                       fallback = true;
>                         order = 0;
>                         continue;
>                 }
> @@ -3608,6 +3616,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>                 cond_resched();
>                 nr_allocated += 1U << order;
>         }
> +       if (nr_allocated && fallback)
> +               set_page_private(pages[0], VM_AREA_ALLOC_PAGES_FALLBACK);
>
>         return nr_allocated;
>  }
> --
> 2.34.1
>
Hailong Liu July 25, 2024, 6:15 a.m. UTC | #5
On Wed, 24. Jul 21:02, Matthew Wilcox wrote:
> On Thu, Jul 25, 2024 at 02:28:27AM +0800, Hailong.Liu wrote:
> > >  	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
> > > -			page_shift == PAGE_SHIFT)
> > > -		return vmap_small_pages_range_noflush(addr, end, prot, pages);
> > > +			page_shift == PAGE_SHIFT ||
> > > +			page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
> > > +		int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
> > > +
> > > +		set_page_private(pages[0], 0);
> > > +		return ret;
> > > +	}
> > >
> > >  	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
> > >  		int err;
> > > @@ -3583,6 +3590,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> > >
> > >  			/* fall back to the zero order allocations */
> > >  			alloc_gfp |= __GFP_NOFAIL;
> > > +			fallback = true;
> > Sry for my mistake, I forget define fallback here.
> > BTW, This is not the optimal solution. Does anyone have a better idea? Glad to
> > hear:)
>
> Yeah, I really don't like this approach.  You could return a small
> struct indicating both nr_allocated and whether you had to fall back.
> Or you could pass a bool * parameter.  They're both pretty nasty.
Agree. Thanks for pointing out. I send a rfc-v2 patch with a different solution.
pls help review.
https://lore.kernel.org/all/20240725035318.471-1-hailong.liu@oppo.com/T/#u

--
help you, help me,
Hailong.
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 03c78fae06f3..b35dfd3eeee3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -75,6 +75,8 @@  early_param("nohugevmalloc", set_nohugevmalloc);
 static const bool vmap_allow_huge = false;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
 
+#define VM_AREA_ALLOC_PAGES_FALLBACK 0x1
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -604,8 +606,13 @@  int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 	WARN_ON(page_shift < PAGE_SHIFT);
 
 	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
-			page_shift == PAGE_SHIFT)
-		return vmap_small_pages_range_noflush(addr, end, prot, pages);
+			page_shift == PAGE_SHIFT ||
+			page_private(pages[0]) == VM_AREA_ALLOC_PAGES_FALLBACK) {
+		int ret = vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+		set_page_private(pages[0], 0);
+		return ret;
+	}
 
 	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
 		int err;
@@ -3583,6 +3590,7 @@  vm_area_alloc_pages(gfp_t gfp, int nid,
 
 			/* fall back to the zero order allocations */
 			alloc_gfp |= __GFP_NOFAIL;
+			fallback = true;
 			order = 0;
 			continue;
 		}
@@ -3608,6 +3616,8 @@  vm_area_alloc_pages(gfp_t gfp, int nid,
 		cond_resched();
 		nr_allocated += 1U << order;
 	}
+	if (nr_allocated && fallback)
+		set_page_private(pages[0], VM_AREA_ALLOC_PAGES_FALLBACK);
 
 	return nr_allocated;
 }