Message ID | 20240529-fuse-uring-for-6-9-rfc2-out-v1-6-d149476b1d65@ddn.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | fuse: fuse-over-io-uring | expand |
On Wed, May 29, 2024 at 08:00:41PM +0200, Bernd Schubert wrote: > This is to have a numa aware vmalloc function for memory exposed to > userspace. Fuse uring will allocate queue memory using this > new function. > > Signed-off-by: Bernd Schubert <bschubert@ddn.com> > cc: Andrew Morton <akpm@linux-foundation.org> > cc: linux-mm@kvack.org > Acked-by: Andrew Morton <akpm@linux-foundation.org> > --- > include/linux/vmalloc.h | 1 + > mm/nommu.c | 6 ++++++ > mm/vmalloc.c | 41 +++++++++++++++++++++++++++++++++++++---- > 3 files changed, 44 insertions(+), 4 deletions(-) > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index 98ea90e90439..e7645702074e 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -141,6 +141,7 @@ static inline unsigned long vmalloc_nr_pages(void) { return 0; } > extern void *vmalloc(unsigned long size) __alloc_size(1); > extern void *vzalloc(unsigned long size) __alloc_size(1); > extern void *vmalloc_user(unsigned long size) __alloc_size(1); > +extern void *vmalloc_node_user(unsigned long size, int node) __alloc_size(1); > extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); > extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); > extern void *vmalloc_32(unsigned long size) __alloc_size(1); > diff --git a/mm/nommu.c b/mm/nommu.c > index 5ec8f44e7ce9..207ddf639aa9 100644 > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -185,6 +185,12 @@ void *vmalloc_user(unsigned long size) > } > EXPORT_SYMBOL(vmalloc_user); > > +void *vmalloc_node_user(unsigned long size, int node) > +{ > + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); > +} > +EXPORT_SYMBOL(vmalloc_node_user); > + > struct page *vmalloc_to_page(const void *addr) > { > return virt_to_page(addr); > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 68fa001648cc..0ac2f44b2b1f 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3958,6 +3958,25 @@ void *vzalloc(unsigned long size) > } > EXPORT_SYMBOL(vzalloc); > > +/** > + * _vmalloc_node_user - allocate zeroed virtually contiguous memory for userspace > + * on the given numa node > + * @size: allocation size > + * @node: numa node > + * > + * The resulting memory area is zeroed so it can be mapped to userspace > + * without leaking data. > + * > + * Return: pointer to the allocated memory or %NULL on error > + */ > +static void *_vmalloc_node_user(unsigned long size, int node) > +{ > + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, > + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, > + VM_USERMAP, node, > + __builtin_return_address(0)); > +} > + Looking at the rest of vmalloc it seems like adding an extra variant to do the special thing is overkill, I think it would be fine to just have void *vmalloc_nod_user(unsigned long size, int node) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, node, __builtin_return_address(0)); } instead of creating a _vmalloc_node_user(). Also as an aside, this is definitely being used by this series, but I think it would be good to go ahead and send this by itself with just the explanation that it's going to be used by the fuse iouring stuff later, that way you can get this merged and continue working on the iouring part. This also goes for the other prep patches earlier this this series, but since those are fuse related it's probably fine to just keep shipping them with this series. Thanks, Josef
On 5/30/24 17:10, Josef Bacik wrote: > On Wed, May 29, 2024 at 08:00:41PM +0200, Bernd Schubert wrote: >> This is to have a numa aware vmalloc function for memory exposed to >> userspace. Fuse uring will allocate queue memory using this >> new function. >> >> Signed-off-by: Bernd Schubert <bschubert@ddn.com> >> cc: Andrew Morton <akpm@linux-foundation.org> >> cc: linux-mm@kvack.org >> Acked-by: Andrew Morton <akpm@linux-foundation.org> >> --- >> include/linux/vmalloc.h | 1 + >> mm/nommu.c | 6 ++++++ >> mm/vmalloc.c | 41 +++++++++++++++++++++++++++++++++++++---- >> 3 files changed, 44 insertions(+), 4 deletions(-) >> >> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h >> index 98ea90e90439..e7645702074e 100644 >> --- a/include/linux/vmalloc.h >> +++ b/include/linux/vmalloc.h >> @@ -141,6 +141,7 @@ static inline unsigned long vmalloc_nr_pages(void) { return 0; } >> extern void *vmalloc(unsigned long size) __alloc_size(1); >> extern void *vzalloc(unsigned long size) __alloc_size(1); >> extern void *vmalloc_user(unsigned long size) __alloc_size(1); >> +extern void *vmalloc_node_user(unsigned long size, int node) __alloc_size(1); >> extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); >> extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); >> extern void *vmalloc_32(unsigned long size) __alloc_size(1); >> diff --git a/mm/nommu.c b/mm/nommu.c >> index 5ec8f44e7ce9..207ddf639aa9 100644 >> --- a/mm/nommu.c >> +++ b/mm/nommu.c >> @@ -185,6 +185,12 @@ void *vmalloc_user(unsigned long size) >> } >> EXPORT_SYMBOL(vmalloc_user); >> >> +void *vmalloc_node_user(unsigned long size, int node) >> +{ >> + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); >> +} >> +EXPORT_SYMBOL(vmalloc_node_user); >> + >> struct page *vmalloc_to_page(const void *addr) >> { >> return virt_to_page(addr); >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >> index 68fa001648cc..0ac2f44b2b1f 100644 >> --- a/mm/vmalloc.c >> +++ b/mm/vmalloc.c >> @@ -3958,6 +3958,25 @@ void *vzalloc(unsigned long size) >> } >> EXPORT_SYMBOL(vzalloc); >> >> +/** >> + * _vmalloc_node_user - allocate zeroed virtually contiguous memory for userspace >> + * on the given numa node >> + * @size: allocation size >> + * @node: numa node >> + * >> + * The resulting memory area is zeroed so it can be mapped to userspace >> + * without leaking data. >> + * >> + * Return: pointer to the allocated memory or %NULL on error >> + */ >> +static void *_vmalloc_node_user(unsigned long size, int node) >> +{ >> + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, >> + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, >> + VM_USERMAP, node, >> + __builtin_return_address(0)); >> +} >> + > > Looking at the rest of vmalloc it seems like adding an extra variant to do the > special thing is overkill, I think it would be fine to just have > > void *vmalloc_nod_user(unsigned long size, int node) > { > return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, > GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, > VM_USERMAP, node, > __builtin_return_address(0)); > } > > instead of creating a _vmalloc_node_user(). No issue with me either. I had done it like this as there are basically two caller wit the same flags - vmalloc_user(size, NUMA_NO_NODE) and the new vmalloc_node_user(size, node). > > Also as an aside, this is definitely being used by this series, but I think it > would be good to go ahead and send this by itself with just the explanation that > it's going to be used by the fuse iouring stuff later, that way you can get this > merged and continue working on the iouring part. Thanks for your advise, will submit it separately. If the for now used export is acceptable it would also help me, as we have back ports of these patches. > > This also goes for the other prep patches earlier this this series, but since > those are fuse related it's probably fine to just keep shipping them with this > series. Thanks, Thanks again for your help and reviews! Bernd
On Wed, May 29, 2024 at 08:00:41PM +0200, Bernd Schubert wrote: > This is to have a numa aware vmalloc function for memory exposed to > userspace. Fuse uring will allocate queue memory using this > new function. > > Signed-off-by: Bernd Schubert <bschubert@ddn.com> > cc: Andrew Morton <akpm@linux-foundation.org> > cc: linux-mm@kvack.org > Acked-by: Andrew Morton <akpm@linux-foundation.org> > --- > include/linux/vmalloc.h | 1 + > mm/nommu.c | 6 ++++++ > mm/vmalloc.c | 41 +++++++++++++++++++++++++++++++++++++---- > 3 files changed, 44 insertions(+), 4 deletions(-) > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index 98ea90e90439..e7645702074e 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -141,6 +141,7 @@ static inline unsigned long vmalloc_nr_pages(void) { return 0; } > extern void *vmalloc(unsigned long size) __alloc_size(1); > extern void *vzalloc(unsigned long size) __alloc_size(1); > extern void *vmalloc_user(unsigned long size) __alloc_size(1); > +extern void *vmalloc_node_user(unsigned long size, int node) __alloc_size(1); > extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); > extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); > extern void *vmalloc_32(unsigned long size) __alloc_size(1); > diff --git a/mm/nommu.c b/mm/nommu.c > index 5ec8f44e7ce9..207ddf639aa9 100644 > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -185,6 +185,12 @@ void *vmalloc_user(unsigned long size) > } > EXPORT_SYMBOL(vmalloc_user); > > +void *vmalloc_node_user(unsigned long size, int node) > +{ > + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); > +} > +EXPORT_SYMBOL(vmalloc_node_user); > + > struct page *vmalloc_to_page(const void *addr) > { > return virt_to_page(addr); > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 68fa001648cc..0ac2f44b2b1f 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3958,6 +3958,25 @@ void *vzalloc(unsigned long size) > } > EXPORT_SYMBOL(vzalloc); > > +/** > + * _vmalloc_node_user - allocate zeroed virtually contiguous memory for userspace Please avoid the overly long line. > + * on the given numa node > + * @size: allocation size > + * @node: numa node > + * > + * The resulting memory area is zeroed so it can be mapped to userspace > + * without leaking data. > + * > + * Return: pointer to the allocated memory or %NULL on error > + */ > +static void *_vmalloc_node_user(unsigned long size, int node) Although for static functions kerneldoc comments are pretty silly to start with. > void *vmalloc_user(unsigned long size) > { > - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, > - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, > - VM_USERMAP, NUMA_NO_NODE, > - __builtin_return_address(0)); > + return _vmalloc_node_user(size, NUMA_NO_NODE); But I suspect simply adding a gfp_t argument to vmalloc_node might be a much easier to use interface here, even if it would need a sanity check to only allow for actually useful to vmalloc flags.
On Fri, May 31, 2024 at 06:56:01AM -0700, Christoph Hellwig wrote: > > void *vmalloc_user(unsigned long size) > > { > > - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, > > - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, > > - VM_USERMAP, NUMA_NO_NODE, > > - __builtin_return_address(0)); > > + return _vmalloc_node_user(size, NUMA_NO_NODE); > > But I suspect simply adding a gfp_t argument to vmalloc_node might be > a much easier to use interface here, even if it would need a sanity > check to only allow for actually useful to vmalloc flags. vmalloc doesn't properly support gfp flags due to page table allocation
On 6/3/24 17:59, Kent Overstreet wrote: > On Fri, May 31, 2024 at 06:56:01AM -0700, Christoph Hellwig wrote: >>> void *vmalloc_user(unsigned long size) >>> { >>> - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, >>> - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, >>> - VM_USERMAP, NUMA_NO_NODE, >>> - __builtin_return_address(0)); >>> + return _vmalloc_node_user(size, NUMA_NO_NODE); >> >> But I suspect simply adding a gfp_t argument to vmalloc_node might be >> a much easier to use interface here, even if it would need a sanity >> check to only allow for actually useful to vmalloc flags. > > vmalloc doesn't properly support gfp flags due to page table allocation Thanks Kent, I had actually totally misunderstood what Christoph meant. I might miss something, but vmalloc_node looks quite different to vmalloc_user / vmalloc_node_user void *vmalloc_user(unsigned long size) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } vs void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } void *vmalloc_node(unsigned long size, int node) { return __vmalloc_node(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } If we wanted to avoid another export, shouldn't we better rename vmalloc_user to vmalloc_node_user, add the node argument and change all callers? Anyway, I will send the current patch separately to linux-mm and will ask if it can get merged before the fuse patches. Thanks, Bernd
On Mon, Jun 03, 2024 at 11:59:01AM -0400, Kent Overstreet wrote: > On Fri, May 31, 2024 at 06:56:01AM -0700, Christoph Hellwig wrote: > > > void *vmalloc_user(unsigned long size) > > > { > > > - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, > > > - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, > > > - VM_USERMAP, NUMA_NO_NODE, > > > - __builtin_return_address(0)); > > > + return _vmalloc_node_user(size, NUMA_NO_NODE); > > > > But I suspect simply adding a gfp_t argument to vmalloc_node might be > > a much easier to use interface here, even if it would need a sanity > > check to only allow for actually useful to vmalloc flags. > > vmalloc doesn't properly support gfp flags due to page table allocation Which I tried to cover by the above "to only allow for actually useful to vmalloc flags". I.e. the __GFP_ZERO used here is useful, as would be a GFP_USER which we'd probably actually want here as well.
On Mon, Jun 03, 2024 at 07:24:03PM +0000, Bernd Schubert wrote: > void *vmalloc_node(unsigned long size, int node) > { > return __vmalloc_node(size, 1, GFP_KERNEL, node, > __builtin_return_address(0)); > } > > > > > If we wanted to avoid another export, shouldn't we better rename > vmalloc_user to vmalloc_node_user, add the node argument and change > all callers? > > Anyway, I will send the current patch separately to linux-mm and will ask > if it can get merged before the fuse patches. Well, the GFP flags exist to avoid needing a gazillion of variants of everything build around the page allocator. For vmalloc we can't, as Kent rightly said, support GFP_NOFS and GFP_NOIO and need to use the scopes instead, and we should warn about that (which __vmalloc doesn't and could use some fixes for).
On Mon, Jun 03, 2024 at 09:20:10PM -0700, Christoph Hellwig wrote: > On Mon, Jun 03, 2024 at 07:24:03PM +0000, Bernd Schubert wrote: > > void *vmalloc_node(unsigned long size, int node) > > { > > return __vmalloc_node(size, 1, GFP_KERNEL, node, > > __builtin_return_address(0)); > > } > > > > > > > > > > If we wanted to avoid another export, shouldn't we better rename > > vmalloc_user to vmalloc_node_user, add the node argument and change > > all callers? > > > > Anyway, I will send the current patch separately to linux-mm and will ask > > if it can get merged before the fuse patches. > > Well, the GFP flags exist to avoid needing a gazillion of variants of > everything build around the page allocator. For vmalloc we can't, as > Kent rightly said, support GFP_NOFS and GFP_NOIO and need to use the > scopes instead, and we should warn about that (which __vmalloc doesn't > and could use some fixes for). Perhaps before going any further here, we should refresh our memories on what the vmalloc code actually does these days? __vmalloc_area_node() does this when mapping the pages: /* * page tables allocations ignore external gfp mask, enforce it * by the scope API */ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) memalloc_noio_restore(flags); IOWs, vmalloc() has obeyed GFP_NOFS/GFP_NOIO constraints properly for since early 2022 and there isn't a need to wrap it with scopes just to do a single constrained allocation: commit 451769ebb7e792c3404db53b3c2a422990de654e Author: Michal Hocko <mhocko@suse.com> Date: Fri Jan 14 14:06:57 2022 -0800 mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc Patch series "extend vmalloc support for constrained allocations", v2. Based on a recent discussion with Dave and Neil [1] I have tried to implement NOFS, NOIO, NOFAIL support for the vmalloc to make life of kvmalloc users easier. ..... Add support for GFP_NOFS and GFP_NOIO to vmalloc directly. All internal allocations already comply with the given gfp_mask. The only current exception is vmap_pages_range which maps kernel page tables. Infer the proper scope API based on the given gfp mask. ..... -Dave.
On Fri, Jun 07, 2024 at 12:30:27PM +1000, Dave Chinner wrote: > IOWs, vmalloc() has obeyed GFP_NOFS/GFP_NOIO constraints properly > for since early 2022 and there isn't a need to wrap it with scopes > just to do a single constrained allocation: Perfect. Doesn't change that we still need some amount of filterting, e.g. GFP_COMP and vmalloc won't mix too well.
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 98ea90e90439..e7645702074e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -141,6 +141,7 @@ static inline unsigned long vmalloc_nr_pages(void) { return 0; } extern void *vmalloc(unsigned long size) __alloc_size(1); extern void *vzalloc(unsigned long size) __alloc_size(1); extern void *vmalloc_user(unsigned long size) __alloc_size(1); +extern void *vmalloc_node_user(unsigned long size, int node) __alloc_size(1); extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); extern void *vmalloc_32(unsigned long size) __alloc_size(1); diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7ce9..207ddf639aa9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -185,6 +185,12 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); +void *vmalloc_node_user(unsigned long size, int node) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_node_user); + struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 68fa001648cc..0ac2f44b2b1f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3958,6 +3958,25 @@ void *vzalloc(unsigned long size) } EXPORT_SYMBOL(vzalloc); +/** + * _vmalloc_node_user - allocate zeroed virtually contiguous memory for userspace + * on the given numa node + * @size: allocation size + * @node: numa node + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +static void *_vmalloc_node_user(unsigned long size, int node) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, node, + __builtin_return_address(0)); +} + /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size @@ -3969,13 +3988,27 @@ EXPORT_SYMBOL(vzalloc); */ void *vmalloc_user(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, NUMA_NO_NODE, - __builtin_return_address(0)); + return _vmalloc_node_user(size, NUMA_NO_NODE); } EXPORT_SYMBOL(vmalloc_user); +/** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace on + * a numa node + * @size: allocation size + * @node: numa node + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_node_user(unsigned long size, int node) +{ + return _vmalloc_node_user(size, node); +} +EXPORT_SYMBOL(vmalloc_node_user); + /** * vmalloc_node - allocate memory on a specific node * @size: allocation size