diff mbox series

[v4,bpf-next,2/2] mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages().

Message ID 20240305030516.41519-3-alexei.starovoitov@gmail.com (mailing list archive)
State Accepted
Commit 6b66b3a4ed5e68dd95ce459bb2d96d4cd2633f99
Delegated to: BPF
Headers show
Series mm: Enforce ioremap address space and introduce sparse vm_area | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 13956 this patch: 13956
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 2938 this patch: 2938
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 15061 this patch: 15061
netdev/checkpatch warning WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Alexei Starovoitov March 5, 2024, 3:05 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

vmap/vmalloc APIs are used to map a set of pages into contiguous kernel
virtual space.

get_vm_area() with appropriate flag is used to request an area of kernel
address range. It's used for vmalloc, vmap, ioremap, xen use cases.
- vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag.
- the areas created by vmap() function should be tagged with VM_MAP.
- ioremap areas are tagged with VM_IOREMAP.

BPF would like to extend the vmap API to implement a lazily-populated
sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag
and vm_area_map_pages(area, start_addr, count, pages) API to map a set
of pages within a given area.
It has the same sanity checks as vmap() does.
It also checks that get_vm_area() was created with VM_SPARSE flag
which identifies such areas in /proc/vmallocinfo
and returns zero pages on read through /proc/kcore.

The next commits will introduce bpf_arena which is a sparsely populated
shared memory region between bpf program and user space process. It will
map privately-managed pages into a sparse vm area with the following steps:

  // request virtual memory region during bpf prog verification
  area = get_vm_area(area_size, VM_SPARSE);

  // on demand
  vm_area_map_pages(area, kaddr, kend, pages);
  vm_area_unmap_pages(area, kaddr, kend);

  // after bpf program is detached and unloaded
  free_vm_area(area);

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/vmalloc.h |  5 ++++
 mm/vmalloc.c            | 59 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 2 deletions(-)

Comments

Christoph Hellwig March 6, 2024, 2:19 p.m. UTC | #1
I'd still prefer to hide the vm_area, but for now:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Alexei Starovoitov March 6, 2024, 5:10 p.m. UTC | #2
On Wed, Mar 6, 2024 at 6:19 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> I'd still prefer to hide the vm_area, but for now:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thank you.
I will think of a way to move get_vm_area() to mm/internal.h and
propose a plan by lsf/mm/bpf in May.
Pasha Tatashin March 6, 2024, 9:03 p.m. UTC | #3
On Mon, Mar 4, 2024 at 10:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> vmap/vmalloc APIs are used to map a set of pages into contiguous kernel
> virtual space.
>
> get_vm_area() with appropriate flag is used to request an area of kernel
> address range. It's used for vmalloc, vmap, ioremap, xen use cases.
> - vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag.
> - the areas created by vmap() function should be tagged with VM_MAP.
> - ioremap areas are tagged with VM_IOREMAP.
>
> BPF would like to extend the vmap API to implement a lazily-populated
> sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag
> and vm_area_map_pages(area, start_addr, count, pages) API to map a set
> of pages within a given area.
> It has the same sanity checks as vmap() does.
> It also checks that get_vm_area() was created with VM_SPARSE flag
> which identifies such areas in /proc/vmallocinfo
> and returns zero pages on read through /proc/kcore.
>
> The next commits will introduce bpf_arena which is a sparsely populated
> shared memory region between bpf program and user space process. It will
> map privately-managed pages into a sparse vm area with the following steps:
>
>   // request virtual memory region during bpf prog verification
>   area = get_vm_area(area_size, VM_SPARSE);
>
>   // on demand
>   vm_area_map_pages(area, kaddr, kend, pages);
>   vm_area_unmap_pages(area, kaddr, kend);
>
>   // after bpf program is detached and unloaded
>   free_vm_area(area);
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/vmalloc.h |  5 ++++
>  mm/vmalloc.c            | 59 +++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 62 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index c720be70c8dd..0f72c85a377b 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -35,6 +35,7 @@ struct iov_iter;              /* in uio.h */
>  #else
>  #define VM_DEFER_KMEMLEAK      0
>  #endif
> +#define VM_SPARSE              0x00001000      /* sparse vm_area. not all pages are present. */
>
>  /* bits [20..32] reserved for arch specific ioremap internals */
>
> @@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr)
>  }
>
>  #ifdef CONFIG_MMU
> +int vm_area_map_pages(struct vm_struct *area, unsigned long start,
> +                     unsigned long end, struct page **pages);
> +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
> +                        unsigned long end);
>  void vunmap_range(unsigned long addr, unsigned long end);
>  static inline void set_vm_flush_reset_perms(void *addr)
>  {
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index f42f98a127d5..e5b8c70950bc 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -648,6 +648,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
>         return err;
>  }
>
> +static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
> +                               unsigned long end)
> +{
> +       might_sleep();

This interface and in general VM_SPARSE would be useful for
dynamically grown kernel stacks [1]. However, the might_sleep() here
would be a problem. We would need to be able to handle
vm_area_map_pages() from interrupt disabled context therefore no
sleeping. The caller would need to guarantee that the page tables are
pre-allocated before the mapping.

Pasha

[1] https://lore.kernel.org/all/CA+CK2bBYt9RAVqASB2eLyRQxYT5aiL0fGhUu3TumQCyJCNTWvw@mail.gmail.com
Alexei Starovoitov March 6, 2024, 9:28 p.m. UTC | #4
On Wed, Mar 6, 2024 at 1:04 PM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> On Mon, Mar 4, 2024 at 10:05 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > vmap/vmalloc APIs are used to map a set of pages into contiguous kernel
> > virtual space.
> >
> > get_vm_area() with appropriate flag is used to request an area of kernel
> > address range. It's used for vmalloc, vmap, ioremap, xen use cases.
> > - vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag.
> > - the areas created by vmap() function should be tagged with VM_MAP.
> > - ioremap areas are tagged with VM_IOREMAP.
> >
> > BPF would like to extend the vmap API to implement a lazily-populated
> > sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag
> > and vm_area_map_pages(area, start_addr, count, pages) API to map a set
> > of pages within a given area.
> > It has the same sanity checks as vmap() does.
> > It also checks that get_vm_area() was created with VM_SPARSE flag
> > which identifies such areas in /proc/vmallocinfo
> > and returns zero pages on read through /proc/kcore.
> >
> > The next commits will introduce bpf_arena which is a sparsely populated
> > shared memory region between bpf program and user space process. It will
> > map privately-managed pages into a sparse vm area with the following steps:
> >
> >   // request virtual memory region during bpf prog verification
> >   area = get_vm_area(area_size, VM_SPARSE);
> >
> >   // on demand
> >   vm_area_map_pages(area, kaddr, kend, pages);
> >   vm_area_unmap_pages(area, kaddr, kend);
> >
> >   // after bpf program is detached and unloaded
> >   free_vm_area(area);
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/vmalloc.h |  5 ++++
> >  mm/vmalloc.c            | 59 +++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 62 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > index c720be70c8dd..0f72c85a377b 100644
> > --- a/include/linux/vmalloc.h
> > +++ b/include/linux/vmalloc.h
> > @@ -35,6 +35,7 @@ struct iov_iter;              /* in uio.h */
> >  #else
> >  #define VM_DEFER_KMEMLEAK      0
> >  #endif
> > +#define VM_SPARSE              0x00001000      /* sparse vm_area. not all pages are present. */
> >
> >  /* bits [20..32] reserved for arch specific ioremap internals */
> >
> > @@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr)
> >  }
> >
> >  #ifdef CONFIG_MMU
> > +int vm_area_map_pages(struct vm_struct *area, unsigned long start,
> > +                     unsigned long end, struct page **pages);
> > +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
> > +                        unsigned long end);
> >  void vunmap_range(unsigned long addr, unsigned long end);
> >  static inline void set_vm_flush_reset_perms(void *addr)
> >  {
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index f42f98a127d5..e5b8c70950bc 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -648,6 +648,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
> >         return err;
> >  }
> >
> > +static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
> > +                               unsigned long end)
> > +{
> > +       might_sleep();
>
> This interface and in general VM_SPARSE would be useful for
> dynamically grown kernel stacks [1]. However, the might_sleep() here
> would be a problem. We would need to be able to handle
> vm_area_map_pages() from interrupt disabled context therefore no
> sleeping. The caller would need to guarantee that the page tables are
> pre-allocated before the mapping.

Sounds like we'd need to differentiate two kinds of sparse regions.
One that is really sparse where page tables are not populated (bpf use case)
and another where only the pte level might be empty.
Only the latter one will be usable for such auto-grow stacks.

Months back I played with this idea:
https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?&id=ce63949a879f2f26c1c1834303e6dfbfb79d1fbd
that
"Make vmap_pages_range() allocate page tables down to the last (PTE) level."
Essentially pass NULL instead of 'pages' into vmap_pages_range()
and it will populate all levels except the last.
Then the page fault handler can service a fault in auto-growing stack
area if it has a page stashed in some per-cpu free list.
I suspect this is something you might need for
"16k stack that is populated on fault",
plus a free list of 3 pages per-cpu,
and set_pte_at() in pf handler.
Pasha Tatashin March 6, 2024, 9:46 p.m. UTC | #5
> > This interface and in general VM_SPARSE would be useful for
> > dynamically grown kernel stacks [1]. However, the might_sleep() here
> > would be a problem. We would need to be able to handle
> > vm_area_map_pages() from interrupt disabled context therefore no
> > sleeping. The caller would need to guarantee that the page tables are
> > pre-allocated before the mapping.
>
> Sounds like we'd need to differentiate two kinds of sparse regions.
> One that is really sparse where page tables are not populated (bpf use case)
> and another where only the pte level might be empty.
> Only the latter one will be usable for such auto-grow stacks.
>
> Months back I played with this idea:
> https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?&id=ce63949a879f2f26c1c1834303e6dfbfb79d1fbd
> that
> "Make vmap_pages_range() allocate page tables down to the last (PTE) level."
> Essentially pass NULL instead of 'pages' into vmap_pages_range()
> and it will populate all levels except the last.

Yes, this is what is needed, however, it can be a little simpler with
kernel stacks:
given that the first page in the vm_area is mapped when stack is first
allocated, and that the VA range is aligned to 16K, we actually are
guaranteed to have all page table levels down to pte pre-allocated
during that initial mapping. Therefore, we do not need to worry about
allocating them later during PFs.

> Then the page fault handler can service a fault in auto-growing stack
> area if it has a page stashed in some per-cpu free list.
> I suspect this is something you might need for
> "16k stack that is populated on fault",
> plus a free list of 3 pages per-cpu,
> and set_pte_at() in pf handler.

Yes, what you described is exactly what I am working on: using 3-pages
per-cpu to handle kstack page faults. The only thing that is missing
is that I would like to have the ability to call a non-sleeping
version of vm_area_map_pages().

Pasha
Alexei Starovoitov March 6, 2024, 10:12 p.m. UTC | #6
On Wed, Mar 6, 2024 at 1:46 PM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> > > This interface and in general VM_SPARSE would be useful for
> > > dynamically grown kernel stacks [1]. However, the might_sleep() here
> > > would be a problem. We would need to be able to handle
> > > vm_area_map_pages() from interrupt disabled context therefore no
> > > sleeping. The caller would need to guarantee that the page tables are
> > > pre-allocated before the mapping.
> >
> > Sounds like we'd need to differentiate two kinds of sparse regions.
> > One that is really sparse where page tables are not populated (bpf use case)
> > and another where only the pte level might be empty.
> > Only the latter one will be usable for such auto-grow stacks.
> >
> > Months back I played with this idea:
> > https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?&id=ce63949a879f2f26c1c1834303e6dfbfb79d1fbd
> > that
> > "Make vmap_pages_range() allocate page tables down to the last (PTE) level."
> > Essentially pass NULL instead of 'pages' into vmap_pages_range()
> > and it will populate all levels except the last.
>
> Yes, this is what is needed, however, it can be a little simpler with
> kernel stacks:
> given that the first page in the vm_area is mapped when stack is first
> allocated, and that the VA range is aligned to 16K, we actually are
> guaranteed to have all page table levels down to pte pre-allocated
> during that initial mapping. Therefore, we do not need to worry about
> allocating them later during PFs.

Ahh. Found:
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, ...

> > Then the page fault handler can service a fault in auto-growing stack
> > area if it has a page stashed in some per-cpu free list.
> > I suspect this is something you might need for
> > "16k stack that is populated on fault",
> > plus a free list of 3 pages per-cpu,
> > and set_pte_at() in pf handler.
>
> Yes, what you described is exactly what I am working on: using 3-pages
> per-cpu to handle kstack page faults. The only thing that is missing
> is that I would like to have the ability to call a non-sleeping
> version of vm_area_map_pages().

vm_area_map_pages() cannot be non-sleepable, since the [start, end)
range will dictate whether mid level allocs and locks are needed.

Instead in alloc_thread_stack_node() you'd need a flavor
of get_vm_area() that can align the range to THREAD_ALIGN.
Then immediately call _sleepable_ vm_area_map_pages() to populate
the first page and later set_pte_at() the other pages on demand
from the fault handler.
Pasha Tatashin March 6, 2024, 10:56 p.m. UTC | #7
On Wed, Mar 6, 2024 at 5:13 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Mar 6, 2024 at 1:46 PM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> >
> > > > This interface and in general VM_SPARSE would be useful for
> > > > dynamically grown kernel stacks [1]. However, the might_sleep() here
> > > > would be a problem. We would need to be able to handle
> > > > vm_area_map_pages() from interrupt disabled context therefore no
> > > > sleeping. The caller would need to guarantee that the page tables are
> > > > pre-allocated before the mapping.
> > >
> > > Sounds like we'd need to differentiate two kinds of sparse regions.
> > > One that is really sparse where page tables are not populated (bpf use case)
> > > and another where only the pte level might be empty.
> > > Only the latter one will be usable for such auto-grow stacks.
> > >
> > > Months back I played with this idea:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?&id=ce63949a879f2f26c1c1834303e6dfbfb79d1fbd
> > > that
> > > "Make vmap_pages_range() allocate page tables down to the last (PTE) level."
> > > Essentially pass NULL instead of 'pages' into vmap_pages_range()
> > > and it will populate all levels except the last.
> >
> > Yes, this is what is needed, however, it can be a little simpler with
> > kernel stacks:
> > given that the first page in the vm_area is mapped when stack is first
> > allocated, and that the VA range is aligned to 16K, we actually are
> > guaranteed to have all page table levels down to pte pre-allocated
> > during that initial mapping. Therefore, we do not need to worry about
> > allocating them later during PFs.
>
> Ahh. Found:
> stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, ...
>
> > > Then the page fault handler can service a fault in auto-growing stack
> > > area if it has a page stashed in some per-cpu free list.
> > > I suspect this is something you might need for
> > > "16k stack that is populated on fault",
> > > plus a free list of 3 pages per-cpu,
> > > and set_pte_at() in pf handler.
> >
> > Yes, what you described is exactly what I am working on: using 3-pages
> > per-cpu to handle kstack page faults. The only thing that is missing
> > is that I would like to have the ability to call a non-sleeping
> > version of vm_area_map_pages().
>
> vm_area_map_pages() cannot be non-sleepable, since the [start, end)
> range will dictate whether mid level allocs and locks are needed.
>
> Instead in alloc_thread_stack_node() you'd need a flavor
> of get_vm_area() that can align the range to THREAD_ALIGN.
> Then immediately call _sleepable_ vm_area_map_pages() to populate
> the first page and later set_pte_at() the other pages on demand
> from the fault handler.

We still need to get to PTE level to use set_pte_at(). So, either
store it in task_struct for faster PF handling, or add another
non-sleeping vmap function that will do something like this:

vm_area_set_page_at(addr, page)
{
   pgd = pgd_offset_k(addr)
   p4d = vunmap_p4d_range(pgd, addr)
   pud = pud_offset(p4d, addr)
   pmd = pmd_offset(pud, addr)
   pte = pte_offset_kernel(pmd, addr)

  set_pte_at(init_mm, addr, pte, mk_pte(page...));
}

Pasha
Pasha Tatashin March 6, 2024, 10:57 p.m. UTC | #8
On Mon, Mar 4, 2024 at 10:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> vmap/vmalloc APIs are used to map a set of pages into contiguous kernel
> virtual space.
>
> get_vm_area() with appropriate flag is used to request an area of kernel
> address range. It's used for vmalloc, vmap, ioremap, xen use cases.
> - vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag.
> - the areas created by vmap() function should be tagged with VM_MAP.
> - ioremap areas are tagged with VM_IOREMAP.
>
> BPF would like to extend the vmap API to implement a lazily-populated
> sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag
> and vm_area_map_pages(area, start_addr, count, pages) API to map a set
> of pages within a given area.
> It has the same sanity checks as vmap() does.
> It also checks that get_vm_area() was created with VM_SPARSE flag
> which identifies such areas in /proc/vmallocinfo
> and returns zero pages on read through /proc/kcore.
>
> The next commits will introduce bpf_arena which is a sparsely populated
> shared memory region between bpf program and user space process. It will
> map privately-managed pages into a sparse vm area with the following steps:
>
>   // request virtual memory region during bpf prog verification
>   area = get_vm_area(area_size, VM_SPARSE);
>
>   // on demand
>   vm_area_map_pages(area, kaddr, kend, pages);
>   vm_area_unmap_pages(area, kaddr, kend);
>
>   // after bpf program is detached and unloaded
>   free_vm_area(area);
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Alexei Starovoitov March 6, 2024, 11:11 p.m. UTC | #9
On Wed, Mar 6, 2024 at 2:57 PM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> On Wed, Mar 6, 2024 at 5:13 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Mar 6, 2024 at 1:46 PM Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> > >
> > > > > This interface and in general VM_SPARSE would be useful for
> > > > > dynamically grown kernel stacks [1]. However, the might_sleep() here
> > > > > would be a problem. We would need to be able to handle
> > > > > vm_area_map_pages() from interrupt disabled context therefore no
> > > > > sleeping. The caller would need to guarantee that the page tables are
> > > > > pre-allocated before the mapping.
> > > >
> > > > Sounds like we'd need to differentiate two kinds of sparse regions.
> > > > One that is really sparse where page tables are not populated (bpf use case)
> > > > and another where only the pte level might be empty.
> > > > Only the latter one will be usable for such auto-grow stacks.
> > > >
> > > > Months back I played with this idea:
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?&id=ce63949a879f2f26c1c1834303e6dfbfb79d1fbd
> > > > that
> > > > "Make vmap_pages_range() allocate page tables down to the last (PTE) level."
> > > > Essentially pass NULL instead of 'pages' into vmap_pages_range()
> > > > and it will populate all levels except the last.
> > >
> > > Yes, this is what is needed, however, it can be a little simpler with
> > > kernel stacks:
> > > given that the first page in the vm_area is mapped when stack is first
> > > allocated, and that the VA range is aligned to 16K, we actually are
> > > guaranteed to have all page table levels down to pte pre-allocated
> > > during that initial mapping. Therefore, we do not need to worry about
> > > allocating them later during PFs.
> >
> > Ahh. Found:
> > stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, ...
> >
> > > > Then the page fault handler can service a fault in auto-growing stack
> > > > area if it has a page stashed in some per-cpu free list.
> > > > I suspect this is something you might need for
> > > > "16k stack that is populated on fault",
> > > > plus a free list of 3 pages per-cpu,
> > > > and set_pte_at() in pf handler.
> > >
> > > Yes, what you described is exactly what I am working on: using 3-pages
> > > per-cpu to handle kstack page faults. The only thing that is missing
> > > is that I would like to have the ability to call a non-sleeping
> > > version of vm_area_map_pages().
> >
> > vm_area_map_pages() cannot be non-sleepable, since the [start, end)
> > range will dictate whether mid level allocs and locks are needed.
> >
> > Instead in alloc_thread_stack_node() you'd need a flavor
> > of get_vm_area() that can align the range to THREAD_ALIGN.
> > Then immediately call _sleepable_ vm_area_map_pages() to populate
> > the first page and later set_pte_at() the other pages on demand
> > from the fault handler.
>
> We still need to get to PTE level to use set_pte_at(). So, either
> store it in task_struct for faster PF handling, or add another
> non-sleeping vmap function that will do something like this:
>
> vm_area_set_page_at(addr, page)
> {
>    pgd = pgd_offset_k(addr)
>    p4d = vunmap_p4d_range(pgd, addr)
>    pud = pud_offset(p4d, addr)
>    pmd = pmd_offset(pud, addr)
>    pte = pte_offset_kernel(pmd, addr)
>
>   set_pte_at(init_mm, addr, pte, mk_pte(page...));
> }

Right. There are several flavors of this logic across the tree.
What you're proposing is pretty much vmalloc_to_page() that
returns pte even if !pte_present, instead of a page.
x86 is doing mostly the same in lookup_address() fwiw.
Good opportunity to clean all this up and share the code.
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..0f72c85a377b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,7 @@  struct iov_iter;		/* in uio.h */
 #else
 #define VM_DEFER_KMEMLEAK	0
 #endif
+#define VM_SPARSE		0x00001000	/* sparse vm_area. not all pages are present. */
 
 /* bits [20..32] reserved for arch specific ioremap internals */
 
@@ -232,6 +233,10 @@  static inline bool is_vm_area_hugepages(const void *addr)
 }
 
 #ifdef CONFIG_MMU
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+		      unsigned long end, struct page **pages);
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+			 unsigned long end);
 void vunmap_range(unsigned long addr, unsigned long end);
 static inline void set_vm_flush_reset_perms(void *addr)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f42f98a127d5..e5b8c70950bc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -648,6 +648,58 @@  static int vmap_pages_range(unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+				unsigned long end)
+{
+	might_sleep();
+	if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+		return -EINVAL;
+	if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+		return -EINVAL;
+	if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+		return -EINVAL;
+	if ((end - start) >> PAGE_SHIFT > totalram_pages())
+		return -E2BIG;
+	if (start < (unsigned long)area->addr ||
+	    (void *)end > area->addr + get_vm_area_size(area))
+		return -ERANGE;
+	return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+		      unsigned long end, struct page **pages)
+{
+	int err;
+
+	err = check_sparse_vm_area(area, start, end);
+	if (err)
+		return err;
+
+	return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+			 unsigned long end)
+{
+	if (check_sparse_vm_area(area, start, end))
+		return;
+
+	vunmap_range(start, end);
+}
+
 int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -3822,9 +3874,9 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 		if (flags & VMAP_RAM)
 			copied = vmap_ram_vread_iter(iter, addr, n, flags);
-		else if (!(vm && (vm->flags & VM_IOREMAP)))
+		else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
 			copied = aligned_vread_iter(iter, addr, n);
-		else /* IOREMAP area is treated as memory hole */
+		else /* IOREMAP | SPARSE area is treated as memory hole */
 			copied = zero_iter(iter, n);
 
 		addr += copied;
@@ -4415,6 +4467,9 @@  static int s_show(struct seq_file *m, void *p)
 	if (v->flags & VM_IOREMAP)
 		seq_puts(m, " ioremap");
 
+	if (v->flags & VM_SPARSE)
+		seq_puts(m, " sparse");
+
 	if (v->flags & VM_ALLOC)
 		seq_puts(m, " vmalloc");