diff mbox series

[bpf-next,v1,RESEND,5/5] x86: use register_text_tail_vm

Message ID 20221031222541.1773452-6-song@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series vmalloc_exec for modules and BPF programs | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16192 this patch: 16190
netdev/cc_maintainers warning 8 maintainers not CCed: tglx@linutronix.de mingo@redhat.com luto@kernel.org david@redhat.com hpa@zytor.com dave.hansen@linux.intel.com bp@alien8.de glider@google.com
netdev/build_clang fail Errors and warnings before: 4364 this patch: 4366
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 18751 this patch: 18752
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 38 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 pending Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 pending Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_parallel on s390x with gcc

Commit Message

Song Liu Oct. 31, 2022, 10:25 p.m. UTC
Allocate 2MB pages up to round_up(_etext, 2MB), and register memory
[round_up(_etext, 4kb), round_up(_etext, 2MB)] with register_text_tail_vm
so that we can use this part of memory for dynamic kernel text (BPF
programs, etc.).

Here is an example:

[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext

[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]

[root@eth50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd

ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, and
bpf programs.

Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/include/asm/pgtable_64_types.h | 1 +
 arch/x86/mm/init_64.c                   | 4 +++-
 include/linux/vmalloc.h                 | 4 ++++
 3 files changed, 8 insertions(+), 1 deletion(-)

Comments

Edgecombe, Rick P Nov. 2, 2022, 10:24 p.m. UTC | #1
On Mon, 2022-10-31 at 15:25 -0700, Song Liu wrote:
> Allocate 2MB pages up to round_up(_etext, 2MB), and register memory
> [round_up(_etext, 4kb), round_up(_etext, 2MB)] with
> register_text_tail_vm
> so that we can use this part of memory for dynamic kernel text (BPF
> programs, etc.).
> 
> Here is an example:
> 
> [root@eth50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
> 
> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> ffffffff8220f920 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> ffffffff8220fa28 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> ffffffff8220fad4 t
> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> 
> [root@eth50-1 ~]#  grep 0xffffffff82200000
> /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-
> 0xffffffff82400000     2M     ro   PSE         x  pmd
> 
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text,
> and
> bpf programs.
> 
> Signed-off-by: Song Liu <song@kernel.org>
> ---
>  arch/x86/include/asm/pgtable_64_types.h | 1 +
>  arch/x86/mm/init_64.c                   | 4 +++-
>  include/linux/vmalloc.h                 | 4 ++++
>  3 files changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/pgtable_64_types.h
> b/arch/x86/include/asm/pgtable_64_types.h
> index 04f36063ad54..c0f9cceb109a 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -101,6 +101,7 @@ extern unsigned int ptrs_per_p4d;
>  #define PUD_MASK	(~(PUD_SIZE - 1))
>  #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
>  #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) &
> PMD_MASK)
>  
>  /*
>   * See Documentation/x86/x86_64/mm.rst for a description of the
> memory map.
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 3f040c6e5d13..5b42fc0c6099 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1373,7 +1373,7 @@ void mark_rodata_ro(void)
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>  	unsigned long end = (unsigned long)__end_rodata_hpage_align;
> -	unsigned long text_end = PFN_ALIGN(_etext);
> +	unsigned long text_end = PMD_ALIGN(_etext);
>  	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>  	unsigned long all_end;

Check out is_errata93(). Right now it assumes all text is between text-
etext and MODULES_VADDR-MODULES_END. It's a quite old errata, but it
would be nice if we had a is_text_addr() helper or something. To help
keep track of the places where text might pop up.

Speaking of which, it might be nice to update
Documentation/x86/x86_64/mm.rst with some hints that this area exists.

>  
> @@ -1414,6 +1414,8 @@ void mark_rodata_ro(void)
>  				(void *)rodata_end, (void *)_sdata);
>  
>  	debug_checkwx();
> +	register_text_tail_vm(PFN_ALIGN((unsigned long)_etext),
> +			      PMD_ALIGN((unsigned long)_etext));
>  }
>  
>  int kern_addr_valid(unsigned long addr)
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 9b2042313c12..7365cf9c4e7f 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -132,11 +132,15 @@ extern void vm_unmap_aliases(void);
>  #ifdef CONFIG_MMU
>  extern void __init vmalloc_init(void);
>  extern unsigned long vmalloc_nr_pages(void);
> +void register_text_tail_vm(unsigned long start, unsigned long end);
>  #else
>  static inline void vmalloc_init(void)
>  {
>  }
>  static inline unsigned long vmalloc_nr_pages(void) { return 0; }
> +void register_text_tail_vm(unsigned long start, unsigned long end)
> +{
> +}
>  #endif

This looks like it should be in the previous patch.

>  
>  extern void *vmalloc(unsigned long size) __alloc_size(1);
Song Liu Nov. 3, 2022, 9:04 p.m. UTC | #2
On Wed, Nov 2, 2022 at 3:24 PM Edgecombe, Rick P
<rick.p.edgecombe@intel.com> wrote:
>
> On Mon, 2022-10-31 at 15:25 -0700, Song Liu wrote:
> > Allocate 2MB pages up to round_up(_etext, 2MB), and register memory
> > [round_up(_etext, 4kb), round_up(_etext, 2MB)] with
> > register_text_tail_vm
> > so that we can use this part of memory for dynamic kernel text (BPF
> > programs, etc.).
> >
> > Here is an example:
> >
> > [root@eth50-1 ~]# grep _etext /proc/kallsyms
> > ffffffff82202a08 T _etext
> >
> > [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> > ffffffff8220f920 t
> > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> > ffffffff8220fa28 t
> > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> > ffffffff8220fad4 t
> > bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> >
> > [root@eth50-1 ~]#  grep 0xffffffff82200000
> > /sys/kernel/debug/page_tables/kernel
> > 0xffffffff82200000-
> > 0xffffffff82400000     2M     ro   PSE         x  pmd
> >
> > ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text,
> > and
> > bpf programs.
> >
> > Signed-off-by: Song Liu <song@kernel.org>
> > ---
> >  arch/x86/include/asm/pgtable_64_types.h | 1 +
> >  arch/x86/mm/init_64.c                   | 4 +++-
> >  include/linux/vmalloc.h                 | 4 ++++
> >  3 files changed, 8 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/include/asm/pgtable_64_types.h
> > b/arch/x86/include/asm/pgtable_64_types.h
> > index 04f36063ad54..c0f9cceb109a 100644
> > --- a/arch/x86/include/asm/pgtable_64_types.h
> > +++ b/arch/x86/include/asm/pgtable_64_types.h
> > @@ -101,6 +101,7 @@ extern unsigned int ptrs_per_p4d;
> >  #define PUD_MASK     (~(PUD_SIZE - 1))
> >  #define PGDIR_SIZE   (_AC(1, UL) << PGDIR_SHIFT)
> >  #define PGDIR_MASK   (~(PGDIR_SIZE - 1))
> > +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) &
> > PMD_MASK)
> >
> >  /*
> >   * See Documentation/x86/x86_64/mm.rst for a description of the
> > memory map.
> > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> > index 3f040c6e5d13..5b42fc0c6099 100644
> > --- a/arch/x86/mm/init_64.c
> > +++ b/arch/x86/mm/init_64.c
> > @@ -1373,7 +1373,7 @@ void mark_rodata_ro(void)
> >       unsigned long start = PFN_ALIGN(_text);
> >       unsigned long rodata_start = PFN_ALIGN(__start_rodata);
> >       unsigned long end = (unsigned long)__end_rodata_hpage_align;
> > -     unsigned long text_end = PFN_ALIGN(_etext);
> > +     unsigned long text_end = PMD_ALIGN(_etext);
> >       unsigned long rodata_end = PFN_ALIGN(__end_rodata);
> >       unsigned long all_end;
>
> Check out is_errata93(). Right now it assumes all text is between text-
> etext and MODULES_VADDR-MODULES_END. It's a quite old errata, but it
> would be nice if we had a is_text_addr() helper or something. To help
> keep track of the places where text might pop up.
>
> Speaking of which, it might be nice to update
> Documentation/x86/x86_64/mm.rst with some hints that this area exists.
>
> >
> > @@ -1414,6 +1414,8 @@ void mark_rodata_ro(void)
> >                               (void *)rodata_end, (void *)_sdata);
> >
> >       debug_checkwx();
> > +     register_text_tail_vm(PFN_ALIGN((unsigned long)_etext),
> > +                           PMD_ALIGN((unsigned long)_etext));
> >  }
> >
> >  int kern_addr_valid(unsigned long addr)
> > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > index 9b2042313c12..7365cf9c4e7f 100644
> > --- a/include/linux/vmalloc.h
> > +++ b/include/linux/vmalloc.h
> > @@ -132,11 +132,15 @@ extern void vm_unmap_aliases(void);
> >  #ifdef CONFIG_MMU
> >  extern void __init vmalloc_init(void);
> >  extern unsigned long vmalloc_nr_pages(void);
> > +void register_text_tail_vm(unsigned long start, unsigned long end);
> >  #else
> >  static inline void vmalloc_init(void)
> >  {
> >  }
> >  static inline unsigned long vmalloc_nr_pages(void) { return 0; }
> > +void register_text_tail_vm(unsigned long start, unsigned long end)
> > +{
> > +}
> >  #endif
>
> This looks like it should be in the previous patch.

Good catch! I will fix it in the next version.

Thanks,
Song

>
> >
> >  extern void *vmalloc(unsigned long size) __alloc_size(1);
diff mbox series

Patch

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 04f36063ad54..c0f9cceb109a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -101,6 +101,7 @@  extern unsigned int ptrs_per_p4d;
 #define PUD_MASK	(~(PUD_SIZE - 1))
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
+#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
 
 /*
  * See Documentation/x86/x86_64/mm.rst for a description of the memory map.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..5b42fc0c6099 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1373,7 +1373,7 @@  void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
 	unsigned long end = (unsigned long)__end_rodata_hpage_align;
-	unsigned long text_end = PFN_ALIGN(_etext);
+	unsigned long text_end = PMD_ALIGN(_etext);
 	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
 	unsigned long all_end;
 
@@ -1414,6 +1414,8 @@  void mark_rodata_ro(void)
 				(void *)rodata_end, (void *)_sdata);
 
 	debug_checkwx();
+	register_text_tail_vm(PFN_ALIGN((unsigned long)_etext),
+			      PMD_ALIGN((unsigned long)_etext));
 }
 
 int kern_addr_valid(unsigned long addr)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9b2042313c12..7365cf9c4e7f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -132,11 +132,15 @@  extern void vm_unmap_aliases(void);
 #ifdef CONFIG_MMU
 extern void __init vmalloc_init(void);
 extern unsigned long vmalloc_nr_pages(void);
+void register_text_tail_vm(unsigned long start, unsigned long end);
 #else
 static inline void vmalloc_init(void)
 {
 }
 static inline unsigned long vmalloc_nr_pages(void) { return 0; }
+void register_text_tail_vm(unsigned long start, unsigned long end)
+{
+}
 #endif
 
 extern void *vmalloc(unsigned long size) __alloc_size(1);