Message ID | 20221007234315.2877365-5-song@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | vmalloc_exec for modules and BPF programs | expand |
On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote: > On x86 kernel, we allocate 2MB pages for kernel text up to > round_down(_etext, 2MB). Therefore, some of the kernel text is still > on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to > round_up(_etext, 2MB), and use the rest of the page for modules and > BPF programs. > > Here is an example: > > [root@eth50-1 ~]# grep _etext /proc/kallsyms > ffffffff82202a08 T _etext > > [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 > ffffffff8220f920 t > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] > ffffffff8220fa28 t > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] > ffffffff8220fad4 t > bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] > > [root@eth50-1 ~]# grep 0xffffffff82200000 > /sys/kernel/debug/page_tables/kernel > 0xffffffff82200000- > 0xffffffff82400000 2M ro PSE x pmd > > [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms > ffffffff822ba910 t xfs_flush_inodes_worker [xfs] > ffffffff822bc580 t xfs_flush_inodes [xfs] > > ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, > xfs > module, and bpf programs. Can this memory range be freed as part of a vfree_exec() call then? Does vmalloc actually try to unmap it? If so, it could get complicated with PTI. It probably should be a special case that never gets fully freed. > > Signed-off-by: Song Liu <song@kernel.org> > --- > arch/x86/mm/init_64.c | 3 ++- > mm/vmalloc.c | 24 ++++++++++++++++++++++++ > 2 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index 0fe690ebc269..d94f196c541a 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -1367,12 +1367,13 @@ int __init > deferred_page_init_max_threads(const struct cpumask *node_cpumask) > > int kernel_set_to_readonly; > > +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & > PMD_MASK) > void mark_rodata_ro(void) > { > unsigned long start = PFN_ALIGN(_text); > unsigned long rodata_start = PFN_ALIGN(__start_rodata); > unsigned long end = (unsigned long)__end_rodata_hpage_align; > - unsigned long text_end = PFN_ALIGN(_etext); > + unsigned long text_end = PMD_ALIGN(_etext); This should probably have more logic and adjustments. If etext is PMD aligned, some of the stuff outside the diff won't do anything. Also, if a kernel doesn't have modules or BPF JIT it would be a waste of memory. > unsigned long rodata_end = PFN_ALIGN(__end_rodata); > unsigned long all_end; > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 9212ff96b871..41509bbec583 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false; > #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) > #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE) > > +static struct vm_struct text_tail_vm; > +static struct vmap_area text_tail_va; > + > bool is_vmalloc_addr(const void *x) > { > unsigned long addr = (unsigned long)kasan_reset_tag(x); > @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x) > unsigned long addr = (unsigned long)kasan_reset_tag(x); > if (addr >= MODULES_VADDR && addr < MODULES_END) > return 1; > + if (addr >= text_tail_va.va_start && addr < > text_tail_va.va_end) > + return 1; > #endif > return is_vmalloc_addr(x); > } > @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void) > } > } > > +static void register_text_tail_vm(void) > +{ > + unsigned long start = PFN_ALIGN((unsigned long)_etext); > + unsigned long end = PMD_ALIGN((unsigned long)_etext); > + struct vmap_area *va; > + > + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); > + if (WARN_ON_ONCE(!va)) > + return; > + text_tail_vm.addr = (void *)start; > + text_tail_vm.size = end - start; > + text_tail_va.va_start = start; > + text_tail_va.va_end = end; > + text_tail_va.vm = &text_tail_vm; > + memcpy(va, &text_tail_va, sizeof(*va)); > + insert_vmap_area_augment(va, NULL, &free_text_area_root, > &free_text_area_list); > +} > + > void __init vmalloc_init(void) > { > struct vmap_area *va; > @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void) > * Create the cache for vmap_area objects. > */ > vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); > + register_text_tail_vm(); > > for_each_possible_cpu(i) { > struct vmap_block_queue *vbq;
> On Oct 10, 2022, at 11:32 AM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote: >> On x86 kernel, we allocate 2MB pages for kernel text up to >> round_down(_etext, 2MB). Therefore, some of the kernel text is still >> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to >> round_up(_etext, 2MB), and use the rest of the page for modules and >> BPF programs. >> >> Here is an example: >> >> [root@eth50-1 ~]# grep _etext /proc/kallsyms >> ffffffff82202a08 T _etext >> >> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 >> ffffffff8220f920 t >> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] >> ffffffff8220fa28 t >> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] >> ffffffff8220fad4 t >> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] >> >> [root@eth50-1 ~]# grep 0xffffffff82200000 >> /sys/kernel/debug/page_tables/kernel >> 0xffffffff82200000- >> 0xffffffff82400000 2M ro PSE x pmd >> >> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms >> ffffffff822ba910 t xfs_flush_inodes_worker [xfs] >> ffffffff822bc580 t xfs_flush_inodes [xfs] >> >> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, >> xfs >> module, and bpf programs. > > Can this memory range be freed as part of a vfree_exec() call then? > Does vmalloc actually try to unmap it? If so, it could get complicated > with PTI. > > It probably should be a special case that never gets fully freed. Right, this is never freed. > >> >> Signed-off-by: Song Liu <song@kernel.org> >> --- >> arch/x86/mm/init_64.c | 3 ++- >> mm/vmalloc.c | 24 ++++++++++++++++++++++++ >> 2 files changed, 26 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c >> index 0fe690ebc269..d94f196c541a 100644 >> --- a/arch/x86/mm/init_64.c >> +++ b/arch/x86/mm/init_64.c >> @@ -1367,12 +1367,13 @@ int __init >> deferred_page_init_max_threads(const struct cpumask *node_cpumask) >> >> int kernel_set_to_readonly; >> >> +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & >> PMD_MASK) >> void mark_rodata_ro(void) >> { >> unsigned long start = PFN_ALIGN(_text); >> unsigned long rodata_start = PFN_ALIGN(__start_rodata); >> unsigned long end = (unsigned long)__end_rodata_hpage_align; >> - unsigned long text_end = PFN_ALIGN(_etext); >> + unsigned long text_end = PMD_ALIGN(_etext); > > This should probably have more logic and adjustments. If etext is PMD > aligned, some of the stuff outside the diff won't do anything. Hmm.. I don't quite follow this comment. If the etext is PMD aligned, we can still use vmalloc_exec to allocate memory. So it shouldn't matter, no? > > Also, if a kernel doesn't have modules or BPF JIT it would be a waste > of memory. I guess we can add a command line argument for these corner cases? Thanks, Song > >> unsigned long rodata_end = PFN_ALIGN(__end_rodata); >> unsigned long all_end; >> >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >> index 9212ff96b871..41509bbec583 100644 >> --- a/mm/vmalloc.c >> +++ b/mm/vmalloc.c >> @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false; >> #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) >> #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE) >> >> +static struct vm_struct text_tail_vm; >> +static struct vmap_area text_tail_va; >> + >> bool is_vmalloc_addr(const void *x) >> { >> unsigned long addr = (unsigned long)kasan_reset_tag(x); >> @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x) >> unsigned long addr = (unsigned long)kasan_reset_tag(x); >> if (addr >= MODULES_VADDR && addr < MODULES_END) >> return 1; >> + if (addr >= text_tail_va.va_start && addr < >> text_tail_va.va_end) >> + return 1; >> #endif >> return is_vmalloc_addr(x); >> } >> @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void) >> } >> } >> >> +static void register_text_tail_vm(void) >> +{ >> + unsigned long start = PFN_ALIGN((unsigned long)_etext); >> + unsigned long end = PMD_ALIGN((unsigned long)_etext); >> + struct vmap_area *va; >> + >> + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); >> + if (WARN_ON_ONCE(!va)) >> + return; >> + text_tail_vm.addr = (void *)start; >> + text_tail_vm.size = end - start; >> + text_tail_va.va_start = start; >> + text_tail_va.va_end = end; >> + text_tail_va.vm = &text_tail_vm; >> + memcpy(va, &text_tail_va, sizeof(*va)); >> + insert_vmap_area_augment(va, NULL, &free_text_area_root, >> &free_text_area_list); >> +} >> + >> void __init vmalloc_init(void) >> { >> struct vmap_area *va; >> @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void) >> * Create the cache for vmap_area objects. >> */ >> vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); >> + register_text_tail_vm(); >> >> for_each_possible_cpu(i) { >> struct vmap_block_queue *vbq;
On Mon, 2022-10-10 at 19:08 +0000, Song Liu wrote: > > On Oct 10, 2022, at 11:32 AM, Edgecombe, Rick P < > > rick.p.edgecombe@intel.com> wrote: > > > > On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote: > > > On x86 kernel, we allocate 2MB pages for kernel text up to > > > round_down(_etext, 2MB). Therefore, some of the kernel text is > > > still > > > on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to > > > round_up(_etext, 2MB), and use the rest of the page for modules > > > and > > > BPF programs. > > > > > > Here is an example: > > > > > > [root@eth50-1 ~]# grep _etext /proc/kallsyms > > > ffffffff82202a08 T _etext > > > > > > [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 > > > ffffffff8220f920 t > > > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] > > > ffffffff8220fa28 t > > > bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] > > > ffffffff8220fad4 t > > > bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] > > > > > > [root@eth50-1 ~]# grep 0xffffffff82200000 > > > /sys/kernel/debug/page_tables/kernel > > > 0xffffffff82200000- > > > 0xffffffff82400000 2M ro PSE x pmd > > > > > > [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms > > > ffffffff822ba910 t xfs_flush_inodes_worker [xfs] > > > ffffffff822bc580 t xfs_flush_inodes [xfs] > > > > > > ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel > > > text, > > > xfs > > > module, and bpf programs. > > > > Can this memory range be freed as part of a vfree_exec() call then? > > Does vmalloc actually try to unmap it? If so, it could get > > complicated > > with PTI. > > > > It probably should be a special case that never gets fully freed. > > Right, this is never freed. Can we get a comment somewhere highlighting how this is avoided? Maybe this is just me missing some vmalloc understanding, but this pointer to an all zero vm_struct seems weird too. Are there other vmap allocations like this? Which vmap APIs work with this and which don't? > > > > > > > > > Signed-off-by: Song Liu <song@kernel.org> > > > --- > > > arch/x86/mm/init_64.c | 3 ++- > > > mm/vmalloc.c | 24 ++++++++++++++++++++++++ > > > 2 files changed, 26 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > > > index 0fe690ebc269..d94f196c541a 100644 > > > --- a/arch/x86/mm/init_64.c > > > +++ b/arch/x86/mm/init_64.c > > > @@ -1367,12 +1367,13 @@ int __init > > > deferred_page_init_max_threads(const struct cpumask > > > *node_cpumask) > > > > > > int kernel_set_to_readonly; > > > > > > +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - > > > 1)) & > > > PMD_MASK) > > > void mark_rodata_ro(void) > > > { > > > unsigned long start = PFN_ALIGN(_text); > > > unsigned long rodata_start = PFN_ALIGN(__start_rodata); > > > unsigned long end = (unsigned > > > long)__end_rodata_hpage_align; > > > - unsigned long text_end = PFN_ALIGN(_etext); > > > + unsigned long text_end = PMD_ALIGN(_etext); > > > > This should probably have more logic and adjustments. If etext is > > PMD > > aligned, some of the stuff outside the diff won't do anything. > > Hmm.. I don't quite follow this comment. If the etext is PMD > aligned, > we can still use vmalloc_exec to allocate memory. So it shouldn't > matter, no? Maybe this doesn't matter since PMD alignment must happen naturally sometimes. I was just noticing the attempts to operate on this region between etext and start_rodata (free_init_pages(), etc). If this was never not PMD aligned they could be dropped. But if you are going to adjust the behavior for !CONFIG_MODULES, etc, then it is still needed.
On Tue, 2022-10-11 at 16:25 +0000, Song Liu wrote: > > Maybe this is just me missing some vmalloc understanding, but this > > pointer to an all zero vm_struct seems weird too. Are there other > > vmap > > allocations like this? Which vmap APIs work with this and which > > don't? > > There are two vmap trees at the moment: free_area_ tree and > vmap_area_ tree. free_area_ tree uses vmap->subtree_max_size, while > vmap_area_ tree contains vmap backed by vm_struct, and thus uses > vmap->vm. > > This set add a new tree, free_text_area_. This tree is different to > the other two, as it uses subtree_max_size, and it is also backed > by vm_struct. To handle this requirement without growing vmap_struct, > we introduced all_text_vm to store the vm_struct for free_text_area_ > tree. > > free_text_area_ tree is different to vmap_area_ tree. Each vmap in > vmap_area_ tree has its own vm_struct (1 to 1 mapping), while > multiple vmap in free_text_area_ tree map to a single vm_struct. > > Also, free_text_area_ handles granularity < PAGE_SIZE; while the > other two trees only work with PAGE_SIZE aligned memory. > > Does this answer your questions? I mean from the perspective of someone trying to use this without diving into the entire implementation. The function is called vmalloc_exec() and is freed with vfree_exec(). Makes sense. But with the other vmallocs_foo's (including previous vmalloc_exec() implementations) you can call find_vm_area(), etc on them. They show in "vmallocinfo" and generally behave similarly. That isn't true for these new allocations, right? Then you have code that operates on module text like: if (is_vmalloc_or_module_addr(addr)) pfn = vmalloc_to_pfn(addr); It looks like it would work (on x86 at least). Should it be expected to? Especially after this patch, where there is memory that isn't even tracked by the original vmap_area trees, it is pretty much a separate allocator. So I think it might be nice to spell out which other vmalloc APIs work with these new functions since they are named "vmalloc". Maybe just say none of them do. Separate from that, I guess you are planning to make this limited to certain architectures? It might be better to put logic with assumptions about x86 boot time page table details inside arch/x86 somewhere.
> On Oct 11, 2022, at 1:40 PM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Tue, 2022-10-11 at 16:25 +0000, Song Liu wrote: >>> Maybe this is just me missing some vmalloc understanding, but this >>> pointer to an all zero vm_struct seems weird too. Are there other >>> vmap >>> allocations like this? Which vmap APIs work with this and which >>> don't? >> >> There are two vmap trees at the moment: free_area_ tree and >> vmap_area_ tree. free_area_ tree uses vmap->subtree_max_size, while >> vmap_area_ tree contains vmap backed by vm_struct, and thus uses >> vmap->vm. >> >> This set add a new tree, free_text_area_. This tree is different to >> the other two, as it uses subtree_max_size, and it is also backed >> by vm_struct. To handle this requirement without growing vmap_struct, >> we introduced all_text_vm to store the vm_struct for free_text_area_ >> tree. >> >> free_text_area_ tree is different to vmap_area_ tree. Each vmap in >> vmap_area_ tree has its own vm_struct (1 to 1 mapping), while >> multiple vmap in free_text_area_ tree map to a single vm_struct. >> >> Also, free_text_area_ handles granularity < PAGE_SIZE; while the >> other two trees only work with PAGE_SIZE aligned memory. >> >> Does this answer your questions? > > I mean from the perspective of someone trying to use this without > diving into the entire implementation. > > The function is called vmalloc_exec() and is freed with vfree_exec(). > Makes sense. But with the other vmallocs_foo's (including previous > vmalloc_exec() implementations) you can call find_vm_area(), etc on > them. They show in "vmallocinfo" and generally behave similarly. That > isn't true for these new allocations, right? That's right. These operations are not supported (at least for now). > > Then you have code that operates on module text like: > if (is_vmalloc_or_module_addr(addr)) > pfn = vmalloc_to_pfn(addr); > > It looks like it would work (on x86 at least). Should it be expected > to? > > Especially after this patch, where there is memory that isn't even > tracked by the original vmap_area trees, it is pretty much a separate > allocator. So I think it might be nice to spell out which other vmalloc > APIs work with these new functions since they are named "vmalloc". > Maybe just say none of them do. I guess it is fair to call this a separate allocator. Maybe vmalloc_exec is not the right name? I do think this is the best way to build an allocator with vmap tree logic. > > > Separate from that, I guess you are planning to make this limited to > certain architectures? It might be better to put logic with assumptions > about x86 boot time page table details inside arch/x86 somewhere. Yes, the architecture need some text_poke mechanism to use this. On BPF side, x86_64 calls this directly from arch code (jit engine), so it is mostly covered. For modules, we need to handle this better. Thanks, Song
On Wed, 2022-10-12 at 05:37 +0000, Song Liu wrote: > > Then you have code that operates on module text like: > > if (is_vmalloc_or_module_addr(addr)) > > pfn = vmalloc_to_pfn(addr); > > > > It looks like it would work (on x86 at least). Should it be > > expected > > to? > > > > Especially after this patch, where there is memory that isn't even > > tracked by the original vmap_area trees, it is pretty much a > > separate > > allocator. So I think it might be nice to spell out which other > > vmalloc > > APIs work with these new functions since they are named "vmalloc". > > Maybe just say none of them do. > > I guess it is fair to call this a separate allocator. Maybe > vmalloc_exec is not the right name? I do think this is the best > way to build an allocator with vmap tree logic. Yea, I don't know about the name. I think someone else suggested it specifically, right? I had called mine perm_alloc() so it could also handle read-only and other permissions. If you keep vmalloc_exec() it needs some big comments about which APIs can work with it, and an audit of the existing code that works on module and JIT text. > > > > > > > Separate from that, I guess you are planning to make this limited > > to > > certain architectures? It might be better to put logic with > > assumptions > > about x86 boot time page table details inside arch/x86 somewhere. > > Yes, the architecture need some text_poke mechanism to use this. It also depends on the space between _etext and the PMD aligned _etext to be present and not get used by anything else. For other architectures, there might be rodata there or other things. > On BPF side, x86_64 calls this directly from arch code (jit engine), > so it is mostly covered. For modules, we need to handle this better. That old RFC has some ideas around this. I kind of like your incremental approach though. To me it seems to be moving in the right direction.
> On Oct 12, 2022, at 11:38 AM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Wed, 2022-10-12 at 05:37 +0000, Song Liu wrote: >>> Then you have code that operates on module text like: >>> if (is_vmalloc_or_module_addr(addr)) >>> pfn = vmalloc_to_pfn(addr); >>> >>> It looks like it would work (on x86 at least). Should it be >>> expected >>> to? >>> >>> Especially after this patch, where there is memory that isn't even >>> tracked by the original vmap_area trees, it is pretty much a >>> separate >>> allocator. So I think it might be nice to spell out which other >>> vmalloc >>> APIs work with these new functions since they are named "vmalloc". >>> Maybe just say none of them do. >> >> I guess it is fair to call this a separate allocator. Maybe >> vmalloc_exec is not the right name? I do think this is the best >> way to build an allocator with vmap tree logic. > > Yea, I don't know about the name. I think someone else suggested it > specifically, right? I think Luis suggested rename module_alloc to vmalloc_exec. But I guess we still need module_alloc for module data allocations. > > I had called mine perm_alloc() so it could also handle read-only and > other permissions. What are other permissions that we use? We can probably duplicate the free_text_are_ tree logic for other cases. > If you keep vmalloc_exec() it needs some big > comments about which APIs can work with it, and an audit of the > existing code that works on module and JIT text. > >> >>> >>> >>> Separate from that, I guess you are planning to make this limited >>> to >>> certain architectures? It might be better to put logic with >>> assumptions >>> about x86 boot time page table details inside arch/x86 somewhere. >> >> Yes, the architecture need some text_poke mechanism to use this. > > It also depends on the space between _etext and the PMD aligned _etext > to be present and not get used by anything else. For other > architectures, there might be rodata there or other things. Good point! We need to make sure this part is not used by other things. > >> On BPF side, x86_64 calls this directly from arch code (jit engine), >> so it is mostly covered. For modules, we need to handle this better. > > That old RFC has some ideas around this. I kind of like your > incremental approach though. To me it seems to be moving in the right > direction. Thanks! Song
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0fe690ebc269..d94f196c541a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) int kernel_set_to_readonly; +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long)__end_rodata_hpage_align; - unsigned long text_end = PFN_ALIGN(_etext); + unsigned long text_end = PMD_ALIGN(_etext); unsigned long rodata_end = PFN_ALIGN(__end_rodata); unsigned long all_end; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9212ff96b871..41509bbec583 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false; #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE) +static struct vm_struct text_tail_vm; +static struct vmap_area text_tail_va; + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)kasan_reset_tag(x); @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x) unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; + if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end) + return 1; #endif return is_vmalloc_addr(x); } @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void) } } +static void register_text_tail_vm(void) +{ + unsigned long start = PFN_ALIGN((unsigned long)_etext); + unsigned long end = PMD_ALIGN((unsigned long)_etext); + struct vmap_area *va; + + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + return; + text_tail_vm.addr = (void *)start; + text_tail_vm.size = end - start; + text_tail_va.va_start = start; + text_tail_va.va_end = end; + text_tail_va.vm = &text_tail_vm; + memcpy(va, &text_tail_va, sizeof(*va)); + insert_vmap_area_augment(va, NULL, &free_text_area_root, &free_text_area_list); +} + void __init vmalloc_init(void) { struct vmap_area *va; @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void) * Create the cache for vmap_area objects. */ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + register_text_tail_vm(); for_each_possible_cpu(i) { struct vmap_block_queue *vbq;
On x86 kernel, we allocate 2MB pages for kernel text up to round_down(_etext, 2MB). Therefore, some of the kernel text is still on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to round_up(_etext, 2MB), and use the rest of the page for modules and BPF programs. Here is an example: [root@eth50-1 ~]# grep _etext /proc/kallsyms ffffffff82202a08 T _etext [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3 ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf] ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf] ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf] [root@eth50-1 ~]# grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel 0xffffffff82200000-0xffffffff82400000 2M ro PSE x pmd [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms ffffffff822ba910 t xfs_flush_inodes_worker [xfs] ffffffff822bc580 t xfs_flush_inodes [xfs] ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs module, and bpf programs. Signed-off-by: Song Liu <song@kernel.org> --- arch/x86/mm/init_64.c | 3 ++- mm/vmalloc.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-)