diff mbox series

[v5,7/8] execmem: add support for cache of large ROX pages

Message ID 20241009180816.83591-8-rppt@kernel.org (mailing list archive)
State Superseded
Headers show
Series x86/module: use large ROX pages for text allocations | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch, async
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Mike Rapoport Oct. 9, 2024, 6:08 p.m. UTC
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use huge pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc with
VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
ROX.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
definition of an execmem_range.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/execmem.h |   2 +
 mm/execmem.c            | 317 +++++++++++++++++++++++++++++++++++++++-
 mm/internal.h           |   1 +
 mm/vmalloc.c            |   5 +
 4 files changed, 320 insertions(+), 5 deletions(-)

Comments

Andrew Morton Oct. 9, 2024, 8:24 p.m. UTC | #1
On Wed,  9 Oct 2024 21:08:15 +0300 Mike Rapoport <rppt@kernel.org> wrote:

> Using large pages to map text areas reduces iTLB pressure and improves
> performance.

Are there any measurable performance improvements?

What are the effects of this series upon overall memory consumption?

The lack of acks is a bit surprising for a v5 patch, but I'll add all
this to mm.git for some testing, thanks.
Christoph Hellwig Oct. 10, 2024, 6:58 a.m. UTC | #2
On Wed, Oct 09, 2024 at 09:08:15PM +0300, Mike Rapoport wrote:
>  /**
>   * struct execmem_info - architecture parameters for code allocations
> + * @fill_trapping_insns: set memory to contain instructions that will trap
>   * @ranges: array of parameter sets defining architecture specific
>   * parameters for executable memory allocations. The ranges that are not
>   * explicitly initialized by an architecture use parameters defined for
>   * @EXECMEM_DEFAULT.
>   */
>  struct execmem_info {
> +	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
>  	struct execmem_range	ranges[EXECMEM_TYPE_MAX];

Why is the filler an indirect function call and not an architecture
hook?
Mike Rapoport Oct. 10, 2024, 9:29 a.m. UTC | #3
On Wed, Oct 09, 2024 at 01:24:27PM -0700, Andrew Morton wrote:
> On Wed,  9 Oct 2024 21:08:15 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> 
> > Using large pages to map text areas reduces iTLB pressure and improves
> > performance.
> 
> Are there any measurable performance improvements?

I don't have any numbers, I just followed the common sense of "less TLB
entries is better" and relied on Thomas comments from previous discussions.
 
> What are the effects of this series upon overall memory consumption?
 
There will be some execmem cache fragmentation and an increase in memory
consumption. It depends on the actual modules loaded and how large it the
fragmentation.

For a set of pretty randomly chosen modules where most come from
net/netfilter I see an increase from 19M to 25M.

> The lack of acks is a bit surprising for a v5 patch, but I'll add all
> this to mm.git for some testing, thanks.
>
Mike Rapoport Oct. 10, 2024, 12:57 p.m. UTC | #4
On Wed, Oct 09, 2024 at 11:58:33PM -0700, Christoph Hellwig wrote:
> On Wed, Oct 09, 2024 at 09:08:15PM +0300, Mike Rapoport wrote:
> >  /**
> >   * struct execmem_info - architecture parameters for code allocations
> > + * @fill_trapping_insns: set memory to contain instructions that will trap
> >   * @ranges: array of parameter sets defining architecture specific
> >   * parameters for executable memory allocations. The ranges that are not
> >   * explicitly initialized by an architecture use parameters defined for
> >   * @EXECMEM_DEFAULT.
> >   */
> >  struct execmem_info {
> > +	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
> >  	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
> 
> Why is the filler an indirect function call and not an architecture
> hook?

The idea is to keep everything together and have execmem_info describe all
that architecture needs.
Kees Bakker Oct. 10, 2024, 6:35 p.m. UTC | #5
Op 09-10-2024 om 20:08 schreef Mike Rapoport:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Using large pages to map text areas reduces iTLB pressure and improves
> performance.
>
> Extend execmem_alloc() with an ability to use huge pages with ROX
> permissions as a cache for smaller allocations.
>
> To populate the cache, a writable large page is allocated from vmalloc with
> VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> ROX.
>
> Portions of that large page are handed out to execmem_alloc() callers
> without any changes to the permissions.
>
> When the memory is freed with execmem_free() it is invalidated again so
> that it won't contain stale instructions.
>
> The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
> definition of an execmem_range.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>   include/linux/execmem.h |   2 +
>   mm/execmem.c            | 317 +++++++++++++++++++++++++++++++++++++++-
>   mm/internal.h           |   1 +
>   mm/vmalloc.c            |   5 +
>   4 files changed, 320 insertions(+), 5 deletions(-)
> [...]
> +static void execmem_cache_clean(struct work_struct *work)
> +{
> +	struct maple_tree *free_areas = &execmem_cache.free_areas;
> +	struct mutex *mutex = &execmem_cache.mutex;
> +	MA_STATE(mas, free_areas, 0, ULONG_MAX);
> +	void *area;
> +
> +	mutex_lock(mutex);
> +	mas_for_each(&mas, area, ULONG_MAX) {
> +		size_t size;
> +
No need to check for !area, because it is already guaranteed by the 
while loop condition (mas_for_each)
> +		if (!area)
> +			continue;
> +
> +		size = mas_range_len(&mas);
> +
> +		if (IS_ALIGNED(size, PMD_SIZE) &&
> +		    IS_ALIGNED(mas.index, PMD_SIZE)) {
> +			struct vm_struct *vm = find_vm_area(area);
> +
> +			execmem_set_direct_map_valid(vm, true);
> +			mas_store_gfp(&mas, NULL, GFP_KERNEL);
> +			vfree(area);
> +		}
> +	}
> +	mutex_unlock(mutex);
> +}
>
Christoph Hellwig Oct. 11, 2024, 7:46 a.m. UTC | #6
On Thu, Oct 10, 2024 at 03:57:33PM +0300, Mike Rapoport wrote:
> On Wed, Oct 09, 2024 at 11:58:33PM -0700, Christoph Hellwig wrote:
> > On Wed, Oct 09, 2024 at 09:08:15PM +0300, Mike Rapoport wrote:
> > >  /**
> > >   * struct execmem_info - architecture parameters for code allocations
> > > + * @fill_trapping_insns: set memory to contain instructions that will trap
> > >   * @ranges: array of parameter sets defining architecture specific
> > >   * parameters for executable memory allocations. The ranges that are not
> > >   * explicitly initialized by an architecture use parameters defined for
> > >   * @EXECMEM_DEFAULT.
> > >   */
> > >  struct execmem_info {
> > > +	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
> > >  	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
> > 
> > Why is the filler an indirect function call and not an architecture
> > hook?
> 
> The idea is to keep everything together and have execmem_info describe all
> that architecture needs. 

But why?  That's pretty different from our normal style of arch hooks,
and introduces an indirect call in a security sensitive area.
Mike Rapoport Oct. 13, 2024, 8:43 a.m. UTC | #7
On Fri, Oct 11, 2024 at 12:46:23AM -0700, Christoph Hellwig wrote:
> On Thu, Oct 10, 2024 at 03:57:33PM +0300, Mike Rapoport wrote:
> > On Wed, Oct 09, 2024 at 11:58:33PM -0700, Christoph Hellwig wrote:
> > > On Wed, Oct 09, 2024 at 09:08:15PM +0300, Mike Rapoport wrote:
> > > >  /**
> > > >   * struct execmem_info - architecture parameters for code allocations
> > > > + * @fill_trapping_insns: set memory to contain instructions that will trap
> > > >   * @ranges: array of parameter sets defining architecture specific
> > > >   * parameters for executable memory allocations. The ranges that are not
> > > >   * explicitly initialized by an architecture use parameters defined for
> > > >   * @EXECMEM_DEFAULT.
> > > >   */
> > > >  struct execmem_info {
> > > > +	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
> > > >  	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
> > > 
> > > Why is the filler an indirect function call and not an architecture
> > > hook?
> > 
> > The idea is to keep everything together and have execmem_info describe all
> > that architecture needs. 
> 
> But why?  That's pretty different from our normal style of arch hooks,
> and introduces an indirect call in a security sensitive area.

Will change to __weak hook.
Andrew Morton Oct. 14, 2024, 3:26 a.m. UTC | #8
On Sun, 13 Oct 2024 11:43:41 +0300 Mike Rapoport <rppt@kernel.org> wrote:

> > > The idea is to keep everything together and have execmem_info describe all
> > > that architecture needs. 
> > 
> > But why?  That's pretty different from our normal style of arch hooks,
> > and introduces an indirect call in a security sensitive area.
> 
> Will change to __weak hook. 
> 

Thanks, I'll drop the v1 series;

The todos which I collected are:

https://lkml.kernel.org/r/CAPhsuW66etfdU3Fvk0KsELXcgWD6_TkBFjJ-BTHQu5OejDsP2w@mail.gmail.com
https://lkml.kernel.org/r/Zwd6vH0rz0PVedLI@infradead.org
https://lkml.kernel.org/r/ZwjXz0dz-RldVNx0@infradead.org
https://lkml.kernel.org/r/202410111408.8fe6f604-lkp@intel.com
Christoph Hellwig Oct. 14, 2024, 5:55 a.m. UTC | #9
On Sun, Oct 13, 2024 at 11:43:41AM +0300, Mike Rapoport wrote:
> > But why?  That's pretty different from our normal style of arch hooks,
> > and introduces an indirect call in a security sensitive area.
> 
> Will change to __weak hook. 

Isn't the callback required when using the large ROX page?  I.e.
shouldn't it be an unconditional callback and not a weak override?
Mike Rapoport Oct. 14, 2024, 1:07 p.m. UTC | #10
On Sun, Oct 13, 2024 at 10:55:25PM -0700, Christoph Hellwig wrote:
> On Sun, Oct 13, 2024 at 11:43:41AM +0300, Mike Rapoport wrote:
> > > But why?  That's pretty different from our normal style of arch hooks,
> > > and introduces an indirect call in a security sensitive area.
> > 
> > Will change to __weak hook. 
> 
> Isn't the callback required when using the large ROX page?  I.e.
> shouldn't it be an unconditional callback and not a weak override?

I'll add a Kconfig option to ensure that an architecture that wants to use
large ROX pages has explicit callback for that.
Luis Chamberlain Oct. 14, 2024, 7:16 p.m. UTC | #11
On Sun, Oct 13, 2024 at 08:26:26PM -0700, Andrew Morton wrote:
> On Sun, 13 Oct 2024 11:43:41 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> 
> > > > The idea is to keep everything together and have execmem_info describe all
> > > > that architecture needs. 
> > > 
> > > But why?  That's pretty different from our normal style of arch hooks,
> > > and introduces an indirect call in a security sensitive area.
> > 
> > Will change to __weak hook. 
> > 
> 
> Thanks, I'll drop the v1 series;
> 
> The todos which I collected are:
> 
> https://lkml.kernel.org/r/CAPhsuW66etfdU3Fvk0KsELXcgWD6_TkBFjJ-BTHQu5OejDsP2w@mail.gmail.com
> https://lkml.kernel.org/r/Zwd6vH0rz0PVedLI@infradead.org
> https://lkml.kernel.org/r/ZwjXz0dz-RldVNx0@infradead.org
> https://lkml.kernel.org/r/202410111408.8fe6f604-lkp@intel.com

BTW Andrew I'd like to pick this up through the modules tree, and while
at it, also beat it up with some more testing as we're expanding also
with the modversions stuff for Rust modules.

  Luis
Luis Chamberlain Oct. 15, 2024, 4:09 a.m. UTC | #12
Mike, please run this with kmemleak enabled and running, and also try to get
tools/testing/selftests/kmod/kmod.sh to pass. I run into silly boot issues
with just a guest.

  Luis
Mike Rapoport Oct. 15, 2024, 5:54 a.m. UTC | #13
On Mon, Oct 14, 2024 at 09:09:49PM -0700, Luis Chamberlain wrote:
> Mike, please run this with kmemleak enabled and running, and also try to get
> tools/testing/selftests/kmod/kmod.sh to pass.

There was an issue with kmemleak, I fixed it here:

https://lore.kernel.org/linux-mm/20241009180816.83591-1-rppt@kernel.org/T/#m020884c1795218cc2be245e8091fead1cda3f3e4

> I run into silly boot issues with just a guest.

Was it kmemleak or something else?
 
>   Luis
Luis Chamberlain Oct. 15, 2024, 8:11 p.m. UTC | #14
On Tue, Oct 15, 2024 at 08:54:29AM +0300, Mike Rapoport wrote:
> On Mon, Oct 14, 2024 at 09:09:49PM -0700, Luis Chamberlain wrote:
> > Mike, please run this with kmemleak enabled and running, and also try to get
> > tools/testing/selftests/kmod/kmod.sh to pass.
> 
> There was an issue with kmemleak, I fixed it here:
> 
> https://lore.kernel.org/linux-mm/20241009180816.83591-1-rppt@kernel.org/T/#m020884c1795218cc2be245e8091fead1cda3f3e4

Ah, so this was a side fix, not part of this series, thanks.

> > I run into silly boot issues with just a guest.
> 
> Was it kmemleak or something else?

Both kmemleak and the kmod selftest failed, here is a run of the test
with this patch series:

https://github.com/linux-kdevops/linux-modules-kpd/actions/runs/11352286624/job/31574722735

We now have automated tests generated when people post patches to
linux-modules, but if you give me your github username you can push
onto the linux-kdevops/linux-modules-kpd [0] repo a random branch once you
have it ready, just cp -a the linux-ci-modules/.github [1] directory onto
your branch before a push and that'll trigger a test run (you need to
git add -f .github on your Linux branch) with our self-hosted runners.

[0] https://github.com/linux-kdevops/linux-modules-kpd
[1] https://github.com/linux-kdevops/kdevops-ci-modules

  Luis
Mike Rapoport Oct. 16, 2024, 10:40 a.m. UTC | #15
On Tue, Oct 15, 2024 at 01:11:54PM -0700, Luis Chamberlain wrote:
> On Tue, Oct 15, 2024 at 08:54:29AM +0300, Mike Rapoport wrote:
> > On Mon, Oct 14, 2024 at 09:09:49PM -0700, Luis Chamberlain wrote:
> > > Mike, please run this with kmemleak enabled and running, and also try to get
> > > tools/testing/selftests/kmod/kmod.sh to pass.
> > 
> > There was an issue with kmemleak, I fixed it here:
> > 
> > https://lore.kernel.org/linux-mm/20241009180816.83591-1-rppt@kernel.org/T/#m020884c1795218cc2be245e8091fead1cda3f3e4
> 
> Ah, so this was a side fix, not part of this series, thanks.
> 
> > > I run into silly boot issues with just a guest.
> > 
> > Was it kmemleak or something else?
> 
> Both kmemleak and the kmod selftest failed, here is a run of the test
> with this patch series:
> 
> https://github.com/linux-kdevops/linux-modules-kpd/actions/runs/11352286624/job/31574722735

Is there a kernel log to look at? Could not find it in the run report
 
>   Luis
Luis Chamberlain Oct. 16, 2024, 11:58 p.m. UTC | #16
On Wed, Oct 16, 2024 at 01:40:55PM +0300, Mike Rapoport wrote:
> On Tue, Oct 15, 2024 at 01:11:54PM -0700, Luis Chamberlain wrote:
> > On Tue, Oct 15, 2024 at 08:54:29AM +0300, Mike Rapoport wrote:
> > > On Mon, Oct 14, 2024 at 09:09:49PM -0700, Luis Chamberlain wrote:
> > > > Mike, please run this with kmemleak enabled and running, and also try to get
> > > > tools/testing/selftests/kmod/kmod.sh to pass.
> > > 
> > > There was an issue with kmemleak, I fixed it here:
> > > 
> > > https://lore.kernel.org/linux-mm/20241009180816.83591-1-rppt@kernel.org/T/#m020884c1795218cc2be245e8091fead1cda3f3e4
> > 
> > Ah, so this was a side fix, not part of this series, thanks.
> > 
> > > > I run into silly boot issues with just a guest.
> > > 
> > > Was it kmemleak or something else?
> > 
> > Both kmemleak and the kmod selftest failed, here is a run of the test
> > with this patch series:
> > 
> > https://github.com/linux-kdevops/linux-modules-kpd/actions/runs/11352286624/job/31574722735
> 
> Is there a kernel log to look at? Could not find it in the run report

No, I forgot to include the guestfs console on artifacts, I'll do that
in the next run.

  Luis
diff mbox series

Patch

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index dfdf19f8a5e8..7436aa547818 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -77,12 +77,14 @@  struct execmem_range {
 
 /**
  * struct execmem_info - architecture parameters for code allocations
+ * @fill_trapping_insns: set memory to contain instructions that will trap
  * @ranges: array of parameter sets defining architecture specific
  * parameters for executable memory allocations. The ranges that are not
  * explicitly initialized by an architecture use parameters defined for
  * @EXECMEM_DEFAULT.
  */
 struct execmem_info {
+	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
 	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
 };
 
diff --git a/mm/execmem.c b/mm/execmem.c
index 0f6691e9ffe6..9c6ff9687860 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -7,28 +7,109 @@ 
  */
 
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/execmem.h>
+#include <linux/maple_tree.h>
+#include <linux/set_memory.h>
 #include <linux/moduleloader.h>
 #include <linux/text-patching.h>
 
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+#ifdef CONFIG_MMU
+struct execmem_cache {
+	struct mutex mutex;
+	struct maple_tree busy_areas;
+	struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+	.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+	.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+	.free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+};
+
+static inline unsigned long mas_range_len(struct ma_state *mas)
+{
+	return mas->last - mas->index + 1;
+}
+
+static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid)
+{
+	unsigned int nr = (1 << get_vm_area_page_order(vm));
+	unsigned int updated = 0;
+	int err = 0;
+
+	for (int i = 0; i < vm->nr_pages; i += nr) {
+		err = set_direct_map_valid_noflush(vm->pages[i], nr, valid);
+		if (err)
+			goto err_restore;
+		updated += nr;
+	}
+
+	return 0;
+
+err_restore:
+	for (int i = 0; i < updated; i += nr)
+		set_direct_map_valid_noflush(vm->pages[i], nr, !valid);
+
+	return err;
+}
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	MA_STATE(mas, free_areas, 0, ULONG_MAX);
+	void *area;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas, area, ULONG_MAX) {
+		size_t size;
+
+		if (!area)
+			continue;
+
+		size = mas_range_len(&mas);
+
+		if (IS_ALIGNED(size, PMD_SIZE) &&
+		    IS_ALIGNED(mas.index, PMD_SIZE)) {
+			struct vm_struct *vm = find_vm_area(area);
+
+			execmem_set_direct_map_valid(vm, true);
+			mas_store_gfp(&mas, NULL, GFP_KERNEL);
+			vfree(area);
+		}
+	}
+	mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+			     pgprot_t pgprot, unsigned long vm_flags)
 {
 	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+	unsigned int align = range->alignment;
 	unsigned long start = range->start;
 	unsigned long end = range->end;
-	unsigned int align = range->alignment;
-	pgprot_t pgprot = range->pgprot;
 	void *p;
 
 	if (kasan)
 		vm_flags |= VM_DEFER_KMEMLEAK;
 
+	if (vm_flags & VM_ALLOW_HUGE_VMAP)
+		align = PMD_SIZE;
+
 	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
 				 pgprot, vm_flags, NUMA_NO_NODE,
 				 __builtin_return_address(0));
@@ -50,8 +131,224 @@  static void *__execmem_alloc(struct execmem_range *range, size_t size)
 		return NULL;
 	}
 
+	return p;
+}
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, free_areas, addr - 1, addr + 1);
+	unsigned long lower, upper;
+	void *area = NULL;
+	int err;
+
+	lower = addr;
+	upper = addr + size - 1;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (area && mas.last == addr - 1)
+		lower = mas.index;
+
+	area = mas_next(&mas, ULONG_MAX);
+	if (area && mas.index == addr + size)
+		upper = mas.last;
+
+	mas_set_range(&mas, lower, upper);
+	err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
+	mutex_unlock(mutex);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static bool within_range(struct execmem_range *range, struct ma_state *mas,
+			 size_t size)
+{
+	unsigned long addr = mas->index;
+
+	if (addr >= range->start && addr + size < range->end)
+		return true;
+
+	if (range->fallback_start &&
+	    addr >= range->fallback_start && addr + size < range->fallback_end)
+		return true;
+
+	return false;
+}
+
+static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr, last, area_size = 0;
+	void *area, *ptr = NULL;
+	int err;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas_free, area, ULONG_MAX) {
+		area_size = mas_range_len(&mas_free);
+
+		if (area_size >= size && within_range(range, &mas_free, size))
+			break;
+	}
+
+	if (area_size < size)
+		goto out_unlock;
+
+	addr = mas_free.index;
+	last = mas_free.last;
+
+	/* insert allocated size to busy_areas at range [addr, addr + size) */
+	mas_set_range(&mas_busy, addr, addr + size - 1);
+	err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL);
+	if (err)
+		goto out_unlock;
+
+	mas_store_gfp(&mas_free, NULL, GFP_KERNEL);
+	if (area_size > size) {
+		void *ptr = (void *)(addr + size);
+
+		/*
+		 * re-insert remaining free size to free_areas at range
+		 * [addr + size, last]
+		 */
+		mas_set_range(&mas_free, addr + size, last);
+		err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL);
+		if (err) {
+			mas_store_gfp(&mas_busy, NULL, GFP_KERNEL);
+			goto out_unlock;
+		}
+	}
+	ptr = (void *)addr;
+
+out_unlock:
+	mutex_unlock(mutex);
+	return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+	unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+	unsigned long start, end;
+	struct vm_struct *vm;
+	size_t alloc_size;
+	int err = -ENOMEM;
+	void *p;
+
+	alloc_size = round_up(size, PMD_SIZE);
+	p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+	if (!p)
+		return err;
+
+	vm = find_vm_area(p);
+	if (!vm)
+		goto err_free_mem;
+
+	/* fill memory with instructions that will trap */
+	execmem_info->fill_trapping_insns(p, alloc_size, /* writable = */ true);
+
+	start = (unsigned long)p;
+	end = start + alloc_size;
+
+	vunmap_range(start, end);
+
+	err = execmem_set_direct_map_valid(vm, false);
+	if (err)
+		goto err_free_mem;
+
+	err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+				       PMD_SHIFT);
+	if (err)
+		goto err_free_mem;
+
+	err = execmem_cache_add(p, alloc_size);
+	if (err)
+		goto err_free_mem;
+
+	return 0;
+
+err_free_mem:
+	vfree(p);
+	return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	void *p;
+	int err;
+
+	p = __execmem_cache_alloc(range, size);
+	if (p)
+		return p;
+
+	err = execmem_cache_populate(range, size);
+	if (err)
+		return NULL;
+
+	return __execmem_cache_alloc(range, size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, busy_areas, addr, addr);
+	size_t size;
+	void *area;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (!area) {
+		mutex_unlock(mutex);
+		return false;
+	}
+	size = mas_range_len(&mas);
+
+	mas_store_gfp(&mas, NULL, GFP_KERNEL);
+	mutex_unlock(mutex);
+
+	execmem_info->fill_trapping_insns(ptr, size, /* writable = */ false);
+
+	execmem_cache_add(ptr, size);
+
+	schedule_work(&execmem_cache_clean_work);
+
+	return true;
+}
+
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
+{
+	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+	pgprot_t pgprot = range->pgprot;
+	void *p;
+
+	if (use_cache)
+		p = execmem_cache_alloc(range, size);
+	else
+		p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
 	return kasan_reset_tag(p);
 }
+#else
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
+{
+	return vmalloc(size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	return false;
+}
+#endif
 
 void *execmem_alloc(enum execmem_type type, size_t size)
 {
@@ -67,7 +364,9 @@  void execmem_free(void *ptr)
 	 * supported by vmalloc.
 	 */
 	WARN_ON(in_interrupt());
-	vfree(ptr);
+
+	if (!execmem_cache_free(ptr))
+		vfree(ptr);
 }
 
 void *execmem_update_copy(void *dst, const void *src, size_t size)
@@ -92,6 +391,11 @@  static bool execmem_validate(struct execmem_info *info)
 	return true;
 }
 
+static void default_fill_trapping_insns(void *ptr, size_t size, bool writable)
+{
+	memset(ptr, 0, size);
+}
+
 static void execmem_init_missing(struct execmem_info *info)
 {
 	struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT];
@@ -112,6 +416,9 @@  static void execmem_init_missing(struct execmem_info *info)
 			r->fallback_end = default_range->fallback_end;
 		}
 	}
+
+	if (!info->fill_trapping_insns)
+		info->fill_trapping_insns = default_fill_trapping_insns;
 }
 
 struct execmem_info * __weak execmem_arch_setup(void)
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..95befbc19852 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1189,6 +1189,7 @@  size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
 void __init vmalloc_init(void);
 int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift);
+unsigned int get_vm_area_page_order(struct vm_struct *vm);
 #else
 static inline void vmalloc_init(void)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86b2344d7461..f340e38716c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3007,6 +3007,11 @@  static inline unsigned int vm_area_page_order(struct vm_struct *vm)
 #endif
 }
 
+unsigned int get_vm_area_page_order(struct vm_struct *vm)
+{
+	return vm_area_page_order(vm);
+}
+
 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
 {
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC