[v3,7/8] execmem: add support for cache of large ROX pages

Message ID	20240909064730.3290724-8-rppt@kernel.org (mailing list archive)
State	Superseded
Headers	show Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 467011AD411; Mon, 9 Sep 2024 06:49:42 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725864582; cv=none; b=LxCth35IgueY3mQV+H9Q4VUliQc1baz47LXOUOTpLHv35hQlhzU2oErLLWvSrT3xCShbOnLa2ZP+kWSlVeVJC3uk19rdLOpC0Fx4pvjOhEc4cZ5Usmb/jkfbZGVJ1TlUZMYNqRbCTHPm8KvSrhxrAPVv9D9mn0qJTc5OGQV968M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725864582; c=relaxed/simple; bh=XRujkUOcLuEkt+qMwD4jVwZR/PaUTu1Wi9+2to2sLvQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=NbaNNDN8Ppt3HnQaLlKJVsxQfo+A9u+UnbjzvifpgLJfc9BMWQetQ6jFbzcnhtyGPMu9udxXrbdXHX06g93dfuLo+nkmvd1Bc4ZpaIpPVKRGL+gXnOTHhuZj6mv4ZAeZGufT13YlFZaN8FAcD4kVQTbyp8CDPFqpWbaPJmIIIUg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=hWPqduQd; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="hWPqduQd" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2DA22C4CECE; Mon, 9 Sep 2024 06:49:26 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1725864581; bh=XRujkUOcLuEkt+qMwD4jVwZR/PaUTu1Wi9+2to2sLvQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=hWPqduQdyKT1EOMvkPXV2lxAgRVzzP6spdLpmtHVeIxgqF20WOMIYCQJrAdtrzn4j TfFhgAHMxLN2b1lVUjeYFV29UN5C414SWvIGCQ2GHGcTrJVmDQ3CxaFgtHZy6zxSzh JT54gUClBH6v7dTt+5Jk47Y+isSrDMNxgfaC67OCnxNjSaPIWQ5iG9wGXmUfV5pJqn ATRZpL/FxOEQxDFplXE/fpiZ1o/l891pcqj+vltX8IgnYzH1UjGZXYTJPMaJfMiyhB oRGlYLFbKsQi9O3f237WcQREFqWTPvaa8h6cKWgliGUUBa1RBD+X6G1sxzwX3ZsN4A xXGBanCElQpjg== From: Mike Rapoport <rppt@kernel.org> To: Andrew Morton <akpm@linux-foundation.org> Cc: Andreas Larsson <andreas@gaisler.com>, Andy Lutomirski <luto@kernel.org>, Arnd Bergmann <arnd@arndb.de>, Borislav Petkov <bp@alien8.de>, Brian Cain <bcain@quicinc.com>, Catalin Marinas <catalin.marinas@arm.com>, Christoph Hellwig <hch@infradead.org>, Christophe Leroy <christophe.leroy@csgroup.eu>, Dave Hansen <dave.hansen@linux.intel.com>, Dinh Nguyen <dinguyen@kernel.org>, Geert Uytterhoeven <geert@linux-m68k.org>, Guo Ren <guoren@kernel.org>, Helge Deller <deller@gmx.de>, Huacai Chen <chenhuacai@kernel.org>, Ingo Molnar <mingo@redhat.com>, Johannes Berg <johannes@sipsolutions.net>, John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>, Kent Overstreet <kent.overstreet@linux.dev>, "Liam R. Howlett" <Liam.Howlett@oracle.com>, Luis Chamberlain <mcgrof@kernel.org>, Mark Rutland <mark.rutland@arm.com>, Masami Hiramatsu <mhiramat@kernel.org>, Matt Turner <mattst88@gmail.com>, Max Filippov <jcmvbkbc@gmail.com>, Michael Ellerman <mpe@ellerman.id.au>, Michal Simek <monstr@monstr.eu>, Mike Rapoport <rppt@kernel.org>, Oleg Nesterov <oleg@redhat.com>, Palmer Dabbelt <palmer@dabbelt.com>, Peter Zijlstra <peterz@infradead.org>, Richard Weinberger <richard@nod.at>, Russell King <linux@armlinux.org.uk>, Song Liu <song@kernel.org>, Stafford Horne <shorne@gmail.com>, Steven Rostedt <rostedt@goodmis.org>, Thomas Bogendoerfer <tsbogend@alpha.franken.de>, Thomas Gleixner <tglx@linutronix.de>, Uladzislau Rezki <urezki@gmail.com>, Vineet Gupta <vgupta@kernel.org>, Will Deacon <will@kernel.org>, bpf@vger.kernel.org, linux-alpha@vger.kernel.org, linux-arch@vger.kernel.org, linux-arm-kernel@lists.infradead.org, linux-csky@vger.kernel.org, linux-hexagon@vger.kernel.org, linux-kernel@vger.kernel.org, linux-m68k@lists.linux-m68k.org, linux-mips@vger.kernel.org, linux-mm@kvack.org, linux-modules@vger.kernel.org, linux-openrisc@vger.kernel.org, linux-parisc@vger.kernel.org, linux-riscv@lists.infradead.org, linux-sh@vger.kernel.org, linux-snps-arc@lists.infradead.org, linux-trace-kernel@vger.kernel.org, linux-um@lists.infradead.org, linuxppc-dev@lists.ozlabs.org, loongarch@lists.linux.dev, sparclinux@vger.kernel.org, x86@kernel.org Subject: [PATCH v3 7/8] execmem: add support for cache of large ROX pages Date: Mon, 9 Sep 2024 09:47:29 +0300 Message-ID: <20240909064730.3290724-8-rppt@kernel.org> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240909064730.3290724-1-rppt@kernel.org> References: <20240909064730.3290724-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-trace-kernel@vger.kernel.org List-Id: <linux-trace-kernel.vger.kernel.org> List-Subscribe: <mailto:linux-trace-kernel+subscribe@vger.kernel.org> List-Unsubscribe: <mailto:linux-trace-kernel+unsubscribe@vger.kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit
Series	x86/module: use large ROX pages for text allocations \| expand [v3,0/8] x86/module: use large ROX pages for text allocations [v3,1/8] mm: vmalloc: group declarations depending on CONFIG_MMU together [v3,2/8] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations [v3,3/8] asm-generic: introduce text-patching.h [v3,4/8] module: prepare to handle ROX allocations for text [v3,5/8] ftrace: Add swap_func to ftrace_process_locs() [v3,6/8] x86/module: perpare module loading for ROX allocations of text [v3,7/8] execmem: add support for cache of large ROX pages [v3,8/8] x86/module: enable ROX caches for module text

Message ID

20240909064730.3290724-8-rppt@kernel.org (mailing list archive)

State

Superseded

Headers

From: Mike Rapoport <rppt@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Larsson <andreas@gaisler.com>,
	Andy Lutomirski <luto@kernel.org>,
	Arnd Bergmann <arnd@arndb.de>,
	Borislav Petkov <bp@alien8.de>,
	Brian Cain <bcain@quicinc.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Christoph Hellwig <hch@infradead.org>,
	Christophe Leroy <christophe.leroy@csgroup.eu>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Dinh Nguyen <dinguyen@kernel.org>,
	Geert Uytterhoeven <geert@linux-m68k.org>,
	Guo Ren <guoren@kernel.org>,
	Helge Deller <deller@gmx.de>,
	Huacai Chen <chenhuacai@kernel.org>,
	Ingo Molnar <mingo@redhat.com>,
	Johannes Berg <johannes@sipsolutions.net>,
	John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>,
	Kent Overstreet <kent.overstreet@linux.dev>,
	"Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Luis Chamberlain <mcgrof@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Matt Turner <mattst88@gmail.com>,
	Max Filippov <jcmvbkbc@gmail.com>,
	Michael Ellerman <mpe@ellerman.id.au>,
	Michal Simek <monstr@monstr.eu>,
	Mike Rapoport <rppt@kernel.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Palmer Dabbelt <palmer@dabbelt.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Richard Weinberger <richard@nod.at>,
	Russell King <linux@armlinux.org.uk>,
	Song Liu <song@kernel.org>,
	Stafford Horne <shorne@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Thomas Bogendoerfer <tsbogend@alpha.franken.de>,
	Thomas Gleixner <tglx@linutronix.de>,
	Uladzislau Rezki <urezki@gmail.com>,
	Vineet Gupta <vgupta@kernel.org>,
	Will Deacon <will@kernel.org>,
	bpf@vger.kernel.org,
	linux-alpha@vger.kernel.org,
	linux-arch@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-csky@vger.kernel.org,
	linux-hexagon@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-m68k@lists.linux-m68k.org,
	linux-mips@vger.kernel.org,
	linux-mm@kvack.org,
	linux-modules@vger.kernel.org,
	linux-openrisc@vger.kernel.org,
	linux-parisc@vger.kernel.org,
	linux-riscv@lists.infradead.org,
	linux-sh@vger.kernel.org,
	linux-snps-arc@lists.infradead.org,
	linux-trace-kernel@vger.kernel.org,
	linux-um@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org,
	loongarch@lists.linux.dev,
	sparclinux@vger.kernel.org,
	x86@kernel.org
Subject: [PATCH v3 7/8] execmem: add support for cache of large ROX pages
Date: Mon,  9 Sep 2024 09:47:29 +0300
Message-ID: <20240909064730.3290724-8-rppt@kernel.org>
In-Reply-To: <20240909064730.3290724-1-rppt@kernel.org>
References: <20240909064730.3290724-1-rppt@kernel.org>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

Series

x86/module: use large ROX pages for text allocations | expand

Commit Message

Mike Rapoport Sept. 9, 2024, 6:47 a.m. UTC

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use huge pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc with
VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
ROX.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
definition of an execmem_range.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 include/linux/execmem.h |   2 +
 mm/execmem.c            | 289 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 286 insertions(+), 5 deletions(-)

Comments

Ard Biesheuvel Sept. 13, 2024, 3 p.m. UTC | #1

Hi Mike,

On Mon, 9 Sept 2024 at 08:51, Mike Rapoport <rppt@kernel.org> wrote:
>
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Using large pages to map text areas reduces iTLB pressure and improves
> performance.
>
> Extend execmem_alloc() with an ability to use huge pages with ROX
> permissions as a cache for smaller allocations.
>
> To populate the cache, a writable large page is allocated from vmalloc with
> VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> ROX.
>
> Portions of that large page are handed out to execmem_alloc() callers
> without any changes to the permissions.
>
> When the memory is freed with execmem_free() it is invalidated again so
> that it won't contain stale instructions.
>
> The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
> definition of an execmem_range.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  include/linux/execmem.h |   2 +
>  mm/execmem.c            | 289 +++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 286 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> index dfdf19f8a5e8..7436aa547818 100644
> --- a/include/linux/execmem.h
> +++ b/include/linux/execmem.h
> @@ -77,12 +77,14 @@ struct execmem_range {
>
>  /**
>   * struct execmem_info - architecture parameters for code allocations
> + * @fill_trapping_insns: set memory to contain instructions that will trap
>   * @ranges: array of parameter sets defining architecture specific
>   * parameters for executable memory allocations. The ranges that are not
>   * explicitly initialized by an architecture use parameters defined for
>   * @EXECMEM_DEFAULT.
>   */
>  struct execmem_info {
> +       void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
>         struct execmem_range    ranges[EXECMEM_TYPE_MAX];
>  };
>
> diff --git a/mm/execmem.c b/mm/execmem.c
> index 0f6691e9ffe6..f547c1f3c93d 100644
> --- a/mm/execmem.c
> +++ b/mm/execmem.c
> @@ -7,28 +7,88 @@
>   */
>
>  #include <linux/mm.h>
> +#include <linux/mutex.h>
>  #include <linux/vmalloc.h>
>  #include <linux/execmem.h>
> +#include <linux/maple_tree.h>
>  #include <linux/moduleloader.h>
>  #include <linux/text-patching.h>
>
> +#include <asm/tlbflush.h>
> +
> +#include "internal.h"
> +
>  static struct execmem_info *execmem_info __ro_after_init;
>  static struct execmem_info default_execmem_info __ro_after_init;
>
> -static void *__execmem_alloc(struct execmem_range *range, size_t size)
> +#ifdef CONFIG_MMU
> +struct execmem_cache {
> +       struct mutex mutex;
> +       struct maple_tree busy_areas;
> +       struct maple_tree free_areas;
> +};
> +
> +static struct execmem_cache execmem_cache = {
> +       .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
> +       .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
> +                                    execmem_cache.mutex),
> +       .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
> +                                    execmem_cache.mutex),
> +};
> +
> +static void execmem_cache_clean(struct work_struct *work)
> +{
> +       struct maple_tree *free_areas = &execmem_cache.free_areas;
> +       struct mutex *mutex = &execmem_cache.mutex;
> +       MA_STATE(mas, free_areas, 0, ULONG_MAX);
> +       void *area;
> +
> +       mutex_lock(mutex);
> +       mas_for_each(&mas, area, ULONG_MAX) {
> +               size_t size;
> +
> +               if (!xa_is_value(area))
> +                       continue;
> +
> +               size = xa_to_value(area);
> +
> +               if (IS_ALIGNED(size, PMD_SIZE) &&
> +                   IS_ALIGNED(mas.index, PMD_SIZE)) {
> +                       void *ptr = (void *)mas.index;
> +
> +                       mas_erase(&mas);
> +                       vfree(ptr);
> +               }
> +       }
> +       mutex_unlock(mutex);
> +}
> +
> +static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
> +
> +static void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable)
> +{
> +       if (execmem_info->fill_trapping_insns)
> +               execmem_info->fill_trapping_insns(ptr, size, writable);
> +       else
> +               memset(ptr, 0, size);

Does this really have to be a function pointer with a runtime check?

This could just be a __weak definition, with the arch providing an
override if the memset() is not appropriate.

Mike Rapoport Sept. 15, 2024, 1:48 p.m. UTC | #2

Hi Ard,

On Fri, Sep 13, 2024 at 05:00:42PM +0200, Ard Biesheuvel wrote:
> Hi Mike,
> 
> On Mon, 9 Sept 2024 at 08:51, Mike Rapoport <rppt@kernel.org> wrote:

...

> > +static void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable)
> > +{
> > +       if (execmem_info->fill_trapping_insns)
> > +               execmem_info->fill_trapping_insns(ptr, size, writable);
> > +       else
> > +               memset(ptr, 0, size);
> 
> Does this really have to be a function pointer with a runtime check?
> 
> This could just be a __weak definition, with the arch providing an
> override if the memset() is not appropriate.

I prefer to keep this a method in execmem_info rather that have a __weak
definition that architectures can override.

This is not on the hot path, so I don't think a runtime check here would
matter. Still, I can fill in a default with memset at init time.

Liam R. Howlett Sept. 19, 2024, 11:18 a.m. UTC | #3

* Mike Rapoport <rppt@kernel.org> [240909 02:49]:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Using large pages to map text areas reduces iTLB pressure and improves
> performance.
> 
> Extend execmem_alloc() with an ability to use huge pages with ROX
> permissions as a cache for smaller allocations.
> 
> To populate the cache, a writable large page is allocated from vmalloc with
> VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> ROX.
> 
> Portions of that large page are handed out to execmem_alloc() callers
> without any changes to the permissions.
> 
> When the memory is freed with execmem_free() it is invalidated again so
> that it won't contain stale instructions.
> 
> The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
> definition of an execmem_range.

I am not sure you need to convert to xa entries.

> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  include/linux/execmem.h |   2 +
>  mm/execmem.c            | 289 +++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 286 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/execmem.h b/include/linux/execmem.h
> index dfdf19f8a5e8..7436aa547818 100644
> --- a/include/linux/execmem.h
> +++ b/include/linux/execmem.h
> @@ -77,12 +77,14 @@ struct execmem_range {
>  
>  /**
>   * struct execmem_info - architecture parameters for code allocations
> + * @fill_trapping_insns: set memory to contain instructions that will trap
>   * @ranges: array of parameter sets defining architecture specific
>   * parameters for executable memory allocations. The ranges that are not
>   * explicitly initialized by an architecture use parameters defined for
>   * @EXECMEM_DEFAULT.
>   */
>  struct execmem_info {
> +	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
>  	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
>  };
>  
> diff --git a/mm/execmem.c b/mm/execmem.c
> index 0f6691e9ffe6..f547c1f3c93d 100644
> --- a/mm/execmem.c
> +++ b/mm/execmem.c
> @@ -7,28 +7,88 @@
>   */
>  
>  #include <linux/mm.h>
> +#include <linux/mutex.h>
>  #include <linux/vmalloc.h>
>  #include <linux/execmem.h>
> +#include <linux/maple_tree.h>
>  #include <linux/moduleloader.h>
>  #include <linux/text-patching.h>
>  
> +#include <asm/tlbflush.h>
> +
> +#include "internal.h"
> +
>  static struct execmem_info *execmem_info __ro_after_init;
>  static struct execmem_info default_execmem_info __ro_after_init;
>  
> -static void *__execmem_alloc(struct execmem_range *range, size_t size)
> +#ifdef CONFIG_MMU
> +struct execmem_cache {
> +	struct mutex mutex;
> +	struct maple_tree busy_areas;
> +	struct maple_tree free_areas;
> +};
> +
> +static struct execmem_cache execmem_cache = {
> +	.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
> +	.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
> +				     execmem_cache.mutex),
> +	.free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
> +				     execmem_cache.mutex),
> +};
> +
> +static void execmem_cache_clean(struct work_struct *work)
> +{
> +	struct maple_tree *free_areas = &execmem_cache.free_areas;
> +	struct mutex *mutex = &execmem_cache.mutex;
> +	MA_STATE(mas, free_areas, 0, ULONG_MAX);
> +	void *area;
> +
> +	mutex_lock(mutex);
> +	mas_for_each(&mas, area, ULONG_MAX) {
> +		size_t size;
> +
> +		if (!xa_is_value(area))
> +			continue;
> +
> +		size = xa_to_value(area);
> +
> +		if (IS_ALIGNED(size, PMD_SIZE) &&
> +		    IS_ALIGNED(mas.index, PMD_SIZE)) {
> +			void *ptr = (void *)mas.index;

If you store this pointer then it would be much nicer.

> +
> +			mas_erase(&mas);

mas_store_gfp() would probably be better here to store a null.

> +			vfree(ptr);
> +		}
> +	}
> +	mutex_unlock(mutex);
> +}
> +
> +static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
> +
> +static void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable)
> +{
> +	if (execmem_info->fill_trapping_insns)
> +		execmem_info->fill_trapping_insns(ptr, size, writable);
> +	else
> +		memset(ptr, 0, size);
> +}
> +
> +static void *execmem_vmalloc(struct execmem_range *range, size_t size,
> +			     pgprot_t pgprot, unsigned long vm_flags)
>  {
>  	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
> -	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
>  	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
> +	unsigned int align = range->alignment;
>  	unsigned long start = range->start;
>  	unsigned long end = range->end;
> -	unsigned int align = range->alignment;
> -	pgprot_t pgprot = range->pgprot;
>  	void *p;
>  
>  	if (kasan)
>  		vm_flags |= VM_DEFER_KMEMLEAK;
>  
> +	if (vm_flags & VM_ALLOW_HUGE_VMAP)
> +		align = PMD_SIZE;
> +
>  	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
>  				 pgprot, vm_flags, NUMA_NO_NODE,
>  				 __builtin_return_address(0));
> @@ -50,8 +110,225 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size)
>  		return NULL;
>  	}
>  
> +	return p;
> +}
> +
> +static int execmem_cache_add(void *ptr, size_t size)
> +{
> +	struct maple_tree *free_areas = &execmem_cache.free_areas;
> +	struct mutex *mutex = &execmem_cache.mutex;
> +	unsigned long addr = (unsigned long)ptr;
> +	MA_STATE(mas, free_areas, addr - 1, addr + 1);
> +	unsigned long lower, lower_size = 0;
> +	unsigned long upper, upper_size = 0;
> +	unsigned long area_size;
> +	void *area = NULL;
> +	int err;
> +
> +	lower = addr;
> +	upper = addr + size - 1;
> +
> +	mutex_lock(mutex);
> +	area = mas_walk(&mas);
> +	if (area && xa_is_value(area) && mas.last == addr - 1) {
> +		lower = mas.index;
> +		lower_size = xa_to_value(area);
> +	}
> +
> +	area = mas_next(&mas, ULONG_MAX);
> +	if (area && xa_is_value(area) && mas.index == addr + size) {
> +		upper = mas.last;
> +		upper_size = xa_to_value(area);
> +	}
> +
> +	mas_set_range(&mas, lower, upper);
> +	area_size = lower_size + upper_size + size;
> +	err = mas_store_gfp(&mas, xa_mk_value(area_size), GFP_KERNEL);
> +	mutex_unlock(mutex);
> +	if (err)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static bool within_range(struct execmem_range *range, struct ma_state *mas,
> +			 size_t size)
> +{
> +	unsigned long addr = mas->index;
> +
> +	if (addr >= range->start && addr + size < range->end)
> +		return true;
> +
> +	if (range->fallback_start &&
> +	    addr >= range->fallback_start && addr + size < range->fallback_end)
> +		return true;
> +
> +	return false;
> +}
> +
> +static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
> +{
> +	struct maple_tree *free_areas = &execmem_cache.free_areas;
> +	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
> +	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
> +	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
> +	struct mutex *mutex = &execmem_cache.mutex;
> +	unsigned long addr, last, area_size = 0;
> +	void *area, *ptr = NULL;
> +	int err;
> +
> +	mutex_lock(mutex);
> +	mas_for_each(&mas_free, area, ULONG_MAX) {
> +		area_size = xa_to_value(area);
> +
> +		if (area_size >= size && within_range(range, &mas_free, size))
> +			break;
> +	}
> +
> +	if (area_size < size)
> +		goto out_unlock;
> +
> +	addr = mas_free.index;
> +	last = mas_free.last;
> +
> +	/* insert allocated size to busy_areas at range [addr, addr + size) */
> +	mas_set_range(&mas_busy, addr, addr + size - 1);
> +	err = mas_store_gfp(&mas_busy, xa_mk_value(size), GFP_KERNEL);
> +	if (err)
> +		goto out_unlock;
> +
> +	mas_erase(&mas_free);
> +	if (area_size > size) {
> +		/*
> +		 * re-insert remaining free size to free_areas at range
> +		 * [addr + size, last]
> +		 */
> +		mas_set_range(&mas_free, addr + size, last);
> +		size = area_size - size;
> +		err = mas_store_gfp(&mas_free, xa_mk_value(size), GFP_KERNEL);
> +		if (err) {
> +			mas_erase(&mas_busy);
> +			goto out_unlock;
> +		}
> +	}

It would be more efficient to replace the entry then erase the portion.

Something like
	if (area_size > size) {
		err = mas_store_gfp(&mas_free, xa_mk_value(size), GFP_KERNEL);
		if (err)
		...
		/* range mismatches stored size here */
	}
	mas_set_range(&mas_busy, addr, addr + size - 1);
	mas_store_gfp(&mas_free, NULL, GFP_KERNEL);


> +	ptr = (void *)addr;
> +
> +out_unlock:
> +	mutex_unlock(mutex);
> +	return ptr;
> +}
> +
> +static int execmem_cache_populate(struct execmem_range *range, size_t size)
> +{
> +	unsigned long vm_flags = VM_FLUSH_RESET_PERMS | VM_ALLOW_HUGE_VMAP;
> +	unsigned long start, end;
> +	struct vm_struct *vm;
> +	size_t alloc_size;
> +	int err = -ENOMEM;
> +	void *p;
> +
> +	alloc_size = round_up(size, PMD_SIZE);
> +	p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
> +	if (!p)
> +		return err;
> +
> +	vm = find_vm_area(p);
> +	if (!vm)
> +		goto err_free_mem;
> +
> +	/* fill memory with instructions that will trap */
> +	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
> +
> +	start = (unsigned long)p;
> +	end = start + alloc_size;
> +
> +	vunmap_range(start, end);
> +
> +	err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
> +				       PMD_SHIFT);
> +	if (err)
> +		goto err_free_mem;
> +
> +	err = execmem_cache_add(p, alloc_size);
> +	if (err)
> +		goto err_free_mem;
> +
> +	return 0;
> +
> +err_free_mem:
> +	vfree(p);
> +	return err;
> +}
> +
> +static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
> +{
> +	void *p;
> +	int err;
> +
> +	p = __execmem_cache_alloc(range, size);
> +	if (p)
> +		return p;
> +
> +	err = execmem_cache_populate(range, size);
> +	if (err)
> +		return NULL;
> +
> +	return __execmem_cache_alloc(range, size);
> +}
> +
> +static bool execmem_cache_free(void *ptr)
> +{
> +	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
> +	struct mutex *mutex = &execmem_cache.mutex;
> +	unsigned long addr = (unsigned long)ptr;
> +	MA_STATE(mas, busy_areas, addr, addr);
> +	size_t size;
> +	void *area;
> +
> +	mutex_lock(mutex);
> +	area = mas_walk(&mas);
> +	if (!area) {
> +		mutex_unlock(mutex);
> +		return false;
> +	}
> +	size = xa_to_value(area);
> +	mas_erase(&mas);

Again, it is probably better to store null.  erase is more of if you are
unsure on where the index range ends, and since the maple state is
already set up to erase, it's best to just store NULL.

> +	mutex_unlock(mutex);
> +
> +	execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
> +
> +	execmem_cache_add(ptr, size);
> +
> +	schedule_work(&execmem_cache_clean_work);
> +
> +	return true;
> +}
> +
> +static void *__execmem_alloc(struct execmem_range *range, size_t size)
> +{
> +	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
> +	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
> +	pgprot_t pgprot = range->pgprot;
> +	void *p;
> +
> +	if (use_cache)
> +		p = execmem_cache_alloc(range, size);
> +	else
> +		p = execmem_vmalloc(range, size, pgprot, vm_flags);
> +
>  	return kasan_reset_tag(p);
>  }
> +#else
> +static void *__execmem_alloc(struct execmem_range *range, size_t size)
> +{
> +	return vmalloc(size);
> +}
> +
> +static bool execmem_cache_free(void *ptr)
> +{
> +	return false;
> +}
> +#endif
>  
>  void *execmem_alloc(enum execmem_type type, size_t size)
>  {
> @@ -67,7 +344,9 @@ void execmem_free(void *ptr)
>  	 * supported by vmalloc.
>  	 */
>  	WARN_ON(in_interrupt());
> -	vfree(ptr);
> +
> +	if (!execmem_cache_free(ptr))
> +		vfree(ptr);
>  }
>  
>  void *execmem_update_copy(void *dst, const void *src, size_t size)
> -- 
> 2.43.0
>

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index dfdf19f8a5e8..7436aa547818 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -77,12 +77,14 @@  struct execmem_range {
 
 /**
  * struct execmem_info - architecture parameters for code allocations
+ * @fill_trapping_insns: set memory to contain instructions that will trap
  * @ranges: array of parameter sets defining architecture specific
  * parameters for executable memory allocations. The ranges that are not
  * explicitly initialized by an architecture use parameters defined for
  * @EXECMEM_DEFAULT.
  */
 struct execmem_info {
+	void (*fill_trapping_insns)(void *ptr, size_t size, bool writable);
 	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
 };
 
diff --git a/mm/execmem.c b/mm/execmem.c
index 0f6691e9ffe6..f547c1f3c93d 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -7,28 +7,88 @@ 
  */
 
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/execmem.h>
+#include <linux/maple_tree.h>
 #include <linux/moduleloader.h>
 #include <linux/text-patching.h>
 
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+#ifdef CONFIG_MMU
+struct execmem_cache {
+	struct mutex mutex;
+	struct maple_tree busy_areas;
+	struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+	.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+	.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+	.free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+				     execmem_cache.mutex),
+};
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	MA_STATE(mas, free_areas, 0, ULONG_MAX);
+	void *area;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas, area, ULONG_MAX) {
+		size_t size;
+
+		if (!xa_is_value(area))
+			continue;
+
+		size = xa_to_value(area);
+
+		if (IS_ALIGNED(size, PMD_SIZE) &&
+		    IS_ALIGNED(mas.index, PMD_SIZE)) {
+			void *ptr = (void *)mas.index;
+
+			mas_erase(&mas);
+			vfree(ptr);
+		}
+	}
+	mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable)
+{
+	if (execmem_info->fill_trapping_insns)
+		execmem_info->fill_trapping_insns(ptr, size, writable);
+	else
+		memset(ptr, 0, size);
+}
+
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+			     pgprot_t pgprot, unsigned long vm_flags)
 {
 	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+	unsigned int align = range->alignment;
 	unsigned long start = range->start;
 	unsigned long end = range->end;
-	unsigned int align = range->alignment;
-	pgprot_t pgprot = range->pgprot;
 	void *p;
 
 	if (kasan)
 		vm_flags |= VM_DEFER_KMEMLEAK;
 
+	if (vm_flags & VM_ALLOW_HUGE_VMAP)
+		align = PMD_SIZE;
+
 	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
 				 pgprot, vm_flags, NUMA_NO_NODE,
 				 __builtin_return_address(0));
@@ -50,8 +110,225 @@  static void *__execmem_alloc(struct execmem_range *range, size_t size)
 		return NULL;
 	}
 
+	return p;
+}
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, free_areas, addr - 1, addr + 1);
+	unsigned long lower, lower_size = 0;
+	unsigned long upper, upper_size = 0;
+	unsigned long area_size;
+	void *area = NULL;
+	int err;
+
+	lower = addr;
+	upper = addr + size - 1;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (area && xa_is_value(area) && mas.last == addr - 1) {
+		lower = mas.index;
+		lower_size = xa_to_value(area);
+	}
+
+	area = mas_next(&mas, ULONG_MAX);
+	if (area && xa_is_value(area) && mas.index == addr + size) {
+		upper = mas.last;
+		upper_size = xa_to_value(area);
+	}
+
+	mas_set_range(&mas, lower, upper);
+	area_size = lower_size + upper_size + size;
+	err = mas_store_gfp(&mas, xa_mk_value(area_size), GFP_KERNEL);
+	mutex_unlock(mutex);
+	if (err)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static bool within_range(struct execmem_range *range, struct ma_state *mas,
+			 size_t size)
+{
+	unsigned long addr = mas->index;
+
+	if (addr >= range->start && addr + size < range->end)
+		return true;
+
+	if (range->fallback_start &&
+	    addr >= range->fallback_start && addr + size < range->fallback_end)
+		return true;
+
+	return false;
+}
+
+static void *__execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	struct maple_tree *free_areas = &execmem_cache.free_areas;
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr, last, area_size = 0;
+	void *area, *ptr = NULL;
+	int err;
+
+	mutex_lock(mutex);
+	mas_for_each(&mas_free, area, ULONG_MAX) {
+		area_size = xa_to_value(area);
+
+		if (area_size >= size && within_range(range, &mas_free, size))
+			break;
+	}
+
+	if (area_size < size)
+		goto out_unlock;
+
+	addr = mas_free.index;
+	last = mas_free.last;
+
+	/* insert allocated size to busy_areas at range [addr, addr + size) */
+	mas_set_range(&mas_busy, addr, addr + size - 1);
+	err = mas_store_gfp(&mas_busy, xa_mk_value(size), GFP_KERNEL);
+	if (err)
+		goto out_unlock;
+
+	mas_erase(&mas_free);
+	if (area_size > size) {
+		/*
+		 * re-insert remaining free size to free_areas at range
+		 * [addr + size, last]
+		 */
+		mas_set_range(&mas_free, addr + size, last);
+		size = area_size - size;
+		err = mas_store_gfp(&mas_free, xa_mk_value(size), GFP_KERNEL);
+		if (err) {
+			mas_erase(&mas_busy);
+			goto out_unlock;
+		}
+	}
+	ptr = (void *)addr;
+
+out_unlock:
+	mutex_unlock(mutex);
+	return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+	unsigned long vm_flags = VM_FLUSH_RESET_PERMS | VM_ALLOW_HUGE_VMAP;
+	unsigned long start, end;
+	struct vm_struct *vm;
+	size_t alloc_size;
+	int err = -ENOMEM;
+	void *p;
+
+	alloc_size = round_up(size, PMD_SIZE);
+	p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+	if (!p)
+		return err;
+
+	vm = find_vm_area(p);
+	if (!vm)
+		goto err_free_mem;
+
+	/* fill memory with instructions that will trap */
+	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+
+	start = (unsigned long)p;
+	end = start + alloc_size;
+
+	vunmap_range(start, end);
+
+	err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+				       PMD_SHIFT);
+	if (err)
+		goto err_free_mem;
+
+	err = execmem_cache_add(p, alloc_size);
+	if (err)
+		goto err_free_mem;
+
+	return 0;
+
+err_free_mem:
+	vfree(p);
+	return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+	void *p;
+	int err;
+
+	p = __execmem_cache_alloc(range, size);
+	if (p)
+		return p;
+
+	err = execmem_cache_populate(range, size);
+	if (err)
+		return NULL;
+
+	return __execmem_cache_alloc(range, size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+	struct mutex *mutex = &execmem_cache.mutex;
+	unsigned long addr = (unsigned long)ptr;
+	MA_STATE(mas, busy_areas, addr, addr);
+	size_t size;
+	void *area;
+
+	mutex_lock(mutex);
+	area = mas_walk(&mas);
+	if (!area) {
+		mutex_unlock(mutex);
+		return false;
+	}
+	size = xa_to_value(area);
+	mas_erase(&mas);
+	mutex_unlock(mutex);
+
+	execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
+
+	execmem_cache_add(ptr, size);
+
+	schedule_work(&execmem_cache_clean_work);
+
+	return true;
+}
+
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
+{
+	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+	pgprot_t pgprot = range->pgprot;
+	void *p;
+
+	if (use_cache)
+		p = execmem_cache_alloc(range, size);
+	else
+		p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
 	return kasan_reset_tag(p);
 }
+#else
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
+{
+	return vmalloc(size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+	return false;
+}
+#endif
 
 void *execmem_alloc(enum execmem_type type, size_t size)
 {
@@ -67,7 +344,9 @@  void execmem_free(void *ptr)
 	 * supported by vmalloc.
 	 */
 	WARN_ON(in_interrupt());
-	vfree(ptr);
+
+	if (!execmem_cache_free(ptr))
+		vfree(ptr);
 }
 
 void *execmem_update_copy(void *dst, const void *src, size_t size)

[v3,7/8] execmem: add support for cache of large ROX pages

Commit Message

Comments

Patch