diff mbox series

[07/32] mm: Bring back vmalloc_exec

Message ID 20230509165657.1735798-8-kent.overstreet@linux.dev (mailing list archive)
State New
Headers show
Series bcachefs - a new COW filesystem | expand

Commit Message

Kent Overstreet May 9, 2023, 4:56 p.m. UTC
From: Kent Overstreet <kent.overstreet@gmail.com>

This is needed for bcachefs, which dynamically generates per-btree node
unpack functions.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: linux-mm@kvack.org
---
 include/linux/vmalloc.h |  1 +
 kernel/module/main.c    |  4 +---
 mm/nommu.c              | 18 ++++++++++++++++++
 mm/vmalloc.c            | 21 +++++++++++++++++++++
 4 files changed, 41 insertions(+), 3 deletions(-)

Comments

Lorenzo Stoakes May 9, 2023, 6:19 p.m. UTC | #1
On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> From: Kent Overstreet <kent.overstreet@gmail.com>
>
> This is needed for bcachefs, which dynamically generates per-btree node
> unpack functions.

Small nits -

Would be good to refer to the original patch that removed it,
i.e. 7a0e27b2a0ce ("mm: remove vmalloc_exec") something like 'patch
... folded vmalloc_exec() into its one user, however bcachefs requires this
as well so revert'.

Would also be good to mention that you are now exporting the function which
the original didn't appear to do.

>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Uladzislau Rezki <urezki@gmail.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: linux-mm@kvack.org

Another nit: I'm a vmalloc reviewer so would be good to get cc'd too :)
(forgivable mistake as very recent change!)

> ---
>  include/linux/vmalloc.h |  1 +
>  kernel/module/main.c    |  4 +---
>  mm/nommu.c              | 18 ++++++++++++++++++
>  mm/vmalloc.c            | 21 +++++++++++++++++++++
>  4 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 69250efa03..ff147fe115 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -145,6 +145,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_user(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
>  extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
> +extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
>  extern void *vmalloc_32(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
>  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index d3be89de70..9eaa89e84c 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1607,9 +1607,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg
>
>  void * __weak module_alloc(unsigned long size)
>  {
> -	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -			NUMA_NO_NODE, __builtin_return_address(0));
> +	return vmalloc_exec(size, GFP_KERNEL);
>  }
>
>  bool __weak module_init_section(const char *name)
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 57ba243c6a..8d9ab19e39 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -280,6 +280,24 @@ void *vzalloc_node(unsigned long size, int node)
>  }
>  EXPORT_SYMBOL(vzalloc_node);
>
> +/**
> + *	vmalloc_exec  -  allocate virtually contiguous, executable memory
> + *	@size:		allocation size
> + *
> + *	Kernel-internal function to allocate enough pages to cover @size
> + *	the page level allocator and map them into contiguous and
> + *	executable kernel virtual space.
> + *
> + *	For tight control over page level allocator and protection flags
> + *	use __vmalloc() instead.
> + */
> +
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc(size, gfp_mask);
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>  /**
>   * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
>   *	@size:		allocation size
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 31ff782d36..2ebb9ea7f0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3401,6 +3401,27 @@ void *vzalloc_node(unsigned long size, int node)
>  }
>  EXPORT_SYMBOL(vzalloc_node);
>
> +/**
> + * vmalloc_exec - allocate virtually contiguous, executable memory
> + * @size:	  allocation size
> + *
> + * Kernel-internal function to allocate enough pages to cover @size
> + * the page level allocator and map them into contiguous and
> + * executable kernel virtual space.
> + *
> + * For tight control over page level allocator and protection flags
> + * use __vmalloc() instead.
> + *
> + * Return: pointer to the allocated memory or %NULL on error
> + */
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +			NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>  #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
>  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
>  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
> --
> 2.40.1
>

Otherwise lgtm, feel free to add:

Acked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Kent Overstreet May 9, 2023, 8:15 p.m. UTC | #2
On Tue, May 09, 2023 at 11:19:38AM -0700, Lorenzo Stoakes wrote:
> On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > From: Kent Overstreet <kent.overstreet@gmail.com>
> >
> > This is needed for bcachefs, which dynamically generates per-btree node
> > unpack functions.
> 
> Small nits -
> 
> Would be good to refer to the original patch that removed it,
> i.e. 7a0e27b2a0ce ("mm: remove vmalloc_exec") something like 'patch
> ... folded vmalloc_exec() into its one user, however bcachefs requires this
> as well so revert'.
> 
> Would also be good to mention that you are now exporting the function which
> the original didn't appear to do.
> 
> >
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Uladzislau Rezki <urezki@gmail.com>
> > Cc: Christoph Hellwig <hch@infradead.org>
> > Cc: linux-mm@kvack.org
> 
> Another nit: I'm a vmalloc reviewer so would be good to get cc'd too :)
> (forgivable mistake as very recent change!)

Thanks - folded your suggestions into the commit message, and added you
for the next posting :)
Christoph Hellwig May 9, 2023, 8:46 p.m. UTC | #3
On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> From: Kent Overstreet <kent.overstreet@gmail.com>
> 
> This is needed for bcachefs, which dynamically generates per-btree node
> unpack functions.

No, we will never add back a way for random code allocating executable
memory in kernel space.
Lorenzo Stoakes May 9, 2023, 9:12 p.m. UTC | #4
On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > From: Kent Overstreet <kent.overstreet@gmail.com>
> >
> > This is needed for bcachefs, which dynamically generates per-btree node
> > unpack functions.
>
> No, we will never add back a way for random code allocating executable
> memory in kernel space.

Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
reinstating a helper function because the code is now used in more than one
place (at lsf/mm so a little distracted :)

But it being exported is a problem. Perhaps there's another way of acheving the
same aim without having to do so?
Kent Overstreet May 9, 2023, 9:29 p.m. UTC | #5
On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
> On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > >
> > > This is needed for bcachefs, which dynamically generates per-btree node
> > > unpack functions.
> >
> > No, we will never add back a way for random code allocating executable
> > memory in kernel space.
> 
> Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
> reinstating a helper function because the code is now used in more than one
> place (at lsf/mm so a little distracted :)
> 
> But it being exported is a problem. Perhaps there's another way of acheving the
> same aim without having to do so?

None that I see.

The background is that bcachefs generates a per btree node unpack
function, based on the packed format for that btree node, for unpacking
keys within that node. The unpack function is only ~50 bytes, and for
locality we want it to be located with the btree node's other in-memory
lookup tables so they can be prefetched all at once.

Here's the codegen:

https://evilpiepirate.org/git/bcachefs.git/tree/fs/bcachefs/bkey.c#n727
Darrick J. Wong May 9, 2023, 9:43 p.m. UTC | #6
On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
> On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > >
> > > This is needed for bcachefs, which dynamically generates per-btree node
> > > unpack functions.
> >
> > No, we will never add back a way for random code allocating executable
> > memory in kernel space.
> 
> Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
> reinstating a helper function because the code is now used in more than one
> place (at lsf/mm so a little distracted :)
> 
> But it being exported is a problem. Perhaps there's another way of acheving the
> same aim without having to do so?

I already trolled Kent with this on IRC, but for the parts of bcachefs
that want better assembly code than whatever gcc generates from the C
source, could you compile code to BPF and then let the BPF JIT engines
turn that into machine code for you?

(also distracted by LSFMM)

--D
Kent Overstreet May 9, 2023, 9:54 p.m. UTC | #7
On Tue, May 09, 2023 at 02:43:19PM -0700, Darrick J. Wong wrote:
> On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
> > On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> > > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > > >
> > > > This is needed for bcachefs, which dynamically generates per-btree node
> > > > unpack functions.
> > >
> > > No, we will never add back a way for random code allocating executable
> > > memory in kernel space.
> > 
> > Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
> > reinstating a helper function because the code is now used in more than one
> > place (at lsf/mm so a little distracted :)
> > 
> > But it being exported is a problem. Perhaps there's another way of acheving the
> > same aim without having to do so?
> 
> I already trolled Kent with this on IRC, but for the parts of bcachefs
> that want better assembly code than whatever gcc generates from the C
> source, could you compile code to BPF and then let the BPF JIT engines
> turn that into machine code for you?

It's an intriguing idea, but it'd be a _lot_ of work and this is old
code that's never had a single bug - I'm not in a hurry to rewrite it.

And there would still be the issue that we've still got lots of little
unpack functions that go with other tables; we can't just burn a full
page per unpack function, that would waste way too much memory, and if
we put them together then we're stuck writing a whole nother allocator
- nope, and then we're also mucking with the memory layout of the data
structures used in the very hottest paths in the filesystem - I'm very
wary of introducing performance regressions there.

I think it'd be much more practical to find some way of making
vmalloc_exec() more palatable. What are the exact concerns?
Eric Biggers May 10, 2023, 6:48 a.m. UTC | #8
On Tue, May 09, 2023 at 05:29:10PM -0400, Kent Overstreet wrote:
> On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
> > On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> > > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > > >
> > > > This is needed for bcachefs, which dynamically generates per-btree node
> > > > unpack functions.
> > >
> > > No, we will never add back a way for random code allocating executable
> > > memory in kernel space.
> > 
> > Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
> > reinstating a helper function because the code is now used in more than one
> > place (at lsf/mm so a little distracted :)
> > 
> > But it being exported is a problem. Perhaps there's another way of acheving the
> > same aim without having to do so?
> 
> None that I see.
> 
> The background is that bcachefs generates a per btree node unpack
> function, based on the packed format for that btree node, for unpacking
> keys within that node. The unpack function is only ~50 bytes, and for
> locality we want it to be located with the btree node's other in-memory
> lookup tables so they can be prefetched all at once.
> 
> Here's the codegen:
> 
> https://evilpiepirate.org/git/bcachefs.git/tree/fs/bcachefs/bkey.c#n727

Well, it's a cool trick, but it's not clear that it actually belongs in
production kernel code.  What else in the kernel actually does dynamic codegen?
Just BPF, I think?

Among other issues, this is entirely architecture-specific, and it may cause
interoperability issues with various other features, including security
features.  Is it really safe to leave a W&X page around, for example?

What seems to be missing is any explanation for what we're actually getting from
this extremely unusual solution that cannot be gained any other way.  What is
unique about bcachefs that it really needs something like this?

- Eric
David Laight May 10, 2023, 11:56 a.m. UTC | #9
From: Kent Overstreet
> Sent: 09 May 2023 22:29
...
> The background is that bcachefs generates a per btree node unpack
> function, based on the packed format for that btree node, for unpacking
> keys within that node. The unpack function is only ~50 bytes, and for
> locality we want it to be located with the btree node's other in-memory
> lookup tables so they can be prefetched all at once.

Loading data into the d-cache isn't going to load code into
the i-cache.
Indeed you don't want to be mixing code and data in the same
cache line - because it just wastes space in the cache.

Looks to me like you could have a few different unpack
functions and pick the correct one based on the packed format.
Quite likely the code would be just as fast (if longer)
when you allow for parallel execution on modern cpu.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Christophe Leroy May 10, 2023, 2:18 p.m. UTC | #10
Le 09/05/2023 à 18:56, Kent Overstreet a écrit :
> From: Kent Overstreet <kent.overstreet@gmail.com>
> 
> This is needed for bcachefs, which dynamically generates per-btree node
> unpack functions.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Uladzislau Rezki <urezki@gmail.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: linux-mm@kvack.org
> ---
>   include/linux/vmalloc.h |  1 +
>   kernel/module/main.c    |  4 +---
>   mm/nommu.c              | 18 ++++++++++++++++++
>   mm/vmalloc.c            | 21 +++++++++++++++++++++
>   4 files changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 69250efa03..ff147fe115 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -145,6 +145,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1);
>   extern void *vmalloc_user(unsigned long size) __alloc_size(1);
>   extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
>   extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
> +extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
>   extern void *vmalloc_32(unsigned long size) __alloc_size(1);
>   extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
>   extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index d3be89de70..9eaa89e84c 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1607,9 +1607,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg
>   
>   void * __weak module_alloc(unsigned long size)
>   {
> -	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -			NUMA_NO_NODE, __builtin_return_address(0));
> +	return vmalloc_exec(size, GFP_KERNEL);
>   }
>   
>   bool __weak module_init_section(const char *name)
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 57ba243c6a..8d9ab19e39 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -280,6 +280,24 @@ void *vzalloc_node(unsigned long size, int node)
>   }
>   EXPORT_SYMBOL(vzalloc_node);
>   
> +/**
> + *	vmalloc_exec  -  allocate virtually contiguous, executable memory
> + *	@size:		allocation size
> + *
> + *	Kernel-internal function to allocate enough pages to cover @size
> + *	the page level allocator and map them into contiguous and
> + *	executable kernel virtual space.
> + *
> + *	For tight control over page level allocator and protection flags
> + *	use __vmalloc() instead.
> + */
> +
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc(size, gfp_mask);
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>   /**
>    * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
>    *	@size:		allocation size
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 31ff782d36..2ebb9ea7f0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3401,6 +3401,27 @@ void *vzalloc_node(unsigned long size, int node)
>   }
>   EXPORT_SYMBOL(vzalloc_node);
>   
> +/**
> + * vmalloc_exec - allocate virtually contiguous, executable memory
> + * @size:	  allocation size
> + *
> + * Kernel-internal function to allocate enough pages to cover @size
> + * the page level allocator and map them into contiguous and
> + * executable kernel virtual space.
> + *
> + * For tight control over page level allocator and protection flags
> + * use __vmalloc() instead.
> + *
> + * Return: pointer to the allocated memory or %NULL on error
> + */
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +			NUMA_NO_NODE, __builtin_return_address(0));
> +}

That cannot work. The VMALLOC space is mapped non-exec on powerpc/32. 
You have to allocate between MODULES_VADDR and MODULES_END if you want 
something executable so you must use module_alloc() see 
https://elixir.bootlin.com/linux/v6.4-rc1/source/arch/powerpc/kernel/module.c#L108

> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>   #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
>   #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
>   #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
Johannes Thumshirn May 10, 2023, 3:05 p.m. UTC | #11
On 09.05.23 18:56, Kent Overstreet wrote:
> +/**
> + * vmalloc_exec - allocate virtually contiguous, executable memory
> + * @size:	  allocation size
> + *
> + * Kernel-internal function to allocate enough pages to cover @size
> + * the page level allocator and map them into contiguous and
> + * executable kernel virtual space.
> + *
> + * For tight control over page level allocator and protection flags
> + * use __vmalloc() instead.
> + *
> + * Return: pointer to the allocated memory or %NULL on error
> + */
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +			NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);

Uh W+X memory reagions.
The 90s called, they want their shellcode back.
Theodore Ts'o May 11, 2023, 5:33 a.m. UTC | #12
On Tue, May 09, 2023 at 05:54:26PM -0400, Kent Overstreet wrote:
> 
> I think it'd be much more practical to find some way of making
> vmalloc_exec() more palatable. What are the exact concerns?

Having a vmalloc_exec() function (whether it is not exported at all,
or exported as a GPL symbol) makes it *much* easier for an exploit
writer, since it's a super convenient gadget for use with
Returned-oriented-programming[1] to create a writeable, executable
space that could then be filled with arbitrary code of the exploit
author's arbitrary desire.

[1] https://en.wikipedia.org/wiki/Return-oriented_programming

The other thing I'll note from examining the code generator, is that
it appears that bcachefs *only* has support for x86_64.  This brings
me back to the days of my youth when all the world was a Vax[2].  :-)

   10.  Thou shalt foreswear, renounce, and abjure the vile heresy
        which claimeth that ``All the world's a VAX'', and have no commerce
	with the benighted heathens who cling to this barbarous belief, that
 	the days of thy program may be long even though the days of thy
	current machine be short.

	[ This particular heresy bids fair to be replaced by ``All the
	world's a Sun'' or ``All the world's a 386'' (this latter
	being a particularly revolting invention of Satan), but the
	words apply to all such without limitation. Beware, in
	particular, of the subtle and terrible ``All the world's a
	32-bit machine'', which is almost true today but shall cease
	to be so before thy resume grows too much longer.]

[2] The Ten Commandments for C Programmers (Annotated Edition)
    https://www.lysator.liu.se/c/ten-commandments.html

Seriously, does this mean that bcachefs won't work on Arm systems
(arm32 or arm64)?  Or Risc V systems?  Or S/390's?  Or Power
architectuers?  Or Itanium or PA-RISC systems?  (OK, I really don't
care all that much about those last two.  :-)


When people ask me why file systems are so hard to make enterprise
ready, I tell them to recall the general advice given to people to
write secure, robust systems: (a) avoid premature optimization, (b)
avoid fine-grained, multi-threaded programming, as much as possible,
because locking bugs are a b*tch, and (c) avoid unnecessary global
state as much as possible.

File systems tend to violate all of these precepts: (a) people chase
benchmark optimizations to the exclusion of all else, because people
have an unhealthy obsession with Phornix benchmark articles, (b) file
systems tend to be inherently multi-threaded, with lots of locks, and
(c) file systems are all about managing global state in the form of
files, directories, etc.

However, hiding a miniature architecture-specific compiler inside a
file system seems to be a rather blatent example of "premature
optimization".

							- Ted
Kent Overstreet May 11, 2023, 5:44 a.m. UTC | #13
On Thu, May 11, 2023 at 01:33:12AM -0400, Theodore Ts'o wrote:
> Seriously, does this mean that bcachefs won't work on Arm systems
> (arm32 or arm64)?  Or Risc V systems?  Or S/390's?  Or Power
> architectuers?  Or Itanium or PA-RISC systems?  (OK, I really don't
> care all that much about those last two.  :-)

No :)

My CI servers are arm64 servers. There's a bch2_bkey_unpack_key()
written in C, that works on any architecture. But specializing for a
particular format is a not-insignificant performance improvement, so
writing an arm64 version has been on my todo list.

> When people ask me why file systems are so hard to make enterprise
> ready, I tell them to recall the general advice given to people to
> write secure, robust systems: (a) avoid premature optimization, (b)
> avoid fine-grained, multi-threaded programming, as much as possible,
> because locking bugs are a b*tch, and (c) avoid unnecessary global
> state as much as possible.
> 
> File systems tend to violate all of these precepts: (a) people chase
> benchmark optimizations to the exclusion of all else, because people
> have an unhealthy obsession with Phornix benchmark articles, (b) file
> systems tend to be inherently multi-threaded, with lots of locks, and
> (c) file systems are all about managing global state in the form of
> files, directories, etc.
> 
> However, hiding a miniature architecture-specific compiler inside a
> file system seems to be a rather blatent example of "premature
> optimization".

Ted, this project is _15_ years old.

I'm getting ready to write a full explanation of what this is for and
why it's important, I've just been busy with the conference - and I want
to write something good, that provides all the context.

I've also been mulling over fallback options, but I don't see any good
ones. The unspecialized, C version of unpack has branches (the absolute
minimum, I took my time when I was writing that code too); the
specialized versions are branchless and _much_ smaller, and the only way
to do that specialization is with some form of dynamic codegen.

But I do owe you all a detailed walkthrough of what this is all about,
so you'll get it in the next day or so.

Cheers,
Kent
Kees Cook May 11, 2023, 10:28 p.m. UTC | #14
On Wed, May 10, 2023 at 03:05:48PM +0000, Johannes Thumshirn wrote:
> On 09.05.23 18:56, Kent Overstreet wrote:
> > +/**
> > + * vmalloc_exec - allocate virtually contiguous, executable memory
> > + * @size:	  allocation size
> > + *
> > + * Kernel-internal function to allocate enough pages to cover @size
> > + * the page level allocator and map them into contiguous and
> > + * executable kernel virtual space.
> > + *
> > + * For tight control over page level allocator and protection flags
> > + * use __vmalloc() instead.
> > + *
> > + * Return: pointer to the allocated memory or %NULL on error
> > + */
> > +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> > +{
> > +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> > +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> > +			NUMA_NO_NODE, __builtin_return_address(0));
> > +}
> > +EXPORT_SYMBOL_GPL(vmalloc_exec);
> 
> Uh W+X memory reagions.
> The 90s called, they want their shellcode back.

Just to clarify: the kernel must never create W+X memory regions. So,
no, do not reintroduce vmalloc_exec().

Dynamic code areas need to be constructed in a non-executable memory,
then switched to read-only and verified to still be what was expected,
and only then made executable.
Kent Overstreet May 12, 2023, 6:36 p.m. UTC | #15
On Tue, May 09, 2023 at 11:48:49PM -0700, Eric Biggers wrote:
> What seems to be missing is any explanation for what we're actually getting from
> this extremely unusual solution that cannot be gained any other way.  What is
> unique about bcachefs that it really needs something like this?

Ok, as promised:

Background: all metadata in bcachefs is a structured as key/value pairs,
and there's a common key format for all keys.

struct bkey {
	/* 3 byte header */
	u8		u64s;		/* size of k/v in u64s */
	u8		format;		/* packed/unpacked, needs_whiteout */
	u8		type;		/* value type */
	u8		pad;

	/*
	 * Order of fields below is for little endian, they're in
	 * reverse order on big endian (and byte swabbed as necessary
	 * when reading foreign endian metadata)
	 * 
	 * Since field order matches byte order, the key can be treated
	 * as one large multi word integer for doing comparisons:
	 */
	u96		version;	/* nonces, send/recv support */
	u32		size;		/* size of extent keys */

	/* Below are the field used for ordering/comparison: */
	u32		snapshot;	
	u64		offset;
	u64		inode;

	/* Value is stored inline with key */
	struct bch_val	v;
};

sizeof(struct bkey) == 40.

An extent value that has one pointer and no checksum is 8 bytes, with
one pointer and one 32 bit checksum 16 bytes, for 56 bytes total (key
included).

But for a given btree node, most of the key fields will typically be
redundandant. An extents leaf node might have extents for all one inode
number or a small range of inode numbers, snapshots may or may not be in
use, etc. - clearly some compression is desirable here.

The key observation is that key compression is possible if we have a
compression function that preserves order, and an order-preserving
compression function is possible if it's allowed to fail. That means we
do comparisons on packed keys, which lets us skip _most_ unpack
operations, for btree node resorts and for lookups within a node.

Packing works by defining a format with an offset and a bit width for
each field, so e.g. if all keys in a btree node have the same inode
number the packed format can specify that inode number and then a field
width of 0 bits for the inode field.

Continuing the arithmetic from before, a packed extent key will
typically be only 8 or 16 bytes, or 24-32 including the val, which means
bkey packing cuts our metadata size roughly in half.

(It also makes our key format somewhat self describing and gives us a
mechanism by which we could add or extend fields in the future).

-----------------------------------------------------

As mentioned before, since packed bkeys are still multi-word integers we
can do some important operations without unpacking, but to iterate over
keys, compare packed & unpacked keys in resort, etc. - we'll still need
to unpack, so we need this operation to be as fast as possible.

bkey.c __bch2_bkey_unpack_key() is the unspecialized version written in
C, that works on any archictecture. It loops over the fields in a
bkey_format, pulling them out of the input words and adding back the
field offsets. It's got the absolute minimum number of branches - one
per field, when deciding to advance to the next input word - but it
can't be branchless and it's a whole ton of shifts and bitops.

dynamic codegen lets us produce unpack functions that are fully
branchless and _much_ smaller. For any given btree node we'll have a
format where multiple fields have 0 field with - i.e. those fields are
always constants. That code goes away, and also if the format can be
byte aligned we can eliminate shifts and bitopts. Code size for the
dynamically compiled unpack functions is roughly 10% that of the
unspecialized C version.

I hope that addresses some of the "what is this even for" questions :)

Cheers,
Kent
Kent Overstreet May 12, 2023, 6:41 p.m. UTC | #16
On Thu, May 11, 2023 at 03:28:40PM -0700, Kees Cook wrote:
> On Wed, May 10, 2023 at 03:05:48PM +0000, Johannes Thumshirn wrote:
> > On 09.05.23 18:56, Kent Overstreet wrote:
> > > +/**
> > > + * vmalloc_exec - allocate virtually contiguous, executable memory
> > > + * @size:	  allocation size
> > > + *
> > > + * Kernel-internal function to allocate enough pages to cover @size
> > > + * the page level allocator and map them into contiguous and
> > > + * executable kernel virtual space.
> > > + *
> > > + * For tight control over page level allocator and protection flags
> > > + * use __vmalloc() instead.
> > > + *
> > > + * Return: pointer to the allocated memory or %NULL on error
> > > + */
> > > +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> > > +{
> > > +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> > > +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> > > +			NUMA_NO_NODE, __builtin_return_address(0));
> > > +}
> > > +EXPORT_SYMBOL_GPL(vmalloc_exec);
> > 
> > Uh W+X memory reagions.
> > The 90s called, they want their shellcode back.
> 
> Just to clarify: the kernel must never create W+X memory regions. So,
> no, do not reintroduce vmalloc_exec().
> 
> Dynamic code areas need to be constructed in a non-executable memory,
> then switched to read-only and verified to still be what was expected,
> and only then made executable.

So if we're opening this up to the topic if what an acceptible API would
look like - how hard is this requirement?

The reason is that the functions we're constructing are only ~50 bytes,
so we don't want to be burning a full page per function (particularly
for the 64kb page architectures...)

If we were to build an allocator for sub-page dynamically constructed
code, we'd have to flip the whole page to W+X while copying in the new
function. But, we could construct it in non executable memory and then
hand it off to this new allocator to do the copy, which would also do
the page permission flipping.

It seem like this is something BPF might want eventually too, depending
on average BPF program size...
Eric Biggers May 13, 2023, 1:57 a.m. UTC | #17
Hi Kent,

On Fri, May 12, 2023 at 02:36:13PM -0400, Kent Overstreet wrote:
> On Tue, May 09, 2023 at 11:48:49PM -0700, Eric Biggers wrote:
> > What seems to be missing is any explanation for what we're actually getting from
> > this extremely unusual solution that cannot be gained any other way.  What is
> > unique about bcachefs that it really needs something like this?
> 
> Ok, as promised:
> 
> Background: all metadata in bcachefs is a structured as key/value pairs,
> and there's a common key format for all keys.
> 
> struct bkey {
> 	/* 3 byte header */
> 	u8		u64s;		/* size of k/v in u64s */
> 	u8		format;		/* packed/unpacked, needs_whiteout */
> 	u8		type;		/* value type */
> 	u8		pad;
> 
> 	/*
> 	 * Order of fields below is for little endian, they're in
> 	 * reverse order on big endian (and byte swabbed as necessary
> 	 * when reading foreign endian metadata)
> 	 * 
> 	 * Since field order matches byte order, the key can be treated
> 	 * as one large multi word integer for doing comparisons:
> 	 */
> 	u96		version;	/* nonces, send/recv support */
> 	u32		size;		/* size of extent keys */
> 
> 	/* Below are the field used for ordering/comparison: */
> 	u32		snapshot;	
> 	u64		offset;
> 	u64		inode;
> 
> 	/* Value is stored inline with key */
> 	struct bch_val	v;
> };
> 
> sizeof(struct bkey) == 40.
> 
> An extent value that has one pointer and no checksum is 8 bytes, with
> one pointer and one 32 bit checksum 16 bytes, for 56 bytes total (key
> included).
> 
> But for a given btree node, most of the key fields will typically be
> redundandant. An extents leaf node might have extents for all one inode
> number or a small range of inode numbers, snapshots may or may not be in
> use, etc. - clearly some compression is desirable here.
> 
> The key observation is that key compression is possible if we have a
> compression function that preserves order, and an order-preserving
> compression function is possible if it's allowed to fail. That means we
> do comparisons on packed keys, which lets us skip _most_ unpack
> operations, for btree node resorts and for lookups within a node.
> 
> Packing works by defining a format with an offset and a bit width for
> each field, so e.g. if all keys in a btree node have the same inode
> number the packed format can specify that inode number and then a field
> width of 0 bits for the inode field.
> 
> Continuing the arithmetic from before, a packed extent key will
> typically be only 8 or 16 bytes, or 24-32 including the val, which means
> bkey packing cuts our metadata size roughly in half.
> 
> (It also makes our key format somewhat self describing and gives us a
> mechanism by which we could add or extend fields in the future).
> 
> -----------------------------------------------------
> 
> As mentioned before, since packed bkeys are still multi-word integers we
> can do some important operations without unpacking, but to iterate over
> keys, compare packed & unpacked keys in resort, etc. - we'll still need
> to unpack, so we need this operation to be as fast as possible.
> 
> bkey.c __bch2_bkey_unpack_key() is the unspecialized version written in
> C, that works on any archictecture. It loops over the fields in a
> bkey_format, pulling them out of the input words and adding back the
> field offsets. It's got the absolute minimum number of branches - one
> per field, when deciding to advance to the next input word - but it
> can't be branchless and it's a whole ton of shifts and bitops.
> 
> dynamic codegen lets us produce unpack functions that are fully
> branchless and _much_ smaller. For any given btree node we'll have a
> format where multiple fields have 0 field with - i.e. those fields are
> always constants. That code goes away, and also if the format can be
> byte aligned we can eliminate shifts and bitopts. Code size for the
> dynamically compiled unpack functions is roughly 10% that of the
> unspecialized C version.
> 
> I hope that addresses some of the "what is this even for" questions :)
> 
> Cheers,
> Kent

I don't think this response addresses all the possibilities for optimizing the C
implementation, so I'd like to bring up a few and make sure that you've explored
them.

To summarize, you need to decode 6 fields that are each a variable number of
bits (not known at compile time), and add an offset (also not known at compile
time) to each field.

I don't think the offset is particularly interesting.  Adding an offset to each
field is very cheap and trivially parallelizable by the CPU.

It's really the bit width that's "interesting", as it must be the serialized
decoding of variable-length fields that slows things down a lot.

First, I wanted to mention that decoding of variable-length fields has been
extensively studied for decompression algorithms, e.g. for Huffman decoding.
And it turns out that it can be done branchlessly.  The basic idea is that you
have a branchless refill step that looks like the following:

#define REFILL_BITS_BRANCHLESS()                    \
        bitbuf |= get_unaligned_u64(p) << bitsleft; \
        p += 7 - ((bitsleft >> 3) & 0x7);           \
        bitsleft |= 56;

That branchlessly ensures that 'bitbuf' contains '56 <= bitsleft <= 63' bits.
Then, the needed number of bits can be removed and returned:

#define READ_BITS(n)                          \
        REFILL_BITS_BRANCHLESS();             \
        tmp = bitbuf & (((u64)1 << (n)) - 1); \
        bitbuf >>= (n);                       \
        bitsleft -= (n);                      \
        tmp

If you're interested, I can give you some references about the above method.
But, I really just wanted to mention it for completeness, since I think you'd
actually want to go in a slightly different direction, since (a) you have all
the field widths available from the beginning, as opposed to being interleaved
into the bitstream itself (as is the case in Huffman decoding for example), so
you're not limited to serialized decoding of each field, (b) your fields are up
to 96 bits, and (c) you've selected a bitstream convention that seems to make it
such that your stream *must* be read in aligned units of u64, so I don't think
something like REFILL_BITS_BRANCHLESS() could work for you anyway.

What I would suggest instead is preprocessing the list of 6 field lengths to
create some information that can be used to extract all 6 fields branchlessly
with no dependencies between different fields.  (And you clearly *can* add a
preprocessing step, as you already have one -- the dynamic code generator.)

So, something like the following:

    const struct field_info *info = &format->fields[0];

    field0 = (in->u64s[info->word_idx] >> info->shift1) & info->mask;
    field0 |= in->u64s[info->word_idx - 1] >> info->shift2;

... but with the code for all 6 fields interleaved.

On modern CPUs, I think that would be faster than your current C code.

You could do better by creating variants that are specialized for specific
common sets of parameters.  During "preprocessing", you would select a variant
and set an enum accordingly.  During decoding, you would switch on that enum and
call the appropriate variant.  (This could also be done with a function pointer,
of course, but indirect calls are slow these days...)

For example, you mentioned that 8-byte packed keys is a common case.  In that
case there is only a single u64 to decode from, so you could create a function
that just handles that case:

    field0 = (word >> info->shift) & info->mask;

You could also create other variants, e.g.:

- 16-byte packed keys (which you mentioned are common)
- Some specific set of fields have zero width so don't need to be extracted
  (which it sounds like is common, or is it different fields each time?)
- All fields having specific lengths (are there any particularly common cases?)

Have you considered any of these ideas?

- Eric
Lorenzo Stoakes May 13, 2023, 1:25 p.m. UTC | #18
On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
> On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > >
> > > This is needed for bcachefs, which dynamically generates per-btree node
> > > unpack functions.
> >
> > No, we will never add back a way for random code allocating executable
> > memory in kernel space.
>
> Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
> reinstating a helper function because the code is now used in more than one
> place (at lsf/mm so a little distracted :)
>
> But it being exported is a problem. Perhaps there's another way of acheving the
> same aim without having to do so?

Just to be abundantly clear, my original ack was a mistake (I overlooked
the _exporting_ of the function being as significant as it is and assumed
in an LSF/MM haze that it was simply a refactoring of _already available_
functionality rather than newly providing a means to allocate directly
executable kernel memory).

Exporting this is horrible for the numerous reasons expounded on in this
thread, we need a different solution.

Nacked-by: Lorenzo Stoakes <lstoakes@gmail.com>
Kent Overstreet May 13, 2023, 7:28 p.m. UTC | #19
On Fri, May 12, 2023 at 06:57:52PM -0700, Eric Biggers wrote:
> What I would suggest instead is preprocessing the list of 6 field lengths to
> create some information that can be used to extract all 6 fields branchlessly
> with no dependencies between different fields.  (And you clearly *can* add a
> preprocessing step, as you already have one -- the dynamic code generator.)
> 
> So, something like the following:
> 
>     const struct field_info *info = &format->fields[0];
> 
>     field0 = (in->u64s[info->word_idx] >> info->shift1) & info->mask;
>     field0 |= in->u64s[info->word_idx - 1] >> info->shift2;
> 
> ... but with the code for all 6 fields interleaved.
> 
> On modern CPUs, I think that would be faster than your current C code.
> 
> You could do better by creating variants that are specialized for specific
> common sets of parameters.  During "preprocessing", you would select a variant
> and set an enum accordingly.  During decoding, you would switch on that enum and
> call the appropriate variant.  (This could also be done with a function pointer,
> of course, but indirect calls are slow these days...)
> 
> For example, you mentioned that 8-byte packed keys is a common case.  In that
> case there is only a single u64 to decode from, so you could create a function
> that just handles that case:
> 
>     field0 = (word >> info->shift) & info->mask;
> 
> You could also create other variants, e.g.:
> 
> - 16-byte packed keys (which you mentioned are common)
> - Some specific set of fields have zero width so don't need to be extracted
>   (which it sounds like is common, or is it different fields each time?)
> - All fields having specific lengths (are there any particularly common cases?)
> 
> Have you considered any of these ideas?

I like that idea.

Gonna hack some code... :)
Kent Overstreet May 14, 2023, 5:45 a.m. UTC | #20
On Fri, May 12, 2023 at 06:57:52PM -0700, Eric Biggers wrote:
> First, I wanted to mention that decoding of variable-length fields has been
> extensively studied for decompression algorithms, e.g. for Huffman decoding.
> And it turns out that it can be done branchlessly.  The basic idea is that you
> have a branchless refill step that looks like the following:
> 
> #define REFILL_BITS_BRANCHLESS()                    \
>         bitbuf |= get_unaligned_u64(p) << bitsleft; \
>         p += 7 - ((bitsleft >> 3) & 0x7);           \
>         bitsleft |= 56;
> 
> That branchlessly ensures that 'bitbuf' contains '56 <= bitsleft <= 63' bits.
> Then, the needed number of bits can be removed and returned:
> 
> #define READ_BITS(n)                          \
>         REFILL_BITS_BRANCHLESS();             \
>         tmp = bitbuf & (((u64)1 << (n)) - 1); \
>         bitbuf >>= (n);                       \
>         bitsleft -= (n);                      \
>         tmp
> 
> If you're interested, I can give you some references about the above method.

I might be interested in those references, new bit tricks and integer
encodings are always fun :)

> But, I really just wanted to mention it for completeness, since I think you'd
> actually want to go in a slightly different direction, since (a) you have all
> the field widths available from the beginning, as opposed to being interleaved
> into the bitstream itself (as is the case in Huffman decoding for example), so
> you're not limited to serialized decoding of each field, (b) your fields are up
> to 96 bits, and (c) you've selected a bitstream convention that seems to make it
> such that your stream *must* be read in aligned units of u64, so I don't think
> something like REFILL_BITS_BRANCHLESS() could work for you anyway.
> 
> What I would suggest instead is preprocessing the list of 6 field lengths to
> create some information that can be used to extract all 6 fields branchlessly
> with no dependencies between different fields.  (And you clearly *can* add a
> preprocessing step, as you already have one -- the dynamic code generator.)
> 
> So, something like the following:
> 
>     const struct field_info *info = &format->fields[0];
> 
>     field0 = (in->u64s[info->word_idx] >> info->shift1) & info->mask;
>     field0 |= in->u64s[info->word_idx - 1] >> info->shift2;
> 
> ... but with the code for all 6 fields interleaved.
> 
> On modern CPUs, I think that would be faster than your current C code.
> 
> You could do better by creating variants that are specialized for specific
> common sets of parameters.  During "preprocessing", you would select a variant
> and set an enum accordingly.  During decoding, you would switch on that enum and
> call the appropriate variant.  (This could also be done with a function pointer,
> of course, but indirect calls are slow these days...)

testing random btree updates:

dynamically generated unpack:
rand_insert: 20.0 MiB with 1 threads in    33 sec,  1609 nsec per iter, 607 KiB per sec

old C unpack:
rand_insert: 20.0 MiB with 1 threads in    35 sec,  1672 nsec per iter, 584 KiB per sec

the Eric Biggers special:
rand_insert: 20.0 MiB with 1 threads in    35 sec,  1676 nsec per iter, 583 KiB per sec

Tested two versions of your approach, one without a shift value, one
where we use a shift value to try to avoid unaligned access - second was
perhaps 1% faster

so it's not looking good. This benchmark doesn't even hit on
unpack_key() quite as much as I thought, so the difference is
significant.

diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 6d3a1c096f..128d96766c 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -7,6 +7,8 @@
 #include "bset.h"
 #include "util.h"
 
+#include <asm/unaligned.h>
+
 #undef EBUG_ON
 
 #ifdef DEBUG_BKEYS
@@ -19,9 +21,35 @@ const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
 struct bkey_format_processed bch2_bkey_format_postprocess(const struct bkey_format f)
 {
-	return (struct bkey_format_processed) {
-		.f = f,
-	};
+	struct bkey_format_processed ret = { .f = f, .aligned = true };
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned offset = f.key_u64s * 64;
+#else
+	unsigned offset = KEY_PACKED_BITS_START;
+#endif
+
+	for (unsigned i = 0; i < BKEY_NR_FIELDS; i++) {
+		unsigned bits = f.bits_per_field[i];
+
+		if (bits & 7) {
+			ret.aligned = false;
+			break;
+		}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		offset -= bits;
+#endif
+
+		ret.shift[i]	= min(offset & 63, 64 - bits);
+		ret.offset[i]	= (offset - ret.shift[i]) / 8;
+		ret.mask[i]	= bits ? ~0ULL >> (64 - bits) : 0;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+		offset += bits;
+#endif
+	}
+
+	return ret;
 }
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *out,
@@ -191,6 +219,19 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
 	return v + offset;
 }
 
+__always_inline
+static u64 get_aligned_field(const struct bkey_format_processed *f,
+			     const struct bkey_packed *in,
+			     unsigned field_idx)
+{
+	u64 v = get_unaligned((u64 *) (((u8 *) in->_data) + f->offset[field_idx]));
+
+	v >>= f->shift[field_idx];
+	v &= f->mask[field_idx];
+
+	return v + le64_to_cpu(f->f.field_offset[field_idx]);
+}
+
 __always_inline
 static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 {
@@ -269,45 +310,57 @@ bool bch2_bkey_transform(const struct bkey_format *out_f,
 	return true;
 }
 
-struct bkey __bch2_bkey_unpack_key(const struct bkey_format_processed *format_p,
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format_processed *format,
 				   const struct bkey_packed *in)
 {
-	const struct bkey_format *format = &format_p->f;
-	struct unpack_state state = unpack_state_init(format, in);
 	struct bkey out;
 
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(format->f.nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->f.key_u64s);
 	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+	EBUG_ON(in->u64s - format->f.key_u64s + BKEY_U64s > U8_MAX);
 
-	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.u64s	= BKEY_U64s + in->u64s - format->f.key_u64s;
 	out.format	= KEY_FORMAT_CURRENT;
 	out.needs_whiteout = in->needs_whiteout;
 	out.type	= in->type;
 	out.pad[0]	= 0;
 
+	if (likely(format->aligned)) {
+#define x(id, field)	out.field = get_aligned_field(format, in, id);
+		bkey_fields()
+#undef x
+	} else {
+		struct unpack_state state = unpack_state_init(&format->f, in);
+
 #define x(id, field)	out.field = get_inc_field(&state, id);
-	bkey_fields()
+		bkey_fields()
 #undef x
+	}
 
 	return out;
 }
 
-struct bpos __bkey_unpack_pos(const struct bkey_format_processed *format_p,
+struct bpos __bkey_unpack_pos(const struct bkey_format_processed *format,
 			      const struct bkey_packed *in)
 {
-	const struct bkey_format *format = &format_p->f;
-	struct unpack_state state = unpack_state_init(format, in);
 	struct bpos out;
 
-	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
-	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(format->f.nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->f.key_u64s);
 	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
 
-	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
-	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
-	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+	if (likely(format->aligned)) {
+		out.inode	= get_aligned_field(format, in, BKEY_FIELD_INODE);
+		out.offset	= get_aligned_field(format, in, BKEY_FIELD_OFFSET);
+		out.snapshot	= get_aligned_field(format, in, BKEY_FIELD_SNAPSHOT);
+	} else {
+		struct unpack_state state = unpack_state_init(&format->f, in);
+
+		out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+		out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+		out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+	}
 
 	return out;
 }
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 58ce60c37e..38c3ec6852 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -70,6 +70,10 @@ struct btree_bkey_cached_common {
 
 struct bkey_format_processed {
 	struct bkey_format	f;
+	bool			aligned;
+	u8			offset[6];
+	u8			shift[6];
+	u64			mask[6];
 };
 
 struct btree {
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index dcfd7ceacc..72aedc1e34 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -181,7 +181,11 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
 
 static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
 {
-	return (void *) b->data + btree_bytes(c);
+	/*
+	 * __bch2_bkey_unpack_key() may read up to 8 bytes past the end of the
+	 * input buffer:
+	 */
+	return (void *) b->data + btree_bytes(c) - 8;
 }
 
 static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
Christophe Leroy May 14, 2023, 6:39 p.m. UTC | #21
Le 13/05/2023 à 15:25, Lorenzo Stoakes a écrit :
> On Tue, May 09, 2023 at 02:12:41PM -0700, Lorenzo Stoakes wrote:
>> On Tue, May 09, 2023 at 01:46:09PM -0700, Christoph Hellwig wrote:
>>> On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
>>>> From: Kent Overstreet <kent.overstreet@gmail.com>
>>>>
>>>> This is needed for bcachefs, which dynamically generates per-btree node
>>>> unpack functions.
>>>
>>> No, we will never add back a way for random code allocating executable
>>> memory in kernel space.
>>
>> Yeah I think I glossed over this aspect a bit as it looks ostensibly like simply
>> reinstating a helper function because the code is now used in more than one
>> place (at lsf/mm so a little distracted :)
>>
>> But it being exported is a problem. Perhaps there's another way of acheving the
>> same aim without having to do so?
> 
> Just to be abundantly clear, my original ack was a mistake (I overlooked
> the _exporting_ of the function being as significant as it is and assumed
> in an LSF/MM haze that it was simply a refactoring of _already available_
> functionality rather than newly providing a means to allocate directly
> executable kernel memory).
> 
> Exporting this is horrible for the numerous reasons expounded on in this
> thread, we need a different solution.
> 
> Nacked-by: Lorenzo Stoakes <lstoakes@gmail.com>
> 

I addition to that, I still don't understand why you bring back 
vmalloc_exec() instead of using module_alloc().

As reminded in a previous response, some architectures like powerpc/32s 
cannot allocate exec memory in vmalloc space. On powerpc this is because 
exec protection is performed on 256Mbytes segments and vmalloc space is 
flagged non-exec. Some other architectures have a constraint on distance 
between kernel core text and other text.

Today you have for instance kprobes in the kernel that need dynamic exec 
memory. It uses module_alloc() to get it. On some architectures you also 
have ftrace that gets some exec memory with module_alloc().

So, I still don't understand why you cannot use module_alloc() and need 
vmalloc_exec() instead.

Thanks
Christophe
Eric Biggers May 14, 2023, 6:43 p.m. UTC | #22
On Sun, May 14, 2023 at 01:45:29AM -0400, Kent Overstreet wrote:
> On Fri, May 12, 2023 at 06:57:52PM -0700, Eric Biggers wrote:
> > First, I wanted to mention that decoding of variable-length fields has been
> > extensively studied for decompression algorithms, e.g. for Huffman decoding.
> > And it turns out that it can be done branchlessly.  The basic idea is that you
> > have a branchless refill step that looks like the following:
> > 
> > #define REFILL_BITS_BRANCHLESS()                    \
> >         bitbuf |= get_unaligned_u64(p) << bitsleft; \
> >         p += 7 - ((bitsleft >> 3) & 0x7);           \
> >         bitsleft |= 56;
> > 
> > That branchlessly ensures that 'bitbuf' contains '56 <= bitsleft <= 63' bits.
> > Then, the needed number of bits can be removed and returned:
> > 
> > #define READ_BITS(n)                          \
> >         REFILL_BITS_BRANCHLESS();             \
> >         tmp = bitbuf & (((u64)1 << (n)) - 1); \
> >         bitbuf >>= (n);                       \
> >         bitsleft -= (n);                      \
> >         tmp
> > 
> > If you're interested, I can give you some references about the above method.
> 
> I might be interested in those references, new bit tricks and integer
> encodings are always fun :)

There are some good blog posts by Fabian Giese:

* https://fgiesen.wordpress.com/2018/02/19/reading-bits-in-far-too-many-ways-part-1/
* https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/
* https://fgiesen.wordpress.com/2018/09/27/reading-bits-in-far-too-many-ways-part-3/

And the examples I gave above are basically what I use in libdeflate:
https://github.com/ebiggers/libdeflate/blob/master/lib/deflate_decompress.c

> > But, I really just wanted to mention it for completeness, since I think you'd
> > actually want to go in a slightly different direction, since (a) you have all
> > the field widths available from the beginning, as opposed to being interleaved
> > into the bitstream itself (as is the case in Huffman decoding for example), so
> > you're not limited to serialized decoding of each field, (b) your fields are up
> > to 96 bits, and (c) you've selected a bitstream convention that seems to make it
> > such that your stream *must* be read in aligned units of u64, so I don't think
> > something like REFILL_BITS_BRANCHLESS() could work for you anyway.
> > 
> > What I would suggest instead is preprocessing the list of 6 field lengths to
> > create some information that can be used to extract all 6 fields branchlessly
> > with no dependencies between different fields.  (And you clearly *can* add a
> > preprocessing step, as you already have one -- the dynamic code generator.)
> > 
> > So, something like the following:
> > 
> >     const struct field_info *info = &format->fields[0];
> > 
> >     field0 = (in->u64s[info->word_idx] >> info->shift1) & info->mask;
> >     field0 |= in->u64s[info->word_idx - 1] >> info->shift2;
> > 
> > ... but with the code for all 6 fields interleaved.
> > 
> > On modern CPUs, I think that would be faster than your current C code.
> > 
> > You could do better by creating variants that are specialized for specific
> > common sets of parameters.  During "preprocessing", you would select a variant
> > and set an enum accordingly.  During decoding, you would switch on that enum and
> > call the appropriate variant.  (This could also be done with a function pointer,
> > of course, but indirect calls are slow these days...)
> 
> testing random btree updates:
> 
> dynamically generated unpack:
> rand_insert: 20.0 MiB with 1 threads in    33 sec,  1609 nsec per iter, 607 KiB per sec
> 
> old C unpack:
> rand_insert: 20.0 MiB with 1 threads in    35 sec,  1672 nsec per iter, 584 KiB per sec
> 
> the Eric Biggers special:
> rand_insert: 20.0 MiB with 1 threads in    35 sec,  1676 nsec per iter, 583 KiB per sec
> 
> Tested two versions of your approach, one without a shift value, one
> where we use a shift value to try to avoid unaligned access - second was
> perhaps 1% faster
> 
> so it's not looking good. This benchmark doesn't even hit on
> unpack_key() quite as much as I thought, so the difference is
> significant.
> 
> diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c

I don't know what this patch applies to, so I can't properly review it.

I suggest checking the assembly and making sure it is what is expected.

In general, for this type of thing it's also helpful to put together a userspace
micro-benchmark program so that it's very fast to evaluate different options.
Building and booting a kernel and doing some I/O benchmark on a bcachefs sounds
much more time consuming and less precise.

> -struct bkey __bch2_bkey_unpack_key(const struct bkey_format_processed *format_p,
> +struct bkey __bch2_bkey_unpack_key(const struct bkey_format_processed *format,
>  				   const struct bkey_packed *in)
>  {
> -	const struct bkey_format *format = &format_p->f;
> -	struct unpack_state state = unpack_state_init(format, in);
>  	struct bkey out;
>  
> -	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
> -	EBUG_ON(in->u64s < format->key_u64s);
> +	EBUG_ON(format->f.nr_fields != BKEY_NR_FIELDS);
> +	EBUG_ON(in->u64s < format->f.key_u64s);
>  	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
> -	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
> +	EBUG_ON(in->u64s - format->f.key_u64s + BKEY_U64s > U8_MAX);
>  
> -	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
> +	out.u64s	= BKEY_U64s + in->u64s - format->f.key_u64s;
>  	out.format	= KEY_FORMAT_CURRENT;
>  	out.needs_whiteout = in->needs_whiteout;
>  	out.type	= in->type;
>  	out.pad[0]	= 0;
>  
> +	if (likely(format->aligned)) {
> +#define x(id, field)	out.field = get_aligned_field(format, in, id);
> +		bkey_fields()
> +#undef x
> +	} else {
> +		struct unpack_state state = unpack_state_init(&format->f, in);
> +
>  #define x(id, field)	out.field = get_inc_field(&state, id);
> -	bkey_fields()
> +		bkey_fields()
>  #undef x
> +	}

It looks like you didn't change the !aligned case.  How often is the 'aligned'
case taken?

I think it would also help if the generated assembly had the handling of the
fields interleaved.  To achieve that, it might be necessary to interleave the C
code.

- Eric
Kent Overstreet May 14, 2023, 11:43 p.m. UTC | #23
On Sun, May 14, 2023 at 06:39:00PM +0000, Christophe Leroy wrote:
> I addition to that, I still don't understand why you bring back 
> vmalloc_exec() instead of using module_alloc().
> 
> As reminded in a previous response, some architectures like powerpc/32s 
> cannot allocate exec memory in vmalloc space. On powerpc this is because 
> exec protection is performed on 256Mbytes segments and vmalloc space is 
> flagged non-exec. Some other architectures have a constraint on distance 
> between kernel core text and other text.
> 
> Today you have for instance kprobes in the kernel that need dynamic exec 
> memory. It uses module_alloc() to get it. On some architectures you also 
> have ftrace that gets some exec memory with module_alloc().
> 
> So, I still don't understand why you cannot use module_alloc() and need 
> vmalloc_exec() instead.

Because I didn't know about it :)

Looks like that is indeed the appropriate interface (if a bit poorly
named), I'll switch to using that, thanks.

It'll still need to be exported, but it looks like the W|X attribute
discussion is not really germane here since it's what other in kernel
users are using, and there's nothing particularly special about how
bcachefs is using it compared to them.
Christophe Leroy May 15, 2023, 4:45 a.m. UTC | #24
Le 15/05/2023 à 01:43, Kent Overstreet a écrit :
> On Sun, May 14, 2023 at 06:39:00PM +0000, Christophe Leroy wrote:
>> I addition to that, I still don't understand why you bring back
>> vmalloc_exec() instead of using module_alloc().
>>
>> As reminded in a previous response, some architectures like powerpc/32s
>> cannot allocate exec memory in vmalloc space. On powerpc this is because
>> exec protection is performed on 256Mbytes segments and vmalloc space is
>> flagged non-exec. Some other architectures have a constraint on distance
>> between kernel core text and other text.
>>
>> Today you have for instance kprobes in the kernel that need dynamic exec
>> memory. It uses module_alloc() to get it. On some architectures you also
>> have ftrace that gets some exec memory with module_alloc().
>>
>> So, I still don't understand why you cannot use module_alloc() and need
>> vmalloc_exec() instead.
> 
> Because I didn't know about it :)
> 
> Looks like that is indeed the appropriate interface (if a bit poorly
> named), I'll switch to using that, thanks.
> 
> It'll still need to be exported, but it looks like the W|X attribute
> discussion is not really germane here since it's what other in kernel
> users are using, and there's nothing particularly special about how
> bcachefs is using it compared to them.

The W|X subject is applicable.

If you look into powerpc's module_alloc(), you'll see that when 
CONFIG_STRICT_MODULE_RWX is selected, module_alloc() allocate 
PAGE_KERNEL memory. It is then up to the consumer to change it to RO-X.

See for instance in arch/powerpc/kernel/kprobes.c:

void *alloc_insn_page(void)
{
	void *page;

	page = module_alloc(PAGE_SIZE);
	if (!page)
		return NULL;

	if (strict_module_rwx_enabled())
		set_memory_rox((unsigned long)page, 1);

	return page;
}


Christophe
Kent Overstreet May 15, 2023, 5:02 a.m. UTC | #25
On Mon, May 15, 2023 at 04:45:42AM +0000, Christophe Leroy wrote:
> 
> 
> Le 15/05/2023 à 01:43, Kent Overstreet a écrit :
> > On Sun, May 14, 2023 at 06:39:00PM +0000, Christophe Leroy wrote:
> >> I addition to that, I still don't understand why you bring back
> >> vmalloc_exec() instead of using module_alloc().
> >>
> >> As reminded in a previous response, some architectures like powerpc/32s
> >> cannot allocate exec memory in vmalloc space. On powerpc this is because
> >> exec protection is performed on 256Mbytes segments and vmalloc space is
> >> flagged non-exec. Some other architectures have a constraint on distance
> >> between kernel core text and other text.
> >>
> >> Today you have for instance kprobes in the kernel that need dynamic exec
> >> memory. It uses module_alloc() to get it. On some architectures you also
> >> have ftrace that gets some exec memory with module_alloc().
> >>
> >> So, I still don't understand why you cannot use module_alloc() and need
> >> vmalloc_exec() instead.
> > 
> > Because I didn't know about it :)
> > 
> > Looks like that is indeed the appropriate interface (if a bit poorly
> > named), I'll switch to using that, thanks.
> > 
> > It'll still need to be exported, but it looks like the W|X attribute
> > discussion is not really germane here since it's what other in kernel
> > users are using, and there's nothing particularly special about how
> > bcachefs is using it compared to them.
> 
> The W|X subject is applicable.
> 
> If you look into powerpc's module_alloc(), you'll see that when 
> CONFIG_STRICT_MODULE_RWX is selected, module_alloc() allocate 
> PAGE_KERNEL memory. It is then up to the consumer to change it to RO-X.
> 
> See for instance in arch/powerpc/kernel/kprobes.c:
> 
> void *alloc_insn_page(void)
> {
> 	void *page;
> 
> 	page = module_alloc(PAGE_SIZE);
> 	if (!page)
> 		return NULL;
> 
> 	if (strict_module_rwx_enabled())
> 		set_memory_rox((unsigned long)page, 1);
> 
> 	return page;
> }

Yeah.

I'm looking at the bpf code now.

<RANT MODE, YOU ARE WARNED>

Can I just say, for the record - god damn this situation is starting to
piss me off? This really nicely encapsulates everything I hate about
kernel development processes and culture and the fscking messes that get
foisted upon people as a result.

All I'm trying to do is write a fucking filesystem here people, I've got
enough on my plate. Dealing with the fallout of a kernel interface going
away without a proper replacement was NOT WHAT I FUCKING HAD IN MIND?

5% performance regression without this. That's just not acceptable, I
can't produce a filesystem that people will in the end want to use by
leaving performance on the table, it's death of a thousand cuts if I
take that attitude. Every 1% needs to be accounted for, a 5% performance
regression is flat out not going to happen.

And the real icing on this motherfucking turd sandwich of a cake, is
that I'm not the first person to have to solve this particular technical
problem.

BPF has the code I need.

But, in true kernel fashion, did they recognize that this was a
subproblem they could write as a library, both making their code more
modular and easier to understand, as well as, oh I don't know, not
leaving a giant steaming turd for the next person to come along?

Nope.

I'd be embarassed if I was responsible for this.
Kent Overstreet May 15, 2023, 5:38 a.m. UTC | #26
On Sun, May 14, 2023 at 11:43:25AM -0700, Eric Biggers wrote:
> I think it would also help if the generated assembly had the handling of the
> fields interleaved.  To achieve that, it might be necessary to interleave the C
> code.

No, that has negligable effect on performance - as expected, for an out
of order processor. < 1% improvement.

It doesn't look like this approach is going to work here. Sadly.
Eric Biggers May 15, 2023, 6:13 a.m. UTC | #27
On Mon, May 15, 2023 at 01:38:51AM -0400, Kent Overstreet wrote:
> On Sun, May 14, 2023 at 11:43:25AM -0700, Eric Biggers wrote:
> > I think it would also help if the generated assembly had the handling of the
> > fields interleaved.  To achieve that, it might be necessary to interleave the C
> > code.
> 
> No, that has negligable effect on performance - as expected, for an out
> of order processor. < 1% improvement.
> 
> It doesn't look like this approach is going to work here. Sadly.

I'd be glad to take a look at the code you actually tried.  It would be helpful
if you actually provided it, instead of just this "I tried it, I'm giving up
now" sort of thing.

I was also hoping you'd take the time to split this out into a userspace
micro-benchmark program that we could quickly try different approaches on.

BTW, even if people are okay with dynamic code generation (which seems
unlikely?), you'll still need a C version for architectures that you haven't
implemented the dynamic code generation for.

- Eric
Kent Overstreet May 15, 2023, 6:18 a.m. UTC | #28
On Sun, May 14, 2023 at 11:13:46PM -0700, Eric Biggers wrote:
> On Mon, May 15, 2023 at 01:38:51AM -0400, Kent Overstreet wrote:
> > On Sun, May 14, 2023 at 11:43:25AM -0700, Eric Biggers wrote:
> > > I think it would also help if the generated assembly had the handling of the
> > > fields interleaved.  To achieve that, it might be necessary to interleave the C
> > > code.
> > 
> > No, that has negligable effect on performance - as expected, for an out
> > of order processor. < 1% improvement.
> > 
> > It doesn't look like this approach is going to work here. Sadly.
> 
> I'd be glad to take a look at the code you actually tried.  It would be helpful
> if you actually provided it, instead of just this "I tried it, I'm giving up
> now" sort of thing.

https://evilpiepirate.org/git/bcachefs.git/log/?h=bkey_unpack

> I was also hoping you'd take the time to split this out into a userspace
> micro-benchmark program that we could quickly try different approaches on.

I don't need to, because I already have this:
https://evilpiepirate.org/git/ktest.git/tree/tests/bcachefs/perf.ktest

> BTW, even if people are okay with dynamic code generation (which seems
> unlikely?), you'll still need a C version for architectures that you haven't
> implemented the dynamic code generation for.

Excuse me? There already is a C version, and we've been discussing it.
Your approach wasn't any faster than the existing C version.
Eric Biggers May 15, 2023, 7:13 a.m. UTC | #29
On Mon, May 15, 2023 at 02:18:14AM -0400, Kent Overstreet wrote:
> On Sun, May 14, 2023 at 11:13:46PM -0700, Eric Biggers wrote:
> > On Mon, May 15, 2023 at 01:38:51AM -0400, Kent Overstreet wrote:
> > > On Sun, May 14, 2023 at 11:43:25AM -0700, Eric Biggers wrote:
> > > > I think it would also help if the generated assembly had the handling of the
> > > > fields interleaved.  To achieve that, it might be necessary to interleave the C
> > > > code.
> > > 
> > > No, that has negligable effect on performance - as expected, for an out
> > > of order processor. < 1% improvement.
> > > 
> > > It doesn't look like this approach is going to work here. Sadly.
> > 
> > I'd be glad to take a look at the code you actually tried.  It would be helpful
> > if you actually provided it, instead of just this "I tried it, I'm giving up
> > now" sort of thing.
> 
> https://evilpiepirate.org/git/bcachefs.git/log/?h=bkey_unpack
> 
> > I was also hoping you'd take the time to split this out into a userspace
> > micro-benchmark program that we could quickly try different approaches on.
> 
> I don't need to, because I already have this:
> https://evilpiepirate.org/git/ktest.git/tree/tests/bcachefs/perf.ktest

Sure, given that this is an optimization problem with a very small scope
(decoding 6 fields from a bitstream), I was hoping for something easier and
faster to iterate on than setting up a full kernel + bcachefs test environment
and reverse engineering 500 lines of shell script.  But sure, I can look into
that when I have a chance.

> Your approach wasn't any faster than the existing C version.

Well, it's your implementation of what you thought was "my approach".  It
doesn't quite match what I had suggested.  As I mentioned in my last email, it's
also unclear that your new code is ever actually executed, since you made it
conditional on all fields being byte-aligned...

- Eric
Kent Overstreet May 15, 2023, 7:26 a.m. UTC | #30
On Mon, May 15, 2023 at 12:13:43AM -0700, Eric Biggers wrote:
> Sure, given that this is an optimization problem with a very small scope
> (decoding 6 fields from a bitstream), I was hoping for something easier and
> faster to iterate on than setting up a full kernel + bcachefs test environment
> and reverse engineering 500 lines of shell script.  But sure, I can look into
> that when I have a chance.

If you were actually wanting to help, that repository is the tool I use
for kernel development and testing - it's got documentation.

It builds a kernel, boots a VM and runs a test in about 15 seconds, no
need for lifting that code out to userspace.

> > Your approach wasn't any faster than the existing C version.
> 
> Well, it's your implementation of what you thought was "my approach".  It
> doesn't quite match what I had suggested.  As I mentioned in my last email, it's
> also unclear that your new code is ever actually executed, since you made it
> conditional on all fields being byte-aligned...

Eric, I'm not an idiot, that was one of the first things I checked. No
unaligned bkey formats were generated in my tests.

The latest iteration of your approach that I looked at compiled to ~250
bytes of code, vs. ~50 bytes for the dynamically generated unpack
functions. I'm sure it's possible to shave a bit off with some more
work, but looking at the generated code it's clear it's not going to be
competitive.
David Laight May 15, 2023, 10:29 a.m. UTC | #31
From: Kent Overstreet
> Sent: 14 May 2023 06:45
...
> dynamically generated unpack:
> rand_insert: 20.0 MiB with 1 threads in    33 sec,  1609 nsec per iter, 607 KiB per sec
> 
> old C unpack:
> rand_insert: 20.0 MiB with 1 threads in    35 sec,  1672 nsec per iter, 584 KiB per sec
> 
> the Eric Biggers special:
> rand_insert: 20.0 MiB with 1 threads in    35 sec,  1676 nsec per iter, 583 KiB per sec
> 
> Tested two versions of your approach, one without a shift value, one
> where we use a shift value to try to avoid unaligned access - second was
> perhaps 1% faster

You won't notice any effect of avoiding unaligned accesses on x86.
I think then get split into 64bit accesses and again on 64 byte
boundaries (that is what I see for uncached access to PCIe).
The kernel won't be doing >64bit and the 'out of order'
pipeline will tend to cover the others (especially since you
get 2 reads/clock).

> so it's not looking good. This benchmark doesn't even hit on
> unpack_key() quite as much as I thought, so the difference is
> significant.

Beware: unless you manage to lock the cpu frequency (which is ~impossible
on some cpu) timings in nanoseconds are pretty useless.
You can use the performance counter to get accurate cycle times
(provided there isn't a cpu switch in the middle of a micro-benchmark).

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Kees Cook May 16, 2023, 9:02 p.m. UTC | #32
On Fri, May 12, 2023 at 02:41:50PM -0400, Kent Overstreet wrote:
> On Thu, May 11, 2023 at 03:28:40PM -0700, Kees Cook wrote:
> > On Wed, May 10, 2023 at 03:05:48PM +0000, Johannes Thumshirn wrote:
> > > On 09.05.23 18:56, Kent Overstreet wrote:
> > > > +/**
> > > > + * vmalloc_exec - allocate virtually contiguous, executable memory
> > > > + * @size:	  allocation size
> > > > + *
> > > > + * Kernel-internal function to allocate enough pages to cover @size
> > > > + * the page level allocator and map them into contiguous and
> > > > + * executable kernel virtual space.
> > > > + *
> > > > + * For tight control over page level allocator and protection flags
> > > > + * use __vmalloc() instead.
> > > > + *
> > > > + * Return: pointer to the allocated memory or %NULL on error
> > > > + */
> > > > +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> > > > +{
> > > > +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> > > > +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> > > > +			NUMA_NO_NODE, __builtin_return_address(0));
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(vmalloc_exec);
> > > 
> > > Uh W+X memory reagions.
> > > The 90s called, they want their shellcode back.
> > 
> > Just to clarify: the kernel must never create W+X memory regions. So,
> > no, do not reintroduce vmalloc_exec().
> > 
> > Dynamic code areas need to be constructed in a non-executable memory,
> > then switched to read-only and verified to still be what was expected,
> > and only then made executable.
> 
> So if we're opening this up to the topic if what an acceptible API would
> look like - how hard is this requirement?
> 
> The reason is that the functions we're constructing are only ~50 bytes,
> so we don't want to be burning a full page per function (particularly
> for the 64kb page architectures...)

For something that small, why not use the text_poke API?
Kent Overstreet May 16, 2023, 9:20 p.m. UTC | #33
On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> For something that small, why not use the text_poke API?

This looks like it's meant for patching existing kernel text, which
isn't what I want - I'm generating new functions on the fly, one per
btree node.

I'm working up a new allocator - a (very simple) slab allocator where
you pass a buffer, and it gives you a copy of that buffer mapped
executable, but not writeable.

It looks like we'll be able to convert bpf, kprobes, and ftrace
trampolines to it; it'll consolidate a fair amount of code (particularly
in bpf), and they won't have to burn a full page per allocation anymore.

bpf has a neat trick where it maps the same page in two different
locations, one is the executable location and the other is the writeable
location - I'm stealing that.

external api will be:

void *jit_alloc(void *buf, size_t len, gfp_t gfp);
void jit_free(void *buf);
void jit_update(void *buf, void *new_code, size_t len); /* update an existing allocation */
Matthew Wilcox (Oracle) May 16, 2023, 9:47 p.m. UTC | #34
On Tue, May 16, 2023 at 05:20:33PM -0400, Kent Overstreet wrote:
> On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > For something that small, why not use the text_poke API?
> 
> This looks like it's meant for patching existing kernel text, which
> isn't what I want - I'm generating new functions on the fly, one per
> btree node.
> 
> I'm working up a new allocator - a (very simple) slab allocator where
> you pass a buffer, and it gives you a copy of that buffer mapped
> executable, but not writeable.
> 
> It looks like we'll be able to convert bpf, kprobes, and ftrace
> trampolines to it; it'll consolidate a fair amount of code (particularly
> in bpf), and they won't have to burn a full page per allocation anymore.
> 
> bpf has a neat trick where it maps the same page in two different
> locations, one is the executable location and the other is the writeable
> location - I'm stealing that.

How does that avoid the problem of being able to construct an arbitrary
gadget that somebody else will then execute?  IOW, what bpf has done
seems like it's working around & undoing the security improvements.

I suppose it's an improvement that only the executable address is
passed back to the caller, and not the writable address.

> external api will be:
> 
> void *jit_alloc(void *buf, size_t len, gfp_t gfp);
> void jit_free(void *buf);
> void jit_update(void *buf, void *new_code, size_t len); /* update an existing allocation */
>
Kent Overstreet May 16, 2023, 9:57 p.m. UTC | #35
On Tue, May 16, 2023 at 10:47:13PM +0100, Matthew Wilcox wrote:
> On Tue, May 16, 2023 at 05:20:33PM -0400, Kent Overstreet wrote:
> > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > For something that small, why not use the text_poke API?
> > 
> > This looks like it's meant for patching existing kernel text, which
> > isn't what I want - I'm generating new functions on the fly, one per
> > btree node.
> > 
> > I'm working up a new allocator - a (very simple) slab allocator where
> > you pass a buffer, and it gives you a copy of that buffer mapped
> > executable, but not writeable.
> > 
> > It looks like we'll be able to convert bpf, kprobes, and ftrace
> > trampolines to it; it'll consolidate a fair amount of code (particularly
> > in bpf), and they won't have to burn a full page per allocation anymore.
> > 
> > bpf has a neat trick where it maps the same page in two different
> > locations, one is the executable location and the other is the writeable
> > location - I'm stealing that.
> 
> How does that avoid the problem of being able to construct an arbitrary
> gadget that somebody else will then execute?  IOW, what bpf has done
> seems like it's working around & undoing the security improvements.
> 
> I suppose it's an improvement that only the executable address is
> passed back to the caller, and not the writable address.

That's my thinking; grepping around finds several uses of module_alloc()
that are all doing different variations on the page permissions dance.
Let's just do it once and do it right...
Kent Overstreet May 17, 2023, 5:28 a.m. UTC | #36
On Tue, May 16, 2023 at 10:47:13PM +0100, Matthew Wilcox wrote:
> On Tue, May 16, 2023 at 05:20:33PM -0400, Kent Overstreet wrote:
> > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > For something that small, why not use the text_poke API?
> > 
> > This looks like it's meant for patching existing kernel text, which
> > isn't what I want - I'm generating new functions on the fly, one per
> > btree node.
> > 
> > I'm working up a new allocator - a (very simple) slab allocator where
> > you pass a buffer, and it gives you a copy of that buffer mapped
> > executable, but not writeable.
> > 
> > It looks like we'll be able to convert bpf, kprobes, and ftrace
> > trampolines to it; it'll consolidate a fair amount of code (particularly
> > in bpf), and they won't have to burn a full page per allocation anymore.
> > 
> > bpf has a neat trick where it maps the same page in two different
> > locations, one is the executable location and the other is the writeable
> > location - I'm stealing that.
> 
> How does that avoid the problem of being able to construct an arbitrary
> gadget that somebody else will then execute?  IOW, what bpf has done
> seems like it's working around & undoing the security improvements.
> 
> I suppose it's an improvement that only the executable address is
> passed back to the caller, and not the writable address.

Ok, here's what I came up with. Have not tested all corner cases, still
need to write docs - but I think this gives us a nicer interface than
what bpf/kprobes/etc. have been doing, and it does the sub-page sized
allocations I need.

With an additional tweak to module_alloc() (not done in this patch yet)
we avoid ever mapping in pages both writeable and executable:

-->--

From 6eeb6b8ef4271ea1a8d9cac7fbaeeb7704951976 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 May 2023 01:22:06 -0400
Subject: [PATCH] mm: jit/text allocator

This provides a new, very simple slab allocator for jit/text, i.e. bpf,
ftrace trampolines, or bcachefs unpack functions.

With this API we can avoid ever mapping pages both writeable and
executable (not implemented in this patch: need to tweak
module_alloc()), and it also supports sub-page sized allocations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

diff --git a/include/linux/jitalloc.h b/include/linux/jitalloc.h
new file mode 100644
index 0000000000..f1549d60e8
--- /dev/null
+++ b/include/linux/jitalloc.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_JITALLOC_H
+#define _LINUX_JITALLOC_H
+
+void jit_update(void *buf, void *new_buf, size_t len);
+void jit_free(void *buf);
+void *jit_alloc(void *buf, size_t len);
+
+#endif /* _LINUX_JITALLOC_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 4751031f3f..ff26a4f0c9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1202,6 +1202,9 @@ config LRU_GEN_STATS
 	  This option has a per-memcg and per-node memory overhead.
 # }
 
+config JITALLOC
+	bool
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index c03e1e5859..25e82db9e8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_JITALLOC) += jitalloc.o
diff --git a/mm/jitalloc.c b/mm/jitalloc.c
new file mode 100644
index 0000000000..7c4d621802
--- /dev/null
+++ b/mm/jitalloc.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/jitalloc.h>
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/mutex.h>
+#include <linux/set_memory.h>
+#include <linux/vmalloc.h>
+
+#include <asm/text-patching.h>
+
+static DEFINE_MUTEX(jit_alloc_lock);
+
+struct jit_cache {
+	unsigned		obj_size_bits;
+	unsigned		objs_per_slab;
+	struct list_head	partial;
+};
+
+#define JITALLOC_MIN_SIZE	16
+#define NR_JIT_CACHES		ilog2(PAGE_SIZE / JITALLOC_MIN_SIZE)
+
+static struct jit_cache jit_caches[NR_JIT_CACHES];
+
+struct jit_slab {
+	unsigned long		__page_flags;
+
+	struct jit_cache	*cache;
+	void			*executably_mapped;;
+	unsigned long		*objs_allocated; /* bitmap of free objects */
+	struct list_head	list;
+};
+
+#define folio_jit_slab(folio)		(_Generic((folio),			\
+	const struct folio *:		(const struct jit_slab *)(folio),	\
+	struct folio *:			(struct jit_slab *)(folio)))
+
+#define jit_slab_folio(s)		(_Generic((s),				\
+	const struct jit_slab *:	(const struct folio *)s,		\
+	struct jit_slab *:		(struct folio *)s))
+
+static struct jit_slab *jit_slab_alloc(struct jit_cache *cache)
+{
+	void *executably_mapped = module_alloc(PAGE_SIZE);
+	struct page *page;
+	struct folio *folio;
+	struct jit_slab *slab;
+	unsigned long *objs_allocated;
+
+	if (!executably_mapped)
+		return NULL;
+
+	objs_allocated = kcalloc(BITS_TO_LONGS(cache->objs_per_slab), sizeof(unsigned long), GFP_KERNEL);
+	if (!objs_allocated ) {
+		vfree(executably_mapped);
+		return NULL;
+	}
+
+	set_vm_flush_reset_perms(executably_mapped);
+	set_memory_rox((unsigned long) executably_mapped, 1);
+
+	page = vmalloc_to_page(executably_mapped);
+	folio = page_folio(page);
+
+	__folio_set_slab(folio);
+	slab			= folio_jit_slab(folio);
+	slab->cache		= cache;
+	slab->executably_mapped	= executably_mapped;
+	slab->objs_allocated = objs_allocated;
+	INIT_LIST_HEAD(&slab->list);
+
+	return slab;
+}
+
+static void *jit_cache_alloc(void *buf, size_t len, struct jit_cache *cache)
+{
+	struct jit_slab *s =
+		list_first_entry_or_null(&cache->partial, struct jit_slab, list) ?:
+		jit_slab_alloc(cache);
+	unsigned obj_idx, nr_allocated;
+
+	if (!s)
+		return NULL;
+
+	obj_idx = find_first_zero_bit(s->objs_allocated, cache->objs_per_slab);
+
+	BUG_ON(obj_idx >= cache->objs_per_slab);
+	__set_bit(obj_idx, s->objs_allocated);
+
+	nr_allocated = bitmap_weight(s->objs_allocated, s->cache->objs_per_slab);
+
+	if (nr_allocated == s->cache->objs_per_slab) {
+		list_del_init(&s->list);
+	} else if (nr_allocated == 1) {
+		list_del(&s->list);
+		list_add(&s->list, &s->cache->partial);
+	}
+
+	return s->executably_mapped + (obj_idx << cache->obj_size_bits);
+}
+
+void jit_update(void *buf, void *new_buf, size_t len)
+{
+	text_poke_copy(buf, new_buf, len);
+}
+EXPORT_SYMBOL_GPL(jit_update);
+
+void jit_free(void *buf)
+{
+	struct page *page;
+	struct folio *folio;
+	struct jit_slab *s;
+	unsigned obj_idx, nr_allocated;
+	size_t offset;
+
+	if (!buf)
+		return;
+
+	page	= vmalloc_to_page(buf);
+	folio	= page_folio(page);
+	offset	= offset_in_folio(folio, buf);
+
+	if (!folio_test_slab(folio)) {
+		vfree(buf);
+		return;
+	}
+
+	s = folio_jit_slab(folio);
+
+	mutex_lock(&jit_alloc_lock);
+	obj_idx = offset >> s->cache->obj_size_bits;
+
+	__clear_bit(obj_idx, s->objs_allocated);
+
+	nr_allocated = bitmap_weight(s->objs_allocated, s->cache->objs_per_slab);
+
+	if (nr_allocated == 0) {
+		list_del(&s->list);
+		kfree(s->objs_allocated);
+		folio_put(folio);
+	} else if (nr_allocated + 1 == s->cache->objs_per_slab) {
+		list_del(&s->list);
+		list_add(&s->list, &s->cache->partial);
+	}
+
+	mutex_unlock(&jit_alloc_lock);
+}
+EXPORT_SYMBOL_GPL(jit_free);
+
+void *jit_alloc(void *buf, size_t len)
+{
+	unsigned jit_cache_idx = ilog2(roundup_pow_of_two(len) / 16);
+	void *p;
+
+	if (jit_cache_idx < NR_JIT_CACHES) {
+		mutex_lock(&jit_alloc_lock);
+		p = jit_cache_alloc(buf, len, &jit_caches[jit_cache_idx]);
+		mutex_unlock(&jit_alloc_lock);
+	} else {
+		p = module_alloc(len);
+		if (p) {
+			set_vm_flush_reset_perms(p);
+			set_memory_rox((unsigned long) p, DIV_ROUND_UP(len, PAGE_SIZE));
+		}
+	}
+
+	if (p && buf)
+		jit_update(p, buf, len);
+
+	return p;
+}
+EXPORT_SYMBOL_GPL(jit_alloc);
+
+static int __init jit_alloc_init(void)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(jit_caches); i++) {
+		jit_caches[i].obj_size_bits	= ilog2(JITALLOC_MIN_SIZE) + i;
+		jit_caches[i].objs_per_slab	= PAGE_SIZE >> jit_caches[i].obj_size_bits;
+
+		INIT_LIST_HEAD(&jit_caches[i].partial);
+	}
+
+	return 0;
+}
+core_initcall(jit_alloc_init);
Mike Rapoport May 17, 2023, 2:04 p.m. UTC | #37
On Wed, May 17, 2023 at 01:28:43AM -0400, Kent Overstreet wrote:
> On Tue, May 16, 2023 at 10:47:13PM +0100, Matthew Wilcox wrote:
> > On Tue, May 16, 2023 at 05:20:33PM -0400, Kent Overstreet wrote:
> > > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > > For something that small, why not use the text_poke API?
> > > 
> > > This looks like it's meant for patching existing kernel text, which
> > > isn't what I want - I'm generating new functions on the fly, one per
> > > btree node.
> > > 
> > > I'm working up a new allocator - a (very simple) slab allocator where
> > > you pass a buffer, and it gives you a copy of that buffer mapped
> > > executable, but not writeable.
> > > 
> > > It looks like we'll be able to convert bpf, kprobes, and ftrace
> > > trampolines to it; it'll consolidate a fair amount of code (particularly
> > > in bpf), and they won't have to burn a full page per allocation anymore.
> > > 
> > > bpf has a neat trick where it maps the same page in two different
> > > locations, one is the executable location and the other is the writeable
> > > location - I'm stealing that.
> > 
> > How does that avoid the problem of being able to construct an arbitrary
> > gadget that somebody else will then execute?  IOW, what bpf has done
> > seems like it's working around & undoing the security improvements.
> > 
> > I suppose it's an improvement that only the executable address is
> > passed back to the caller, and not the writable address.
> 
> Ok, here's what I came up with. Have not tested all corner cases, still
> need to write docs - but I think this gives us a nicer interface than
> what bpf/kprobes/etc. have been doing, and it does the sub-page sized
> allocations I need.
> 
> With an additional tweak to module_alloc() (not done in this patch yet)
> we avoid ever mapping in pages both writeable and executable:
> 
> -->--
> 
> From 6eeb6b8ef4271ea1a8d9cac7fbaeeb7704951976 Mon Sep 17 00:00:00 2001
> From: Kent Overstreet <kent.overstreet@linux.dev>
> Date: Wed, 17 May 2023 01:22:06 -0400
> Subject: [PATCH] mm: jit/text allocator
> 
> This provides a new, very simple slab allocator for jit/text, i.e. bpf,
> ftrace trampolines, or bcachefs unpack functions.
> 
> With this API we can avoid ever mapping pages both writeable and
> executable (not implemented in this patch: need to tweak
> module_alloc()), and it also supports sub-page sized allocations.

This looks like yet another workaround for that module_alloc() was not
designed to handle permission changes. Rather than create more and more
wrappers for module_alloc() we need to have core API for code allocation,
apparently on top of vmalloc, and then use that API for modules, bpf,
tracing and whatnot.

There was quite lengthy discussion about how to handle code allocations
here:

https://lore.kernel.org/linux-mm/20221107223921.3451913-1-song@kernel.org/
 
and Song is already working on improvements for module_alloc(), e.g. see
commit ac3b43283923 ("module: replace module_layout with module_memory")

Another thing, the code below will not even compile on !x86.

> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> 
> diff --git a/include/linux/jitalloc.h b/include/linux/jitalloc.h
> new file mode 100644
> index 0000000000..f1549d60e8
> --- /dev/null
> +++ b/include/linux/jitalloc.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_JITALLOC_H
> +#define _LINUX_JITALLOC_H
> +
> +void jit_update(void *buf, void *new_buf, size_t len);
> +void jit_free(void *buf);
> +void *jit_alloc(void *buf, size_t len);
> +
> +#endif /* _LINUX_JITALLOC_H */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 4751031f3f..ff26a4f0c9 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1202,6 +1202,9 @@ config LRU_GEN_STATS
>  	  This option has a per-memcg and per-node memory overhead.
>  # }
>  
> +config JITALLOC
> +	bool
> +
>  source "mm/damon/Kconfig"
>  
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index c03e1e5859..25e82db9e8 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
>  obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> +obj-$(CONFIG_JITALLOC) += jitalloc.o
> diff --git a/mm/jitalloc.c b/mm/jitalloc.c
> new file mode 100644
> index 0000000000..7c4d621802
> --- /dev/null
> +++ b/mm/jitalloc.c
> @@ -0,0 +1,187 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/gfp.h>
> +#include <linux/highmem.h>
> +#include <linux/jitalloc.h>
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/mutex.h>
> +#include <linux/set_memory.h>
> +#include <linux/vmalloc.h>
> +
> +#include <asm/text-patching.h>
> +
> +static DEFINE_MUTEX(jit_alloc_lock);
> +
> +struct jit_cache {
> +	unsigned		obj_size_bits;
> +	unsigned		objs_per_slab;
> +	struct list_head	partial;
> +};
> +
> +#define JITALLOC_MIN_SIZE	16
> +#define NR_JIT_CACHES		ilog2(PAGE_SIZE / JITALLOC_MIN_SIZE)
> +
> +static struct jit_cache jit_caches[NR_JIT_CACHES];
> +
> +struct jit_slab {
> +	unsigned long		__page_flags;
> +
> +	struct jit_cache	*cache;
> +	void			*executably_mapped;;
> +	unsigned long		*objs_allocated; /* bitmap of free objects */
> +	struct list_head	list;
> +};
> +
> +#define folio_jit_slab(folio)		(_Generic((folio),			\
> +	const struct folio *:		(const struct jit_slab *)(folio),	\
> +	struct folio *:			(struct jit_slab *)(folio)))
> +
> +#define jit_slab_folio(s)		(_Generic((s),				\
> +	const struct jit_slab *:	(const struct folio *)s,		\
> +	struct jit_slab *:		(struct folio *)s))
> +
> +static struct jit_slab *jit_slab_alloc(struct jit_cache *cache)
> +{
> +	void *executably_mapped = module_alloc(PAGE_SIZE);
> +	struct page *page;
> +	struct folio *folio;
> +	struct jit_slab *slab;
> +	unsigned long *objs_allocated;
> +
> +	if (!executably_mapped)
> +		return NULL;
> +
> +	objs_allocated = kcalloc(BITS_TO_LONGS(cache->objs_per_slab), sizeof(unsigned long), GFP_KERNEL);
> +	if (!objs_allocated ) {
> +		vfree(executably_mapped);
> +		return NULL;
> +	}
> +
> +	set_vm_flush_reset_perms(executably_mapped);
> +	set_memory_rox((unsigned long) executably_mapped, 1);
> +
> +	page = vmalloc_to_page(executably_mapped);
> +	folio = page_folio(page);
> +
> +	__folio_set_slab(folio);
> +	slab			= folio_jit_slab(folio);
> +	slab->cache		= cache;
> +	slab->executably_mapped	= executably_mapped;
> +	slab->objs_allocated = objs_allocated;
> +	INIT_LIST_HEAD(&slab->list);
> +
> +	return slab;
> +}
> +
> +static void *jit_cache_alloc(void *buf, size_t len, struct jit_cache *cache)
> +{
> +	struct jit_slab *s =
> +		list_first_entry_or_null(&cache->partial, struct jit_slab, list) ?:
> +		jit_slab_alloc(cache);
> +	unsigned obj_idx, nr_allocated;
> +
> +	if (!s)
> +		return NULL;
> +
> +	obj_idx = find_first_zero_bit(s->objs_allocated, cache->objs_per_slab);
> +
> +	BUG_ON(obj_idx >= cache->objs_per_slab);
> +	__set_bit(obj_idx, s->objs_allocated);
> +
> +	nr_allocated = bitmap_weight(s->objs_allocated, s->cache->objs_per_slab);
> +
> +	if (nr_allocated == s->cache->objs_per_slab) {
> +		list_del_init(&s->list);
> +	} else if (nr_allocated == 1) {
> +		list_del(&s->list);
> +		list_add(&s->list, &s->cache->partial);
> +	}
> +
> +	return s->executably_mapped + (obj_idx << cache->obj_size_bits);
> +}
> +
> +void jit_update(void *buf, void *new_buf, size_t len)
> +{
> +	text_poke_copy(buf, new_buf, len);
> +}
> +EXPORT_SYMBOL_GPL(jit_update);
> +
> +void jit_free(void *buf)
> +{
> +	struct page *page;
> +	struct folio *folio;
> +	struct jit_slab *s;
> +	unsigned obj_idx, nr_allocated;
> +	size_t offset;
> +
> +	if (!buf)
> +		return;
> +
> +	page	= vmalloc_to_page(buf);
> +	folio	= page_folio(page);
> +	offset	= offset_in_folio(folio, buf);
> +
> +	if (!folio_test_slab(folio)) {
> +		vfree(buf);
> +		return;
> +	}
> +
> +	s = folio_jit_slab(folio);
> +
> +	mutex_lock(&jit_alloc_lock);
> +	obj_idx = offset >> s->cache->obj_size_bits;
> +
> +	__clear_bit(obj_idx, s->objs_allocated);
> +
> +	nr_allocated = bitmap_weight(s->objs_allocated, s->cache->objs_per_slab);
> +
> +	if (nr_allocated == 0) {
> +		list_del(&s->list);
> +		kfree(s->objs_allocated);
> +		folio_put(folio);
> +	} else if (nr_allocated + 1 == s->cache->objs_per_slab) {
> +		list_del(&s->list);
> +		list_add(&s->list, &s->cache->partial);
> +	}
> +
> +	mutex_unlock(&jit_alloc_lock);
> +}
> +EXPORT_SYMBOL_GPL(jit_free);
> +
> +void *jit_alloc(void *buf, size_t len)
> +{
> +	unsigned jit_cache_idx = ilog2(roundup_pow_of_two(len) / 16);
> +	void *p;
> +
> +	if (jit_cache_idx < NR_JIT_CACHES) {
> +		mutex_lock(&jit_alloc_lock);
> +		p = jit_cache_alloc(buf, len, &jit_caches[jit_cache_idx]);
> +		mutex_unlock(&jit_alloc_lock);
> +	} else {
> +		p = module_alloc(len);
> +		if (p) {
> +			set_vm_flush_reset_perms(p);
> +			set_memory_rox((unsigned long) p, DIV_ROUND_UP(len, PAGE_SIZE));
> +		}
> +	}
> +
> +	if (p && buf)
> +		jit_update(p, buf, len);
> +
> +	return p;
> +}
> +EXPORT_SYMBOL_GPL(jit_alloc);
> +
> +static int __init jit_alloc_init(void)
> +{
> +	for (unsigned i = 0; i < ARRAY_SIZE(jit_caches); i++) {
> +		jit_caches[i].obj_size_bits	= ilog2(JITALLOC_MIN_SIZE) + i;
> +		jit_caches[i].objs_per_slab	= PAGE_SIZE >> jit_caches[i].obj_size_bits;
> +
> +		INIT_LIST_HEAD(&jit_caches[i].partial);
> +	}
> +
> +	return 0;
> +}
> +core_initcall(jit_alloc_init);
>
Kent Overstreet May 17, 2023, 2:18 p.m. UTC | #38
On Wed, May 17, 2023 at 05:04:27PM +0300, Mike Rapoport wrote:
> On Wed, May 17, 2023 at 01:28:43AM -0400, Kent Overstreet wrote:
> > On Tue, May 16, 2023 at 10:47:13PM +0100, Matthew Wilcox wrote:
> > > On Tue, May 16, 2023 at 05:20:33PM -0400, Kent Overstreet wrote:
> > > > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > > > For something that small, why not use the text_poke API?
> > > > 
> > > > This looks like it's meant for patching existing kernel text, which
> > > > isn't what I want - I'm generating new functions on the fly, one per
> > > > btree node.
> > > > 
> > > > I'm working up a new allocator - a (very simple) slab allocator where
> > > > you pass a buffer, and it gives you a copy of that buffer mapped
> > > > executable, but not writeable.
> > > > 
> > > > It looks like we'll be able to convert bpf, kprobes, and ftrace
> > > > trampolines to it; it'll consolidate a fair amount of code (particularly
> > > > in bpf), and they won't have to burn a full page per allocation anymore.
> > > > 
> > > > bpf has a neat trick where it maps the same page in two different
> > > > locations, one is the executable location and the other is the writeable
> > > > location - I'm stealing that.
> > > 
> > > How does that avoid the problem of being able to construct an arbitrary
> > > gadget that somebody else will then execute?  IOW, what bpf has done
> > > seems like it's working around & undoing the security improvements.
> > > 
> > > I suppose it's an improvement that only the executable address is
> > > passed back to the caller, and not the writable address.
> > 
> > Ok, here's what I came up with. Have not tested all corner cases, still
> > need to write docs - but I think this gives us a nicer interface than
> > what bpf/kprobes/etc. have been doing, and it does the sub-page sized
> > allocations I need.
> > 
> > With an additional tweak to module_alloc() (not done in this patch yet)
> > we avoid ever mapping in pages both writeable and executable:
> > 
> > -->--
> > 
> > From 6eeb6b8ef4271ea1a8d9cac7fbaeeb7704951976 Mon Sep 17 00:00:00 2001
> > From: Kent Overstreet <kent.overstreet@linux.dev>
> > Date: Wed, 17 May 2023 01:22:06 -0400
> > Subject: [PATCH] mm: jit/text allocator
> > 
> > This provides a new, very simple slab allocator for jit/text, i.e. bpf,
> > ftrace trampolines, or bcachefs unpack functions.
> > 
> > With this API we can avoid ever mapping pages both writeable and
> > executable (not implemented in this patch: need to tweak
> > module_alloc()), and it also supports sub-page sized allocations.
> 
> This looks like yet another workaround for that module_alloc() was not
> designed to handle permission changes. Rather than create more and more
> wrappers for module_alloc() we need to have core API for code allocation,
> apparently on top of vmalloc, and then use that API for modules, bpf,
> tracing and whatnot.
> 
> There was quite lengthy discussion about how to handle code allocations
> here:
> 
> https://lore.kernel.org/linux-mm/20221107223921.3451913-1-song@kernel.org/

Thanks for the link!

Added Song to the CC.

Song, I'm looking at your code now - switching to hugepages is great,
but I wonder if we might be able to combine our two approaches - with
the slab allocator I did, do we have to bother with VMAs at all? And
then it gets us sub-page sized allocations.

> and Song is already working on improvements for module_alloc(), e.g. see
> commit ac3b43283923 ("module: replace module_layout with module_memory")
> 
> Another thing, the code below will not even compile on !x86.

Due to text_poke(), which I see is abstracted better in that patchset.

I'm very curious why text_poke() does tlb flushing at all; it seems like
flush_icache_range() is actually what's needed?

text_poke() also only touching up to two pages, without that being
documented, is also a footgun...

And I'm really curious why text_poke() is needed at all. Seems like we
could just use kmap_local() to create a temporary writeable mapping,
except in my testing that got me a RO mapping. Odd.
Mike Rapoport May 17, 2023, 3:44 p.m. UTC | #39
On Wed, May 17, 2023 at 10:18:11AM -0400, Kent Overstreet wrote:
> On Wed, May 17, 2023 at 05:04:27PM +0300, Mike Rapoport wrote:
> 
> And I'm really curious why text_poke() is needed at all. Seems like we
> could just use kmap_local() to create a temporary writeable mapping,

On 64 bit kmap_local_page() is aliased to page_address() and does not map
anything. text_poke() is needed to actually create a temporary writable
mapping without touching page tables in vmalloc and/or direct map.
Kent Overstreet May 17, 2023, 3:59 p.m. UTC | #40
On Wed, May 17, 2023 at 06:44:12PM +0300, Mike Rapoport wrote:
> On Wed, May 17, 2023 at 10:18:11AM -0400, Kent Overstreet wrote:
> > On Wed, May 17, 2023 at 05:04:27PM +0300, Mike Rapoport wrote:
> > 
> > And I'm really curious why text_poke() is needed at all. Seems like we
> > could just use kmap_local() to create a temporary writeable mapping,
> 
> On 64 bit kmap_local_page() is aliased to page_address() and does not map
> anything. text_poke() is needed to actually create a temporary writable
> mapping without touching page tables in vmalloc and/or direct map.

Duh - thanks!
Eric Biggers May 21, 2023, 9:33 p.m. UTC | #41
On Mon, May 15, 2023 at 03:26:43AM -0400, Kent Overstreet wrote:
> On Mon, May 15, 2023 at 12:13:43AM -0700, Eric Biggers wrote:
> > Sure, given that this is an optimization problem with a very small scope
> > (decoding 6 fields from a bitstream), I was hoping for something easier and
> > faster to iterate on than setting up a full kernel + bcachefs test environment
> > and reverse engineering 500 lines of shell script.  But sure, I can look into
> > that when I have a chance.
> 
> If you were actually wanting to help, that repository is the tool I use
> for kernel development and testing - it's got documentation.
> 
> It builds a kernel, boots a VM and runs a test in about 15 seconds, no
> need for lifting that code out to userspace.
> 

FYI, I had a go with your test framework today, but I ran into too many problems
to really bother with it.  In case you want to improve it, these are the
problems I ran into (so far).  The command I was trying to run, after having run
'./root_image create' as root as the README says to do, was
'build-test-kernel run -I ~/src/ktest/tests/bcachefs/perf.ktest':

- Error due to ~/src/ktest/tests/bcachefs/bcachefs-tools not existing.  Worked
  around by cloning the bcachefs-tools repo to this location.  (Note, it's not a
  git submodule, so updating the git submodules didn't help.)

- Error due to "Root image not found".  Worked around by recursively chown'ing
  /var/lib/ktest from root to my user.  (Note, the README says to run
  'root_image create' as root, which results in root ownership.)

- Error due to "cannot set up guest memory 'pc.ram': Cannot allocate memory".
  Worked around by editing tests/bcachefs/perf.ktest to change config-mem from
  32G to 16G.  (I have 32G memory total on this computer.)

- Error due to "failed to open /dev/vfio/10: No such file or directory".
  Enabling CONFIG_VFIO and CONFIG_VFIO_PCI in my host kernel didn't help.  It
  seems the test is hardcoded to expect PCI passthrough to be set up with a
  specific device.  I'd have expected it to just set up a standard virtual disk.
Kent Overstreet May 21, 2023, 10:04 p.m. UTC | #42
On Sun, May 21, 2023 at 02:33:34PM -0700, Eric Biggers wrote:
> FYI, I had a go with your test framework today, but I ran into too many problems
> to really bother with it.  In case you want to improve it, these are the
> problems I ran into (so far).  The command I was trying to run, after having run
> './root_image create' as root as the README says to do, was
> 'build-test-kernel run -I ~/src/ktest/tests/bcachefs/perf.ktest':

Thanks for giving it a shot...

> - Error due to ~/src/ktest/tests/bcachefs/bcachefs-tools not existing.  Worked
>   around by cloning the bcachefs-tools repo to this location.  (Note, it's not a
>   git submodule, so updating the git submodules didn't help.)

a require-git line was missing, fixed that...

> - Error due to "Root image not found".  Worked around by recursively chown'ing
>   /var/lib/ktest from root to my user.  (Note, the README says to run
>   'root_image create' as root, which results in root ownership.)

Not sure about this one - root ownership is supposed to be fine because
qemu opens the root image read only, we use qemu's block device
in-memory snapshot mode. Was it just not readable by your user?

> - Error due to "cannot set up guest memory 'pc.ram': Cannot allocate memory".
>   Worked around by editing tests/bcachefs/perf.ktest to change config-mem from
>   32G to 16G.  (I have 32G memory total on this computer.)
 
I think 32G is excessive for the tests that actually need to be in this
file, dropping that back to 16G.

> - Error due to "failed to open /dev/vfio/10: No such file or directory".
>   Enabling CONFIG_VFIO and CONFIG_VFIO_PCI in my host kernel didn't help.  It
>   seems the test is hardcoded to expect PCI passthrough to be set up with a
>   specific device.  I'd have expected it to just set up a standard virtual disk.

Some of the tests in that file do need a fast device, but the tests
we're interested in do not - I'll split that up.

I just pushed fixes for everything except the root_image issue if you
want to give it another go.

Cheers,
Kent
Andy Lutomirski June 17, 2023, 4:13 a.m. UTC | #43
On 5/16/23 14:20, Kent Overstreet wrote:
> On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
>> For something that small, why not use the text_poke API?
> 
> This looks like it's meant for patching existing kernel text, which
> isn't what I want - I'm generating new functions on the fly, one per
> btree node.

Dynamically generating code is a giant can of worms.

Kees touched on a basic security thing: a linear address mapped W+X is a big
no-no.  And that's just scratching the surface -- ideally we would have a
strong protocol for generating code: the code is generated in some
extra-secure context, then it's made immutable and double-checked, then
it becomes live.  (And we would offer this to userspace, some day.)
Just having a different address for the W and X aliases is pretty weak.

(When x86 modifies itself at boot or for static keys, it changes out the
page tables temporarily.)

And even beyond security, we have correctness.  x86 is a fairly 
forgiving architecture.  If you go back in time about 20 years, modify
some code *at the same linear address at which you intend to execute 
it*, and jump to it, it works.  It may even work if you do it through
an alias (the manual is vague).  But it's not 20 years ago, and you have
multiple cores.  This does *not* work with multiple CPUs -- you need to 
serialize on the CPU executing the modified code.  On all the but the 
very newest CPUs, you need to kludge up the serialization, and that's
sloooooooooooooow.  Very new CPUs have the SERIALIZE instruction, which
is merely sloooooow.

(The manual is terrible.  It's clear that a way to do this without 
serializing must exist, because that's what happens when code is paged 
in from a user program.)

And remember that x86 is the forgiving architecture.  Other 
architectures have their own rules that may involve all kinds of 
terrifying cache management.  IIRC ARM (32-bit) is really quite nasty in 
this regard.  I've seen some references suggesting that RISC-V has a 
broken design of its cache management and this is a real mess.

x86 low level stuff on Linux gets away with it because the 
implementation is conservative and very slow, but it's very rarely invoked.

eBPF gets away with it in ways that probably no one really likes, but 
also no one expects eBPF to load programs particularly quickly.

You are proposing doing this when a btree node is loaded.  You could 
spend 20 *thousand* cycles, on *each CPU*, the first time you access 
that node, not to mention the extra branch to decide whether you need to 
spend those 20k cycles.  Or you could use IPIs.

Or you could just not do this.  I think you should just remove all this 
dynamic codegen stuff, at least for now.

> 
> I'm working up a new allocator - a (very simple) slab allocator where
> you pass a buffer, and it gives you a copy of that buffer mapped
> executable, but not writeable.
> 
> It looks like we'll be able to convert bpf, kprobes, and ftrace
> trampolines to it; it'll consolidate a fair amount of code (particularly
> in bpf), and they won't have to burn a full page per allocation anymore.
> 
> bpf has a neat trick where it maps the same page in two different
> locations, one is the executable location and the other is the writeable
> location - I'm stealing that.
> 
> external api will be:
> 
> void *jit_alloc(void *buf, size_t len, gfp_t gfp);
> void jit_free(void *buf);
> void jit_update(void *buf, void *new_code, size_t len); /* update an existing allocation */

Based on the above, I regret to inform you that jit_update() will either 
need to sync all cores via IPI or all cores will need to check whether a 
sync is needed and do it themselves.

That IPI could be, I dunno, 500k cycles?  1M cycles?  Depends on what 
cores are asleep at the time.  (I have some old Sandy Bridge machines 
where, if you tick all the boxes wrong, you might spend tens of 
milliseconds doing this due to power savings gone wrong.)  Or are you 
planning to implement a fancy mostly-lockless thing to track which cores 
actually need the IPI so you can avoid waking up sleeping cores?

Sorry to be a party pooper.

--Andy

P.S. I have given some thought to how to make a JIT API that was 
actually (somewhat) performant.  It's nontrivial, and it would involve 
having at least phone calls and possibly actual meetings with people who 
understand the microarchitecture of various CPUs to get all the details 
hammered out and documented properly.

I don't think it would be efficient for teeny little functions like 
bcachefs wants, but maybe?  That would be even more complex and messy.
Kent Overstreet June 17, 2023, 3:34 p.m. UTC | #44
On Fri, Jun 16, 2023 at 09:13:22PM -0700, Andy Lutomirski wrote:
> On 5/16/23 14:20, Kent Overstreet wrote:
> > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > For something that small, why not use the text_poke API?
> > 
> > This looks like it's meant for patching existing kernel text, which
> > isn't what I want - I'm generating new functions on the fly, one per
> > btree node.
> 
> Dynamically generating code is a giant can of worms.
> 
> Kees touched on a basic security thing: a linear address mapped W+X is a big
> no-no.  And that's just scratching the surface -- ideally we would have a
> strong protocol for generating code: the code is generated in some
> extra-secure context, then it's made immutable and double-checked, then
> it becomes live.

"Double checking" arbitrary code is is fantasy. You can't "prove the
security" of arbitrary code post compilation.

Rice's theorem states that any nontrivial property of a program is
either a direct consequence of the syntax, or is undecidable. It's why
programs in statically typed languages are easier to reason about, and
it's also why the borrow checker in Rust is a syntactic construct.

You just have to be able to trust the code that generates the code. Just
like you have to be able to trust any other code that lives in kernel
space.

This is far safer and easier to reason about than what BPF is doing
because we're not compiling arbitrary code, the actual codegen part is
200 loc and the input is just a single table.

> 
> (When x86 modifies itself at boot or for static keys, it changes out the
> page tables temporarily.)
> 
> And even beyond security, we have correctness.  x86 is a fairly forgiving
> architecture.  If you go back in time about 20 years, modify
> some code *at the same linear address at which you intend to execute it*,
> and jump to it, it works.  It may even work if you do it through
> an alias (the manual is vague).  But it's not 20 years ago, and you have
> multiple cores.  This does *not* work with multiple CPUs -- you need to
> serialize on the CPU executing the modified code.  On all the but the very
> newest CPUs, you need to kludge up the serialization, and that's
> sloooooooooooooow.  Very new CPUs have the SERIALIZE instruction, which
> is merely sloooooow.

If what you were saying was true, it would be an issue any time we
mapped in new executable code for userspace - minor page faults would be
stupidly slow.

This code has been running on thousands of machines for years, and the
only issues that have come up have been due to the recent introduction
of indirect branch tracking. x86 doesn't have such broken caches, and
architectures that do have utterly broken caches (because that's what
you're describing: you're describing caches that _are not coherent
across cores_) are not high on my list of things I care about.

Also, SERIALIZE is a spectre thing. Not relevant here.

> Based on the above, I regret to inform you that jit_update() will either
> need to sync all cores via IPI or all cores will need to check whether a
> sync is needed and do it themselves.

text_poke() doesn't even send IPIs.

I think you've been misled about some things :)
Andy Lutomirski June 17, 2023, 7:19 p.m. UTC | #45
On Sat, Jun 17, 2023, at 8:34 AM, Kent Overstreet wrote:
> On Fri, Jun 16, 2023 at 09:13:22PM -0700, Andy Lutomirski wrote:
>> On 5/16/23 14:20, Kent Overstreet wrote:
>> > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
>> > > For something that small, why not use the text_poke API?
>> > 
>> > This looks like it's meant for patching existing kernel text, which
>> > isn't what I want - I'm generating new functions on the fly, one per
>> > btree node.
>> 
>> Dynamically generating code is a giant can of worms.
>> 
>> Kees touched on a basic security thing: a linear address mapped W+X is a big
>> no-no.  And that's just scratching the surface -- ideally we would have a
>> strong protocol for generating code: the code is generated in some
>> extra-secure context, then it's made immutable and double-checked, then
>> it becomes live.
>
> "Double checking" arbitrary code is is fantasy. You can't "prove the
> security" of arbitrary code post compilation.
>
> Rice's theorem states that any nontrivial property of a program is
> either a direct consequence of the syntax, or is undecidable. It's why
> programs in statically typed languages are easier to reason about, and
> it's also why the borrow checker in Rust is a syntactic construct.

If you want security in some theoretical sense, sure, you're probably right.  But that doesn't stop people from double-checking executable code to quite good effect.  For example:

https://www.bitdefender.com/blog/businessinsights/bitdefender-releases-landmark-open-source-software-project-hypervisor-based-memory-introspection/

(I have no personal experience with this, but I know people who do.  It's obviously not perfect, but I think it provides meaningful benefits.)

I'm not saying Linux should do this internally, but it might not be a terrible idea some day.

>
> You just have to be able to trust the code that generates the code. Just
> like you have to be able to trust any other code that lives in kernel
> space.
>
> This is far safer and easier to reason about than what BPF is doing
> because we're not compiling arbitrary code, the actual codegen part is
> 200 loc and the input is just a single table.

Great, then propose a model where the codegen operates in an extra-safe protected context.  Or pre-generate the most common variants, have them pull their constants from memory instead of immediates, and use that.

>
>> 
>> (When x86 modifies itself at boot or for static keys, it changes out the
>> page tables temporarily.)
>> 
>> And even beyond security, we have correctness.  x86 is a fairly forgiving
>> architecture.  If you go back in time about 20 years, modify
>> some code *at the same linear address at which you intend to execute it*,
>> and jump to it, it works.  It may even work if you do it through
>> an alias (the manual is vague).  But it's not 20 years ago, and you have
>> multiple cores.  This does *not* work with multiple CPUs -- you need to
>> serialize on the CPU executing the modified code.  On all the but the very
>> newest CPUs, you need to kludge up the serialization, and that's
>> sloooooooooooooow.  Very new CPUs have the SERIALIZE instruction, which
>> is merely sloooooow.
>
> If what you were saying was true, it would be an issue any time we
> mapped in new executable code for userspace - minor page faults would be
> stupidly slow.

I literally mentioned this in the email.

I don't know _precisely_ what's going on, but I assume it's that it's impossible (assuming the kernel gets TLB invalidation right) for a CPU to have anything buffered for a linear address that is unmapped, so when it gets mapped, the CPU can't have anything stale in its buffers.  (By buffers, I mean any sort of instruction or decoded instruction cache.)

Having *this* conversation is what I was talking about in regard to possible fancy future optimization.

>
> This code has been running on thousands of machines for years, and the
> only issues that have come up have been due to the recent introduction
> of indirect branch tracking. x86 doesn't have such broken caches, and
> architectures that do have utterly broken caches (because that's what
> you're describing: you're describing caches that _are not coherent
> across cores_) are not high on my list of things I care about.

I care.  And a bunch of people who haven't gotten their filesystem corrupted because of a missed serialization.

>
> Also, SERIALIZE is a spectre thing. Not relevant here.

Nope, try again.  SERIALIZE "serializes" in the rather vague sense in the Intel SDM.  I don't think it's terribly useful for Spectre.

(Yes, I know what I'm talking about.)

>
>> Based on the above, I regret to inform you that jit_update() will either
>> need to sync all cores via IPI or all cores will need to check whether a
>> sync is needed and do it themselves.
>
> text_poke() doesn't even send IPIs.

text_poke() and the associated machinery is unbelievably complicated.  

Also, arch/x86/kernel/alternative.c contains:

void text_poke_sync(void)
{
	on_each_cpu(do_sync_core, NULL, 1);
}

The magic in text_poke() was developed over the course of years, and Intel architects were involved.

(And I think some text_poke() stuff uses RCU, which is another way to sync without IPI.  I doubt the performance characteristics are appropriate for bcachefs, but I could be wrong.)

>
> I think you've been misled about some things :)

I wish.


I like bcachefs.  I really don't want to have to put on my maintainer hat here, and I do indeed generally stay in the background.  (And I haven't had nearly as much time for this kind of the work in the last couple years as I'd like, sigh.) But I personally have a fairly strict opinion that, if someone (including myself!) wants to merge something that plays clever games that may cause x86 architecture code (especially mm code) to do things it shouldn't in corner cases, even if no one has directly observed that corner case or even knows how to get it to misbehave, then they had better have a very convincing argument that it's safe.  No one likes debugging bugs when something that should be coherent becomes incoherent.

So, if you really really want self-modifying code in bcachefs, its correctness needs to be very, very well argued, and it needs to be maintainable.  Otherwise I will NAK it.  Sorry.
Kent Overstreet June 17, 2023, 8:08 p.m. UTC | #46
On Sat, Jun 17, 2023 at 12:19:41PM -0700, Andy Lutomirski wrote:
> On Sat, Jun 17, 2023, at 8:34 AM, Kent Overstreet wrote:
> > On Fri, Jun 16, 2023 at 09:13:22PM -0700, Andy Lutomirski wrote:
> >> On 5/16/23 14:20, Kent Overstreet wrote:
> >> > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> >> > > For something that small, why not use the text_poke API?
> >> > 
> >> > This looks like it's meant for patching existing kernel text, which
> >> > isn't what I want - I'm generating new functions on the fly, one per
> >> > btree node.
> >> 
> >> Dynamically generating code is a giant can of worms.
> >> 
> >> Kees touched on a basic security thing: a linear address mapped W+X is a big
> >> no-no.  And that's just scratching the surface -- ideally we would have a
> >> strong protocol for generating code: the code is generated in some
> >> extra-secure context, then it's made immutable and double-checked, then
> >> it becomes live.
> >
> > "Double checking" arbitrary code is is fantasy. You can't "prove the
> > security" of arbitrary code post compilation.
> >
> > Rice's theorem states that any nontrivial property of a program is
> > either a direct consequence of the syntax, or is undecidable. It's why
> > programs in statically typed languages are easier to reason about, and
> > it's also why the borrow checker in Rust is a syntactic construct.
> 
> If you want security in some theoretical sense, sure, you're probably right.  But that doesn't stop people from double-checking executable code to quite good effect.  For example:
> 
> https://www.bitdefender.com/blog/businessinsights/bitdefender-releases-landmark-open-source-software-project-hypervisor-based-memory-introspection/
> 
> (I have no personal experience with this, but I know people who do.  It's obviously not perfect, but I think it provides meaningful benefits.)
> 
> I'm not saying Linux should do this internally, but it might not be a terrible idea some day.

So you want to pull a virus scanner into the kernel.

> > You just have to be able to trust the code that generates the code. Just
> > like you have to be able to trust any other code that lives in kernel
> > space.
> >
> > This is far safer and easier to reason about than what BPF is doing
> > because we're not compiling arbitrary code, the actual codegen part is
> > 200 loc and the input is just a single table.
> 
> Great, then propose a model where the codegen operates in an
> extra-safe protected context.  Or pre-generate the most common
> variants, have them pull their constants from memory instead of
> immediates, and use that.

I'll do no such nonsense.

> > If what you were saying was true, it would be an issue any time we
> > mapped in new executable code for userspace - minor page faults would be
> > stupidly slow.
> 
> I literally mentioned this in the email.

No, you didn't. Feel free to link or cite if you think otherwise.

> 
> I don't know _precisely_ what's going on, but I assume it's that it's impossible (assuming the kernel gets TLB invalidation right) for a CPU to have anything buffered for a linear address that is unmapped, so when it gets mapped, the CPU can't have anything stale in its buffers.  (By buffers, I mean any sort of instruction or decoded instruction cache.)
> 
> Having *this* conversation is what I was talking about in regard to possible fancy future optimization.
> 
> >
> > This code has been running on thousands of machines for years, and the
> > only issues that have come up have been due to the recent introduction
> > of indirect branch tracking. x86 doesn't have such broken caches, and
> > architectures that do have utterly broken caches (because that's what
> > you're describing: you're describing caches that _are not coherent
> > across cores_) are not high on my list of things I care about.
> 
> I care.  And a bunch of people who haven't gotten their filesystem corrupted because of a missed serialization.
> 
> >
> > Also, SERIALIZE is a spectre thing. Not relevant here.
> 
> Nope, try again.  SERIALIZE "serializes" in the rather vague sense in the Intel SDM.  I don't think it's terribly useful for Spectre.
> 
> (Yes, I know what I'm talking about.)
> 
> >
> >> Based on the above, I regret to inform you that jit_update() will either
> >> need to sync all cores via IPI or all cores will need to check whether a
> >> sync is needed and do it themselves.
> >
> > text_poke() doesn't even send IPIs.
> 
> text_poke() and the associated machinery is unbelievably complicated.  

It's not that bad.

The only reference to IPIs in text_poke() is the comment that indicates
that flush_tlb_mm_range() may sometimes do IPIs, but explicitly
indicates that it does _not_ do IPIs the way text_poke() is using it.

> Also, arch/x86/kernel/alternative.c contains:
> 
> void text_poke_sync(void)
> {
> 	on_each_cpu(do_sync_core, NULL, 1);
> }

...which is for modifying code that is currently being executed, not the
text_poke() or text_poke_copy() paths.

> 
> The magic in text_poke() was developed over the course of years, and
> Intel architects were involved.
> 
> (And I think some text_poke() stuff uses RCU, which is another way to
> sync without IPI.  I doubt the performance characteristics are
> appropriate for bcachefs, but I could be wrong.)

No, it doesn't use RCU.

> > I think you've been misled about some things :)
> 
> I wish.

Given your comments on text_poke(), I think you are. You're confusing
synchronization requirements for _self modifying_ code with the
synchronization requirements for writing new code to memory, and then
executing it.

And given that bcachefs is not doing anything new here - we're doing a
more limited form of what BPF is already doing - I don't think this is
even the appropriate place for this discussion. There is a new
executable memory allocator being developed and posted, which is
expected to wrap text_poke() in an arch-independent way so that
allocations can share pages, and so that we can remove the need to have
pages mapped both writeable and executable.

If you've got knowledge you wish to share on how to get cache coherency
right, I think that might be a more appropriate thread.
Andy Lutomirski June 17, 2023, 8:35 p.m. UTC | #47
On Sat, Jun 17, 2023, at 1:08 PM, Kent Overstreet wrote:
> On Sat, Jun 17, 2023 at 12:19:41PM -0700, Andy Lutomirski wrote:
>> On Sat, Jun 17, 2023, at 8:34 AM, Kent Overstreet wrote:

>> Great, then propose a model where the codegen operates in an
>> extra-safe protected context.  Or pre-generate the most common
>> variants, have them pull their constants from memory instead of
>> immediates, and use that.
>
> I'll do no such nonsense.

You can avoid generating code beyond what gcc generates at all, or you can pre-generate code but not on an ongoing basis at runtime, or you can generate code at runtime correctly.  I don't think there are many other options.

>
>> > If what you were saying was true, it would be an issue any time we
>> > mapped in new executable code for userspace - minor page faults would be
>> > stupidly slow.
>> 
>> I literally mentioned this in the email.
>
> No, you didn't. Feel free to link or cite if you think otherwise.

"It's clear that a way to do this without 
serializing must exist, because that's what happens when code is paged 
in from a user program."

>> > text_poke() doesn't even send IPIs.
>> 
>> text_poke() and the associated machinery is unbelievably complicated.  
>
> It's not that bad.

This is a useless discussion.

>
> The only reference to IPIs in text_poke() is the comment that indicates
> that flush_tlb_mm_range() may sometimes do IPIs, but explicitly
> indicates that it does _not_ do IPIs the way text_poke() is using it.
>
>> Also, arch/x86/kernel/alternative.c contains:
>> 
>> void text_poke_sync(void)
>> {
>> 	on_each_cpu(do_sync_core, NULL, 1);
>> }
>
> ...which is for modifying code that is currently being executed, not the
> text_poke() or text_poke_copy() paths.
>
>> 
>> The magic in text_poke() was developed over the course of years, and
>> Intel architects were involved.
>> 
>> (And I think some text_poke() stuff uses RCU, which is another way to
>> sync without IPI.  I doubt the performance characteristics are
>> appropriate for bcachefs, but I could be wrong.)
>
> No, it doesn't use RCU.

It literally says in alternative.c:

 * Not safe against concurrent execution; useful for JITs to dump
 * new code blocks into unused regions of RX memory. Can be used in
 * conjunction with synchronize_rcu_tasks() to wait for existing
 * execution to quiesce after having made sure no existing functions
 * pointers are live.

I don't know whether any callers actually do this.  I didn't look.

>
>> > I think you've been misled about some things :)
>> 
>> I wish.
>
> Given your comments on text_poke(), I think you are. You're confusing
> synchronization requirements for _self modifying_ code with the
> synchronization requirements for writing new code to memory, and then
> executing it.

No, you are misunderstanding the difference.

Version A:

User mmap()s an executable file (DSO, whatever).  At first, there is either no PTE or a not-present PTE.  At some point, in response to a page fault or just the kernel prefetching, the kernel fills in the backing page and then creates the PTE.  From the CPU's perspective, the virtual address atomically transitions from having nothing there to having the final code there.  It works (despite the manual having nothing to say about this case).  It's also completely unavoidable.

Version B:

Kernel vmallocs some space *and populates the pagetables*.  There is backing storage, that is executable (or it's a non-NX system, although those are quite rare these days).

Because the CPU hates you, it speculatively executes that code.  (Maybe you're under attack.  Maybe you're just unlucky.  Doesn't matter.)  It populates the instruction cache, remembers the decoded instructions, etc.  It does all the things that make the manual say scary things about serialization.  It notices that the speculative execution was wrong and backs it out, but nothing is invalidated.

Now you write code into there.  Either you do this from a different CPU or you do it at a different linear address, so the magic hardware that invalidates for you does not trigger.

Now you jump into that code, and you tell yourself that it was new code because it was all zeros before and you never intentionally executed it.  But the CPU could not care less what you think, and you lose.

>
> And given that bcachefs is not doing anything new here - we're doing a
> more limited form of what BPF is already doing - I don't think this is
> even the appropriate place for this discussion. There is a new
> executable memory allocator being developed and posted, which is
> expected to wrap text_poke() in an arch-independent way so that
> allocations can share pages, and so that we can remove the need to have
> pages mapped both writeable and executable.

I don't really care what BPF is doing, and BPF may well have the same problem.

But if I understood what bcachefs is doing, it's creating code vastly more frequently than BPF, in response to entirely unprivileged operations from usermode.  It's a whole different amount of exposure.

>
> If you've got knowledge you wish to share on how to get cache coherency
> right, I think that might be a more appropriate thread.

I'll look.
Mark Rutland June 19, 2023, 9:19 a.m. UTC | #48
On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> From: Kent Overstreet <kent.overstreet@gmail.com>
> 
> This is needed for bcachefs, which dynamically generates per-btree node
> unpack functions.

Much like Kees and Andy, I have concerns with adding new code generators to the
kernel. Even ignoring the actual code generation, there are a bunch of subtle
ordering/maintenance/synchronization concerns across architectures, and we
already have a fair amount of pain with the existing cases.

Can you share more detail on how you want to use this?

From a quick scan of your gitweb for the bcachefs-for-upstream branch I
couldn't spot the relevant patches.

Thanks,
Mark.

> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Uladzislau Rezki <urezki@gmail.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: linux-mm@kvack.org
> ---
>  include/linux/vmalloc.h |  1 +
>  kernel/module/main.c    |  4 +---
>  mm/nommu.c              | 18 ++++++++++++++++++
>  mm/vmalloc.c            | 21 +++++++++++++++++++++
>  4 files changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 69250efa03..ff147fe115 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -145,6 +145,7 @@ extern void *vzalloc(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_user(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
>  extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
> +extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
>  extern void *vmalloc_32(unsigned long size) __alloc_size(1);
>  extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
>  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
> diff --git a/kernel/module/main.c b/kernel/module/main.c
> index d3be89de70..9eaa89e84c 100644
> --- a/kernel/module/main.c
> +++ b/kernel/module/main.c
> @@ -1607,9 +1607,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg
>  
>  void * __weak module_alloc(unsigned long size)
>  {
> -	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -			NUMA_NO_NODE, __builtin_return_address(0));
> +	return vmalloc_exec(size, GFP_KERNEL);
>  }
>  
>  bool __weak module_init_section(const char *name)
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 57ba243c6a..8d9ab19e39 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -280,6 +280,24 @@ void *vzalloc_node(unsigned long size, int node)
>  }
>  EXPORT_SYMBOL(vzalloc_node);
>  
> +/**
> + *	vmalloc_exec  -  allocate virtually contiguous, executable memory
> + *	@size:		allocation size
> + *
> + *	Kernel-internal function to allocate enough pages to cover @size
> + *	the page level allocator and map them into contiguous and
> + *	executable kernel virtual space.
> + *
> + *	For tight control over page level allocator and protection flags
> + *	use __vmalloc() instead.
> + */
> +
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc(size, gfp_mask);
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>  /**
>   * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
>   *	@size:		allocation size
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 31ff782d36..2ebb9ea7f0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3401,6 +3401,27 @@ void *vzalloc_node(unsigned long size, int node)
>  }
>  EXPORT_SYMBOL(vzalloc_node);
>  
> +/**
> + * vmalloc_exec - allocate virtually contiguous, executable memory
> + * @size:	  allocation size
> + *
> + * Kernel-internal function to allocate enough pages to cover @size
> + * the page level allocator and map them into contiguous and
> + * executable kernel virtual space.
> + *
> + * For tight control over page level allocator and protection flags
> + * use __vmalloc() instead.
> + *
> + * Return: pointer to the allocated memory or %NULL on error
> + */
> +void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
> +{
> +	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +			NUMA_NO_NODE, __builtin_return_address(0));
> +}
> +EXPORT_SYMBOL_GPL(vmalloc_exec);
> +
>  #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
>  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
>  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
> -- 
> 2.40.1
> 
>
Kent Overstreet June 19, 2023, 10:47 a.m. UTC | #49
On Mon, Jun 19, 2023 at 10:19:00AM +0100, Mark Rutland wrote:
> On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > From: Kent Overstreet <kent.overstreet@gmail.com>
> > 
> > This is needed for bcachefs, which dynamically generates per-btree node
> > unpack functions.
> 
> Much like Kees and Andy, I have concerns with adding new code generators to the
> kernel. Even ignoring the actual code generation, there are a bunch of subtle
> ordering/maintenance/synchronization concerns across architectures, and we
> already have a fair amount of pain with the existing cases.

Look, jits are just not that unusual. I'm not going to be responding to
vague concerns that don't have any actual engineering rational.

> Can you share more detail on how you want to use this?
> 
> From a quick scan of your gitweb for the bcachefs-for-upstream branch I
> couldn't spot the relevant patches.

I've already written extensively in this thread.
Mark Rutland June 19, 2023, 12:47 p.m. UTC | #50
On Mon, Jun 19, 2023 at 06:47:17AM -0400, Kent Overstreet wrote:
> On Mon, Jun 19, 2023 at 10:19:00AM +0100, Mark Rutland wrote:
> > On Tue, May 09, 2023 at 12:56:32PM -0400, Kent Overstreet wrote:
> > > From: Kent Overstreet <kent.overstreet@gmail.com>
> > > 
> > > This is needed for bcachefs, which dynamically generates per-btree node
> > > unpack functions.
> > 
> > Much like Kees and Andy, I have concerns with adding new code generators to the
> > kernel. Even ignoring the actual code generation, there are a bunch of subtle
> > ordering/maintenance/synchronization concerns across architectures, and we
> > already have a fair amount of pain with the existing cases.
> 
> Look, jits are just not that unusual. I'm not going to be responding to
> vague concerns that don't have any actual engineering rational.

Sorry, but I do have an engineering rationale here: I want to make sure that
this actually works, on architectures that I care about, and will be
maintanable long-term.

We've had a bunch of problems with other JITs ranging from JIT-local "we got
the encoding wrong" to major kernel infrastructure changes like tasks RCU rude
synchronization. I'm trying to figure out whether any of those are likely to
apply and/or whether we should be refactoring other infrastructure for use here
(e.g. the factoring the acutal instruction generation from arch code, or
perhaps reusing eBPF so this can be arch-neutral).

I appreciate that's not clear from my initial mail, but please don't jump
straight to assuming I'm adversarial here.

> > Can you share more detail on how you want to use this?
> > 
> > From a quick scan of your gitweb for the bcachefs-for-upstream branch I
> > couldn't spot the relevant patches.
> 
> I've already written extensively in this thread.

Sorry, I hadn't seen that.

For the benefit of others, the codegen is at:

  https://lore.kernel.org/lkml/ZFq7JhrhyrMTNfd%2F@moria.home.lan/
  https://evilpiepirate.org/git/bcachefs.git/tree/fs/bcachefs/bkey.c#n727

... and the rationale is at:

  https://lore.kernel.org/lkml/ZF6HHRDeUWLNtuL7@moria.home.lan/

One thing I note mmediately is that HAVE_BCACHEFS_COMPILED_UNPACK seems to be
x86-only. If this is important, that'll need some rework to either be
arch-neutral or allow for arch-specific implementations.

Thanks,
Mark.
Kent Overstreet June 19, 2023, 7:17 p.m. UTC | #51
On Mon, Jun 19, 2023 at 01:47:18PM +0100, Mark Rutland wrote:
> Sorry, but I do have an engineering rationale here: I want to make sure that
> this actually works, on architectures that I care about, and will be
> maintanable long-term.
> 
> We've had a bunch of problems with other JITs ranging from JIT-local "we got
> the encoding wrong" to major kernel infrastructure changes like tasks RCU rude
> synchronization. I'm trying to figure out whether any of those are likely to
> apply and/or whether we should be refactoring other infrastructure for use here
> (e.g. the factoring the acutal instruction generation from arch code, or
> perhaps reusing eBPF so this can be arch-neutral).
> 
> I appreciate that's not clear from my initial mail, but please don't jump
> straight to assuming I'm adversarial here.

I know you're not trying to be adversarial, but vague negative feedback
_is_ hostile, because productive technical discussions can't happen
without specifics and you're putting all the onus on the other person to
make that happen.

When you're raising an issue, try be specific - don't make people dig.
If you're unable to be specific, perhaps you're not the right person to
be raising the issue.

I'm of course happy to answer questions that haven't already been asked.

This code is pretty simple as JITs go. With the existing, vmalloc_exec()
based code, there aren't any fancy secondary mappings going on, so no
crazy cache coherency games, and no crazy syncronization issues to worry
about: the jit functions are protected by the per-btree-node locks.

vmalloc_exec() isn't being upstreamed however, since people don't want
WX mappings.

The infrastructure changes we need (and not just for bcachefs) are
 - better executable memory allocation API, with support for sub-page
   allocations: this is already being worked on, the prototype slab
   allocator I posted is probably going to be the basis for part of this

 - an arch indepenendent version of text_poke(): we don't want user code
   to be flipping page permissions to update text, text_poke() is the
   proper API but it's x86 only. No one has volunteered for this yet.

Re-using eBPF for bcachefs's unpack functions is not outside the realm
of possibility, but BPF is a heavy, complex dependency - it's not
something I'll be looking at unless the BPF people are volunteering to
refactor their stuff to provide a suitable API.

> One thing I note mmediately is that HAVE_BCACHEFS_COMPILED_UNPACK seems to be
> x86-only. If this is important, that'll need some rework to either be
> arch-neutral or allow for arch-specific implementations.

Correct. Arm will happen at some point, but it's not an immediate
priority.
Kees Cook June 19, 2023, 7:45 p.m. UTC | #52
On Sat, Jun 17, 2023 at 11:34:31AM -0400, Kent Overstreet wrote:
> On Fri, Jun 16, 2023 at 09:13:22PM -0700, Andy Lutomirski wrote:
> > On 5/16/23 14:20, Kent Overstreet wrote:
> > > On Tue, May 16, 2023 at 02:02:11PM -0700, Kees Cook wrote:
> > > > For something that small, why not use the text_poke API?
> > > 
> > > This looks like it's meant for patching existing kernel text, which
> > > isn't what I want - I'm generating new functions on the fly, one per
> > > btree node.
> > 
> > Dynamically generating code is a giant can of worms.
> > 
> > Kees touched on a basic security thing: a linear address mapped W+X is a big
> > no-no.  And that's just scratching the surface -- ideally we would have a
> > strong protocol for generating code: the code is generated in some
> > extra-secure context, then it's made immutable and double-checked, then
> > it becomes live.
> 
> "Double checking" arbitrary code is is fantasy. You can't "prove the
> security" of arbitrary code post compilation.

I think there's a misunderstanding here about the threat model I'm
interested in protecting against for JITs. While making sure the VM of a
JIT is safe in itself, that's separate from what I'm concerned about.

The threat model is about flaws _elsewhere_ in the kernel that can
leverage the JIT machinery to convert a "write anything anywhere anytime"
exploit primitive into an "execute anything" primitive. Arguments can
be made to say "a write anything flaw means the total collapse of the
security model so there's no point defending against it", but both that
type of flaw and the slippery slope argument don't stand up well to
real-world situations.

The kinds of flaws we've seen are frequently limited in scope (write
1 byte, write only NULs, write only in a specific range, etc), but
when chained together, the weakest link is what ultimately compromises
the kernel. As such, "W^X" is a basic building block of the kernel's
self-defense methods, because it is such a potent target for a
write->execute attack upgrades.

Since a JIT constructs something that will become executable, it needs
to defend itself against stray writes from other threads. Since Linux
doesn't (really) use per-CPU page tables, the workspace for a JIT can be
targeted by something that isn't the JIT. To deal with this, JITs need
to use 3 phases: a writing pass (into W memory), then switch it to RO
and perform a verification pass (construct it again, but compare results
to the RO version), and finally switch it executable. Or, it can use
writes to memory that only the local CPU can perform (i.e. text_poke(),
which uses a different set of page tables with different permissions).

Without basic W^X, it becomes extremely difficult to build further
defenses (e.g. protecting page tables themselves, etc) since WX will
remain the easiest target.

-Kees
Kent Overstreet June 20, 2023, 12:39 a.m. UTC | #53
On Mon, Jun 19, 2023 at 12:45:43PM -0700, Kees Cook wrote:
> I think there's a misunderstanding here about the threat model I'm
> interested in protecting against for JITs. While making sure the VM of a
> JIT is safe in itself, that's separate from what I'm concerned about.
> 
> The threat model is about flaws _elsewhere_ in the kernel that can
> leverage the JIT machinery to convert a "write anything anywhere anytime"
> exploit primitive into an "execute anything" primitive. Arguments can
> be made to say "a write anything flaw means the total collapse of the
> security model so there's no point defending against it", but both that
> type of flaw and the slippery slope argument don't stand up well to
> real-world situations.

Hey Kees, thanks for the explanation - I don't think this is a concern
for what bcachefs is doing, since we're not doing a full jit. The unpack
functions we generate only write to the 40 bytes pointed to by rsi; not
terribly useful as an execute anything primitive :)
Andy Lutomirski June 20, 2023, 5:42 p.m. UTC | #54
On Mon, Jun 19, 2023, at 12:17 PM, Kent Overstreet wrote:
> On Mon, Jun 19, 2023 at 01:47:18PM +0100, Mark Rutland wrote:
>> Sorry, but I do have an engineering rationale here: I want to make sure that
>> this actually works, on architectures that I care about, and will be
>> maintanable long-term.
>> 
>> We've had a bunch of problems with other JITs ranging from JIT-local "we got
>> the encoding wrong" to major kernel infrastructure changes like tasks RCU rude
>> synchronization. I'm trying to figure out whether any of those are likely to
>> apply and/or whether we should be refactoring other infrastructure for use here
>> (e.g. the factoring the acutal instruction generation from arch code, or
>> perhaps reusing eBPF so this can be arch-neutral).
>> 
>> I appreciate that's not clear from my initial mail, but please don't jump
>> straight to assuming I'm adversarial here.
>
> I know you're not trying to be adversarial, but vague negative feedback
> _is_ hostile, because productive technical discussions can't happen
> without specifics and you're putting all the onus on the other person to
> make that happen.

I'm sorry, but this isn't how correct code gets written, and this isn't how at least x86 maintenance operates.

Code is either correct, and comes with an explanation as to how it is correct, or it doesn't go in.  Saying that something is like BPF is not an explanation as to how it's correct.  Saying that someone has not come up with the chain of events that causes a mere violation of architecture rules to actual incorrect execution is not an explanation as to how something is correct.

So, without intending any particular hostility:

<puts on maintainer hat>

bcachefs's x86 JIT is:
Nacked-by: Andy Lutomirski <luto@kernel.org> # for x86

<takes off maintainer hat>

This makes me sad, because I like bcachefs.  But you can get it merged without worrying about my NAK by removing the x86 part.

>
> When you're raising an issue, try be specific - don't make people dig.
> If you're unable to be specific, perhaps you're not the right person to
> be raising the issue.
>
> I'm of course happy to answer questions that haven't already been asked.
>
> This code is pretty simple as JITs go. With the existing, vmalloc_exec()
> based code, there aren't any fancy secondary mappings going on, so no
> crazy cache coherency games, and no crazy syncronization issues to worry
> about: the jit functions are protected by the per-btree-node locks.
>
> vmalloc_exec() isn't being upstreamed however, since people don't want
> WX mappings.
>
> The infrastructure changes we need (and not just for bcachefs) are
>  - better executable memory allocation API, with support for sub-page
>    allocations: this is already being worked on, the prototype slab
>    allocator I posted is probably going to be the basis for part of this
>
>  - an arch indepenendent version of text_poke(): we don't want user code
>    to be flipping page permissions to update text, text_poke() is the
>    proper API but it's x86 only. No one has volunteered for this yet.
>

text_poke() by itself is *not* the proper API, as discussed.  It doesn't serialize adequately, even on x86.  We have text_poke_sync() for that.

--Andy
Kent Overstreet June 20, 2023, 6:08 p.m. UTC | #55
On Tue, Jun 20, 2023 at 10:42:02AM -0700, Andy Lutomirski wrote:
> Code is either correct, and comes with an explanation as to how it is
> correct, or it doesn't go in.  Saying that something is like BPF is
> not an explanation as to how it's correct.  Saying that someone has
> not come up with the chain of events that causes a mere violation of
> architecture rules to actual incorrect execution is not an explanation
> as to how something is correct.

No, I'm saying your concerns are baseless and too vague to address.

> text_poke() by itself is *not* the proper API, as discussed.  It
> doesn't serialize adequately, even on x86.  We have text_poke_sync()
> for that.

Andy, I replied explaining the difference between text_poke() and
text_poke_sync(). It's clear you have no idea what you're talking about,
so I'm not going to be wasting my time on further communications with
you.
Andy Lutomirski June 20, 2023, 6:15 p.m. UTC | #56
On Tue, Jun 20, 2023, at 11:08 AM, Kent Overstreet wrote:
> On Tue, Jun 20, 2023 at 10:42:02AM -0700, Andy Lutomirski wrote:
>> Code is either correct, and comes with an explanation as to how it is
>> correct, or it doesn't go in.  Saying that something is like BPF is
>> not an explanation as to how it's correct.  Saying that someone has
>> not come up with the chain of events that causes a mere violation of
>> architecture rules to actual incorrect execution is not an explanation
>> as to how something is correct.
>
> No, I'm saying your concerns are baseless and too vague to address.

If you don't address them, the NAK will stand forever, or at least until a different group of people take over x86 maintainership.  That's fine with me.

I'm generally pretty happy about working with people to get their Linux code right.  But no one is obligated to listen to me.

>
>> text_poke() by itself is *not* the proper API, as discussed.  It
>> doesn't serialize adequately, even on x86.  We have text_poke_sync()
>> for that.
>
> Andy, I replied explaining the difference between text_poke() and
> text_poke_sync(). It's clear you have no idea what you're talking about,
> so I'm not going to be wasting my time on further communications with
> you.

No problem.  Then your x86 code will not be merged upstream.

Best of luck with the actual filesystem parts!

--Andy
Dave Hansen June 20, 2023, 6:48 p.m. UTC | #57
>> No, I'm saying your concerns are baseless and too vague to
>> address.
> If you don't address them, the NAK will stand forever, or at least
> until a different group of people take over x86 maintainership.
> That's fine with me.

I've got a specific concern: I don't see vmalloc_exec() used in this
series anywhere.  I also don't see any of the actual assembly that's
being generated, or the glue code that's calling into the generated
assembly.

I grepped around a bit in your git trees, but I also couldn't find it in
there.  Any chance you could help a guy out and point us to some of the
specifics of this new, tiny JIT?

>> Andy, I replied explaining the difference between text_poke() and
>> text_poke_sync(). It's clear you have no idea what you're talking about,
>> so I'm not going to be wasting my time on further communications with
>> you.

One more specific concern: This comment made me very uncomfortable and
it read to me very much like a personal attack, something which is
contrary to our code of conduct.
Kent Overstreet June 20, 2023, 8:18 p.m. UTC | #58
On Tue, Jun 20, 2023 at 11:48:26AM -0700, Dave Hansen wrote:
> >> No, I'm saying your concerns are baseless and too vague to
> >> address.
> > If you don't address them, the NAK will stand forever, or at least
> > until a different group of people take over x86 maintainership.
> > That's fine with me.
> 
> I've got a specific concern: I don't see vmalloc_exec() used in this
> series anywhere.  I also don't see any of the actual assembly that's
> being generated, or the glue code that's calling into the generated
> assembly.
>
> I grepped around a bit in your git trees, but I also couldn't find it in
> there.  Any chance you could help a guy out and point us to some of the
> specifics of this new, tiny JIT?

vmalloc_exec() has already been dropped from the patchset - I'll switch
to the new jit allocator when that's available and doing sub-page
allocations.

I can however point you at the code that generates the unpack functions:

https://evilpiepirate.org/git/bcachefs.git/tree/fs/bcachefs/bkey.c#n727

> >> Andy, I replied explaining the difference between text_poke() and
> >> text_poke_sync(). It's clear you have no idea what you're talking about,
> >> so I'm not going to be wasting my time on further communications with
> >> you.
> 
> One more specific concern: This comment made me very uncomfortable and
> it read to me very much like a personal attack, something which is
> contrary to our code of conduct.

It's not; I prefer to be direct than passive aggressive, and if I have
to bow out of a discussion that isn't going anywhere I feel I owe an
explanation of _why_. Too much conflict avoidance means things don't get
resolved.

And Andy and I are talking on IRC now, so things are proceeding in a
better direction.
Andy Lutomirski June 20, 2023, 8:42 p.m. UTC | #59
Hi all-

On Tue, Jun 20, 2023, at 11:48 AM, Dave Hansen wrote:
>>> No, I'm saying your concerns are baseless and too vague to
>>> address.
>> If you don't address them, the NAK will stand forever, or at least
>> until a different group of people take over x86 maintainership.
>> That's fine with me.
>
> I've got a specific concern: I don't see vmalloc_exec() used in this
> series anywhere.  I also don't see any of the actual assembly that's
> being generated, or the glue code that's calling into the generated
> assembly.
>
> I grepped around a bit in your git trees, but I also couldn't find it in
> there.  Any chance you could help a guy out and point us to some of the
> specifics of this new, tiny JIT?
>

So I had a nice discussion with Kent on IRC, and, for the benefit of everyone else reading along, I *think* the JITted code can be replaced by a table-driven approach like this:

typedef unsigned int u32;
typedef unsigned long u64;

struct uncompressed
{
    u32 a;
    u32 b;
    u64 c;
    u64 d;
    u64 e;
    u64 f;
};

struct bitblock
{
    u64 source;
    u64 target;
    u64 mask;
    int shift;
};

// out needs to be zeroed first
void unpack(struct uncompressed *out, const u64 *in, const struct bitblock *blocks, int nblocks)
{
    u64 *out_as_words = (u64*)out;
    for (int i = 0; i < nblocks; i++) {
        const struct bitblock *b;
        out_as_words[b->target] |= (in[b->source] & b->mask) << b->shift;
    }
}

void apply_offsets(struct uncompressed *out, const struct uncompressed *offsets)
{
    out->a += offsets->a;
    out->b += offsets->b;
    out->c += offsets->c;
    out->d += offsets->d;
    out->e += offsets->e;
    out->f += offsets->f;
}

Which generates nice code: https://godbolt.org/z/3fEq37hf5

It would need spectre protection in two places, I think, because it's almost most certainly a great gadget if the attacker can speculatively control the 'blocks' table.  This could be mitigated (I think) by hardcoding nblocks as 12 and by masking b->target.

In contrast, the JIT approach needs a retpoline on each call, which could be more expensive than my entire function :)  I haven't benchmarked them lately.
Andy Lutomirski June 20, 2023, 10:32 p.m. UTC | #60
On Tue, Jun 20, 2023, at 1:42 PM, Andy Lutomirski wrote:
> Hi all-
>
> On Tue, Jun 20, 2023, at 11:48 AM, Dave Hansen wrote:
>>>> No, I'm saying your concerns are baseless and too vague to
>>>> address.
>>> If you don't address them, the NAK will stand forever, or at least
>>> until a different group of people take over x86 maintainership.
>>> That's fine with me.
>>
>> I've got a specific concern: I don't see vmalloc_exec() used in this
>> series anywhere.  I also don't see any of the actual assembly that's
>> being generated, or the glue code that's calling into the generated
>> assembly.
>>
>> I grepped around a bit in your git trees, but I also couldn't find it in
>> there.  Any chance you could help a guy out and point us to some of the
>> specifics of this new, tiny JIT?
>>
>
> So I had a nice discussion with Kent on IRC, and, for the benefit of 
> everyone else reading along, I *think* the JITted code can be replaced 
> by a table-driven approach like this:
>
> typedef unsigned int u32;
> typedef unsigned long u64;
>
> struct uncompressed
> {
>     u32 a;
>     u32 b;
>     u64 c;
>     u64 d;
>     u64 e;
>     u64 f;
> };
>
> struct bitblock
> {
>     u64 source;
>     u64 target;
>     u64 mask;
>     int shift;
> };
>
> // out needs to be zeroed first
> void unpack(struct uncompressed *out, const u64 *in, const struct 
> bitblock *blocks, int nblocks)
> {
>     u64 *out_as_words = (u64*)out;
>     for (int i = 0; i < nblocks; i++) {
>         const struct bitblock *b;
>         out_as_words[b->target] |= (in[b->source] & b->mask) << 
> b->shift;
>     }
> }
>
> void apply_offsets(struct uncompressed *out, const struct uncompressed *offsets)
> {
>     out->a += offsets->a;
>     out->b += offsets->b;
>     out->c += offsets->c;
>     out->d += offsets->d;
>     out->e += offsets->e;
>     out->f += offsets->f;
> }
>
> Which generates nice code: https://godbolt.org/z/3fEq37hf5

Thinking about this a bit more, I think the only real performance issue with my code is that it does 12 read-xor-write operations in memory, which all depend on each other in horrible ways.

If it's reversed so the stores are all in order, then this issue would go away.

typedef unsigned int u32;
typedef unsigned long u64;

struct uncompressed
{
    u32 a;
    u32 b;
    u64 c;
    u64 d;
    u64 e;
    u64 f;
};

struct field_piece {
    int source;
    int shift;
    u64 mask;
};

struct field_pieces {
    struct field_piece pieces[2];
    u64 offset;
};

u64 unpack_one(const u64 *in, const struct field_pieces *pieces)
{
    const struct field_piece *p = pieces->pieces;
    return (((in[p[0].source] & p[0].mask) << p[0].shift) |
        ((in[p[1].source] & p[1].mask) << p[1].shift)) +
        pieces->offset;
}

struct encoding {
    struct field_pieces a, b, c, d, e, f;
};

void unpack(struct uncompressed *out, const u64 *in, const struct encoding *encoding)
{
    out->a = unpack_one(in, &encoding->a);
    out->b = unpack_one(in, &encoding->b);
    out->c = unpack_one(in, &encoding->c);
    out->d = unpack_one(in, &encoding->d);
    out->e = unpack_one(in, &encoding->e);
    out->f = unpack_one(in, &encoding->f);
}

https://godbolt.org/z/srsfcGK4j

Could be faster.  Probably worth testing.
Nadav Amit June 20, 2023, 10:43 p.m. UTC | #61
> On Jun 20, 2023, at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
>> // out needs to be zeroed first
>> void unpack(struct uncompressed *out, const u64 *in, const struct 
>> bitblock *blocks, int nblocks)
>> {
>>    u64 *out_as_words = (u64*)out;
>>    for (int i = 0; i < nblocks; i++) {
>>        const struct bitblock *b;
>>        out_as_words[b->target] |= (in[b->source] & b->mask) << 
>> b->shift;
>>    }
>> }
>> 
>> void apply_offsets(struct uncompressed *out, const struct uncompressed *offsets)
>> {
>>    out->a += offsets->a;
>>    out->b += offsets->b;
>>    out->c += offsets->c;
>>    out->d += offsets->d;
>>    out->e += offsets->e;
>>    out->f += offsets->f;
>> }
>> 
>> Which generates nice code: https://godbolt.org/z/3fEq37hf5
> 
> Thinking about this a bit more, I think the only real performance issue with my code is that it does 12 read-xor-write operations in memory, which all depend on each other in horrible ways.

If you compare the generated code, just notice that you forgot to initialize b in unpack() in this version.

I presume you wanted it to say "b = &blocks[i]”.
Andy Lutomirski June 21, 2023, 1:27 a.m. UTC | #62
On Tue, Jun 20, 2023, at 3:43 PM, Nadav Amit wrote:
>> On Jun 20, 2023, at 3:32 PM, Andy Lutomirski <luto@kernel.org> wrote:
>> 
>>> // out needs to be zeroed first
>>> void unpack(struct uncompressed *out, const u64 *in, const struct 
>>> bitblock *blocks, int nblocks)
>>> {
>>>    u64 *out_as_words = (u64*)out;
>>>    for (int i = 0; i < nblocks; i++) {
>>>        const struct bitblock *b;
>>>        out_as_words[b->target] |= (in[b->source] & b->mask) << 
>>> b->shift;
>>>    }
>>> }
>>> 
>>> void apply_offsets(struct uncompressed *out, const struct uncompressed *offsets)
>>> {
>>>    out->a += offsets->a;
>>>    out->b += offsets->b;
>>>    out->c += offsets->c;
>>>    out->d += offsets->d;
>>>    out->e += offsets->e;
>>>    out->f += offsets->f;
>>> }
>>> 
>>> Which generates nice code: https://godbolt.org/z/3fEq37hf5
>> 
>> Thinking about this a bit more, I think the only real performance issue with my code is that it does 12 read-xor-write operations in memory, which all depend on each other in horrible ways.
>
> If you compare the generated code, just notice that you forgot to 
> initialize b in unpack() in this version.
>
> I presume you wanted it to say "b = &blocks[i]”.

Indeed.  I also didn't notice that -Wall wasn't set.  Oops.
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 69250efa03..ff147fe115 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -145,6 +145,7 @@  extern void *vzalloc(unsigned long size) __alloc_size(1);
 extern void *vmalloc_user(unsigned long size) __alloc_size(1);
 extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
 extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
+extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 extern void *vmalloc_32(unsigned long size) __alloc_size(1);
 extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d3be89de70..9eaa89e84c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1607,9 +1607,7 @@  static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg
 
 void * __weak module_alloc(unsigned long size)
 {
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
+	return vmalloc_exec(size, GFP_KERNEL);
 }
 
 bool __weak module_init_section(const char *name)
diff --git a/mm/nommu.c b/mm/nommu.c
index 57ba243c6a..8d9ab19e39 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -280,6 +280,24 @@  void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+/**
+ *	vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *	@size:		allocation size
+ *
+ *	Kernel-internal function to allocate enough pages to cover @size
+ *	the page level allocator and map them into contiguous and
+ *	executable kernel virtual space.
+ *
+ *	For tight control over page level allocator and protection flags
+ *	use __vmalloc() instead.
+ */
+
+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(vmalloc_exec);
+
 /**
  * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
  *	@size:		allocation size
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 31ff782d36..2ebb9ea7f0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3401,6 +3401,27 @@  void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+/**
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size:	  allocation size
+ *
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+			gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+			NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(vmalloc_exec);
+
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)