diff mbox series

uidgid: make sure we fit into one cacheline

Message ID 20240910-work-uid_gid_map-v1-1-e6bc761363ed@kernel.org (mailing list archive)
State New
Headers show
Series uidgid: make sure we fit into one cacheline | expand

Commit Message

Christian Brauner Sept. 10, 2024, 8:16 a.m. UTC
When I expanded uidgid mappings I intended for a struct uid_gid_map to
fit into a single cacheline on x86 as they tend to be pretty
performance sensitive (idmapped mounts etc). But a 4 byte hole was added
that brought it over 64 bytes. Fix that by adding the static extent
array and the extent counter into a substruct. C's type punning for
unions guarantees that we can access ->nr_extents even if the last
written to member wasn't within the same object. This is also what we
rely on in struct_group() and friends. This of course relies on
non-strict aliasing which we don't do.

99) If the member used to read the contents of a union object is not the
    same as the member last used to store a value in the object, the
    appropriate part of the object representation of the value is
    reinterpreted as an object representation in the new type as
    described in 6.2.6 (a process sometimes called "type punning").

Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2310.pdf
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Before:

struct uid_gid_map {
        u32                        nr_extents;           /*     0     4 */

        /* XXX 4 bytes hole, try to pack */

        union {
                struct uid_gid_extent extent[5];         /*     8    60 */
                struct {
                        struct uid_gid_extent * forward; /*     8     8 */
                        struct uid_gid_extent * reverse; /*    16     8 */
                };                                       /*     8    16 */
        };                                               /*     8    64 */

        /* size: 72, cachelines: 2, members: 2 */
        /* sum members: 68, holes: 1, sum holes: 4 */
        /* last cacheline: 8 bytes */
};

After:

struct uid_gid_map {
        union {
                struct {
                        struct uid_gid_extent extent[5]; /*     0    60 */
                        u32        nr_extents;           /*    60     4 */
                };                                       /*     0    64 */
                struct {
                        struct uid_gid_extent * forward; /*     0     8 */
                        struct uid_gid_extent * reverse; /*     8     8 */
                };                                       /*     0    16 */
        };                                               /*     0    64 */

        /* size: 64, cachelines: 1, members: 1 */
};
---
 include/linux/user_namespace.h | 6 ++++--
 kernel/user.c                  | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)


---
base-commit: 698e7d1680544ef114203b0cf656faa0c1216ebc
change-id: 20240910-work-uid_gid_map-cce46aee1b76

Comments

Mateusz Guzik Sept. 10, 2024, 10:48 a.m. UTC | #1
On Tue, Sep 10, 2024 at 10:16:39AM +0200, Christian Brauner wrote:
> When I expanded uidgid mappings I intended for a struct uid_gid_map to
> fit into a single cacheline on x86 as they tend to be pretty
> performance sensitive (idmapped mounts etc). But a 4 byte hole was added
> that brought it over 64 bytes. Fix that by adding the static extent
> array and the extent counter into a substruct. C's type punning for
> unions guarantees that we can access ->nr_extents even if the last
> written to member wasn't within the same object. This is also what we
> rely on in struct_group() and friends. This of course relies on
> non-strict aliasing which we don't do.
> 
> Before:
> 
> struct uid_gid_map {
>         u32                        nr_extents;           /*     0     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         union {
>                 struct uid_gid_extent extent[5];         /*     8    60 */
>                 struct {
>                         struct uid_gid_extent * forward; /*     8     8 */
>                         struct uid_gid_extent * reverse; /*    16     8 */
>                 };                                       /*     8    16 */
>         };                                               /*     8    64 */
> 
>         /* size: 72, cachelines: 2, members: 2 */
>         /* sum members: 68, holes: 1, sum holes: 4 */
>         /* last cacheline: 8 bytes */
> };
> 
> After:
> 
> struct uid_gid_map {
>         union {
>                 struct {
>                         struct uid_gid_extent extent[5]; /*     0    60 */
>                         u32        nr_extents;           /*    60     4 */
>                 };                                       /*     0    64 */
>                 struct {
>                         struct uid_gid_extent * forward; /*     0     8 */
>                         struct uid_gid_extent * reverse; /*     8     8 */
>                 };                                       /*     0    16 */
>         };                                               /*     0    64 */
> 
>         /* size: 64, cachelines: 1, members: 1 */
> };

May I suggest adding a compile-time assert on the size? While it may be
growing it will be unavoidable at some point, it at least wont happen
unintentionally.

This is not the first time something unintentionally passes a threshold
of the sort and I would argue someone(tm) should do a sweep.
Theodore Ts'o Sept. 10, 2024, 1 p.m. UTC | #2
On Tue, Sep 10, 2024 at 12:48:16PM +0200, Mateusz Guzik wrote:
> May I suggest adding a compile-time assert on the size? While it may be
> growing it will be unavoidable at some point, it at least wont happen
> unintentionally.

That should be fine for this structure since everything is defined in
terms of types that should be fixed across architectures, and they
aren't using any types that might change depending on the kernel
config, but as a general matter, we should be a bit careful when
rearranging structrues to avoid holes and to keep things on the same
cache line.

I recently had a patch submission which was rearranging structure
order for an ext4 data structure, and what worked for the patch
submitter didn't work for me, because of differences between kernel
configs and/or architecture types.

So it's been on my todo list to do a sanity check of various ext4
structuers, but to do so checking a number of different architectures
and common production kernel configs (I don't really care if enabling
lockdep results in more holes, because performance is going to be
impacted way more for reasons other than cache lines :-).

Hmm, maybe fodder for a GSOC or intern project would be creating some
kind of automation to check for optimal structure layouts across
multiple configs/architectures?

	     	      	      	    	       - Ted
Mateusz Guzik Sept. 10, 2024, 1:44 p.m. UTC | #3
On Tue, Sep 10, 2024 at 09:00:43AM -0400, Theodore Ts'o wrote:
> On Tue, Sep 10, 2024 at 12:48:16PM +0200, Mateusz Guzik wrote:
> > May I suggest adding a compile-time assert on the size? While it may be
> > growing it will be unavoidable at some point, it at least wont happen
> > unintentionally.
> 
> That should be fine for this structure since everything is defined in
> terms of types that should be fixed across architectures, and they
> aren't using any types that might change depending on the kernel
> config, but as a general matter, we should be a bit careful when
> rearranging structrues to avoid holes and to keep things on the same
> cache line.
> 
> I recently had a patch submission which was rearranging structure
> order for an ext4 data structure, and what worked for the patch
> submitter didn't work for me, because of differences between kernel
> configs and/or architecture types.
> 
> So it's been on my todo list to do a sanity check of various ext4
> structuers, but to do so checking a number of different architectures
> and common production kernel configs (I don't really care if enabling
> lockdep results in more holes, because performance is going to be
> impacted way more for reasons other than cache lines :-).
> 

While I agree all bets are off for an arbitrarily b0rked kernel, a lot
can be done and for more than structs of constant sizes like the one at
hand.

General note is that things are definitely oversized, with
semi-arbitrary field placement and most notably avoidably go past a
magic threshold like a multiply of 64. Cache misses aside this also
results in memory waste in the allocator, which is my primary concern
here.

If people did sweeps over structs in code they maintain (and code which
is not maintained at all) that would be great (tm), realistically wont
happen for vast majority of the kernel.

Even so, for heavily used stuff interested maintainers should be able to
assert that some baseline does not exceed a certain size -- there is a
huge overlap in *important* distro configs. Perhaps configs used by the
oe-lkp folk would be fine?

Bonus points if relative field placement is also checked for
false-sharing reduction.

So for example *some* stuff could be always statically asserted, like
the size of the struct above and many others.

Other stuff could be conditional on lockdep or whatever other debug
bloater not being enabled.

Stuff of importance which is too messy to be treated this way can have
the check be made on demand -- interested maintainers would compile with
"make CHECK_STRUCT_SIZES=1" based on sensible(tm) config and get the
info, while random users messing with their configs remain unaffected.

If there is a random-ass distro with a config which suffers a size
problem for a given struct they can find out thanks to optional size
tests and try to do something about. As is nobody knows squat unless
they explicitly look at stuff one by one.

I did an exercise of the sort elsewhere and managed to shrink quite a
few 136 byters back to 128 etc.

I have some wip here and there, but I'm not signing up for any such
work. I would argue everyone maintaining a subsystem should be able to
sort this out over time, if they have interest.

> Hmm, maybe fodder for a GSOC or intern project would be creating some
> kind of automation to check for optimal structure layouts across
> multiple configs/architectures?
> 

I'm going to stop ranting here.

I do think what I outlined above is easily doable over time and is a
nice to have before anyone even attempts this one.
Jeff Layton Sept. 10, 2024, 1:51 p.m. UTC | #4
On Tue, 2024-09-10 at 10:16 +0200, Christian Brauner wrote:
> When I expanded uidgid mappings I intended for a struct uid_gid_map to
> fit into a single cacheline on x86 as they tend to be pretty
> performance sensitive (idmapped mounts etc). But a 4 byte hole was added
> that brought it over 64 bytes. Fix that by adding the static extent
> array and the extent counter into a substruct. C's type punning for
> unions guarantees that we can access ->nr_extents even if the last
> written to member wasn't within the same object. This is also what we
> rely on in struct_group() and friends. This of course relies on
> non-strict aliasing which we don't do.
> 
> 99) If the member used to read the contents of a union object is not the
>     same as the member last used to store a value in the object, the
>     appropriate part of the object representation of the value is
>     reinterpreted as an object representation in the new type as
>     described in 6.2.6 (a process sometimes called "type punning").
> 
> Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2310.pdf
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> Before:
> 
> struct uid_gid_map {
>         u32                        nr_extents;           /*     0     4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         union {
>                 struct uid_gid_extent extent[5];         /*     8    60 */
>                 struct {
>                         struct uid_gid_extent * forward; /*     8     8 */
>                         struct uid_gid_extent * reverse; /*    16     8 */
>                 };                                       /*     8    16 */
>         };                                               /*     8    64 */
> 
>         /* size: 72, cachelines: 2, members: 2 */
>         /* sum members: 68, holes: 1, sum holes: 4 */
>         /* last cacheline: 8 bytes */
> };
> 
> After:
> 
> struct uid_gid_map {
>         union {
>                 struct {
>                         struct uid_gid_extent extent[5]; /*     0    60 */
>                         u32        nr_extents;           /*    60     4 */
>                 };                                       /*     0    64 */
>                 struct {
>                         struct uid_gid_extent * forward; /*     0     8 */
>                         struct uid_gid_extent * reverse; /*     8     8 */
>                 };                                       /*     0    16 */
>         };                                               /*     0    64 */
> 
>         /* size: 64, cachelines: 1, members: 1 */
> };
> ---
>  include/linux/user_namespace.h | 6 ++++--
>  kernel/user.c                  | 6 +++---
>  2 files changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 6030a8235617..3625096d5f85 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -21,9 +21,11 @@ struct uid_gid_extent {
>  };
>  
>  struct uid_gid_map { /* 64 bytes -- 1 cache line */
> -	u32 nr_extents;
>  	union {
> -		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
> +		struct {
> +			struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
> +			u32 nr_extents;
> +		};

Is this any different from just moving nr_extents to the end of
struct_uid_gid_map? I don't quite get how moving it into the union
improves things.

>  		struct {
>  			struct uid_gid_extent *forward;
>  			struct uid_gid_extent *reverse;
> diff --git a/kernel/user.c b/kernel/user.c
> index aa1162deafe4..f46b1d41163b 100644
> --- a/kernel/user.c
> +++ b/kernel/user.c
> @@ -36,33 +36,33 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
>   */
>  struct user_namespace init_user_ns = {
>  	.uid_map = {
> -		.nr_extents = 1,
>  		{
>  			.extent[0] = {
>  				.first = 0,
>  				.lower_first = 0,
>  				.count = 4294967295U,
>  			},
> +			.nr_extents = 1,
>  		},
>  	},
>  	.gid_map = {
> -		.nr_extents = 1,
>  		{
>  			.extent[0] = {
>  				.first = 0,
>  				.lower_first = 0,
>  				.count = 4294967295U,
>  			},
> +			.nr_extents = 1,
>  		},
>  	},
>  	.projid_map = {
> -		.nr_extents = 1,
>  		{
>  			.extent[0] = {
>  				.first = 0,
>  				.lower_first = 0,
>  				.count = 4294967295U,
>  			},
> +			.nr_extents = 1,
>  		},
>  	},
>  	.ns.count = REFCOUNT_INIT(3),
> 
> ---
> base-commit: 698e7d1680544ef114203b0cf656faa0c1216ebc
> change-id: 20240910-work-uid_gid_map-cce46aee1b76
> 


Jeff Layton <jlayton@kernel.org>
Matthew Wilcox Sept. 10, 2024, 5:16 p.m. UTC | #5
On Tue, Sep 10, 2024 at 09:51:37AM -0400, Jeff Layton wrote:
> > Before:
> > 
> > struct uid_gid_map {
> >         u32                        nr_extents;           /*     0     4 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         union {
> >                 struct uid_gid_extent extent[5];         /*     8    60 */
> >                 struct {
> >                         struct uid_gid_extent * forward; /*     8     8 */
> >                         struct uid_gid_extent * reverse; /*    16     8 */
> >                 };                                       /*     8    16 */
> >         };                                               /*     8    64 */
> > 
> >         /* size: 72, cachelines: 2, members: 2 */
> >         /* sum members: 68, holes: 1, sum holes: 4 */
> >         /* last cacheline: 8 bytes */
> > };
> > 
> > After:
> > 
> > struct uid_gid_map {
> >         union {
> >                 struct {
> >                         struct uid_gid_extent extent[5]; /*     0    60 */
> >                         u32        nr_extents;           /*    60     4 */
> >                 };                                       /*     0    64 */
> >                 struct {
> >                         struct uid_gid_extent * forward; /*     0     8 */
> >                         struct uid_gid_extent * reverse; /*     8     8 */
> >                 };                                       /*     0    16 */
> >         };                                               /*     0    64 */
> > 
> >         /* size: 64, cachelines: 1, members: 1 */
> > };
> 
> Is this any different from just moving nr_extents to the end of
> struct_uid_gid_map? I don't quite get how moving it into the union
> improves things.

It's an alignment question.  Look more carefully at the pahole output.
The array of uid_gid_extent is 4-byte aligned and 60 bytes in size,
but the two pointers must be eight bytes aligned.  That forces the
compiler to make the whole union 8-byte aligned.  If the nr_extents is
within the union, then it can pack with the array of extents.  If not,
it has to leave a 4-byte gap.
Jeff Layton Sept. 10, 2024, 6:50 p.m. UTC | #6
On Tue, 2024-09-10 at 18:16 +0100, Matthew Wilcox wrote:
> On Tue, Sep 10, 2024 at 09:51:37AM -0400, Jeff Layton wrote:
> > > Before:
> > > 
> > > struct uid_gid_map {
> > >         u32                        nr_extents;           /*     0     4 */
> > > 
> > >         /* XXX 4 bytes hole, try to pack */
> > > 
> > >         union {
> > >                 struct uid_gid_extent extent[5];         /*     8    60 */
> > >                 struct {
> > >                         struct uid_gid_extent * forward; /*     8     8 */
> > >                         struct uid_gid_extent * reverse; /*    16     8 */
> > >                 };                                       /*     8    16 */
> > >         };                                               /*     8    64 */
> > > 
> > >         /* size: 72, cachelines: 2, members: 2 */
> > >         /* sum members: 68, holes: 1, sum holes: 4 */
> > >         /* last cacheline: 8 bytes */
> > > };
> > > 
> > > After:
> > > 
> > > struct uid_gid_map {
> > >         union {
> > >                 struct {
> > >                         struct uid_gid_extent extent[5]; /*     0    60 */
> > >                         u32        nr_extents;           /*    60     4 */
> > >                 };                                       /*     0    64 */
> > >                 struct {
> > >                         struct uid_gid_extent * forward; /*     0     8 */
> > >                         struct uid_gid_extent * reverse; /*     8     8 */
> > >                 };                                       /*     0    16 */
> > >         };                                               /*     0    64 */
> > > 
> > >         /* size: 64, cachelines: 1, members: 1 */
> > > };
> > 
> > Is this any different from just moving nr_extents to the end of
> > struct_uid_gid_map? I don't quite get how moving it into the union
> > improves things.
> 
> It's an alignment question.  Look more carefully at the pahole output.
> The array of uid_gid_extent is 4-byte aligned and 60 bytes in size,
> but the two pointers must be eight bytes aligned.  That forces the
> compiler to make the whole union 8-byte aligned.  If the nr_extents is
> within the union, then it can pack with the array of extents.  If not,
> it has to leave a 4-byte gap.

Thanks, that makes sense. With that clarified:

Reviewed-by: Jeff Layton <jlayton@kernel.org>
diff mbox series

Patch

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..3625096d5f85 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -21,9 +21,11 @@  struct uid_gid_extent {
 };
 
 struct uid_gid_map { /* 64 bytes -- 1 cache line */
-	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+		struct {
+			struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+			u32 nr_extents;
+		};
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
diff --git a/kernel/user.c b/kernel/user.c
index aa1162deafe4..f46b1d41163b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -36,33 +36,33 @@  EXPORT_SYMBOL_GPL(init_binfmt_misc);
  */
 struct user_namespace init_user_ns = {
 	.uid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.gid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.projid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.ns.count = REFCOUNT_INIT(3),