diff mbox series

[3/4] mnt_idmapping: decouple from namespaces

Message ID 20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org (mailing list archive)
State New
Headers show
Series mnt_idmapping: decouple from namespaces | expand

Commit Message

Christian Brauner Nov. 22, 2023, 12:44 p.m. UTC
There's no reason we need to couple mnt idmapping to namespaces in the
way we currently do. Copy the idmapping when an idmapped mount is
created and don't take any reference on the namespace at all.

We also can't easily refcount struct uid_gid_map because it needs to
stay the size of a cacheline otherwise we risk performance regressions
(Ignoring for a second that right now struct uid_gid_map isn't actually
 64 byte but 72 but that's a fix for another patch series.).

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c      | 106 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/uidgid.h  |  13 ++++++
 kernel/user_namespace.c |   4 +-
 3 files changed, 106 insertions(+), 17 deletions(-)

Comments

Josef Bacik Nov. 22, 2023, 2:26 p.m. UTC | #1
On Wed, Nov 22, 2023 at 01:44:39PM +0100, Christian Brauner wrote:
> There's no reason we need to couple mnt idmapping to namespaces in the
> way we currently do. Copy the idmapping when an idmapped mount is
> created and don't take any reference on the namespace at all.
> 
> We also can't easily refcount struct uid_gid_map because it needs to
> stay the size of a cacheline otherwise we risk performance regressions
> (Ignoring for a second that right now struct uid_gid_map isn't actually
>  64 byte but 72 but that's a fix for another patch series.).
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/mnt_idmapping.c      | 106 +++++++++++++++++++++++++++++++++++++++++-------
>  include/linux/uidgid.h  |  13 ++++++
>  kernel/user_namespace.c |   4 +-
>  3 files changed, 106 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
> index 35d78cb3c38a..64c5205e2b5e 100644
> --- a/fs/mnt_idmapping.c
> +++ b/fs/mnt_idmapping.c
> @@ -9,8 +9,16 @@
>  
>  #include "internal.h"
>  
> +/*
> + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
> + * never from raw values. These are just internal helpers.
> + */
> +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
> +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
> +
>  struct mnt_idmap {
> -	struct user_namespace *owner;
> +	struct uid_gid_map uid_map;
> +	struct uid_gid_map gid_map;
>  	refcount_t count;
>  };
>  
> @@ -20,7 +28,6 @@ struct mnt_idmap {
>   * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
>   */
>  struct mnt_idmap nop_mnt_idmap = {
> -	.owner	= &init_user_ns,
>  	.count	= REFCOUNT_INIT(1),
>  };
>  EXPORT_SYMBOL_GPL(nop_mnt_idmap);
> @@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
>  		     kuid_t kuid)
>  {
>  	uid_t uid;
> -	struct user_namespace *mnt_userns = idmap->owner;
>  
>  	if (idmap == &nop_mnt_idmap)
>  		return VFSUIDT_INIT(kuid);
> @@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
>  		uid = from_kuid(fs_userns, kuid);
>  	if (uid == (uid_t)-1)
>  		return INVALID_VFSUID;
> -	return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
> +	return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
>  }
>  EXPORT_SYMBOL_GPL(make_vfsuid);
>  
> @@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
>  		     struct user_namespace *fs_userns, kgid_t kgid)
>  {
>  	gid_t gid;
> -	struct user_namespace *mnt_userns = idmap->owner;
>  
>  	if (idmap == &nop_mnt_idmap)
>  		return VFSGIDT_INIT(kgid);
> @@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
>  		gid = from_kgid(fs_userns, kgid);
>  	if (gid == (gid_t)-1)
>  		return INVALID_VFSGID;
> -	return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
> +	return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
>  }
>  EXPORT_SYMBOL_GPL(make_vfsgid);
>  
> @@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
>  		   struct user_namespace *fs_userns, vfsuid_t vfsuid)
>  {
>  	uid_t uid;
> -	struct user_namespace *mnt_userns = idmap->owner;
>  
>  	if (idmap == &nop_mnt_idmap)
>  		return AS_KUIDT(vfsuid);
> -	uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
> +	uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
>  	if (uid == (uid_t)-1)
>  		return INVALID_UID;
>  	if (initial_idmapping(fs_userns))
> @@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
>  		   struct user_namespace *fs_userns, vfsgid_t vfsgid)
>  {
>  	gid_t gid;
> -	struct user_namespace *mnt_userns = idmap->owner;
>  
>  	if (idmap == &nop_mnt_idmap)
>  		return AS_KGIDT(vfsgid);
> -	gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
> +	gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
>  	if (gid == (gid_t)-1)
>  		return INVALID_GID;
>  	if (initial_idmapping(fs_userns))
> @@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid)
>  #endif
>  EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
>  
> +static int copy_mnt_idmap(struct uid_gid_map *map_from,
> +			  struct uid_gid_map *map_to)
> +{
> +	struct uid_gid_extent *forward, *reverse;
> +	u32 nr_extents = READ_ONCE(map_from->nr_extents);
> +	/* Pairs with smp_wmb() when writing the idmapping. */
> +	smp_rmb();
> +
> +	/*
> +	 * Don't blindly copy @map_to into @map_from if nr_extents is
> +	 * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
> +	 * read @nr_extents someone could have written an idmapping and
> +	 * then we might end up with inconsistent data. So just don't do
> +	 * anything at all.
> +	 */
> +	if (nr_extents == 0)
> +		return 0;
> +
> +	/*
> +	 * Here we know that nr_extents is greater than zero which means
> +	 * a map has been written. Since idmappings can't be changed
> +	 * once they have been written we know that we can safely copy
> +	 * from @map_to into @map_from.
> +	 */
> +
> +	if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
> +		*map_to = *map_from;
> +		return 0;
> +	}
> +
> +	forward = kmemdup(map_from->forward,
> +			  nr_extents * sizeof(struct uid_gid_extent),
> +			  GFP_KERNEL_ACCOUNT);
> +	if (!forward)
> +		return -ENOMEM;
> +
> +	reverse = kmemdup(map_from->reverse,
> +			  nr_extents * sizeof(struct uid_gid_extent),
> +			  GFP_KERNEL_ACCOUNT);
> +	if (!reverse) {
> +		kfree(forward);
> +		return -ENOMEM;
> +	}
> +
> +	/*
> +	 * The idmapping isn't exposed anywhere so we don't need to care
> +	 * about ordering between extent pointers and @nr_extents
> +	 * initialization.
> +	 */
> +	map_to->forward = forward;
> +	map_to->reverse = reverse;
> +	map_to->nr_extents = nr_extents;
> +	return 0;
> +}
> +
> +static void free_mnt_idmap(struct mnt_idmap *idmap)
> +{
> +	if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
> +		kfree(idmap->uid_map.forward);
> +		kfree(idmap->uid_map.reverse);
> +	}
> +	if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
> +		kfree(idmap->gid_map.forward);
> +		kfree(idmap->gid_map.reverse);
> +	}
> +	kfree(idmap);
> +}
> +
>  struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
>  {
>  	struct mnt_idmap *idmap;
> +	int ret;
>  
>  	idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
>  	if (!idmap)
>  		return ERR_PTR(-ENOMEM);
>  
> -	idmap->owner = get_user_ns(mnt_userns);
>  	refcount_set(&idmap->count, 1);
> +	ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
> +	if (!ret)
> +		ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
> +	if (ret) {
> +		free_mnt_idmap(idmap);
> +		idmap = ERR_PTR(ret);
> +	}
>  	return idmap;
>  }
>  
> @@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
>   */
>  void mnt_idmap_put(struct mnt_idmap *idmap)
>  {
> -	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
> -		put_user_ns(idmap->owner);
> -		kfree(idmap);
> -	}
> +	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
> +		free_mnt_idmap(idmap);
>  }
>  EXPORT_SYMBOL_GPL(mnt_idmap_put);
> diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
> index b0542cd11aeb..7806e93b907d 100644
> --- a/include/linux/uidgid.h
> +++ b/include/linux/uidgid.h
> @@ -17,6 +17,7 @@
>  
>  struct user_namespace;
>  extern struct user_namespace init_user_ns;
> +struct uid_gid_map;
>  
>  typedef struct {
>  	uid_t val;
> @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
>  	return from_kgid(ns, gid) != (gid_t) -1;
>  }
>  
> +u32 map_id_down(struct uid_gid_map *map, u32 id);
> +u32 map_id_up(struct uid_gid_map *map, u32 id);
> +
>  #else
>  
>  static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
> @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
>  	return gid_valid(gid);
>  }
>  
> +static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
> +{
> +	return id;
> +}
> +
> +static inline u32 map_id_up(struct uid_gid_map *map, u32 id);

You accidentally put a ; here, and then fix it up in the next patch, it needs to
be fixed here.  Thanks,

Josef
Christian Brauner Nov. 22, 2023, 2:34 p.m. UTC | #2
> You accidentally put a ; here, and then fix it up in the next patch, it needs to
> be fixed here.  Thanks,

Bah, fixed this now. Thanks!
Josef Bacik Nov. 22, 2023, 3:14 p.m. UTC | #3
On Wed, Nov 22, 2023 at 03:34:39PM +0100, Christian Brauner wrote:
> > You accidentally put a ; here, and then fix it up in the next patch, it needs to
> > be fixed here.  Thanks,
> 
> Bah, fixed this now. Thanks!

You can add

Reviewed-by: Josef Bacik <josef@toxicpanda.com>

Thanks,

Josef
diff mbox series

Patch

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 35d78cb3c38a..64c5205e2b5e 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -9,8 +9,16 @@ 
 
 #include "internal.h"
 
+/*
+ * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
+ * never from raw values. These are just internal helpers.
+ */
+#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
+#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
+
 struct mnt_idmap {
-	struct user_namespace *owner;
+	struct uid_gid_map uid_map;
+	struct uid_gid_map gid_map;
 	refcount_t count;
 };
 
@@ -20,7 +28,6 @@  struct mnt_idmap {
  * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
  */
 struct mnt_idmap nop_mnt_idmap = {
-	.owner	= &init_user_ns,
 	.count	= REFCOUNT_INIT(1),
 };
 EXPORT_SYMBOL_GPL(nop_mnt_idmap);
@@ -65,7 +72,6 @@  vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		     kuid_t kuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSUIDT_INIT(kuid);
@@ -75,7 +81,7 @@  vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		uid = from_kuid(fs_userns, kuid);
 	if (uid == (uid_t)-1)
 		return INVALID_VFSUID;
-	return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+	return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
 }
 EXPORT_SYMBOL_GPL(make_vfsuid);
 
@@ -103,7 +109,6 @@  vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		     struct user_namespace *fs_userns, kgid_t kgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSGIDT_INIT(kgid);
@@ -113,7 +118,7 @@  vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		gid = from_kgid(fs_userns, kgid);
 	if (gid == (gid_t)-1)
 		return INVALID_VFSGID;
-	return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+	return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
 }
 EXPORT_SYMBOL_GPL(make_vfsgid);
 
@@ -132,11 +137,10 @@  kuid_t from_vfsuid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsuid_t vfsuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KUIDT(vfsuid);
-	uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+	uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
 	if (uid == (uid_t)-1)
 		return INVALID_UID;
 	if (initial_idmapping(fs_userns))
@@ -160,11 +164,10 @@  kgid_t from_vfsgid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsgid_t vfsgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KGIDT(vfsgid);
-	gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+	gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
 	if (gid == (gid_t)-1)
 		return INVALID_GID;
 	if (initial_idmapping(fs_userns))
@@ -195,16 +198,91 @@  int vfsgid_in_group_p(vfsgid_t vfsgid)
 #endif
 EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
 
+static int copy_mnt_idmap(struct uid_gid_map *map_from,
+			  struct uid_gid_map *map_to)
+{
+	struct uid_gid_extent *forward, *reverse;
+	u32 nr_extents = READ_ONCE(map_from->nr_extents);
+	/* Pairs with smp_wmb() when writing the idmapping. */
+	smp_rmb();
+
+	/*
+	 * Don't blindly copy @map_to into @map_from if nr_extents is
+	 * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
+	 * read @nr_extents someone could have written an idmapping and
+	 * then we might end up with inconsistent data. So just don't do
+	 * anything at all.
+	 */
+	if (nr_extents == 0)
+		return 0;
+
+	/*
+	 * Here we know that nr_extents is greater than zero which means
+	 * a map has been written. Since idmappings can't be changed
+	 * once they have been written we know that we can safely copy
+	 * from @map_to into @map_from.
+	 */
+
+	if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		*map_to = *map_from;
+		return 0;
+	}
+
+	forward = kmemdup(map_from->forward,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!forward)
+		return -ENOMEM;
+
+	reverse = kmemdup(map_from->reverse,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!reverse) {
+		kfree(forward);
+		return -ENOMEM;
+	}
+
+	/*
+	 * The idmapping isn't exposed anywhere so we don't need to care
+	 * about ordering between extent pointers and @nr_extents
+	 * initialization.
+	 */
+	map_to->forward = forward;
+	map_to->reverse = reverse;
+	map_to->nr_extents = nr_extents;
+	return 0;
+}
+
+static void free_mnt_idmap(struct mnt_idmap *idmap)
+{
+	if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->uid_map.forward);
+		kfree(idmap->uid_map.reverse);
+	}
+	if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->gid_map.forward);
+		kfree(idmap->gid_map.reverse);
+	}
+	kfree(idmap);
+}
+
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
 {
 	struct mnt_idmap *idmap;
+	int ret;
 
 	idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
 	if (!idmap)
 		return ERR_PTR(-ENOMEM);
 
-	idmap->owner = get_user_ns(mnt_userns);
 	refcount_set(&idmap->count, 1);
+	ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
+	if (!ret)
+		ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
+	if (ret) {
+		free_mnt_idmap(idmap);
+		idmap = ERR_PTR(ret);
+	}
 	return idmap;
 }
 
@@ -234,9 +312,7 @@  EXPORT_SYMBOL_GPL(mnt_idmap_get);
  */
 void mnt_idmap_put(struct mnt_idmap *idmap)
 {
-	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
-		put_user_ns(idmap->owner);
-		kfree(idmap);
-	}
+	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+		free_mnt_idmap(idmap);
 }
 EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..7806e93b907d 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -17,6 +17,7 @@ 
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
+struct uid_gid_map;
 
 typedef struct {
 	uid_t val;
@@ -138,6 +139,9 @@  static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return from_kgid(ns, gid) != (gid_t) -1;
 }
 
+u32 map_id_down(struct uid_gid_map *map, u32 id);
+u32 map_id_up(struct uid_gid_map *map, u32 id);
+
 #else
 
 static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@ -186,6 +190,15 @@  static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return gid_valid(gid);
 }
 
+static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
+
+static inline u32 map_id_up(struct uid_gid_map *map, u32 id);
+{
+	return id;
+}
 #endif /* CONFIG_USER_NS */
 
 #endif /* _LINUX_UIDGID_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index eabe8bcc7042..a649e58e3b6a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -332,7 +332,7 @@  static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
 	return map_id_range_down(map, id, 1);
 }
@@ -375,7 +375,7 @@  map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
 		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
 {
 	struct uid_gid_extent *extent;
 	unsigned extents = map->nr_extents;