diff mbox series

[RFC,iproute2-next,4/5] ip: Also create and persist mount namespace when creating netns

Message ID 20231009182753.851551-5-toke@redhat.com (mailing list archive)
State RFC
Delegated to: David Ahern
Headers show
Series Persisting of mount namespaces along with network namespaces | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Toke Høiland-Jørgensen Oct. 9, 2023, 6:27 p.m. UTC
When creating a new network namespace, persist not only the network namespace
reference itself, but also create and persist a new mount namespace that is
paired with the network namespace. This means that multiple subsequent
invocations of 'ip netns exec' will reuse the same mount namespace instead of
creating a new namespace on every entry, as was the behaviour before this patch.

The persistent mount namespace has the benefit that any new mounts created
inside the namespace will persist. Most notably, this is useful when using bpffs
instances along with 'ip netns', as these were previously transient to a single
'ip netns' invocation.

To preserve backwards compatibility, when changing namespaces we will fall back
to the old behaviour of creating a new mount namespace when switching netns, if
we can't find a persisted namespace to enter. This can happen if the netns
instance was created with a previous version of iproute2 that doesn't persist
the mount namespace.

One caveat of the mount namespace persistence is that we can't make the
containing directory mount shared, the way we do with the netns mounts. This
means that if 'ip netns del' is invoked *inside* a namespace created with 'ip
netns', the mount namespace reference will not be deleted and will stick around
in the original mount namespace where it was created. This is unavoidable
because it is not possible to create a bind-mounted reference to a mount
namespace inside that same mount namespace (as that would create a circular
reference).

In such a situation, we may end up with the network namespace reference being
removed but the mount namespace reference sticking around (the same thing can
happen if 'ip netns del' is executed with an older version of iproute2). In this
situation, a subsequent 'ip netns add' with the same namespace name will end up
reusing the old mount namespace reference.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 Makefile        |  2 ++
 ip/ipnetns.c    | 64 +++++++++++++++++++++++++++++++++++++++++++------
 lib/namespace.c |  8 ++++++-
 3 files changed, 66 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 5c559c8dc805..aeb1ddc53c6a 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@  SBINDIR?=/sbin
 CONF_ETC_DIR?=/etc/iproute2
 CONF_USR_DIR?=$(LIBDIR)/iproute2
 NETNS_RUN_DIR?=/var/run/netns
+MNTNS_RUN_DIR?=/var/run/netns-mnt
 NETNS_ETC_DIR?=/etc/netns
 DATADIR?=$(PREFIX)/share
 HDRDIR?=$(PREFIX)/include/iproute2
@@ -41,6 +42,7 @@  endif
 DEFINES+=-DCONF_USR_DIR=\"$(CONF_USR_DIR)\" \
          -DCONF_ETC_DIR=\"$(CONF_ETC_DIR)\" \
          -DNETNS_RUN_DIR=\"$(NETNS_RUN_DIR)\" \
+         -DMNTNS_RUN_DIR=\"$(MNTNS_RUN_DIR)\" \
          -DNETNS_ETC_DIR=\"$(NETNS_ETC_DIR)\" \
          -DCONF_COLOR=$(CONF_COLOR)
 
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 529790482683..551819577755 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -733,13 +733,24 @@  static int netns_identify(int argc, char **argv)
 
 static int on_netns_del(char *nsname, void *arg)
 {
-	char netns_path[PATH_MAX];
+	char ns_path[PATH_MAX];
+	struct stat st;
+
+	snprintf(ns_path, sizeof(ns_path), "%s/%s", MNTNS_RUN_DIR, nsname);
+	if (!stat(ns_path, &st)) { /* may not exist if created by old iproute2 */
+		umount2(ns_path, MNT_DETACH);
+		if (unlink(ns_path) < 0) {
+			fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
+				ns_path, strerror(errno));
+			return -1;
+		}
+	}
 
-	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, nsname);
-	umount2(netns_path, MNT_DETACH);
-	if (unlink(netns_path) < 0) {
+	snprintf(ns_path, sizeof(ns_path), "%s/%s", NETNS_RUN_DIR, nsname);
+	umount2(ns_path, MNT_DETACH);
+	if (unlink(ns_path) < 0) {
 		fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
-			netns_path, strerror(errno));
+			ns_path, strerror(errno));
 		return -1;
 	}
 	return 0;
@@ -885,17 +896,46 @@  static int bind_ns_file(const char *parent, const char *nsfile,
 	return 0;
 }
 
+static ino_t get_mnt_ino(pid_t pid)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	snprintf(path, sizeof(path), "/proc/%u/ns/mnt", (unsigned) pid);
+
+	if (stat(path, &st) != 0) {
+		fprintf(stderr, "stat of %s failed: %s\n",
+			path, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	return st.st_ino;
+}
+
 static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
 				      int *fd)
 {
+	ino_t mnt_ino;
 	pid_t child;
 
+	mnt_ino = get_mnt_ino(getpid());
+
 	child = fork_and_wait(fd);
 	if (child)
 		return child;
 
 	if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
 		exit(EXIT_FAILURE);
+
+	/* We can only bind the mount namespace reference if the target pid is
+	 * actually in a different mount namespace than ourselves. We ignore any
+	 * errors in creating the mount namespace reference because an old
+	 * namespace mount may be present if a network namespace with the same
+	 * name was previously removed by an older version of iproute2; in this
+	 * case that old reference will just be reused.
+	 */
+	if (mnt_ino != get_mnt_ino(target_pid))
+		bind_ns_file(MNTNS_RUN_DIR, "mnt", ns_name, target_pid);
+
 	exit(EXIT_SUCCESS);
 }
 
@@ -1003,8 +1043,13 @@  static int netns_add(int argc, char **argv, bool create)
 	 * unmounting a network namespace file in one namespace will unmount the
 	 * network namespace file in all namespaces allowing the network
 	 * namespace to be freed sooner.
+	 *
+	 * The mount namespace directory cannot be shared because it's not
+	 * possible to mount references to a mount namespace inside that
+	 * namespace itself.
 	 */
-	if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED))
+	if (prepare_ns_mount_dir(NETNS_RUN_DIR, MS_SHARED) ||
+	    prepare_ns_mount_dir(MNTNS_RUN_DIR, MS_SLAVE))
 		return -1;
 
 	child = bind_ns_files_from_child(name, pid, &event_fd);
@@ -1012,12 +1057,17 @@  static int netns_add(int argc, char **argv, bool create)
 		exit(EXIT_FAILURE);
 
 	if (create) {
-		if (unshare(CLONE_NEWNET) < 0) {
+		if (unshare(CLONE_NEWNET | CLONE_NEWNS) < 0) {
 			fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
 				name, strerror(errno));
 			close(event_fd);
 			exit(EXIT_FAILURE);
 		}
+
+		if (prepare_mountns(name, false)) {
+			close(event_fd);
+			exit(EXIT_FAILURE);
+		}
 	}
 
 	return sync_with_child(child, event_fd);
diff --git a/lib/namespace.c b/lib/namespace.c
index 5e310762f34b..5f2449fb0003 100644
--- a/lib/namespace.c
+++ b/lib/namespace.c
@@ -127,7 +127,13 @@  int netns_switch(char *name)
 	if (switch_ns(NETNS_RUN_DIR, name, CLONE_NEWNET))
 		return -1;
 
-	return prepare_mountns(name, true);
+	/* Try to enter an existing persisted mount namespace. If this fails,
+	 * preserve the old behaviour of creating a new namespace on entry.
+	 */
+	if (switch_ns(MNTNS_RUN_DIR, name, CLONE_NEWNS))
+		return prepare_mountns(name, true);
+
+	return 0;
 }
 
 int netns_get_fd(const char *name)