diff mbox series

[PATCH/RFC] mount.nfs: handle EADDRINUSE from mount(2)

Message ID 164816808982.6096.11435363819568479436@noble.neil.brown.name (mailing list archive)
State New, archived
Headers show
Series [PATCH/RFC] mount.nfs: handle EADDRINUSE from mount(2) | expand

Commit Message

NeilBrown March 25, 2022, 12:28 a.m. UTC
[[This is the followup to the kernel patch I recently posted.
  It changes the behaviour of incorrectly configured containers to
  get unique client identities - so lease stealing doesn't happen
  so data corruption is avoided - but does not provide stable
  identities, so reboot recovery is not ideal.
  What is best to do when configuration is wrong?  Provide best service
  possible despite it not being perfect, or provide no service so the
  config will not get fixed.  I could be swayed either way.
]]

When NFS filesystems are mounted in different network namespaces, each
network namespace must provide a different hostname (via accompanying
UTS namespace) or different identifier (via sysfs).

If the kernel finds that the identity that it constructs is already in
use in a different namespace it will fail the mount with EADDRINUSE.

This patch catches that error and, if the sysfs identifier is unset,
writes a random string and retries.  This allows the mount to complete
safely even when misconfigured.  The random string has 128 bits of
entropy and so is extremely likely to be globally unique.

A lock is taken on the identifier file, and it is only updated if no
identifier is set.  Thus two concurrent mount attempts will not generate
different identities.  The mount is retried in any case as a race may
have updated the identifier while waiting for the lock.

This is not an ideal solution as an unclean restart of the host cannot
be detected by the server except by a lease timeout.  If the identifier
is configured correctly and is stable across restarts, the server can
detect the restart immediately.  Consequently a warning message is
generated to encourage correct configuration.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 utils/mount/stropts.c | 54 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

Comments

Steve Dickson April 6, 2022, 3:57 p.m. UTC | #1
Hey!

My apologies for taking so long to get to this....

On 3/24/22 8:28 PM, NeilBrown wrote:
> 
> [[This is the followup to the kernel patch I recently posted.
>    It changes the behaviour of incorrectly configured containers to
>    get unique client identities - so lease stealing doesn't happen
>    so data corruption is avoided - but does not provide stable
>    identities, so reboot recovery is not ideal.
Which patch are you referring to and did it make it in?

>    What is best to do when configuration is wrong?  Provide best service
>    possible despite it not being perfect, or provide no service so the
>    config will not get fixed.  I could be swayed either way.
> ]]
Maybe a little both? :-) Flag the broken config and continue on
if possible... but flagging the broken config is more critical... IMHO.

> 
> When NFS filesystems are mounted in different network namespaces, each
> network namespace must provide a different hostname (via accompanying
> UTS namespace) or different identifier (via sysfs).
> 
> If the kernel finds that the identity that it constructs is already in
> use in a different namespace it will fail the mount with EADDRINUSE.
> 
> This patch catches that error and, if the sysfs identifier is unset,
> writes a random string and retries.  This allows the mount to complete
> safely even when misconfigured.  The random string has 128 bits of
> entropy and so is extremely likely to be globally unique.
> 
> A lock is taken on the identifier file, and it is only updated if no
> identifier is set.  Thus two concurrent mount attempts will not generate
> different identities.  The mount is retried in any case as a race may
> have updated the identifier while waiting for the lock.
> 
> This is not an ideal solution as an unclean restart of the host cannot
> be detected by the server except by a lease timeout.  If the identifier
> is configured correctly and is stable across restarts, the server can
> detect the restart immediately.  Consequently a warning message is
> generated to encourage correct configuration.
Just curious... How did you test this patch? I would like
to build an env to generate this type of error.

steved.

> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>   utils/mount/stropts.c | 54 ++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/utils/mount/stropts.c b/utils/mount/stropts.c
> index dbdd11e76b41..84266830b84a 100644
> --- a/utils/mount/stropts.c
> +++ b/utils/mount/stropts.c
> @@ -32,6 +32,7 @@
>   
>   #include <sys/socket.h>
>   #include <sys/mount.h>
> +#include <sys/file.h>
>   #include <netinet/in.h>
>   #include <arpa/inet.h>
>   
> @@ -749,6 +750,50 @@ out:
>   	return ret;
>   }
>   
> +#define ENTROPY_BITS 128
> +static void set_random_identifier(void)
> +{
> +	int fd = open("/sys/fs/nfs/net/nfs_client/identifier", O_RDWR);
> +	int rfd = -1;
> +	unsigned char rbuf[ENTROPY_BITS / 8];
> +	char buf[sizeof(rbuf)*2 + 2];
> +	int n, rn;
> +	int cnt = 1000;
> +
> +	if (fd < 0)
> +		goto out;
> +	/* wait at most one second */
> +	while (flock(fd, LOCK_EX | LOCK_NB) != 0) {
> +		cnt -= 20;
> +		if (cnt < 0)
> +			goto out;
> +		usleep(20 * 1000);
> +	}
> +	n = read(fd, buf, sizeof(buf)-1);
> +	if (n <= 0)
> +		goto out;
> +	buf[n] = 0;
> +	if (n != 7 || strcmp(buf, "(null)\n") != 0)
> +		/* already set */
> +		goto out;
> +	rfd = open("/dev/urandom", O_RDONLY);
> +	if (rfd < 0)
> +		goto out;
> +	rn = read(rfd, rbuf, sizeof(rbuf));
> +	if (rn < (int)sizeof(rbuf))
> +		goto out;
> +	for (n = 0; n < rn; n++)
> +		snprintf(&buf[n*2], 3, "%02x", rbuf[n]);
> +	strcpy(&buf[n*2], "\n");
> +	lseek(fd, SEEK_SET, 0);
> +	write(fd, buf, strlen(buf));
> +out:
> +	if (rfd >= 0)
> +		close(rfd);
> +	if (fd >= 0)
> +		close(fd);
> +}
> +
>   static int nfs_do_mount_v4(struct nfsmount_info *mi,
>   		struct sockaddr *sap, socklen_t salen)
>   {
> @@ -844,7 +889,14 @@ static int nfs_do_mount_v4(struct nfsmount_info *mi,
>   			progname, extra_opts);
>   
>   	result = nfs_sys_mount(mi, options);
> -
> +	if (!result && errno == EADDRINUSE) {
> +		/* client id is not unique, try to create unique id
> +		 * and try again
> +		 */
> +		set_random_identifier();
> +		xlog_warn("Retry mount with randomized identifier. Please configure a stable identifier.");
> +		result = nfs_sys_mount(mi, options);
> +	}
>   	/*
>   	 * If success, update option string to be recorded in /etc/mtab.
>   	 */
NeilBrown April 6, 2022, 10:51 p.m. UTC | #2
On Thu, 07 Apr 2022, Steve Dickson wrote:
> Hey!
> 
> My apologies for taking so long to get to this....

Better late than never !! :-)

> 
> On 3/24/22 8:28 PM, NeilBrown wrote:
> > 
> > [[This is the followup to the kernel patch I recently posted.
> >    It changes the behaviour of incorrectly configured containers to
> >    get unique client identities - so lease stealing doesn't happen
> >    so data corruption is avoided - but does not provide stable
> >    identities, so reboot recovery is not ideal.
> Which patch are you referring to and did it make it in?
> 

https://lore.kernel.org/all/164816787898.6096.12819715693501838662@noble.neil.brown.name/

It hasn't landed, and I didn't expect it to (yet).  It is a basis for
discussion and experimentation.


> >    What is best to do when configuration is wrong?  Provide best service
> >    possible despite it not being perfect, or provide no service so the
> >    config will not get fixed.  I could be swayed either way.
> > ]]
> Maybe a little both? :-) Flag the broken config and continue on
> if possible... but flagging the broken config is more critical... IMHO.

I tend to agree, but it isn't clear how best to flag something in a way
that the flag will actually be seen.

> 
> > 
> > When NFS filesystems are mounted in different network namespaces, each
> > network namespace must provide a different hostname (via accompanying
> > UTS namespace) or different identifier (via sysfs).
> > 
> > If the kernel finds that the identity that it constructs is already in
> > use in a different namespace it will fail the mount with EADDRINUSE.
> > 
> > This patch catches that error and, if the sysfs identifier is unset,
> > writes a random string and retries.  This allows the mount to complete
> > safely even when misconfigured.  The random string has 128 bits of
> > entropy and so is extremely likely to be globally unique.
> > 
> > A lock is taken on the identifier file, and it is only updated if no
> > identifier is set.  Thus two concurrent mount attempts will not generate
> > different identities.  The mount is retried in any case as a race may
> > have updated the identifier while waiting for the lock.
> > 
> > This is not an ideal solution as an unclean restart of the host cannot
> > be detected by the server except by a lease timeout.  If the identifier
> > is configured correctly and is stable across restarts, the server can
> > detect the restart immediately.  Consequently a warning message is
> > generated to encourage correct configuration.
> Just curious... How did you test this patch? I would like
> to build an env to generate this type of error.

ip netns add foo
ip netns exec foo  ip link set dev lo up
ip link add veth0 type veth peer name veth1
ip link set veth1 netns foo

ip netns exec foo ifconfig veth1 10.1.1.1/24 up
ifconfig veth0 10.1.1.2/24 up

ip netns exec foo ip route add default via 10.1.1.2
echo 1 > /proc/sys/net/ipv4/ip_forward
iptables -t nat -A POSTROUTING -j SNAT --to-source 10.0.2.15 -s 10.1.1.0/24

## At this point we have a separate network namespace which has network
## connectivity to anything the host can reach.
## The "--to-source" address is the IP address of the host - a qemu
## instance in this case.

# mount in the primary namespace
mount -o vers=4.1 server:/path /mnt

# attempt to mount in the alternate name space
ip netns exec foo mount -o vers=4.1 server:/path /mnt2

Note that "ip netns exec" creates a temporary mount namespace, so when
it exits, /mnt2 will no longer be mounted.  If you want to do more than
one thing you might need to
   ip netns exec foo /bin/sh my-shell-script
or
   ip netns exec foo /bin/bash -i
   # in a separate terminal window.

NeilBrown
diff mbox series

Patch

diff --git a/utils/mount/stropts.c b/utils/mount/stropts.c
index dbdd11e76b41..84266830b84a 100644
--- a/utils/mount/stropts.c
+++ b/utils/mount/stropts.c
@@ -32,6 +32,7 @@ 
 
 #include <sys/socket.h>
 #include <sys/mount.h>
+#include <sys/file.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 
@@ -749,6 +750,50 @@  out:
 	return ret;
 }
 
+#define ENTROPY_BITS 128
+static void set_random_identifier(void)
+{
+	int fd = open("/sys/fs/nfs/net/nfs_client/identifier", O_RDWR);
+	int rfd = -1;
+	unsigned char rbuf[ENTROPY_BITS / 8];
+	char buf[sizeof(rbuf)*2 + 2];
+	int n, rn;
+	int cnt = 1000;
+
+	if (fd < 0)
+		goto out;
+	/* wait at most one second */
+	while (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+		cnt -= 20;
+		if (cnt < 0)
+			goto out;
+		usleep(20 * 1000);
+	}
+	n = read(fd, buf, sizeof(buf)-1);
+	if (n <= 0)
+		goto out;
+	buf[n] = 0;
+	if (n != 7 || strcmp(buf, "(null)\n") != 0)
+		/* already set */
+		goto out;
+	rfd = open("/dev/urandom", O_RDONLY);
+	if (rfd < 0)
+		goto out;
+	rn = read(rfd, rbuf, sizeof(rbuf));
+	if (rn < (int)sizeof(rbuf))
+		goto out;
+	for (n = 0; n < rn; n++)
+		snprintf(&buf[n*2], 3, "%02x", rbuf[n]);
+	strcpy(&buf[n*2], "\n");
+	lseek(fd, SEEK_SET, 0);
+	write(fd, buf, strlen(buf));
+out:
+	if (rfd >= 0)
+		close(rfd);
+	if (fd >= 0)
+		close(fd);
+}
+
 static int nfs_do_mount_v4(struct nfsmount_info *mi,
 		struct sockaddr *sap, socklen_t salen)
 {
@@ -844,7 +889,14 @@  static int nfs_do_mount_v4(struct nfsmount_info *mi,
 			progname, extra_opts);
 
 	result = nfs_sys_mount(mi, options);
-
+	if (!result && errno == EADDRINUSE) {
+		/* client id is not unique, try to create unique id
+		 * and try again
+		 */
+		set_random_identifier();
+		xlog_warn("Retry mount with randomized identifier. Please configure a stable identifier.");
+		result = nfs_sys_mount(mi, options);
+	}
 	/*
 	 * If success, update option string to be recorded in /etc/mtab.
 	 */