diff mbox series

[6/6] cache: periodically retry requests that couldn't be answered.

Message ID 20231023021052.5258-7-neilb@suse.de (mailing list archive)
State New, archived
Headers show
Series fixes for error handling in nfsd_fh | expand

Commit Message

NeilBrown Oct. 23, 2023, 1:58 a.m. UTC
Requests from the kernel to map the fsid from a filehandle to a path
name sometimes cannot be answered because the filesystems isn't
available now but might be available later.

This happens if an export is marked "mountpoint" but the mountpoint
isn't currently mounted.  In this case it might get mounted in the
future.

It also happens in an NFS filesystem is being re-exported and the server
is unresponsive.  In that case (if it was mounted "softerr") we get
ETIMEDOUT from a stat() attempt and so cannot give either a positive or
negative response.

These cases are currently handled poorly.  No answer is returned to the
kernel so it will continue waiting for an answer - and never get one
even if the NFS server comes back or the mountpoint is mounted.

We cannot report a soft error to the kernel so much retry ourselves.

With this patch we record the request when the lookup fails with
dev_missing or similar and retry every 2 minutes.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 support/export/cache.c | 121 +++++++++++++++++++++++++++++++++++------
 1 file changed, 103 insertions(+), 18 deletions(-)
diff mbox series

Patch

diff --git a/support/export/cache.c b/support/export/cache.c
index a01eba4f6619..6c0a44a3a209 100644
--- a/support/export/cache.c
+++ b/support/export/cache.c
@@ -759,7 +759,15 @@  static struct addrinfo *lookup_client_addr(char *dom)
 	return ret;
 }
 
-static void nfsd_fh(int f)
+#define RETRY_SEC 120
+struct delayed {
+	char *message;
+	time_t last_attempt;
+	int f;
+	struct delayed *next;
+} *delayed;
+
+static int nfsd_handle_fh(int f, char *bp, int blen)
 {
 	/* request are:
 	 *  domain fsidtype fsid
@@ -777,21 +785,13 @@  static void nfsd_fh(int f)
 	nfs_export *exp;
 	int i;
 	int dev_missing = 0;
-	char buf[RPC_CHAN_BUF_SIZE], *bp;
-	int blen;
+	char buf[RPC_CHAN_BUF_SIZE];
 	int did_uncover = 0;
-
-	blen = cache_read(f, buf, sizeof(buf));
-	if (blen <= 0 || buf[blen-1] != '\n') return;
-	buf[blen-1] = 0;
-
-	xlog(D_CALL, "nfsd_fh: inbuf '%s'", buf);
-
-	bp = buf;
+	int ret = 0;
 
 	dom = malloc(blen);
 	if (dom == NULL)
-		return;
+		return ret;
 	if (qword_get(&bp, dom, blen) <= 0)
 		goto out;
 	if (qword_get_int(&bp, &fsidtype) != 0)
@@ -893,8 +893,10 @@  static void nfsd_fh(int f)
 		/* The missing dev could be what we want, so just be
 		 * quiet rather than returning stale yet
 		 */
-		if (dev_missing)
+		if (dev_missing) {
+			ret = 1;
 			goto out;
+		}
 	} else if (found->e_mountpoint &&
 	    !is_mountpoint(found->e_mountpoint[0]?
 			   found->e_mountpoint:
@@ -904,7 +906,7 @@  static void nfsd_fh(int f)
 		   xlog(L_WARNING, "%s not exported as %d not a mountpoint",
 		   found->e_path, found->e_mountpoint);
 		 */
-		/* FIXME we need to make sure we re-visit this later */
+		ret = 1;
 		goto out;
 	}
 
@@ -933,7 +935,68 @@  out:
 		free(found_path);
 	nfs_freeaddrinfo(ai);
 	free(dom);
-	xlog(D_CALL, "nfsd_fh: found %p path %s", found, found ? found->e_path : NULL);
+	if (!ret)
+		xlog(D_CALL, "nfsd_fh: found %p path %s",
+		     found, found ? found->e_path : NULL);
+	return ret;
+}
+
+static void nfsd_fh(int f)
+{
+	struct delayed *d, **dp;
+	char inbuf[RPC_CHAN_BUF_SIZE];
+	int blen;
+
+	blen = cache_read(f, inbuf, sizeof(inbuf));
+	if (blen <= 0 || inbuf[blen-1] != '\n') return;
+	inbuf[blen-1] = 0;
+
+	xlog(D_CALL, "nfsd_fh: inbuf '%s'", inbuf);
+
+	if (nfsd_handle_fh(f, inbuf, blen) == 0)
+		return;
+	/* We don't have a definitive answer to give the kernel.
+	 * This is because an export marked "mountpoint" isn't a
+	 * mountpoint, or because a stat of a mountpoint fails with
+	 * a strange error like ETIMEDOUT as is possible with an
+	 * NFS mount marked "softerr" which is being re-exported.
+	 *
+	 * We cannot tell the kernel to retry, so we have to
+	 * retry ourselves.
+	 */
+	d = malloc(sizeof(*d));
+
+	if (!d)
+		return;
+	d->message = strndup(inbuf, blen);
+	if (!d->message) {
+		free(d);
+		return;
+	}
+	d->f = f;
+	d->last_attempt = time(NULL);
+	d->next = NULL;
+	dp = &delayed;
+	while (*dp)
+		dp = &(*dp)->next;
+	*dp = d;
+}
+
+static void nfsd_retry_fh(struct delayed *d)
+{
+	struct delayed **dp;
+
+	if (nfsd_handle_fh(d->f, d->message, strlen(d->message)+1) == 0) {
+		free(d->message);
+		free(d);
+		return;
+	}
+	d->last_attempt = time(NULL);
+	d->next = NULL;
+	dp = &delayed;
+	while (*dp)
+		dp = &(*dp)->next;
+	*dp = d;
 }
 
 #ifdef HAVE_JUNCTION_SUPPORT
@@ -1512,7 +1575,7 @@  static void nfsd_export(int f)
 			 * This will cause it not to appear in the V4 Pseudo-root
 			 * and so a "mount" of this path will fail, just like with
 			 * V3.
-			 * And filehandle for this mountpoint from an earlier
+			 * Any filehandle for this mountpoint from an earlier
 			 * mount will block in nfsd.fh lookup.
 			 */
 			xlog(L_WARNING,
@@ -1610,6 +1673,7 @@  int cache_process(fd_set *readfds)
 {
 	fd_set fdset;
 	int	selret;
+	struct timeval tv = { 24*3600, 0 };
 
 	if (!readfds) {
 		FD_ZERO(&fdset);
@@ -1618,8 +1682,29 @@  int cache_process(fd_set *readfds)
 	cache_set_fds(readfds);
 	v4clients_set_fds(readfds);
 
-	selret = select(FD_SETSIZE, readfds,
-			(void *) 0, (void *) 0, (struct timeval *) 0);
+	if (delayed) {
+		time_t now = time(NULL);
+		time_t delay;
+		if (delayed->last_attempt > now)
+			/* Clock updated - retry immediately */
+			delayed->last_attempt = now - RETRY_SEC;
+		delay = delayed->last_attempt + RETRY_SEC - now;
+		if (delay < 0)
+			delay = 0;
+		tv.tv_sec = delay;
+	}
+	selret = select(FD_SETSIZE, readfds, NULL, NULL, &tv);
+
+	if (delayed) {
+		time_t now = time(NULL);
+		struct delayed *d = delayed;
+
+		if (d->last_attempt + RETRY_SEC <= now) {
+			delayed = d->next;
+			d->next = NULL;
+			nfsd_retry_fh(d);
+		}
+	}
 
 	switch (selret) {
 	case -1: