Concurrent `ls` takes out the thrash
diff mbox

Message ID 3A6424D7-8207-4392-88C5-6CD3E1F65800@redhat.com
State New
Headers show

Commit Message

Benjamin Coddington Dec. 8, 2016, 4:13 p.m. UTC
From: "Benjamin Coddington" <bcodding@redhat.com>

Ehh.. I think it's kinda ugly, but this is what I came up with.

It works well, though, and handles everything I throw at it.  Its not as
simple as Christoph's suggestion to just go back to .iterate, which would
solve this in a simpler way.

Ben


8<---------------------------------------------------------------------------------------

From c95f29761bb1f60480863601c0becdd6cba89998 Mon Sep 17 00:00:00 2001
Message-Id: <c95f29761bb1f60480863601c0becdd6cba89998.1481213418.git.bcodding@redhat.com>
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 7 Dec 2016 16:23:30 -0500
Subject: [PATCH] NFS: Handle concurrent users of nfs_readdir()

For a concurrent 'ls -l' workload, processes can stack up in nfs_readdir()
both waiting on the next page and taking turns filling the next page from
XDR, but only one of them will have desc->plus set because setting it
clears the flag on the directory.  If a page is filled by a process that
doesn't have desc->plus set then the next pass through lookup() will cause
it to dump the entire page cache with nfs_force_use_readdirplus().  Then
the next readdir starts all over filling the pagecache.  Forward progress
happens, but only after many steps back re-filling the pagecache.

Fix this by using an additional flag on the inode to extend the lifetime
of the readdir advisement to any processes that may be sending READDIR
calls for that directory in nfs_readdir().
---
 fs/nfs/dir.c           | 25 ++++++++++++++++++-------
 include/linux/nfs_fs.h |  1 +
 2 files changed, 19 insertions(+), 7 deletions(-)

Comments

Benjamin Coddington Dec. 8, 2016, 9:48 p.m. UTC | #1
> Ehh.. I think it's kinda ugly, but this is what I came up with.
>
> It works well, though, and handles everything I throw at it.  Its not as
> simple as Christoph's suggestion to just go back to .iterate, which would
> solve this in a simpler way.

After many testings, I can't find any way this is faster than using the old
.iterate serialized readdir.  Additionally, at around 1M entries, this way
frequently needs to oom-kill things.  Using .iterate, that doesn't happen,
and is much faster at that scale.

Maybe my feeble brain is unable to think of the right workload to make
.iterate_shared preferable for NFS.  Suggestions welcome.

Otherwise, I think serialized is the way to go.

Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7483722162fa..d3e256d723d1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -167,8 +167,10 @@  typedef struct {
 	unsigned long	gencount;
 	unsigned int	cache_entry_index;
 	unsigned int	plus:1;
+	unsigned int	doer:1;
 	unsigned int	eof:1;
 } nfs_readdir_descriptor_t;
+static bool nfs_use_readdirplus(struct inode *, nfs_readdir_descriptor_t *desc);
 
 /*
  * The caller is responsible for calling nfs_readdir_release_array(page)
@@ -385,6 +387,7 @@  int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 	int		error;
 
  again:
+	desc->plus = nfs_use_readdirplus(inode, desc) ? 1 : 0;
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
@@ -393,8 +396,6 @@  int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-			desc->plus = 0;
 			goto again;
 		}
 		goto error;
@@ -443,14 +444,22 @@  int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 }
 
 static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+bool nfs_use_readdirplus(struct inode *dir, nfs_readdir_descriptor_t *desc)
 {
+	struct nfs_inode *nfsi = NFS_I(dir);
+
 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
 		return false;
-	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+	if (test_bit(NFS_INO_DOING_RDPLUS, &nfsi->flags))
 		return true;
-	if (ctx->pos == 0)
+	if (desc->ctx->pos == 0)
 		return true;
+	if (test_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags) &&
+		!test_and_set_bit(NFS_INO_DOING_RDPLUS, &nfsi->flags)) {
+		clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+		desc->doer = 1;
+		return true;
+	}
 	return false;
 }
 
@@ -921,7 +930,6 @@  static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->ctx = ctx;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
 
 	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -943,7 +951,7 @@  static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 		if (res == -ETOOSMALL && desc->plus) {
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
+			clear_bit(NFS_INO_DOING_RDPLUS, &NFS_I(inode)->flags);
 			nfs_zap_caches(inode);
 			desc->page_index = 0;
 			desc->plus = 0;
@@ -957,6 +965,9 @@  static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 	} while (!desc->eof);
+
+	if (desc->doer)
+		clear_bit(NFS_INO_DOING_RDPLUS, &NFS_I(inode)->flags);
 out:
 	if (res > 0)
 		res = 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cb631973839a..e4d31fbad407 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -207,6 +207,7 @@  struct nfs_inode {
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
 #define NFS_INO_ODIRECT		(12)		/* I/O setting is O_DIRECT */
+#define NFS_INO_DOING_RDPLUS	(13)		/* doing readdirplus */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {