Concurrent `ls` takes out the thrash
diff mbox

Message ID 934FC075-4393-42AD-92A3-3BC3BE580699@redhat.com
State New
Headers show

Commit Message

Benjamin Coddington Dec. 7, 2016, 10:34 p.m. UTC
From: "Benjamin Coddington" <bcodding@redhat.com>

So, I've got the following patch, and it seems to work fine for the original
workload.  However, if I use ~20 procs started 2 seconds apart, I can still
sometimes get into the state where the machine takes a very long time (5 - 8
minutes).  I wonder if I am getting into a state were vmscan is reclaiming
pages while I'm trying to fill them up.  So.. I'll do a bit more debugging
and re-post this if I feel like it is still the right approach.

Adding an int to nfs_open_dir_context puts it at 60 bytes here.  Probably
could have added a unsigned long flags and used bits for the duped stuff as
well, but it was uglier that way, so I left it.  I like how duped carries
-1, 0, and >1 information without having flag defines cluttering everywhere.

Ben

8<------------------------------------------------------------------------

From 5b3e747a984ec966e8c742d8f4fe4b08e1c93acd Mon Sep 17 00:00:00 2001
Message-Id: <5b3e747a984ec966e8c742d8f4fe4b08e1c93acd.1481149380.git.bcodding@redhat.com>
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 7 Dec 2016 16:23:30 -0500
Subject: [PATCH] NFS: Move readdirplus flag to directory context

For a concurrent 'ls -l' workload, processes can stack up in nfs_readdir()
both waiting on the next page and taking turns filling the next page from
XDR, but only one of them will have desc->plus set because setting it
clears the flag on the directory.  If a page is filled by a process that
doesn't have desc->plus set then the next pass through lookup() will cause
it to dump the entire page cache with nfs_force_use_readdirplus().  Then
the next readdir starts all over filling the pagecache.  Forward progress
happens, but only after many steps back re-filling the pagecache.

Fix this by moving the flag to use readdirplus to each open directory
context.

Suggested-by: Trond Myklebust <trondmy@primarydata.com>
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
---
 fs/nfs/dir.c           | 39 ++++++++++++++++++++++++---------------
 include/linux/nfs_fs.h |  1 +
 2 files changed, 25 insertions(+), 15 deletions(-)

Comments

Trond Myklebust Dec. 7, 2016, 10:41 p.m. UTC | #1
> On Dec 7, 2016, at 17:34, Benjamin Coddington <bcodding@redhat.com> wrote:
> 
> From: "Benjamin Coddington" <bcodding@redhat.com>
> 
> So, I've got the following patch, and it seems to work fine for the original
> workload.  However, if I use ~20 procs started 2 seconds apart, I can still
> sometimes get into the state where the machine takes a very long time (5 - 8
> minutes).  I wonder if I am getting into a state were vmscan is reclaiming
> pages while I'm trying to fill them up.  So.. I'll do a bit more debugging
> and re-post this if I feel like it is still the right approach.
> 
> Adding an int to nfs_open_dir_context puts it at 60 bytes here.  Probably
> could have added a unsigned long flags and used bits for the duped stuff as
> well, but it was uglier that way, so I left it.  I like how duped carries
> -1, 0, and >1 information without having flag defines cluttering everywhere.
> 
> Ben
> 
> 8<------------------------------------------------------------------------
> 
> From 5b3e747a984ec966e8c742d8f4fe4b08e1c93acd Mon Sep 17 00:00:00 2001
> Message-Id: <5b3e747a984ec966e8c742d8f4fe4b08e1c93acd.1481149380.git.bcodding@redhat.com>
> From: Benjamin Coddington <bcodding@redhat.com>
> Date: Wed, 7 Dec 2016 16:23:30 -0500
> Subject: [PATCH] NFS: Move readdirplus flag to directory context
> 
> For a concurrent 'ls -l' workload, processes can stack up in nfs_readdir()
> both waiting on the next page and taking turns filling the next page from
> XDR, but only one of them will have desc->plus set because setting it
> clears the flag on the directory.  If a page is filled by a process that
> doesn't have desc->plus set then the next pass through lookup() will cause
> it to dump the entire page cache with nfs_force_use_readdirplus().  Then
> the next readdir starts all over filling the pagecache.  Forward progress
> happens, but only after many steps back re-filling the pagecache.
> 
> Fix this by moving the flag to use readdirplus to each open directory
> context.
> 
> Suggested-by: Trond Myklebust <trondmy@primarydata.com>
> Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
> ---
> fs/nfs/dir.c           | 39 ++++++++++++++++++++++++---------------
> include/linux/nfs_fs.h |  1 +
> 2 files changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 7483722162fa..818172606fed 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -78,6 +78,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
> 		ctx->dir_cookie = 0;
> 		ctx->dup_cookie = 0;
> 		ctx->cred = get_rpccred(cred);
> +		ctx->use_readdir_plus = 0;
> 		spin_lock(&dir->i_lock);
> 		list_add(&ctx->list, &nfsi->open_files);
> 		spin_unlock(&dir->i_lock);
> @@ -443,17 +444,35 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
> }
> 
> static
> -bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
> +			struct nfs_open_dir_context *dir_ctx)
> {
> 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
> 		return false;
> -	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
> +	if (xchg(&dir_ctx->use_readdir_plus, 0))
> 		return true;
> 	if (ctx->pos == 0)
> 		return true;
> 	return false;
> }
> 
> +static
> +void nfs_set_readdirplus(struct inode *dir, int force)
> +{
> +	struct nfs_inode *nfsi = NFS_I(dir);
> +	struct nfs_open_dir_context *ctx;
> +
> +	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
> +	    !list_empty(&nfsi->open_files)) {
> +		spin_lock(&dir->i_lock);
> +		list_for_each_entry(ctx, &nfsi->open_files, list)
> +			ctx->use_readdir_plus = 1;
> +		spin_unlock(&dir->i_lock);
> +		if (force)
> +			invalidate_mapping_pages(dir->i_mapping, 0, -1);
> +	}
> +}
> +
> /*
>  * This function is called by the lookup and getattr code to request the
>  * use of readdirplus to accelerate any future lookups in the same
> @@ -461,11 +480,7 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>  */
> void nfs_advise_use_readdirplus(struct inode *dir)
> {
> -	struct nfs_inode *nfsi = NFS_I(dir);
> -
> -	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
> -	    !list_empty(&nfsi->open_files))
> -		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
> +	nfs_set_readdirplus(dir, 0);
> }
> 
> /*
> @@ -478,13 +493,7 @@ void nfs_advise_use_readdirplus(struct inode *dir)
>  */
> void nfs_force_use_readdirplus(struct inode *dir)
> {
> -	struct nfs_inode *nfsi = NFS_I(dir);
> -
> -	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
> -	    !list_empty(&nfsi->open_files)) {
> -		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
> -		invalidate_mapping_pages(dir->i_mapping, 0, -1);
> -	}
> +	nfs_set_readdirplus(dir, 1);
> }
> 
> static
> @@ -921,7 +930,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
> 	desc->ctx = ctx;
> 	desc->dir_cookie = &dir_ctx->dir_cookie;
> 	desc->decode = NFS_PROTO(inode)->decode_dirent;
> -	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
> +	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;

This fixes desc->plus at the beginning of the readdir() call. Perhaps we should instead check the value of ctx->use_readdir_plus in the call to nfs_readdir_xdr_filler(), and just resetting cts->use_readdir_plus at the very end of nfs_readdir()?

> 
> 	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
> 		res = nfs_revalidate_mapping(inode, file->f_mapping);
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index cb631973839a..fe06c1dd1737 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -89,6 +89,7 @@ struct nfs_open_dir_context {
> 	__u64 dir_cookie;
> 	__u64 dup_cookie;
> 	signed char duped;
> +	int use_readdir_plus;
> };
> 
> /*
> -- 
> 2.5.5
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Coddington Dec. 7, 2016, 10:55 p.m. UTC | #2
On 7 Dec 2016, at 17:41, Trond Myklebust wrote:

>> On Dec 7, 2016, at 17:34, Benjamin Coddington <bcodding@redhat.com> 
>> wrote:
>> static
>> @@ -921,7 +930,7 @@ static int nfs_readdir(struct file *file, struct 
>> dir_context *ctx)
>> 	desc->ctx = ctx;
>> 	desc->dir_cookie = &dir_ctx->dir_cookie;
>> 	desc->decode = NFS_PROTO(inode)->decode_dirent;
>> -	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
>> +	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;
>
> This fixes desc->plus at the beginning of the readdir() call. Perhaps 
> we
> should instead check the value of ctx->use_readdir_plus in the call to
> nfs_readdir_xdr_filler(), and just resetting cts->use_readdir_plus at 
> the
> very end of nfs_readdir()?

I don't understand the functional difference.  Is this just a 
preference?

There must be something else happening.. dcache is getting under 
pressure
pruned maybe, that causes a miss and then the process starts over?

Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust Dec. 7, 2016, 10:59 p.m. UTC | #3
> On Dec 7, 2016, at 17:55, Benjamin Coddington <bcodding@redhat.com> wrote:

> 

> On 7 Dec 2016, at 17:41, Trond Myklebust wrote:

> 

>>> On Dec 7, 2016, at 17:34, Benjamin Coddington <bcodding@redhat.com> wrote:

>>> static

>>> @@ -921,7 +930,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)

>>> 	desc->ctx = ctx;

>>> 	desc->dir_cookie = &dir_ctx->dir_cookie;

>>> 	desc->decode = NFS_PROTO(inode)->decode_dirent;

>>> -	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;

>>> +	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;

>> 

>> This fixes desc->plus at the beginning of the readdir() call. Perhaps we

>> should instead check the value of ctx->use_readdir_plus in the call to

>> nfs_readdir_xdr_filler(), and just resetting cts->use_readdir_plus at the

>> very end of nfs_readdir()?

> 

> I don't understand the functional difference.  Is this just a preference?


No. The functional difference is that we take into account the fact that a process may be in the readdir() code while a GETATTR or LOOKUP from a second process is triggering “use readdirplus” feedback.

> 

> There must be something else happening.. dcache is getting under pressure

> pruned maybe, that causes a miss and then the process starts over?

> 

> Ben

>
Benjamin Coddington Dec. 7, 2016, 11:10 p.m. UTC | #4
On 7 Dec 2016, at 17:59, Trond Myklebust wrote:

>> On Dec 7, 2016, at 17:55, Benjamin Coddington <bcodding@redhat.com> 
>> wrote:
>>
>> On 7 Dec 2016, at 17:41, Trond Myklebust wrote:
>>
>>>> On Dec 7, 2016, at 17:34, Benjamin Coddington <bcodding@redhat.com> 
>>>> wrote:
>>>> static
>>>> @@ -921,7 +930,7 @@ static int nfs_readdir(struct file *file, 
>>>> struct dir_context *ctx)
>>>> 	desc->ctx = ctx;
>>>> 	desc->dir_cookie = &dir_ctx->dir_cookie;
>>>> 	desc->decode = NFS_PROTO(inode)->decode_dirent;
>>>> -	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
>>>> +	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;
>>>
>>> This fixes desc->plus at the beginning of the readdir() call. 
>>> Perhaps we
>>> should instead check the value of ctx->use_readdir_plus in the call 
>>> to
>>> nfs_readdir_xdr_filler(), and just resetting cts->use_readdir_plus 
>>> at the
>>> very end of nfs_readdir()?
>>
>> I don't understand the functional difference.  Is this just a 
>> preference?
>
> No. The functional difference is that we take into account the fact 
> that a
> process may be in the readdir() code while a GETATTR or LOOKUP from a
> second process is triggering “use readdirplus” feedback.

This should only matter if the concurrent processes have different 
buffer
sizes or start at a different offsets -- which shouldn't happen with 
plain
old 'ls -l'.

.. or maybe I'm wrong if, hmm..  if acdirmin ran out (maybe?).. or if we 
mix
'ls -l' and 'ls'.. or if we have pages getting reclaimed.. OK.  I'll try 
it.

Thanks for your suggestions.
Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Coddington Dec. 8, 2016, 2:18 p.m. UTC | #5
On 7 Dec 2016, at 18:10, Benjamin Coddington wrote:

> On 7 Dec 2016, at 17:59, Trond Myklebust wrote:
>
>>> On Dec 7, 2016, at 17:55, Benjamin Coddington <bcodding@redhat.com> 
>>> wrote:
>>>
>>> On 7 Dec 2016, at 17:41, Trond Myklebust wrote:
>>>
>>>>> On Dec 7, 2016, at 17:34, Benjamin Coddington 
>>>>> <bcodding@redhat.com> wrote:
>>>>> static
>>>>> @@ -921,7 +930,7 @@ static int nfs_readdir(struct file *file, 
>>>>> struct dir_context *ctx)
>>>>> 	desc->ctx = ctx;
>>>>> 	desc->dir_cookie = &dir_ctx->dir_cookie;
>>>>> 	desc->decode = NFS_PROTO(inode)->decode_dirent;
>>>>> -	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
>>>>> +	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;
>>>>
>>>> This fixes desc->plus at the beginning of the readdir() call. 
>>>> Perhaps we
>>>> should instead check the value of ctx->use_readdir_plus in the call 
>>>> to
>>>> nfs_readdir_xdr_filler(), and just resetting cts->use_readdir_plus 
>>>> at the
>>>> very end of nfs_readdir()?
>>>
>>> I don't understand the functional difference.  Is this just a 
>>> preference?
>>
>> No. The functional difference is that we take into account the fact 
>> that a
>> process may be in the readdir() code while a GETATTR or LOOKUP from a
>> second process is triggering “use readdirplus” feedback.
>
> This should only matter if the concurrent processes have different 
> buffer
> sizes or start at a different offsets -- which shouldn't happen with 
> plain
> old 'ls -l'.
>
> .. or maybe I'm wrong if, hmm..  if acdirmin ran out (maybe?).. or if 
> we mix
> 'ls -l' and 'ls'.. or if we have pages getting reclaimed.. OK.  I'll 
> try it.

This doesn't help.

The issue is that anything more than a couple of processes cause 
contention
on the directory's i_lock.  The i_lock is taken x entries x processes. 
The
inode flags and serialization worked far better.

If we add another inode flag, then we can add a DOING_RDPLUS.  A process
entering nfs_readdir() that sees ADVISE and not DOING clears it and sets
DOING and remembers to clear DOING on exit of nfs_readdir().  Any 
process
that sees DOING uses plus.

Ben
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7483722162fa..818172606fed 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -78,6 +78,7 @@  static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
 		ctx->cred = get_rpccred(cred);
+		ctx->use_readdir_plus = 0;
 		spin_lock(&dir->i_lock);
 		list_add(&ctx->list, &nfsi->open_files);
 		spin_unlock(&dir->i_lock);
@@ -443,17 +444,35 @@  int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 }
 
 static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+			struct nfs_open_dir_context *dir_ctx)
 {
 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
 		return false;
-	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
+	if (xchg(&dir_ctx->use_readdir_plus, 0))
 		return true;
 	if (ctx->pos == 0)
 		return true;
 	return false;
 }
 
+static
+void nfs_set_readdirplus(struct inode *dir, int force)
+{
+	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_open_dir_context *ctx;
+
+	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+	    !list_empty(&nfsi->open_files)) {
+		spin_lock(&dir->i_lock);
+		list_for_each_entry(ctx, &nfsi->open_files, list)
+			ctx->use_readdir_plus = 1;
+		spin_unlock(&dir->i_lock);
+		if (force)
+			invalidate_mapping_pages(dir->i_mapping, 0, -1);
+	}
+}
+
 /*
  * This function is called by the lookup and getattr code to request the
  * use of readdirplus to accelerate any future lookups in the same
@@ -461,11 +480,7 @@  bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
  */
 void nfs_advise_use_readdirplus(struct inode *dir)
 {
-	struct nfs_inode *nfsi = NFS_I(dir);
-
-	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
-	    !list_empty(&nfsi->open_files))
-		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+	nfs_set_readdirplus(dir, 0);
 }
 
 /*
@@ -478,13 +493,7 @@  void nfs_advise_use_readdirplus(struct inode *dir)
  */
 void nfs_force_use_readdirplus(struct inode *dir)
 {
-	struct nfs_inode *nfsi = NFS_I(dir);
-
-	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
-	    !list_empty(&nfsi->open_files)) {
-		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
-		invalidate_mapping_pages(dir->i_mapping, 0, -1);
-	}
+	nfs_set_readdirplus(dir, 1);
 }
 
 static
@@ -921,7 +930,7 @@  static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->ctx = ctx;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
+	desc->plus = nfs_use_readdirplus(inode, ctx, dir_ctx) ? 1 : 0;
 
 	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cb631973839a..fe06c1dd1737 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -89,6 +89,7 @@  struct nfs_open_dir_context {
 	__u64 dir_cookie;
 	__u64 dup_cookie;
 	signed char duped;
+	int use_readdir_plus;
 };
 
 /*