diff mbox

[7/7] nfs: page cache invalidation for dio

Message ID 20131114165042.205194841@bombadil.infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Nov. 14, 2013, 4:50 p.m. UTC
Make sure to properly invalidate the pagecache before performing direct I/O,
so that no stale pages are left around.  This matches what the generic
direct I/O code does.  Also take the i_mutex over the direct write submission
to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and
to avoid having the pagecache easily repopulated while direct I/O is in
progrss.  Again matching the generic direct I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/direct.c |   29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

Comments

Jeff Layton Nov. 14, 2013, 6:35 p.m. UTC | #1
On Thu, 14 Nov 2013 08:50:34 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> Make sure to properly invalidate the pagecache before performing direct I/O,
> so that no stale pages are left around.  This matches what the generic
> direct I/O code does.  Also take the i_mutex over the direct write submission
> to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and
> to avoid having the pagecache easily repopulated while direct I/O is in
> progrss.  Again matching the generic direct I/O code.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/nfs/direct.c |   29 +++++++++++++++++++++++++++--
>  1 file changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 6cc7fe1..2b778fc 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -939,9 +939,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  	struct inode *inode = mapping->host;
>  	struct nfs_direct_req *dreq;
>  	struct nfs_lock_context *l_ctx;
> +	loff_t end;
>  	size_t count;
>  
>  	count = iov_length(iov, nr_segs);
> +	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
> +
>  	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
>  
>  	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
> @@ -958,16 +961,25 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  	if (!count)
>  		goto out;
>  
> +	mutex_lock(&inode->i_mutex);
> +
>  	result = nfs_sync_mapping(mapping);
>  	if (result)
> -		goto out;
> +		goto out_unlock;
> +
> +	if (mapping->nrpages) {
> +		result = invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_CACHE_SHIFT, end);
> +		if (result)
> +			goto out_unlock;
> +	}
>  
>  	task_io_account_write(count);
>  
>  	result = -ENOMEM;
>  	dreq = nfs_direct_req_alloc();
>  	if (!dreq)
> -		goto out;
> +		goto out_unlock;
>  
>  	dreq->inode = inode;
>  	dreq->bytes_left = count;
> @@ -982,6 +994,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  		dreq->iocb = iocb;
>  
>  	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
> +	
> +	if (mapping->nrpages) {
> +		invalidate_inode_pages2_range(mapping,
> +					      pos >> PAGE_CACHE_SHIFT, end);
> +	}
> +
> +	mutex_unlock(&inode->i_mutex);
> +
>  	if (!result) {
>  		result = nfs_direct_wait(dreq);
>  		if (result > 0) {
> @@ -994,8 +1014,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  			spin_unlock(&inode->i_lock);
>  		}
>  	}
> +	nfs_direct_req_release(dreq);
> +	return result;
> +
>  out_release:
>  	nfs_direct_req_release(dreq);
> +out_unlock:
> +	mutex_unlock(&inode->i_mutex);
>  out:
>  	return result;
>  }

Hrm... I started chasing down a bug reported by our QA group last week
that's showing up when you mix DIO writes and buffered reads
(basically, diotest3 in the LTP suite is failing). The bug is marked
private for dumb reasons but I'll see if I can make it public. I'll
also plan to give this series a spin to see if it helps fix that bug...

In any case, the DIO write code calls nfs_zap_mapping after it gets the
WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
read() calls from getting data out of the cache after the write reply
comes in.

Why is that not sufficient here?
Christoph Hellwig Nov. 15, 2013, 2:28 p.m. UTC | #2
On Thu, Nov 14, 2013 at 01:35:51PM -0500, Jeff Layton wrote:
> Hrm... I started chasing down a bug reported by our QA group last week
> that's showing up when you mix DIO writes and buffered reads
> (basically, diotest3 in the LTP suite is failing). The bug is marked
> private for dumb reasons but I'll see if I can make it public. I'll
> also plan to give this series a spin to see if it helps fix that bug...
> 
> In any case, the DIO write code calls nfs_zap_mapping after it gets the
> WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
> read() calls from getting data out of the cache after the write reply
> comes in.
> 
> Why is that not sufficient here?

Sounds like it should actually be fine, although I had similar testcases
fail.  I didn't even notice we were doing the invalidation, but delaying
it.  Can't see how that helps when bringing mmap into the game, although
that was always an best effort and pray that it works scenario.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Nov. 15, 2013, 2:52 p.m. UTC | #3
On Fri, 15 Nov 2013 06:28:47 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Thu, Nov 14, 2013 at 01:35:51PM -0500, Jeff Layton wrote:
> > Hrm... I started chasing down a bug reported by our QA group last week
> > that's showing up when you mix DIO writes and buffered reads
> > (basically, diotest3 in the LTP suite is failing). The bug is marked
> > private for dumb reasons but I'll see if I can make it public. I'll
> > also plan to give this series a spin to see if it helps fix that bug...
> > 
> > In any case, the DIO write code calls nfs_zap_mapping after it gets the
> > WRITE reply. That sets NFS_INO_INVALID_DATA and should prevent buffered
> > read() calls from getting data out of the cache after the write reply
> > comes in.
> > 
> > Why is that not sufficient here?
> 
> Sounds like it should actually be fine, although I had similar testcases
> fail.  I didn't even notice we were doing the invalidation, but delaying
> it.  Can't see how that helps when bringing mmap into the game, although
> that was always an best effort and pray that it works scenario.
>

Ok, cool. The bug that I've been looking at with Trond's help is here:

    https://bugzilla.redhat.com/show_bug.cgi?id=919382

Do you have these patches in a git tree someplace? If so, I wouldn't
mind running this reproducer against it to see if it helps. It's a bit
of a longshot, but what the heck...
Christoph Hellwig Nov. 15, 2013, 3:02 p.m. UTC | #4
On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> Do you have these patches in a git tree someplace? If so, I wouldn't
> mind running this reproducer against it to see if it helps. It's a bit
> of a longshot, but what the heck...

While I do have a local git tree I don't really have anywhere to push
it to.  But applying these patches shouldn't be all that hard.

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Nov. 15, 2013, 3:33 p.m. UTC | #5
On Fri, 15 Nov 2013 07:02:04 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> > Do you have these patches in a git tree someplace? If so, I wouldn't
> > mind running this reproducer against it to see if it helps. It's a bit
> > of a longshot, but what the heck...
> 
> While I do have a local git tree I don't really have anywhere to push
> it to.  But applying these patches shouldn't be all that hard.
> 

It's not -- I'm just lazy...

FWIW, I tried this set and it didn't make any difference on the bug, so
I'll just keep soldiering on to track it down...

Thanks,
Jeff Layton Jan. 21, 2014, 7:21 p.m. UTC | #6
On Fri, 15 Nov 2013 10:33:24 -0500
Jeff Layton <jlayton@redhat.com> wrote:

> On Fri, 15 Nov 2013 07:02:04 -0800
> Christoph Hellwig <hch@infradead.org> wrote:
> 
> > On Fri, Nov 15, 2013 at 09:52:41AM -0500, Jeff Layton wrote:
> > > Do you have these patches in a git tree someplace? If so, I wouldn't
> > > mind running this reproducer against it to see if it helps. It's a bit
> > > of a longshot, but what the heck...
> > 
> > While I do have a local git tree I don't really have anywhere to push
> > it to.  But applying these patches shouldn't be all that hard.
> > 
> 
> It's not -- I'm just lazy...
> 
> FWIW, I tried this set and it didn't make any difference on the bug, so
> I'll just keep soldiering on to track it down...
> 

I just tried this set again, and it *did* seem to help that bug.

I think the reason it didn't before was that I had applied this set on
top of a tree that held a different patch that introduced a race in the
nfs_revalidate_mapping() code.

In any case, this helps but it's a little odd. With this patch, you add
an invalidate_inode_pages2 call prior to doing the DIO. But, you've
also left in the call to nfs_zap_mapping in the completion codepath.

So now, we shoot down the mapping prior to doing a DIO write, and then
mark the mapping for invalidation again when the write completes. Was
that intentional?

It seems a little excessive and might hurt performance in some cases.
OTOH, if you mix buffered and DIO you're asking for trouble anyway and
this approach seems to give better cache coherency.
Christoph Hellwig Jan. 22, 2014, 8:24 a.m. UTC | #7
On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> In any case, this helps but it's a little odd. With this patch, you add
> an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> also left in the call to nfs_zap_mapping in the completion codepath.
> 
> So now, we shoot down the mapping prior to doing a DIO write, and then
> mark the mapping for invalidation again when the write completes. Was
> that intentional?
> 
> It seems a little excessive and might hurt performance in some cases.
> OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> this approach seems to give better cache coherency.

Thile follows the model implemented and documented in
generic_file_direct_write().

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton Jan. 22, 2014, 12:04 p.m. UTC | #8
On Wed, 22 Jan 2014 00:24:14 -0800
Christoph Hellwig <hch@infradead.org> wrote:

> On Tue, Jan 21, 2014 at 02:21:59PM -0500, Jeff Layton wrote:
> > In any case, this helps but it's a little odd. With this patch, you add
> > an invalidate_inode_pages2 call prior to doing the DIO. But, you've
> > also left in the call to nfs_zap_mapping in the completion codepath.
> > 
> > So now, we shoot down the mapping prior to doing a DIO write, and then
> > mark the mapping for invalidation again when the write completes. Was
> > that intentional?
> > 
> > It seems a little excessive and might hurt performance in some cases.
> > OTOH, if you mix buffered and DIO you're asking for trouble anyway and
> > this approach seems to give better cache coherency.
> 
> Thile follows the model implemented and documented in
> generic_file_direct_write().
> 

Ok, thanks. That makes sense, and the problem described in those
comments is almost exactly the one I've seen in practice.

I'm still not 100% thrilled with the way that the NFS_INO_INVALID_DATA
flag is handled, but that really has nothing to do with this patchset.

You can add my Tested-by to the set if you like...

Cheers,
diff mbox

Patch

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6cc7fe1..2b778fc 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -939,9 +939,12 @@  ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = mapping->host;
 	struct nfs_direct_req *dreq;
 	struct nfs_lock_context *l_ctx;
+	loff_t end;
 	size_t count;
 
 	count = iov_length(iov, nr_segs);
+	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
@@ -958,16 +961,25 @@  ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (!count)
 		goto out;
 
+	mutex_lock(&inode->i_mutex);
+
 	result = nfs_sync_mapping(mapping);
 	if (result)
-		goto out;
+		goto out_unlock;
+
+	if (mapping->nrpages) {
+		result = invalidate_inode_pages2_range(mapping,
+					pos >> PAGE_CACHE_SHIFT, end);
+		if (result)
+			goto out_unlock;
+	}
 
 	task_io_account_write(count);
 
 	result = -ENOMEM;
 	dreq = nfs_direct_req_alloc();
 	if (!dreq)
-		goto out;
+		goto out_unlock;
 
 	dreq->inode = inode;
 	dreq->bytes_left = count;
@@ -982,6 +994,14 @@  ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		dreq->iocb = iocb;
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+	
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT, end);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0) {
@@ -994,8 +1014,13 @@  ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 			spin_unlock(&inode->i_lock);
 		}
 	}
+	nfs_direct_req_release(dreq);
+	return result;
+
 out_release:
 	nfs_direct_req_release(dreq);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
 out:
 	return result;
 }