diff mbox

[v2,11/12] NFS: Remove inode->i_dio_count from the NFS O_DIRECT code

Message ID 1466544893-12058-11-git-send-email-trond.myklebust@primarydata.com (mailing list archive)
State New, archived
Headers show

Commit Message

Trond Myklebust June 21, 2016, 9:34 p.m. UTC
Now that we can serialise O_DIRECT and write/truncate using the
inode->i_rwsem, we no longer need inode->i_dio_count.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c | 12 +++---------
 fs/nfs/file.c   |  2 --
 fs/nfs/inode.c  |  1 -
 3 files changed, 3 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig June 22, 2016, 4:42 p.m. UTC | #1
On Tue, Jun 21, 2016 at 05:34:52PM -0400, Trond Myklebust wrote:
> Now that we can serialise O_DIRECT and write/truncate using the
> inode->i_rwsem, we no longer need inode->i_dio_count.

For the AIO case that's not true, we can't hold i_rwsem until
aio completes.  So for something that does block allocations we'll
need something like i_dio_count still.  This probably includes the
block / scsi layout code.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust June 22, 2016, 4:58 p.m. UTC | #2
> On Jun 22, 2016, at 12:42, Christoph Hellwig <hch@infradead.org> wrote:
> 
> On Tue, Jun 21, 2016 at 05:34:52PM -0400, Trond Myklebust wrote:
>> Now that we can serialise O_DIRECT and write/truncate using the
>> inode->i_rwsem, we no longer need inode->i_dio_count.
> 
> For the AIO case that's not true, we can't hold i_rwsem until
> aio completes.  So for something that does block allocations we'll
> need something like i_dio_count still.  This probably includes the
> block / scsi layout code.

Why can’t we hold the i_rwsem until aio completes? It is only a read lock, so we’re not blocking the aio engine from adding more requests. As long as notify_change() holds the i_rwsem for write, then we have exclusion.

Cheers
  Trond

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna June 22, 2016, 5:58 p.m. UTC | #3
Hi Trond,

On 06/21/2016 05:34 PM, Trond Myklebust wrote:
> Now that we can serialise O_DIRECT and write/truncate using the
> inode->i_rwsem, we no longer need inode->i_dio_count.

I'm seeing cthon basic tests fail on all NFS versions after applying this patch:

./test5: read and write
        ./test5: (/nfs/basic) 'bigfile' has size 8192, should be 1048576
basic tests failed

Just thought you should know :)
Anna

> 
> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
> ---
>  fs/nfs/direct.c | 12 +++---------
>  fs/nfs/file.c   |  2 --
>  fs/nfs/inode.c  |  1 -
>  3 files changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 2333e9973802..92267316290a 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -385,11 +385,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
>  		spin_unlock(&inode->i_lock);
>  	}
>  
> -	if (write)
> -		nfs_zap_mapping(inode, inode->i_mapping);
> -
> -	inode_dio_end(inode);
> -
>  	if (dreq->iocb) {
>  		long res = (long) dreq->error;
>  		if (!res)
> @@ -488,7 +483,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
>  			     &nfs_direct_read_completion_ops);
>  	get_dreq(dreq);
>  	desc.pg_dreq = dreq;
> -	inode_dio_begin(inode);
>  
>  	while (iov_iter_count(iter)) {
>  		struct page **pagevec;
> @@ -540,7 +534,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
>  	 * generic layer handle the completion.
>  	 */
>  	if (requested_bytes == 0) {
> -		inode_dio_end(inode);
>  		nfs_direct_req_release(dreq);
>  		return result < 0 ? result : -EIO;
>  	}
> @@ -770,6 +763,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
>  static void nfs_direct_write_schedule_work(struct work_struct *work)
>  {
>  	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
> +	struct inode *inode = dreq->inode;
>  	int flags = dreq->flags;
>  
>  	dreq->flags = 0;
> @@ -781,6 +775,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
>  			nfs_direct_write_reschedule(dreq);
>  			break;
>  		default:
> +			nfs_zap_mapping(inode, inode->i_mapping);
> +
>  			nfs_direct_complete(dreq, true);
>  	}
>  }
> @@ -903,7 +899,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  			      &nfs_direct_write_completion_ops);
>  	desc.pg_dreq = dreq;
>  	get_dreq(dreq);
> -	inode_dio_begin(inode);
>  
>  	NFS_I(inode)->write_io += iov_iter_count(iter);
>  	while (iov_iter_count(iter)) {
> @@ -964,7 +959,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  	 * generic layer handle the completion.
>  	 */
>  	if (requested_bytes == 0) {
> -		inode_dio_end(inode);
>  		nfs_direct_req_release(dreq);
>  		return result < 0 ? result : -EIO;
>  	}
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index b1dd7f2c64f4..fb7a1b0b6a20 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -276,7 +276,6 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>  
>  	trace_nfs_fsync_enter(inode);
>  
> -	inode_dio_wait(inode);
>  	do {
>  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
>  		if (ret != 0)
> @@ -364,7 +363,6 @@ start:
>  	/*
>  	 * Wait for O_DIRECT to complete
>  	 */
> -	inode_dio_wait(mapping->host);
>  
>  	page = grab_cache_page_write_begin(mapping, index, flags);
>  	if (!page)
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index bd0390da3344..6bc65ffc3c6c 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -141,7 +141,6 @@ void nfs_evict_inode(struct inode *inode)
>  
>  int nfs_sync_inode(struct inode *inode)
>  {
> -	inode_dio_wait(inode);
>  	return nfs_wb_all(inode);
>  }
>  EXPORT_SYMBOL_GPL(nfs_sync_inode);
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust June 22, 2016, 6:06 p.m. UTC | #4
> On Jun 22, 2016, at 13:58, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:

> 

> Hi Trond,

> 

> On 06/21/2016 05:34 PM, Trond Myklebust wrote:

>> Now that we can serialise O_DIRECT and write/truncate using the

>> inode->i_rwsem, we no longer need inode->i_dio_count.

> 

> I'm seeing cthon basic tests fail on all NFS versions after applying this patch:

> 

> ./test5: read and write

>        ./test5: (/nfs/basic) 'bigfile' has size 8192, should be 1048576

> basic tests failed

> 


???? Connectathon doesn’t use O_DIRECT. Are you sure it is this patch?
Schumaker, Anna June 22, 2016, 6:08 p.m. UTC | #5
On 06/22/2016 02:06 PM, Trond Myklebust wrote:
> 
>> On Jun 22, 2016, at 13:58, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:
>>
>> Hi Trond,
>>
>> On 06/21/2016 05:34 PM, Trond Myklebust wrote:
>>> Now that we can serialise O_DIRECT and write/truncate using the
>>> inode->i_rwsem, we no longer need inode->i_dio_count.
>>
>> I'm seeing cthon basic tests fail on all NFS versions after applying this patch:
>>
>> ./test5: read and write
>>        ./test5: (/nfs/basic) 'bigfile' has size 8192, should be 1048576
>> basic tests failed
>>
> 
> ???? Connectathon doesn’t use O_DIRECT. Are you sure it is this patch?
> 

Pretty sure.  Cthon worked for me at patch #10, but it failed after adding this one.

Anna
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Schumaker, Anna June 22, 2016, 6:51 p.m. UTC | #6
On 06/22/2016 02:08 PM, Anna Schumaker wrote:
> On 06/22/2016 02:06 PM, Trond Myklebust wrote:
>>
>>> On Jun 22, 2016, at 13:58, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:
>>>
>>> Hi Trond,
>>>
>>> On 06/21/2016 05:34 PM, Trond Myklebust wrote:
>>>> Now that we can serialise O_DIRECT and write/truncate using the
>>>> inode->i_rwsem, we no longer need inode->i_dio_count.
>>>
>>> I'm seeing cthon basic tests fail on all NFS versions after applying this patch:
>>>
>>> ./test5: read and write
>>>        ./test5: (/nfs/basic) 'bigfile' has size 8192, should be 1048576
>>> basic tests failed
>>>
>>
>> ???? Connectathon doesn’t use O_DIRECT. Are you sure it is this patch?
>>
> 
> Pretty sure.  Cthon worked for me at patch #10, but it failed after adding this one.

I retried the tests, and it's actually patch #10 where I start having problems.  Sorry for the confusion!

Anna

> 
> Anna
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust June 22, 2016, 7:42 p.m. UTC | #7
> On Jun 22, 2016, at 14:51, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:

> 

> On 06/22/2016 02:08 PM, Anna Schumaker wrote:

>> On 06/22/2016 02:06 PM, Trond Myklebust wrote:

>>> 

>>>> On Jun 22, 2016, at 13:58, Anna Schumaker <Anna.Schumaker@netapp.com> wrote:

>>>> 

>>>> Hi Trond,

>>>> 

>>>> On 06/21/2016 05:34 PM, Trond Myklebust wrote:

>>>>> Now that we can serialise O_DIRECT and write/truncate using the

>>>>> inode->i_rwsem, we no longer need inode->i_dio_count.

>>>> 

>>>> I'm seeing cthon basic tests fail on all NFS versions after applying this patch:

>>>> 

>>>> ./test5: read and write

>>>>       ./test5: (/nfs/basic) 'bigfile' has size 8192, should be 1048576

>>>> basic tests failed

>>>> 

>>> 

>>> ???? Connectathon doesn’t use O_DIRECT. Are you sure it is this patch?

>>> 

>> 

>> Pretty sure.  Cthon worked for me at patch #10, but it failed after adding this one.

> 

> I retried the tests, and it's actually patch #10 where I start having problems.  Sorry for the confusion!

> 


That makes more sense, and I think I’ve found the issues. There were 2:

1) The update of iocb->ki_pos was broken.
2) generic_write_checks() needs to run after the file size revalidation

Cheers
  Trond
Christoph Hellwig June 23, 2016, 10:19 a.m. UTC | #8
On Wed, Jun 22, 2016 at 04:58:05PM +0000, Trond Myklebust wrote:
> > For the AIO case that's not true, we can't hold i_rwsem until
> > aio completes.  So for something that does block allocations we'll
> > need something like i_dio_count still.  This probably includes the
> > block / scsi layout code.
> 
> Why can?t we hold the i_rwsem until aio completes?

Because Linux rw_semaphores require have owner semantics, and require
the owner that acquired them to release them again.  If we have
an AIO request we're not going to complete the request in the calling
context, though.

The same is true for regular mutexes, that's why we drop the inode lock
at the end of nfs_file_direct_read/write even if I/O might still be
pending.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 2333e9973802..92267316290a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -385,11 +385,6 @@  static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 		spin_unlock(&inode->i_lock);
 	}
 
-	if (write)
-		nfs_zap_mapping(inode, inode->i_mapping);
-
-	inode_dio_end(inode);
-
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
@@ -488,7 +483,6 @@  static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
-	inode_dio_begin(inode);
 
 	while (iov_iter_count(iter)) {
 		struct page **pagevec;
@@ -540,7 +534,6 @@  static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_end(inode);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
@@ -770,6 +763,7 @@  static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 static void nfs_direct_write_schedule_work(struct work_struct *work)
 {
 	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
+	struct inode *inode = dreq->inode;
 	int flags = dreq->flags;
 
 	dreq->flags = 0;
@@ -781,6 +775,8 @@  static void nfs_direct_write_schedule_work(struct work_struct *work)
 			nfs_direct_write_reschedule(dreq);
 			break;
 		default:
+			nfs_zap_mapping(inode, inode->i_mapping);
+
 			nfs_direct_complete(dreq, true);
 	}
 }
@@ -903,7 +899,6 @@  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
-	inode_dio_begin(inode);
 
 	NFS_I(inode)->write_io += iov_iter_count(iter);
 	while (iov_iter_count(iter)) {
@@ -964,7 +959,6 @@  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_end(inode);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b1dd7f2c64f4..fb7a1b0b6a20 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -276,7 +276,6 @@  nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_nfs_fsync_enter(inode);
 
-	inode_dio_wait(inode);
 	do {
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
@@ -364,7 +363,6 @@  start:
 	/*
 	 * Wait for O_DIRECT to complete
 	 */
-	inode_dio_wait(mapping->host);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd0390da3344..6bc65ffc3c6c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -141,7 +141,6 @@  void nfs_evict_inode(struct inode *inode)
 
 int nfs_sync_inode(struct inode *inode)
 {
-	inode_dio_wait(inode);
 	return nfs_wb_all(inode);
 }
 EXPORT_SYMBOL_GPL(nfs_sync_inode);