diff mbox series

[7/7] nfs: don't reuse partially completed requests in nfs_lock_and_join_requests

Message ID 20240701052707.1246254-8-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/7] nfs: remove dead code for the old swap over NFS implementation | expand

Commit Message

Christoph Hellwig July 1, 2024, 5:26 a.m. UTC
When NFS requests are split into sub-requests, nfs_inode_remove_request
calls nfs_page_group_sync_on_bit to set PG_REMOVE on this sub-request and
only completes the head requests once PG_REMOVE is set on all requests.
This means that when nfs_lock_and_join_requests sees a PG_REMOVE bit, I/O
on the request is in progress and has partially completed.   If such a
request is returned to nfs_try_to_update_request, it could be extended
with the newly dirtied region and I/O for the combined range will be
re-scheduled, leading to extra I/O.

Change the logic to instead restart the search for a request when any
PG_REMOVE bit is set, as the completion handler will remove the request
as soon as it can take the page group lock.  This not only avoid
extending the I/O but also does the right thing for the callers that
want to cancel or flush the request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfs/write.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

Comments

Sagi Grimberg July 2, 2024, 8:07 a.m. UTC | #1
On 01/07/2024 8:26, Christoph Hellwig wrote:
> When NFS requests are split into sub-requests, nfs_inode_remove_request
> calls nfs_page_group_sync_on_bit to set PG_REMOVE on this sub-request and
> only completes the head requests once PG_REMOVE is set on all requests.
> This means that when nfs_lock_and_join_requests sees a PG_REMOVE bit, I/O
> on the request is in progress and has partially completed.   If such a
> request is returned to nfs_try_to_update_request, it could be extended
> with the newly dirtied region and I/O for the combined range will be
> re-scheduled, leading to extra I/O.

Probably worth noting in the change log that large folios makes this 
potentially much
worse?

>
> Change the logic to instead restart the search for a request when any
> PG_REMOVE bit is set, as the completion handler will remove the request
> as soon as it can take the page group lock.  This not only avoid
> extending the I/O but also does the right thing for the callers that
> want to cancel or flush the request.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   fs/nfs/write.c | 49 ++++++++++++++++++++-----------------------------
>   1 file changed, 20 insertions(+), 29 deletions(-)
>
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 2c089444303982..4dffdc5aadb2e2 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -144,31 +144,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>   		kref_put(&ioc->refcount, nfs_io_completion_release);
>   }
>   
> -static void
> -nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
> -{
> -	if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
> -		kref_get(&req->wb_kref);
> -		atomic_long_inc(&NFS_I(inode)->nrequests);
> -	}
> -}
> -
> -static int
> -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
> -{
> -	int ret;
> -
> -	if (!test_bit(PG_REMOVE, &req->wb_flags))
> -		return 0;
> -	ret = nfs_page_group_lock(req);
> -	if (ret)
> -		return ret;
> -	if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
> -		nfs_page_set_inode_ref(req, inode);
> -	nfs_page_group_unlock(req);
> -	return 0;
> -}
> -
>   /**
>    * nfs_folio_find_head_request - find head request associated with a folio
>    * @folio: pointer to folio
> @@ -564,6 +539,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
>   	struct inode *inode = folio->mapping->host;
>   	struct nfs_page *head, *subreq;
>   	struct nfs_commit_info cinfo;
> +	bool removed;
>   	int ret;
>   
>   	/*
> @@ -588,18 +564,18 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
>   		goto retry;
>   	}
>   
> -	ret = nfs_cancel_remove_inode(head, inode);
> -	if (ret < 0)
> -		goto out_unlock;
> -
>   	ret = nfs_page_group_lock(head);
>   	if (ret < 0)
>   		goto out_unlock;
>   
> +	removed = test_bit(PG_REMOVE, &head->wb_flags);
> +
>   	/* lock each request in the page group */
>   	for (subreq = head->wb_this_page;
>   	     subreq != head;
>   	     subreq = subreq->wb_this_page) {
> +		if (test_bit(PG_REMOVE, &subreq->wb_flags))
> +			removed = true;
>   		ret = nfs_page_group_lock_subreq(head, subreq);
>   		if (ret < 0)
>   			goto out_unlock;
> @@ -607,6 +583,21 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
>   
>   	nfs_page_group_unlock(head);
>   
> +	/*
> +	 * If PG_REMOVE is set on any request, I/O on that request has
> +	 * completed, but some requests were still under I/O at the time
> +	 * we locked the head request.
> +	 *
> +	 * In that case the above wait for all requests means that all I/O
> +	 * has now finished, and we can restart from a clean slate.  Let the
> +	 * old requests go away and start from scratch instead.
> +	 */
> +	if (removed) {
> +		nfs_unroll_locks(head, head);
> +		nfs_unlock_and_release_request(head);
> +		goto retry;
> +	}

Don't you need a waitqueue or something to avoid excessive restarts 
until the
IO completes?
Christoph Hellwig July 3, 2024, 4:25 a.m. UTC | #2
On Tue, Jul 02, 2024 at 11:07:13AM +0300, Sagi Grimberg wrote:
> On 01/07/2024 8:26, Christoph Hellwig wrote:
>> When NFS requests are split into sub-requests, nfs_inode_remove_request
>> calls nfs_page_group_sync_on_bit to set PG_REMOVE on this sub-request and
>> only completes the head requests once PG_REMOVE is set on all requests.
>> This means that when nfs_lock_and_join_requests sees a PG_REMOVE bit, I/O
>> on the request is in progress and has partially completed.   If such a
>> request is returned to nfs_try_to_update_request, it could be extended
>> with the newly dirtied region and I/O for the combined range will be
>> re-scheduled, leading to extra I/O.
>
> Probably worth noting in the change log that large folios makes this 
> potentially much
> worse?

That assumes large folios actually create more subrequest.  One big
reason to create subrequests is flexfiles mirroring, which of course
doesn't change with large folio.  The other is that if ->pg_test
doesn't allow the nfs_page to cover everything, which is roughly
bound by a page array allocation and for PNFS the layout segment
size, and the chance for that to fail could very slightly increase.
diff mbox series

Patch

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2c089444303982..4dffdc5aadb2e2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -144,31 +144,6 @@  static void nfs_io_completion_put(struct nfs_io_completion *ioc)
 		kref_put(&ioc->refcount, nfs_io_completion_release);
 }
 
-static void
-nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
-{
-	if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
-		kref_get(&req->wb_kref);
-		atomic_long_inc(&NFS_I(inode)->nrequests);
-	}
-}
-
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
-{
-	int ret;
-
-	if (!test_bit(PG_REMOVE, &req->wb_flags))
-		return 0;
-	ret = nfs_page_group_lock(req);
-	if (ret)
-		return ret;
-	if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
-		nfs_page_set_inode_ref(req, inode);
-	nfs_page_group_unlock(req);
-	return 0;
-}
-
 /**
  * nfs_folio_find_head_request - find head request associated with a folio
  * @folio: pointer to folio
@@ -564,6 +539,7 @@  static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
 	struct inode *inode = folio->mapping->host;
 	struct nfs_page *head, *subreq;
 	struct nfs_commit_info cinfo;
+	bool removed;
 	int ret;
 
 	/*
@@ -588,18 +564,18 @@  static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
 		goto retry;
 	}
 
-	ret = nfs_cancel_remove_inode(head, inode);
-	if (ret < 0)
-		goto out_unlock;
-
 	ret = nfs_page_group_lock(head);
 	if (ret < 0)
 		goto out_unlock;
 
+	removed = test_bit(PG_REMOVE, &head->wb_flags);
+
 	/* lock each request in the page group */
 	for (subreq = head->wb_this_page;
 	     subreq != head;
 	     subreq = subreq->wb_this_page) {
+		if (test_bit(PG_REMOVE, &subreq->wb_flags))
+			removed = true;
 		ret = nfs_page_group_lock_subreq(head, subreq);
 		if (ret < 0)
 			goto out_unlock;
@@ -607,6 +583,21 @@  static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
 
 	nfs_page_group_unlock(head);
 
+	/*
+	 * If PG_REMOVE is set on any request, I/O on that request has
+	 * completed, but some requests were still under I/O at the time
+	 * we locked the head request.
+	 *
+	 * In that case the above wait for all requests means that all I/O
+	 * has now finished, and we can restart from a clean slate.  Let the
+	 * old requests go away and start from scratch instead.
+	 */
+	if (removed) {
+		nfs_unroll_locks(head, head);
+		nfs_unlock_and_release_request(head);
+		goto retry;
+	}
+
 	nfs_init_cinfo_from_inode(&cinfo, inode);
 	nfs_join_page_group(head, &cinfo, inode);
 	return head;