diff mbox series

[RFC,05/31] btrfs: writepage() lock extent before pages

Message ID 412df27129a676f4bca6724960350569573a3791.1623567940.git.rgoldwyn@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs buffered iomap support | expand

Commit Message

Goldwyn Rodrigues June 13, 2021, 1:39 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Lock Order change: Extent locks before page locks

writepage() is called with the page locked. So make an attempt
to lock the extents and proceed only if successful.

The new function best_effort_lock_extent() attempts to lock the range
provided.

If the entire range was not locked, but it still covers the locked
page, work with the reduced range by resetting delalloc_end()

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/extent_io.c | 66 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 15 deletions(-)

Comments

Nikolay Borisov June 15, 2021, 3:47 p.m. UTC | #1
On 13.06.21 г. 16:39, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> Lock Order change: Extent locks before page locks
> 
> writepage() is called with the page locked. So make an attempt
> to lock the extents and proceed only if successful.
> 
> The new function best_effort_lock_extent() attempts to lock the range
> provided.
> 
> If the entire range was not locked, but it still covers the locked
> page, work with the reduced range by resetting delalloc_end()
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/btrfs/extent_io.c | 66 ++++++++++++++++++++++++++++++++++----------
>  1 file changed, 51 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 9e81d25dea70..75ad809e8c86 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1488,6 +1488,47 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
>  	return 1;
>  }
>  
> +/*
> + * best_effort_lock_extent() - locks the extent to the best of effort
> + * making sure the minimum range is locked and the
> + * delalloc bits are set. If the full requested range is not locked,
> + * delalloc_end shifts to until the range that can be locked.
> + */
> +static bool best_effort_lock_extent(struct extent_io_tree *tree, u64 start,
> +		u64 *delalloc_end, u64 min_end, struct extent_state **cached_state)
> +{
> +	u64 failed_start;
> +	u64 end = *delalloc_end;
> +	int ret;
> +
> +	ret = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
> +			     &failed_start, cached_state, GFP_NOFS, NULL);
> +
> +	if (!ret)
> +		return false;

The locking sequence here seems broken. Based on the behavior of
lock_extent_bits :

1. ret = 0 means the range was successfully locked instead you return
false.

2. ret can be a number of errors, including EEXIST, the original code
basically waits for the extents to be unlocked and retries.

> +
> +	if (failed_start < min_end) {
> +		/* Could not lock the whole range */
> +		if (failed_start > start)
> +			unlock_extent_cached(tree, start,
> +					failed_start - 1, cached_state);
> +		return false;
> +	} else if (end > failed_start) {
> +		/* Work with what could be locked */
> +		end = failed_start - 1;

nit: Here instead of assigning *delalloc_end to end and doing the check
I think it's more straightforward to perform the check with failed_start
and *delalloc_end and only then assign failed_start -1 to end.


> +	}
> +
> +	/* Check if delalloc bits are set */
> +	ret = test_range_bit(tree, start, end,
> +			     EXTENT_DELALLOC, 1, *cached_state);
> +	if (!ret) {
> +		unlock_extent_cached(tree, start, end - 1, cached_state);
> +		return false;
> +	}
> +	*delalloc_end = end;
> +	return true;
> +}
> +
>  void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
>  {
>  	unsigned long index = start >> PAGE_SHIFT;
> @@ -2018,7 +2059,16 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
>  	if (delalloc_end + 1 - delalloc_start > max_bytes)
>  		delalloc_end = delalloc_start + max_bytes - 1;
>  
> -	/* step two, lock all the pages after the page that has start */
> +	/* step two, lock the state bits for the range */
> +	found = best_effort_lock_extent(tree, delalloc_start, &delalloc_end,
> +			((page_index(locked_page) + 1) << PAGE_SHIFT),

page_index(locked_page)+1 means you'd like to ensure you have locked at
least 1 page beyond locked page, correct? This looks a bit arbitrary i.e
why 1 page and not 2 for example?

> +			&cached_state);
> +	if (!found) {
> +		free_extent_state(cached_state);
> +		goto out_failed;
> +	}
> +
> +	/* step three, lock all the pages after the page that has start */
>  	ret = lock_delalloc_pages(inode, locked_page,
>  				  delalloc_start, delalloc_end);

So now that this has become step 3 and the extents are locked you should
implement relevant error handling which would unlock the extents in case
of an error.

>  	ASSERT(!ret || ret == -EAGAIN);
> @@ -2038,20 +2088,6 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
>  		}
>  	}
>  
> -	/* step three, lock the state bits for the whole range */
> -	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
> -
> -	/* then test to make sure it is all still delalloc */
> -	ret = test_range_bit(tree, delalloc_start, delalloc_end,
> -			     EXTENT_DELALLOC, 1, cached_state);
> -	if (!ret) {
> -		unlock_extent_cached(tree, delalloc_start, delalloc_end,
> -				     &cached_state);
> -		__unlock_for_delalloc(inode, locked_page,
> -			      delalloc_start, delalloc_end);
> -		cond_resched();
> -		goto again;
> -	}
>  	free_extent_state(cached_state);
>  	*start = delalloc_start;
>  	*end = delalloc_end;
>
diff mbox series

Patch

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9e81d25dea70..75ad809e8c86 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1488,6 +1488,47 @@  int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 	return 1;
 }
 
+/*
+ * best_effort_lock_extent() - locks the extent to the best of effort
+ * making sure the minimum range is locked and the
+ * delalloc bits are set. If the full requested range is not locked,
+ * delalloc_end shifts to until the range that can be locked.
+ */
+static bool best_effort_lock_extent(struct extent_io_tree *tree, u64 start,
+		u64 *delalloc_end, u64 min_end, struct extent_state **cached_state)
+{
+	u64 failed_start;
+	u64 end = *delalloc_end;
+	int ret;
+
+	ret = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			     &failed_start, cached_state, GFP_NOFS, NULL);
+
+	if (!ret)
+		return false;
+
+	if (failed_start < min_end) {
+		/* Could not lock the whole range */
+		if (failed_start > start)
+			unlock_extent_cached(tree, start,
+					failed_start - 1, cached_state);
+		return false;
+	} else if (end > failed_start) {
+		/* Work with what could be locked */
+		end = failed_start - 1;
+	}
+
+	/* Check if delalloc bits are set */
+	ret = test_range_bit(tree, start, end,
+			     EXTENT_DELALLOC, 1, *cached_state);
+	if (!ret) {
+		unlock_extent_cached(tree, start, end - 1, cached_state);
+		return false;
+	}
+	*delalloc_end = end;
+	return true;
+}
+
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_SHIFT;
@@ -2018,7 +2059,16 @@  noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
 	if (delalloc_end + 1 - delalloc_start > max_bytes)
 		delalloc_end = delalloc_start + max_bytes - 1;
 
-	/* step two, lock all the pages after the page that has start */
+	/* step two, lock the state bits for the range */
+	found = best_effort_lock_extent(tree, delalloc_start, &delalloc_end,
+			((page_index(locked_page) + 1) << PAGE_SHIFT),
+			&cached_state);
+	if (!found) {
+		free_extent_state(cached_state);
+		goto out_failed;
+	}
+
+	/* step three, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
 	ASSERT(!ret || ret == -EAGAIN);
@@ -2038,20 +2088,6 @@  noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
 		}
 	}
 
-	/* step three, lock the state bits for the whole range */
-	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
-
-	/* then test to make sure it is all still delalloc */
-	ret = test_range_bit(tree, delalloc_start, delalloc_end,
-			     EXTENT_DELALLOC, 1, cached_state);
-	if (!ret) {
-		unlock_extent_cached(tree, delalloc_start, delalloc_end,
-				     &cached_state);
-		__unlock_for_delalloc(inode, locked_page,
-			      delalloc_start, delalloc_end);
-		cond_resched();
-		goto again;
-	}
 	free_extent_state(cached_state);
 	*start = delalloc_start;
 	*end = delalloc_end;