diff mbox series

[v3,1/3] btrfs: don't hold dev_replace rwsem over whole of btrfs_map_block

Message ID 20240712-b4-rst-updates-v3-1-5cf27dac98a7@kernel.org (mailing list archive)
State New, archived
Headers show
Series btrfs: more RAID stripe tree updates | expand

Commit Message

Johannes Thumshirn July 12, 2024, 7:48 a.m. UTC
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>

Don't hold the dev_replace rwsem for the entirety of btrfs_map_block().

It is only needed to protect
a) calls to find_live_mirror() and
b) calling into handle_ops_on_dev_replace().

But there is no need to hold the rwsem for any kind of set_io_stripe()
calls.

So relax taking the dev_replace rwsem to only protect both cases and check
if the device replace status has changed in the meantime, for which we have
to re-do the find_live_mirror() calls.

This fixes a deadlock on raid-stripe-tree where device replace performs a
scrub operation, which in turn calls into btrfs_map_block() to find the
physical location of the block.

Cc: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

Comments

Filipe Manana July 15, 2024, 11:29 a.m. UTC | #1
On Fri, Jul 12, 2024 at 8:49 AM Johannes Thumshirn <jth@kernel.org> wrote:
>
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Don't hold the dev_replace rwsem for the entirety of btrfs_map_block().
>
> It is only needed to protect
> a) calls to find_live_mirror() and
> b) calling into handle_ops_on_dev_replace().
>
> But there is no need to hold the rwsem for any kind of set_io_stripe()
> calls.
>
> So relax taking the dev_replace rwsem to only protect both cases and check
> if the device replace status has changed in the meantime, for which we have
> to re-do the find_live_mirror() calls.
>
> This fixes a deadlock on raid-stripe-tree where device replace performs a
> scrub operation, which in turn calls into btrfs_map_block() to find the
> physical location of the block.
>
> Cc: Filipe Manana <fdmanana@suse.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Reviewed-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/volumes.c | 28 +++++++++++++++++-----------
>  1 file changed, 17 insertions(+), 11 deletions(-)
>
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index fcedc43ef291..4209419244a1 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6650,14 +6650,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>         max_len = btrfs_max_io_len(map, map_offset, &io_geom);
>         *length = min_t(u64, map->chunk_len - map_offset, max_len);
>
> +again:
>         down_read(&dev_replace->rwsem);
>         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
> -       /*
> -        * Hold the semaphore for read during the whole operation, write is
> -        * requested at commit time but must wait.
> -        */
> -       if (!dev_replace_is_ongoing)
> -               up_read(&dev_replace->rwsem);
>
>         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>         case BTRFS_BLOCK_GROUP_RAID0:
> @@ -6695,6 +6690,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
>                            io_geom.stripe_index, map->num_stripes);
>                 ret = -EINVAL;
> +               up_read(&dev_replace->rwsem);
>                 goto out;
>         }
>
> @@ -6710,6 +6706,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>                  */
>                 num_alloc_stripes += 2;
>
> +       up_read(&dev_replace->rwsem);
> +
>         /*
>          * If this I/O maps to a single device, try to return the device and
>          * physical block information on the stack instead of allocating an
> @@ -6782,6 +6780,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>                 goto out;
>         }
>
> +       /*
> +        * Check if something changed the dev_replace state since
> +        * we've checked it for the last time and if redo the whole
> +        * mapping operation.
> +        */
> +       down_read(&dev_replace->rwsem);
> +       if (dev_replace_is_ongoing !=
> +           btrfs_dev_replace_is_ongoing(dev_replace)) {
> +               up_read(&dev_replace->rwsem);
> +               goto again;

We previously allocated bioc, so before the goto we have to free it
(call btrfs_put_bioc(bioc)), otherwise we'll leak it as after the goto
we end up allocating a new one.

Otherwise it looks fine, thanks.

> +       }
> +
>         if (op != BTRFS_MAP_READ)
>                 io_geom.max_errors = btrfs_chunk_max_errors(map);
>
> @@ -6789,6 +6799,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>             op != BTRFS_MAP_READ) {
>                 handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
>         }
> +       up_read(&dev_replace->rwsem);
>
>         *bioc_ret = bioc;
>         bioc->num_stripes = io_geom.num_stripes;
> @@ -6796,11 +6807,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>         bioc->mirror_num = io_geom.mirror_num;
>
>  out:
> -       if (dev_replace_is_ongoing) {
> -               lockdep_assert_held(&dev_replace->rwsem);
> -               /* Unlock and let waiting writers proceed */
> -               up_read(&dev_replace->rwsem);
> -       }
>         btrfs_free_chunk_map(map);
>         return ret;
>  }
>
> --
> 2.43.0
>
>
Johannes Thumshirn July 15, 2024, 11:38 a.m. UTC | #2
On 15.07.24 13:29, Filipe Manana wrote:
> On Fri, Jul 12, 2024 at 8:49 AM Johannes Thumshirn <jth@kernel.org> wrote:
>>
>> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>
>> Don't hold the dev_replace rwsem for the entirety of btrfs_map_block().
>>
>> It is only needed to protect
>> a) calls to find_live_mirror() and
>> b) calling into handle_ops_on_dev_replace().
>>
>> But there is no need to hold the rwsem for any kind of set_io_stripe()
>> calls.
>>
>> So relax taking the dev_replace rwsem to only protect both cases and check
>> if the device replace status has changed in the meantime, for which we have
>> to re-do the find_live_mirror() calls.
>>
>> This fixes a deadlock on raid-stripe-tree where device replace performs a
>> scrub operation, which in turn calls into btrfs_map_block() to find the
>> physical location of the block.
>>
>> Cc: Filipe Manana <fdmanana@suse.com>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
>> Reviewed-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/volumes.c | 28 +++++++++++++++++-----------
>>   1 file changed, 17 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index fcedc43ef291..4209419244a1 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -6650,14 +6650,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>>          max_len = btrfs_max_io_len(map, map_offset, &io_geom);
>>          *length = min_t(u64, map->chunk_len - map_offset, max_len);
>>
>> +again:
>>          down_read(&dev_replace->rwsem);
>>          dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
>> -       /*
>> -        * Hold the semaphore for read during the whole operation, write is
>> -        * requested at commit time but must wait.
>> -        */
>> -       if (!dev_replace_is_ongoing)
>> -               up_read(&dev_replace->rwsem);
>>
>>          switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>>          case BTRFS_BLOCK_GROUP_RAID0:
>> @@ -6695,6 +6690,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>>                             "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
>>                             io_geom.stripe_index, map->num_stripes);
>>                  ret = -EINVAL;
>> +               up_read(&dev_replace->rwsem);
>>                  goto out;
>>          }
>>
>> @@ -6710,6 +6706,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>>                   */
>>                  num_alloc_stripes += 2;
>>
>> +       up_read(&dev_replace->rwsem);
>> +
>>          /*
>>           * If this I/O maps to a single device, try to return the device and
>>           * physical block information on the stack instead of allocating an
>> @@ -6782,6 +6780,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>>                  goto out;
>>          }
>>
>> +       /*
>> +        * Check if something changed the dev_replace state since
>> +        * we've checked it for the last time and if redo the whole
>> +        * mapping operation.
>> +        */
>> +       down_read(&dev_replace->rwsem);
>> +       if (dev_replace_is_ongoing !=
>> +           btrfs_dev_replace_is_ongoing(dev_replace)) {
>> +               up_read(&dev_replace->rwsem);
>> +               goto again;
> 
> We previously allocated bioc, so before the goto we have to free it
> (call btrfs_put_bioc(bioc)), otherwise we'll leak it as after the goto
> we end up allocating a new one.
> 
> Otherwise it looks fine, thanks.
> 

Good catch, will update.
diff mbox series

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fcedc43ef291..4209419244a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6650,14 +6650,9 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
 	*length = min_t(u64, map->chunk_len - map_offset, max_len);
 
+again:
 	down_read(&dev_replace->rwsem);
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
-	/*
-	 * Hold the semaphore for read during the whole operation, write is
-	 * requested at commit time but must wait.
-	 */
-	if (!dev_replace_is_ongoing)
-		up_read(&dev_replace->rwsem);
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 	case BTRFS_BLOCK_GROUP_RAID0:
@@ -6695,6 +6690,7 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
 			   io_geom.stripe_index, map->num_stripes);
 		ret = -EINVAL;
+		up_read(&dev_replace->rwsem);
 		goto out;
 	}
 
@@ -6710,6 +6706,8 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		 */
 		num_alloc_stripes += 2;
 
+	up_read(&dev_replace->rwsem);
+
 	/*
 	 * If this I/O maps to a single device, try to return the device and
 	 * physical block information on the stack instead of allocating an
@@ -6782,6 +6780,18 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		goto out;
 	}
 
+	/*
+	 * Check if something changed the dev_replace state since
+	 * we've checked it for the last time and if redo the whole
+	 * mapping operation.
+	 */
+	down_read(&dev_replace->rwsem);
+	if (dev_replace_is_ongoing !=
+	    btrfs_dev_replace_is_ongoing(dev_replace)) {
+		up_read(&dev_replace->rwsem);
+		goto again;
+	}
+
 	if (op != BTRFS_MAP_READ)
 		io_geom.max_errors = btrfs_chunk_max_errors(map);
 
@@ -6789,6 +6799,7 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	    op != BTRFS_MAP_READ) {
 		handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
 	}
+	up_read(&dev_replace->rwsem);
 
 	*bioc_ret = bioc;
 	bioc->num_stripes = io_geom.num_stripes;
@@ -6796,11 +6807,6 @@  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	bioc->mirror_num = io_geom.mirror_num;
 
 out:
-	if (dev_replace_is_ongoing) {
-		lockdep_assert_held(&dev_replace->rwsem);
-		/* Unlock and let waiting writers proceed */
-		up_read(&dev_replace->rwsem);
-	}
 	btrfs_free_chunk_map(map);
 	return ret;
 }