Message ID | 20240712-b4-rst-updates-v3-1-5cf27dac98a7@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: more RAID stripe tree updates | expand |
On Fri, Jul 12, 2024 at 8:49 AM Johannes Thumshirn <jth@kernel.org> wrote: > > From: Johannes Thumshirn <johannes.thumshirn@wdc.com> > > Don't hold the dev_replace rwsem for the entirety of btrfs_map_block(). > > It is only needed to protect > a) calls to find_live_mirror() and > b) calling into handle_ops_on_dev_replace(). > > But there is no need to hold the rwsem for any kind of set_io_stripe() > calls. > > So relax taking the dev_replace rwsem to only protect both cases and check > if the device replace status has changed in the meantime, for which we have > to re-do the find_live_mirror() calls. > > This fixes a deadlock on raid-stripe-tree where device replace performs a > scrub operation, which in turn calls into btrfs_map_block() to find the > physical location of the block. > > Cc: Filipe Manana <fdmanana@suse.com> > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> > Reviewed-by: Josef Bacik <josef@toxicpanda.com> > Reviewed-by: Qu Wenruo <wqu@suse.com> > --- > fs/btrfs/volumes.c | 28 +++++++++++++++++----------- > 1 file changed, 17 insertions(+), 11 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index fcedc43ef291..4209419244a1 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -6650,14 +6650,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > max_len = btrfs_max_io_len(map, map_offset, &io_geom); > *length = min_t(u64, map->chunk_len - map_offset, max_len); > > +again: > down_read(&dev_replace->rwsem); > dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); > - /* > - * Hold the semaphore for read during the whole operation, write is > - * requested at commit time but must wait. > - */ > - if (!dev_replace_is_ongoing) > - up_read(&dev_replace->rwsem); > > switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { > case BTRFS_BLOCK_GROUP_RAID0: > @@ -6695,6 +6690,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", > io_geom.stripe_index, map->num_stripes); > ret = -EINVAL; > + up_read(&dev_replace->rwsem); > goto out; > } > > @@ -6710,6 +6706,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > */ > num_alloc_stripes += 2; > > + up_read(&dev_replace->rwsem); > + > /* > * If this I/O maps to a single device, try to return the device and > * physical block information on the stack instead of allocating an > @@ -6782,6 +6780,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > goto out; > } > > + /* > + * Check if something changed the dev_replace state since > + * we've checked it for the last time and if redo the whole > + * mapping operation. > + */ > + down_read(&dev_replace->rwsem); > + if (dev_replace_is_ongoing != > + btrfs_dev_replace_is_ongoing(dev_replace)) { > + up_read(&dev_replace->rwsem); > + goto again; We previously allocated bioc, so before the goto we have to free it (call btrfs_put_bioc(bioc)), otherwise we'll leak it as after the goto we end up allocating a new one. Otherwise it looks fine, thanks. > + } > + > if (op != BTRFS_MAP_READ) > io_geom.max_errors = btrfs_chunk_max_errors(map); > > @@ -6789,6 +6799,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > op != BTRFS_MAP_READ) { > handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); > } > + up_read(&dev_replace->rwsem); > > *bioc_ret = bioc; > bioc->num_stripes = io_geom.num_stripes; > @@ -6796,11 +6807,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, > bioc->mirror_num = io_geom.mirror_num; > > out: > - if (dev_replace_is_ongoing) { > - lockdep_assert_held(&dev_replace->rwsem); > - /* Unlock and let waiting writers proceed */ > - up_read(&dev_replace->rwsem); > - } > btrfs_free_chunk_map(map); > return ret; > } > > -- > 2.43.0 > >
On 15.07.24 13:29, Filipe Manana wrote: > On Fri, Jul 12, 2024 at 8:49 AM Johannes Thumshirn <jth@kernel.org> wrote: >> >> From: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> >> Don't hold the dev_replace rwsem for the entirety of btrfs_map_block(). >> >> It is only needed to protect >> a) calls to find_live_mirror() and >> b) calling into handle_ops_on_dev_replace(). >> >> But there is no need to hold the rwsem for any kind of set_io_stripe() >> calls. >> >> So relax taking the dev_replace rwsem to only protect both cases and check >> if the device replace status has changed in the meantime, for which we have >> to re-do the find_live_mirror() calls. >> >> This fixes a deadlock on raid-stripe-tree where device replace performs a >> scrub operation, which in turn calls into btrfs_map_block() to find the >> physical location of the block. >> >> Cc: Filipe Manana <fdmanana@suse.com> >> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> >> Reviewed-by: Josef Bacik <josef@toxicpanda.com> >> Reviewed-by: Qu Wenruo <wqu@suse.com> >> --- >> fs/btrfs/volumes.c | 28 +++++++++++++++++----------- >> 1 file changed, 17 insertions(+), 11 deletions(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index fcedc43ef291..4209419244a1 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -6650,14 +6650,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, >> max_len = btrfs_max_io_len(map, map_offset, &io_geom); >> *length = min_t(u64, map->chunk_len - map_offset, max_len); >> >> +again: >> down_read(&dev_replace->rwsem); >> dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); >> - /* >> - * Hold the semaphore for read during the whole operation, write is >> - * requested at commit time but must wait. >> - */ >> - if (!dev_replace_is_ongoing) >> - up_read(&dev_replace->rwsem); >> >> switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { >> case BTRFS_BLOCK_GROUP_RAID0: >> @@ -6695,6 +6690,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, >> "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", >> io_geom.stripe_index, map->num_stripes); >> ret = -EINVAL; >> + up_read(&dev_replace->rwsem); >> goto out; >> } >> >> @@ -6710,6 +6706,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, >> */ >> num_alloc_stripes += 2; >> >> + up_read(&dev_replace->rwsem); >> + >> /* >> * If this I/O maps to a single device, try to return the device and >> * physical block information on the stack instead of allocating an >> @@ -6782,6 +6780,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, >> goto out; >> } >> >> + /* >> + * Check if something changed the dev_replace state since >> + * we've checked it for the last time and if redo the whole >> + * mapping operation. >> + */ >> + down_read(&dev_replace->rwsem); >> + if (dev_replace_is_ongoing != >> + btrfs_dev_replace_is_ongoing(dev_replace)) { >> + up_read(&dev_replace->rwsem); >> + goto again; > > We previously allocated bioc, so before the goto we have to free it > (call btrfs_put_bioc(bioc)), otherwise we'll leak it as after the goto > we end up allocating a new one. > > Otherwise it looks fine, thanks. > Good catch, will update.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fcedc43ef291..4209419244a1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6650,14 +6650,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, max_len = btrfs_max_io_len(map, map_offset, &io_geom); *length = min_t(u64, map->chunk_len - map_offset, max_len); +again: down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); - /* - * Hold the semaphore for read during the whole operation, write is - * requested at commit time but must wait. - */ - if (!dev_replace_is_ongoing) - up_read(&dev_replace->rwsem); switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case BTRFS_BLOCK_GROUP_RAID0: @@ -6695,6 +6690,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", io_geom.stripe_index, map->num_stripes); ret = -EINVAL; + up_read(&dev_replace->rwsem); goto out; } @@ -6710,6 +6706,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ num_alloc_stripes += 2; + up_read(&dev_replace->rwsem); + /* * If this I/O maps to a single device, try to return the device and * physical block information on the stack instead of allocating an @@ -6782,6 +6780,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, goto out; } + /* + * Check if something changed the dev_replace state since + * we've checked it for the last time and if redo the whole + * mapping operation. + */ + down_read(&dev_replace->rwsem); + if (dev_replace_is_ongoing != + btrfs_dev_replace_is_ongoing(dev_replace)) { + up_read(&dev_replace->rwsem); + goto again; + } + if (op != BTRFS_MAP_READ) io_geom.max_errors = btrfs_chunk_max_errors(map); @@ -6789,6 +6799,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom); } + up_read(&dev_replace->rwsem); *bioc_ret = bioc; bioc->num_stripes = io_geom.num_stripes; @@ -6796,11 +6807,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, bioc->mirror_num = io_geom.mirror_num; out: - if (dev_replace_is_ongoing) { - lockdep_assert_held(&dev_replace->rwsem); - /* Unlock and let waiting writers proceed */ - up_read(&dev_replace->rwsem); - } btrfs_free_chunk_map(map); return ret; }