diff mbox series

[RFC] btrfs: do not use the replace target device as an extra mirror

Message ID 2902738be4657ec16e5b5dd38eac1fb53aa5cc44.1675918006.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series [RFC] btrfs: do not use the replace target device as an extra mirror | expand

Commit Message

Qu Wenruo Feb. 9, 2023, 4:47 a.m. UTC
Since the ancient time of btrfs, if a dev-replace is running, we can use
that replace target as an extra mirror for read.

But there are some extra problems with that:

- No reliable checks on if that target device is even involved
  For profiles like RAID0/RAID10, it's very possible that the replace
  is happening on one device which is not involved in the stripe.

  E.g. a 4-disks RAID0, involving disk A~D, and disk A is being replaced.
  In that case, if our read lands at disk B, there is no extra mirror to
  start with.

- No indicator on whether the target device even contains correct data
  Since dev-replace is running, the target device is progressively
  filled with data from the source device.

  Thus if our read is beyond the currently replaced range, we may just
  read out some garbage.
  This can be extremely dangerous if the range doesn't have data
  checksum, then we may silently trust the garbage.

Thus this RFC patch would just remove this feature.

Yes, I know this would reduce the chance we recover some data, if we're
replacing a failing disk.
But in that case, if we're really relying on the failing disk, but not
some extra mirrors, I'd say we have a much bigger problem.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 124 +++------------------------------------------
 1 file changed, 7 insertions(+), 117 deletions(-)

Comments

Qu Wenruo Feb. 14, 2023, 6:47 a.m. UTC | #1
Any feedback on this?

Whether we get rid of the extra mirror of dev-replace would greatly 
impact how my incoming refactor on __btrfs_map_block() go.

Thanks,
Qu

On 2023/2/9 12:47, Qu Wenruo wrote:
> Since the ancient time of btrfs, if a dev-replace is running, we can use
> that replace target as an extra mirror for read.
> 
> But there are some extra problems with that:
> 
> - No reliable checks on if that target device is even involved
>    For profiles like RAID0/RAID10, it's very possible that the replace
>    is happening on one device which is not involved in the stripe.
> 
>    E.g. a 4-disks RAID0, involving disk A~D, and disk A is being replaced.
>    In that case, if our read lands at disk B, there is no extra mirror to
>    start with.
> 
> - No indicator on whether the target device even contains correct data
>    Since dev-replace is running, the target device is progressively
>    filled with data from the source device.
> 
>    Thus if our read is beyond the currently replaced range, we may just
>    read out some garbage.
>    This can be extremely dangerous if the range doesn't have data
>    checksum, then we may silently trust the garbage.
> 
> Thus this RFC patch would just remove this feature.
> 
> Yes, I know this would reduce the chance we recover some data, if we're
> replacing a failing disk.
> But in that case, if we're really relying on the failing disk, but not
> some extra mirrors, I'd say we have a much bigger problem.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/volumes.c | 124 +++------------------------------------------
>   1 file changed, 7 insertions(+), 117 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 3a2a256fa9cd..385fc89b6702 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5754,12 +5754,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>   		ret = map->num_stripes;
>   	free_extent_map(em);
>   
> -	down_read(&fs_info->dev_replace.rwsem);
> -	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
> -	    fs_info->dev_replace.tgtdev)
> -		ret++;
> -	up_read(&fs_info->dev_replace.rwsem);
> -
>   	return ret;
>   }
>   
> @@ -6046,83 +6040,6 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
>   	return ERR_PTR(ret);
>   }
>   
> -/*
> - * In dev-replace case, for repair case (that's the only case where the mirror
> - * is selected explicitly when calling btrfs_map_block), blocks left of the
> - * left cursor can also be read from the target drive.
> - *
> - * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
> - * array of stripes.
> - * For READ, it also needs to be supported using the same mirror number.
> - *
> - * If the requested block is not left of the left cursor, EIO is returned. This
> - * can happen because btrfs_num_copies() returns one more in the dev-replace
> - * case.
> - */
> -static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
> -					 u64 logical, u64 length,
> -					 u64 srcdev_devid, int *mirror_num,
> -					 u64 *physical)
> -{
> -	struct btrfs_io_context *bioc = NULL;
> -	int num_stripes;
> -	int index_srcdev = 0;
> -	int found = 0;
> -	u64 physical_of_found = 0;
> -	int i;
> -	int ret = 0;
> -
> -	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
> -				logical, &length, &bioc, NULL, NULL, 0);
> -	if (ret) {
> -		ASSERT(bioc == NULL);
> -		return ret;
> -	}
> -
> -	num_stripes = bioc->num_stripes;
> -	if (*mirror_num > num_stripes) {
> -		/*
> -		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
> -		 * that means that the requested area is not left of the left
> -		 * cursor
> -		 */
> -		btrfs_put_bioc(bioc);
> -		return -EIO;
> -	}
> -
> -	/*
> -	 * process the rest of the function using the mirror_num of the source
> -	 * drive. Therefore look it up first.  At the end, patch the device
> -	 * pointer to the one of the target drive.
> -	 */
> -	for (i = 0; i < num_stripes; i++) {
> -		if (bioc->stripes[i].dev->devid != srcdev_devid)
> -			continue;
> -
> -		/*
> -		 * In case of DUP, in order to keep it simple, only add the
> -		 * mirror with the lowest physical address
> -		 */
> -		if (found &&
> -		    physical_of_found <= bioc->stripes[i].physical)
> -			continue;
> -
> -		index_srcdev = i;
> -		found = 1;
> -		physical_of_found = bioc->stripes[i].physical;
> -	}
> -
> -	btrfs_put_bioc(bioc);
> -
> -	ASSERT(found);
> -	if (!found)
> -		return -EIO;
> -
> -	*mirror_num = index_srcdev + 1;
> -	*physical = physical_of_found;
> -	return ret;
> -}
> -
>   static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
>   {
>   	struct btrfs_block_group *cache;
> @@ -6335,14 +6252,13 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	int i;
>   	int ret = 0;
>   	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
> +	int num_copies;
>   	int num_stripes;
>   	int max_errors = 0;
>   	struct btrfs_io_context *bioc = NULL;
>   	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
>   	int dev_replace_is_ongoing = 0;
> -	int patch_the_first_stripe_for_dev_replace = 0;
>   	u16 num_alloc_stripes;
> -	u64 physical_to_patch_in_first_stripe = 0;
>   	u64 raid56_full_stripe_start = (u64)-1;
>   	struct btrfs_io_geometry geom;
>   
> @@ -6365,6 +6281,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	raid56_full_stripe_start = geom.raid56_stripe_offset;
>   	data_stripes = nr_data_stripes(map);
>   
> +	num_copies = btrfs_num_copies(fs_info, logical, geom.len);
> +
>   	down_read(&dev_replace->rwsem);
>   	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
>   	/*
> @@ -6374,19 +6292,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	if (!dev_replace_is_ongoing)
>   		up_read(&dev_replace->rwsem);
>   
> -	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
> -	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
> -		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
> -						    dev_replace->srcdev->devid,
> -						    &mirror_num,
> -					    &physical_to_patch_in_first_stripe);
> -		if (ret)
> -			goto out;
> -		else
> -			patch_the_first_stripe_for_dev_replace = 1;
> -	} else if (mirror_num > map->num_stripes) {
> +	if (mirror_num > num_copies)
>   		mirror_num = 0;
> -	}
>   
>   	num_stripes = 1;
>   	stripe_index = 0;
> @@ -6511,15 +6418,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
>   	    (!need_full_stripe(op) || !dev_replace_is_ongoing ||
>   	     !dev_replace->tgtdev)) {
> -		if (patch_the_first_stripe_for_dev_replace) {
> -			smap->dev = dev_replace->tgtdev;
> -			smap->physical = physical_to_patch_in_first_stripe;
> -			*mirror_num_ret = map->num_stripes + 1;
> -		} else {
> -			set_io_stripe(smap, map, stripe_index, stripe_offset,
> -				      stripe_nr);
> -			*mirror_num_ret = mirror_num;
> -		}
> +		set_io_stripe(smap, map, stripe_index, stripe_offset,
> +			      stripe_nr);
> +		*mirror_num_ret = mirror_num;
>   		*bioc_ret = NULL;
>   		ret = 0;
>   		goto out;
> @@ -6584,17 +6485,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	bioc->max_errors = max_errors;
>   	bioc->mirror_num = mirror_num;
>   
> -	/*
> -	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
> -	 * mirror_num == num_stripes + 1 && dev_replace target drive is
> -	 * available as a mirror
> -	 */
> -	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
> -		WARN_ON(num_stripes > 1);
> -		bioc->stripes[0].dev = dev_replace->tgtdev;
> -		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
> -		bioc->mirror_num = map->num_stripes + 1;
> -	}
>   out:
>   	if (dev_replace_is_ongoing) {
>   		lockdep_assert_held(&dev_replace->rwsem);
Christoph Hellwig Feb. 14, 2023, 7:06 a.m. UTC | #2
On Thu, Feb 09, 2023 at 12:47:01PM +0800, Qu Wenruo wrote:
> Since the ancient time of btrfs, if a dev-replace is running, we can use
> that replace target as an extra mirror for read.

More specifically this seems to go back to:

Author: Stefan Behrens <sbehrens@giantdisaster.de>
Date:   Tue Nov 6 15:06:47 2012 +0100

    Btrfs: allow repair code to include target disk when searching mirrors

    Make the target disk of a running device replace operation
    available for reading. This is only used as a last ressort for
    the defect repair procedure. And it is dependent on the location
    of the data block to read, because during an ongoing device
    replace operation, the target drive is only partially filled
    with the filesystem data.

and

commit 30d9861ff9520e2a112eae71029bc9f7e915a441
Author: Stefan Behrens <sbehrens@giantdisaster.de>
Date:   Tue Nov 6 14:52:18 2012 +0100

    Btrfs: optionally avoid reads from device replace source drive        

    It is desirable to be able to configure the device replace
    procedure to avoid reading the source drive (the one to be
    copied) whenever possible. This is useful when the number of
    read errors on this disk is high, because it would delay the
    copy procedure alot. Therefore there is an option to avoid
    reading from the source disk unless the repair procedure
    really needs to access it. The regular read req asks for
    mapping the block with mirror_num == 0, in this case the
    source disk is avoided whenever possible. The repair code
    selects the mirror_num explicitly (mirror_num != 0), this
    case is not changed by this commit.

from which I father that the idea is that when a drive is replaced
due to a high number of read errors, it might be a better idea to
redirect it to the target device.

The questions is how much does this matter?  NAND storage has a concept
of read disturb, but we really should not be hitting it in practice.

> But there are some extra problems with that:
> 
> - No reliable checks on if that target device is even involved
>   For profiles like RAID0/RAID10, it's very possible that the replace
>   is happening on one device which is not involved in the stripe.
> 
>   E.g. a 4-disks RAID0, involving disk A~D, and disk A is being replaced.
>   In that case, if our read lands at disk B, there is no extra mirror to
>   start with.
> 
> - No indicator on whether the target device even contains correct data
>   Since dev-replace is running, the target device is progressively
>   filled with data from the source device.
> 
>   Thus if our read is beyond the currently replaced range, we may just
>   read out some garbage.
>   This can be extremely dangerous if the range doesn't have data
>   checksum, then we may silently trust the garbage.

Yes, the way this is done currently seems pretty broken.

But there was a clear intent behind it, event exposed to userspace using
the BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID flag.

So at very least the target should not be used unless a replace with the
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID flag is in
progress.  I'm not really here to judge how useful that flag is, but
if you want to remove reading from the target entirely we'dd need to
remove that flag as well and print a warning for it, as it clearly won't
work any more.
Qu Wenruo Feb. 14, 2023, 7:18 a.m. UTC | #3
On 2023/2/14 15:06, Christoph Hellwig wrote:
> On Thu, Feb 09, 2023 at 12:47:01PM +0800, Qu Wenruo wrote:
>> Since the ancient time of btrfs, if a dev-replace is running, we can use
>> that replace target as an extra mirror for read.
> 
> More specifically this seems to go back to:
> 
> Author: Stefan Behrens <sbehrens@giantdisaster.de>
> Date:   Tue Nov 6 15:06:47 2012 +0100
> 
>      Btrfs: allow repair code to include target disk when searching mirrors
> 
>      Make the target disk of a running device replace operation
>      available for reading. This is only used as a last ressort for
>      the defect repair procedure. And it is dependent on the location
>      of the data block to read, because during an ongoing device
>      replace operation, the target drive is only partially filled
>      with the filesystem data.
> 
> and
> 
> commit 30d9861ff9520e2a112eae71029bc9f7e915a441
> Author: Stefan Behrens <sbehrens@giantdisaster.de>
> Date:   Tue Nov 6 14:52:18 2012 +0100
> 
>      Btrfs: optionally avoid reads from device replace source drive
> 
>      It is desirable to be able to configure the device replace
>      procedure to avoid reading the source drive (the one to be
>      copied) whenever possible. This is useful when the number of
>      read errors on this disk is high, because it would delay the
>      copy procedure alot. Therefore there is an option to avoid
>      reading from the source disk unless the repair procedure
>      really needs to access it. The regular read req asks for
>      mapping the block with mirror_num == 0, in this case the
>      source disk is avoided whenever possible. The repair code
>      selects the mirror_num explicitly (mirror_num != 0), this
>      case is not changed by this commit.
> 
> from which I father that the idea is that when a drive is replaced
> due to a high number of read errors, it might be a better idea to
> redirect it to the target device.

To me, avoiding reading from source != read from target.

> 
> The questions is how much does this matter?  NAND storage has a concept
> of read disturb, but we really should not be hitting it in practice.
> 
>> But there are some extra problems with that:
>>
>> - No reliable checks on if that target device is even involved
>>    For profiles like RAID0/RAID10, it's very possible that the replace
>>    is happening on one device which is not involved in the stripe.
>>
>>    E.g. a 4-disks RAID0, involving disk A~D, and disk A is being replaced.
>>    In that case, if our read lands at disk B, there is no extra mirror to
>>    start with.
>>
>> - No indicator on whether the target device even contains correct data
>>    Since dev-replace is running, the target device is progressively
>>    filled with data from the source device.
>>
>>    Thus if our read is beyond the currently replaced range, we may just
>>    read out some garbage.
>>    This can be extremely dangerous if the range doesn't have data
>>    checksum, then we may silently trust the garbage.
> 
> Yes, the way this is done currently seems pretty broken.
> 
> But there was a clear intent behind it, event exposed to userspace using
> the BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID flag.
> 
> So at very least the target should not be used unless a replace with the
> BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID flag is in
> progress.  I'm not really here to judge how useful that flag is, but
> if you want to remove reading from the target entirely we'dd need to
> remove that flag as well and print a warning for it, as it clearly won't
> work any more.
> 

I'm not sure if I missed anything, but doesn't that flag really mean, 
try other mirrors instead?

Thus that flag can still make sense no matter if we count target as an 
extra mirror or not.

Thanks,
Qu
Christoph Hellwig Feb. 15, 2023, 6:39 a.m. UTC | #4
On Tue, Feb 14, 2023 at 03:18:11PM +0800, Qu Wenruo wrote:
> > from which I father that the idea is that when a drive is replaced
> > due to a high number of read errors, it might be a better idea to
> > redirect it to the target device.
> 
> To me, avoiding reading from source != read from target.

Well, it's not identical, but it does severly reduce the redundancy.

But maybe Stefan will find some time to chime in.
Qu Wenruo Feb. 15, 2023, 6:56 a.m. UTC | #5
On 2023/2/15 14:39, Christoph Hellwig wrote:
> On Tue, Feb 14, 2023 at 03:18:11PM +0800, Qu Wenruo wrote:
>>> from which I father that the idea is that when a drive is replaced
>>> due to a high number of read errors, it might be a better idea to
>>> redirect it to the target device.
>>
>> To me, avoiding reading from source != read from target.
> 
> Well, it's not identical, but it does severly reduce the redundancy.

The problem is, the target device is not a reliable mirror.

Its reliability is bound to the progress of the replace.

At the beginning, the reliability is 0, all read from target device is 
garbage.
While at the end of the replace, all read from that target device is 
reliable (the same of the source device).

If you really want to use a single number to describe the reliability, 
I'd say it's 0.5.

But in reality, we treat it more like 1, not 0, not 0.5.

And that unreliability can lead to data corruption cases, e.g., 
NODATASUM read failed on one mirror, and read-repair started using the 
target device.

Meanwhile replace hasn't yet reached that bytenr, thus we're reading 
garbage from target device.
And since NODATASUM, we trust the garbage, thus corrupting the data.

Thanks,
Qu

> 
> But maybe Stefan will find some time to chime in.
Christoph Hellwig Feb. 15, 2023, 6:58 a.m. UTC | #6
On Wed, Feb 15, 2023 at 02:56:39PM +0800, Qu Wenruo wrote:
> Meanwhile replace hasn't yet reached that bytenr, thus we're reading garbage
> from target device.
> And since NODATASUM, we trust the garbage, thus corrupting the data.

Yes, for a read from the target device to be useful, the progress
needs to be tracked, and the read only needs to happen on data
that actually was written.
Qu Wenruo Feb. 15, 2023, 7:01 a.m. UTC | #7
On 2023/2/15 14:58, Christoph Hellwig wrote:
> On Wed, Feb 15, 2023 at 02:56:39PM +0800, Qu Wenruo wrote:
>> Meanwhile replace hasn't yet reached that bytenr, thus we're reading garbage
>> from target device.
>> And since NODATASUM, we trust the garbage, thus corrupting the data.
> 
> Yes, for a read from the target device to be useful, the progress
> needs to be tracked, and the read only needs to happen on data
> that actually was written.

I can still see some comments on this, but can not find the exact 
commits removing the replace progress check...

And if we go that path without a progress check, then the only valid way 
is to distrust the target device completely.

Thanks,
Qu
Stefan Behrens Feb. 15, 2023, 1:58 p.m. UTC | #8
On 2/15/2023 7:58 AM, Christoph Hellwig wrote:
> On Wed, Feb 15, 2023 at 02:56:39PM +0800, Qu Wenruo wrote:
>> Meanwhile replace hasn't yet reached that bytenr, thus we're reading garbage
>> from target device.
>> And since NODATASUM, we trust the garbage, thus corrupting the data.
> 
> Yes, for a read from the target device to be useful, the progress
> needs to be tracked, and the read only needs to happen on data
> that actually was written.

The device replace code maintains (or used to maintain) a concept of a 
cursor.

There's a small area on the target device which is currently written to. 
Left of this area is valid and already written data, which can also be 
read in case it's needed to fix read errors or to avoid access to an 
almost damaged hard drive which tries to reread every bad block 2048 
times (which is 17 seconds at 7200rpm and something you want to avoid). 
Right of the area is data that must not be read because it is garbage 
and uninitialized contents of the new disk.

There are several comments about this concept in volumes.c. And scrub.c 
is the place that keeps the left and right cursor up to date which 
define the area that is already written, currently written to and not 
yet written.
Qu Wenruo Feb. 16, 2023, 12:29 a.m. UTC | #9
On 2023/2/15 21:58, Stefan Behrens wrote:
> On 2/15/2023 7:58 AM, Christoph Hellwig wrote:
>> On Wed, Feb 15, 2023 at 02:56:39PM +0800, Qu Wenruo wrote:
>>> Meanwhile replace hasn't yet reached that bytenr, thus we're reading 
>>> garbage
>>> from target device.
>>> And since NODATASUM, we trust the garbage, thus corrupting the data.
>>
>> Yes, for a read from the target device to be useful, the progress
>> needs to be tracked, and the read only needs to happen on data
>> that actually was written.
> 
> The device replace code maintains (or used to maintain) a concept of a 
> cursor.
> 
> There's a small area on the target device which is currently written to. 
> Left of this area is valid and already written data, which can also be 
> read in case it's needed to fix read errors or to avoid access to an 
> almost damaged hard drive which tries to reread every bad block 2048 
> times (which is 17 seconds at 7200rpm and something you want to avoid). 
> Right of the area is data that must not be read because it is garbage 
> and uninitialized contents of the new disk.

So the main reason for the concept is just to avoid read on the damaged 
device?

Is there any real world data on the behavior?

I don't know which commit removed the cursor, but considering the extra 
work needed to properly handle the cursor and only provide the extra 
mirror on the correct condition, I'm not that convinced it's a total win.

Thus I'm more towards dropping the behavior.

Thanks,
Qu

> 
> There are several comments about this concept in volumes.c. And scrub.c 
> is the place that keeps the left and right cursor up to date which 
> define the area that is already written, currently written to and not 
> yet written.
Qu Wenruo Feb. 17, 2023, 6:08 a.m. UTC | #10
On 2023/2/16 08:29, Qu Wenruo wrote:
> 
> 
> On 2023/2/15 21:58, Stefan Behrens wrote:
>> On 2/15/2023 7:58 AM, Christoph Hellwig wrote:
>>> On Wed, Feb 15, 2023 at 02:56:39PM +0800, Qu Wenruo wrote:
>>>> Meanwhile replace hasn't yet reached that bytenr, thus we're reading 
>>>> garbage
>>>> from target device.
>>>> And since NODATASUM, we trust the garbage, thus corrupting the data.
>>>
>>> Yes, for a read from the target device to be useful, the progress
>>> needs to be tracked, and the read only needs to happen on data
>>> that actually was written.
>>
>> The device replace code maintains (or used to maintain) a concept of a 
>> cursor.
>>
>> There's a small area on the target device which is currently written 
>> to. Left of this area is valid and already written data, which can 
>> also be read in case it's needed to fix read errors or to avoid access 
>> to an almost damaged hard drive which tries to reread every bad block 
>> 2048 times (which is 17 seconds at 7200rpm and something you want to 
>> avoid). Right of the area is data that must not be read because it is 
>> garbage and uninitialized contents of the new disk.
> 
> So the main reason for the concept is just to avoid read on the damaged 
> device?
> 
> Is there any real world data on the behavior?
> 
> I don't know which commit removed the cursor, but considering the extra 
> work needed to properly handle the cursor and only provide the extra 
> mirror on the correct condition, I'm not that convinced it's a total win.

BTW, the current code, find_live_mirror() itself is already following 
the DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID, to avoid the 
source device if needed, no matter if we have a cursor or not.

Thus even we get rid of the target-device-as-extra-mirror behavior, we 
can still avoid reads from the source device (at least for read-repair).

Although the scrub part doesn't seem to care about that at all...

Thanks,
Qu
> 
> Thus I'm more towards dropping the behavior.
> 
> Thanks,
> Qu
> 
>>
>> There are several comments about this concept in volumes.c. And 
>> scrub.c is the place that keeps the left and right cursor up to date 
>> which define the area that is already written, currently written to 
>> and not yet written.
diff mbox series

Patch

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3a2a256fa9cd..385fc89b6702 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5754,12 +5754,6 @@  int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		ret = map->num_stripes;
 	free_extent_map(em);
 
-	down_read(&fs_info->dev_replace.rwsem);
-	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
-	    fs_info->dev_replace.tgtdev)
-		ret++;
-	up_read(&fs_info->dev_replace.rwsem);
-
 	return ret;
 }
 
@@ -6046,83 +6040,6 @@  struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
 	return ERR_PTR(ret);
 }
 
-/*
- * In dev-replace case, for repair case (that's the only case where the mirror
- * is selected explicitly when calling btrfs_map_block), blocks left of the
- * left cursor can also be read from the target drive.
- *
- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
- * array of stripes.
- * For READ, it also needs to be supported using the same mirror number.
- *
- * If the requested block is not left of the left cursor, EIO is returned. This
- * can happen because btrfs_num_copies() returns one more in the dev-replace
- * case.
- */
-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
-					 u64 logical, u64 length,
-					 u64 srcdev_devid, int *mirror_num,
-					 u64 *physical)
-{
-	struct btrfs_io_context *bioc = NULL;
-	int num_stripes;
-	int index_srcdev = 0;
-	int found = 0;
-	u64 physical_of_found = 0;
-	int i;
-	int ret = 0;
-
-	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-				logical, &length, &bioc, NULL, NULL, 0);
-	if (ret) {
-		ASSERT(bioc == NULL);
-		return ret;
-	}
-
-	num_stripes = bioc->num_stripes;
-	if (*mirror_num > num_stripes) {
-		/*
-		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
-		 * that means that the requested area is not left of the left
-		 * cursor
-		 */
-		btrfs_put_bioc(bioc);
-		return -EIO;
-	}
-
-	/*
-	 * process the rest of the function using the mirror_num of the source
-	 * drive. Therefore look it up first.  At the end, patch the device
-	 * pointer to the one of the target drive.
-	 */
-	for (i = 0; i < num_stripes; i++) {
-		if (bioc->stripes[i].dev->devid != srcdev_devid)
-			continue;
-
-		/*
-		 * In case of DUP, in order to keep it simple, only add the
-		 * mirror with the lowest physical address
-		 */
-		if (found &&
-		    physical_of_found <= bioc->stripes[i].physical)
-			continue;
-
-		index_srcdev = i;
-		found = 1;
-		physical_of_found = bioc->stripes[i].physical;
-	}
-
-	btrfs_put_bioc(bioc);
-
-	ASSERT(found);
-	if (!found)
-		return -EIO;
-
-	*mirror_num = index_srcdev + 1;
-	*physical = physical_of_found;
-	return ret;
-}
-
 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
 {
 	struct btrfs_block_group *cache;
@@ -6335,14 +6252,13 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	int i;
 	int ret = 0;
 	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+	int num_copies;
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_io_context *bioc = NULL;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	int dev_replace_is_ongoing = 0;
-	int patch_the_first_stripe_for_dev_replace = 0;
 	u16 num_alloc_stripes;
-	u64 physical_to_patch_in_first_stripe = 0;
 	u64 raid56_full_stripe_start = (u64)-1;
 	struct btrfs_io_geometry geom;
 
@@ -6365,6 +6281,8 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	raid56_full_stripe_start = geom.raid56_stripe_offset;
 	data_stripes = nr_data_stripes(map);
 
+	num_copies = btrfs_num_copies(fs_info, logical, geom.len);
+
 	down_read(&dev_replace->rwsem);
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
 	/*
@@ -6374,19 +6292,8 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	if (!dev_replace_is_ongoing)
 		up_read(&dev_replace->rwsem);
 
-	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
-	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
-		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
-						    dev_replace->srcdev->devid,
-						    &mirror_num,
-					    &physical_to_patch_in_first_stripe);
-		if (ret)
-			goto out;
-		else
-			patch_the_first_stripe_for_dev_replace = 1;
-	} else if (mirror_num > map->num_stripes) {
+	if (mirror_num > num_copies)
 		mirror_num = 0;
-	}
 
 	num_stripes = 1;
 	stripe_index = 0;
@@ -6511,15 +6418,9 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
 	    (!need_full_stripe(op) || !dev_replace_is_ongoing ||
 	     !dev_replace->tgtdev)) {
-		if (patch_the_first_stripe_for_dev_replace) {
-			smap->dev = dev_replace->tgtdev;
-			smap->physical = physical_to_patch_in_first_stripe;
-			*mirror_num_ret = map->num_stripes + 1;
-		} else {
-			set_io_stripe(smap, map, stripe_index, stripe_offset,
-				      stripe_nr);
-			*mirror_num_ret = mirror_num;
-		}
+		set_io_stripe(smap, map, stripe_index, stripe_offset,
+			      stripe_nr);
+		*mirror_num_ret = mirror_num;
 		*bioc_ret = NULL;
 		ret = 0;
 		goto out;
@@ -6584,17 +6485,6 @@  int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	bioc->max_errors = max_errors;
 	bioc->mirror_num = mirror_num;
 
-	/*
-	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
-	 * mirror_num == num_stripes + 1 && dev_replace target drive is
-	 * available as a mirror
-	 */
-	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
-		WARN_ON(num_stripes > 1);
-		bioc->stripes[0].dev = dev_replace->tgtdev;
-		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
-		bioc->mirror_num = map->num_stripes + 1;
-	}
 out:
 	if (dev_replace_is_ongoing) {
 		lockdep_assert_held(&dev_replace->rwsem);