diff mbox series

[v3,3/3] md/raid5: check for overlapping bad blocks before starting reshape

Message ID 20250224090209.2077-3-dougvj@dougvj.net (mailing list archive)
State New
Headers show
Series [v3,1/3] md/raid5: freeze reshape when encountering a bad read | expand

Checks

Context Check Description
mdraidci/vmtest-md-6_14-PR success PR summary
mdraidci/vmtest-md-6_14-VM_Test-0 success Logs for per-patch-testing

Commit Message

Doug V Johnson Feb. 24, 2025, 9:02 a.m. UTC
In addition to halting a reshape in progress when we encounter bad
blocks, we want to make sure that we do not even attempt a reshape if we
know before hand that there are too many overlapping bad blocks and we
would have to stall the reshape.

To do this, we add a new internal function array_has_badblock() which
first checks to see if there are enough drives with bad blocks for the
condition to occur and if there are proceeds to do a simple O(n^2) check
for overlapping bad blocks. If more overlaps are found than can be
corrected for, we return 1 for the presence of bad blocks, otherwise 0

This function is invoked in raid5_start_reshape() and if there are bad
blocks present, returns -EIO which is reported to userspace.

It's possible for bad blocks to be discovered or put in the metadata
after a reshape has started, so we want to leave in place the
functionality to detect and halt a reshape.

Signed-off-by: Doug V Johnson <dougvj@dougvj.net>
---
 drivers/md/raid5.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

Comments

Yu Kuai March 5, 2025, 6:36 a.m. UTC | #1
Hi,

在 2025/02/24 17:02, Doug V Johnson 写道:
> In addition to halting a reshape in progress when we encounter bad
> blocks, we want to make sure that we do not even attempt a reshape if we
> know before hand that there are too many overlapping bad blocks and we
> would have to stall the reshape.
> 
> To do this, we add a new internal function array_has_badblock() which
> first checks to see if there are enough drives with bad blocks for the
> condition to occur and if there are proceeds to do a simple O(n^2) check
> for overlapping bad blocks. If more overlaps are found than can be
> corrected for, we return 1 for the presence of bad blocks, otherwise 0
> 
> This function is invoked in raid5_start_reshape() and if there are bad
> blocks present, returns -EIO which is reported to userspace.
> 
> It's possible for bad blocks to be discovered or put in the metadata
> after a reshape has started, so we want to leave in place the
> functionality to detect and halt a reshape.
> 
> Signed-off-by: Doug V Johnson <dougvj@dougvj.net>
> ---
>   drivers/md/raid5.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 94 insertions(+)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 8b23109d6f37..4b907a674dd1 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -8451,6 +8451,94 @@ static int check_reshape(struct mddev *mddev)
>   				     + mddev->delta_disks));
>   }
>   
> +static int array_has_badblock(struct r5conf *conf)
> +{
> +	/* Searches for overlapping bad blocks on devices that would result
> +	 * in an unreadable condition
> +	 */
> +	int i, j;
> +	/* First see if we even have bad blocks on enough drives to have a
> +	 * bad read condition
> +	 */
> +	int num_badblock_devs = 0;
> +
> +	for (i = 0; i < conf->raid_disks; i++) {
> +		if (rdev_has_badblock(conf->disks[i].rdev,
> +				      0, conf->disks[i].rdev->sectors))
		if (rdev->badblocks.count)

> +			num_badblock_devs++;
> +	}
> +	if (num_badblock_devs <= conf->max_degraded) {
> +		/* There are not enough devices with bad blocks to pose any
> +		 * read problem
> +		 */
> +		return 0;
> +	}
> +	pr_debug("%s: running overlapping bad block check",
> +		 mdname(conf->mddev));
> +	/* Do a more sophisticated check for overlapping regions */
> +	for (i = 0; i < conf->raid_disks; i++) {
> +		sector_t first_bad;
> +		int bad_sectors;
> +		sector_t next_check_s = 0;
> +		int next_check_sectors = conf->disks[i].rdev->sectors;
> +
> +		pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
> +			 mdname(conf->mddev), i,
> +			 (unsigned long)next_check_s, next_check_sectors);
> +		while (is_badblock(conf->disks[i].rdev,
> +				   next_check_s, next_check_sectors,
> +				   &first_bad,
> +				   &bad_sectors) != 0) {
> +			/* Align bad blocks to the size of our stripe */
> +			sector_t aligned_first_bad = first_bad &
> +				~((sector_t)RAID5_STRIPE_SECTORS(conf) - 1);
> +			int aligned_bad_sectors =
> +				max_t(int, RAID5_STRIPE_SECTORS(conf),
> +				      bad_sectors);
> +			int this_num_bad = 1;
For example, if first_bad is 0, bad_sectors is 512 in rdev0

> +
> +			pr_debug("%s: found blocks %i %lu -> %i",
> +				 mdname(conf->mddev), i,
> +				 (unsigned long)aligned_first_bad,
> +				 aligned_bad_sectors);
> +			for (j = 0; j < conf->raid_disks; j++) {
> +				sector_t this_first_bad;
> +				int this_bad_sectors;
> +
> +				if (j == i)
> +					continue;
> +				if (is_badblock(conf->disks[j].rdev,
> +						aligned_first_bad,
> +						aligned_bad_sectors,
> +						&this_first_bad,
> +						&this_bad_sectors)) {
And rdev1 has badblocks 0+256, rdev2 has badblocks 256+256.

If this array is a raid6 with max_degraded=2, then it's fine.

Perhaps a pseudocode loop like following?

  sector_t offset = 0;
  while (offset < dev_sectors) {
          len = dev_sectors - offset;
          bad_disks = 0;
          for (i = 0; i < conf->raid_disks; ++i) {
                  if (is_badblock(rdev, offset, len, &first_bad, 
&bad_sectors)) {
                          if (first_bad <= offset) {
                                  len = min(len, first_bad + bad_sectors 
  offset);
                                  bad_disks++;
                          } else {
                                  len = min(len, first_bad - offset);
                          }
                  }
          }

          if (bad_disks > max_degraded)
                  return false;

          offset += len;
  }

  return true;

Thanks,
Kuai

> +					this_num_bad++;
> +					pr_debug("md/raid:%s: bad block overlap dev %i: %lu %i",
> +						 mdname(conf->mddev), j,
> +						 (unsigned long)this_first_bad,
> +						 this_bad_sectors);
> +				}
> +			}
> +			if (this_num_bad > conf->max_degraded) {
> +				pr_debug("md/raid:%s: %i drives with unreadable sector(s) around %lu %i due to bad block list",
> +					 mdname(conf->mddev),
> +					 this_num_bad,
> +					 (unsigned long)first_bad,
> +					 bad_sectors);
> +				return 1;
> +			}
> +			next_check_s = first_bad + bad_sectors;
> +			next_check_sectors =
> +				next_check_sectors - (first_bad + bad_sectors);
> +			pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
> +				 mdname(conf->mddev), i,
> +				 (unsigned long)next_check_s,
> +				 next_check_sectors);
> +		}
> +	}
> +	return 0;
> +}
> +
>   static int raid5_start_reshape(struct mddev *mddev)
>   {
>   	struct r5conf *conf = mddev->private;
> @@ -8498,6 +8586,12 @@ static int raid5_start_reshape(struct mddev *mddev)
>   		return -EINVAL;
>   	}
>   
> +	if (array_has_badblock(conf)) {
> +		pr_warn("md/raid:%s: reshape not possible due to bad block list",
> +			mdname(mddev));
> +		return -EIO;
> +	}
> +
>   	atomic_set(&conf->reshape_stripes, 0);
>   	spin_lock_irq(&conf->device_lock);
>   	write_seqcount_begin(&conf->gen_lock);
>
diff mbox series

Patch

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8b23109d6f37..4b907a674dd1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8451,6 +8451,94 @@  static int check_reshape(struct mddev *mddev)
 				     + mddev->delta_disks));
 }
 
+static int array_has_badblock(struct r5conf *conf)
+{
+	/* Searches for overlapping bad blocks on devices that would result
+	 * in an unreadable condition
+	 */
+	int i, j;
+	/* First see if we even have bad blocks on enough drives to have a
+	 * bad read condition
+	 */
+	int num_badblock_devs = 0;
+
+	for (i = 0; i < conf->raid_disks; i++) {
+		if (rdev_has_badblock(conf->disks[i].rdev,
+				      0, conf->disks[i].rdev->sectors))
+			num_badblock_devs++;
+	}
+	if (num_badblock_devs <= conf->max_degraded) {
+		/* There are not enough devices with bad blocks to pose any
+		 * read problem
+		 */
+		return 0;
+	}
+	pr_debug("%s: running overlapping bad block check",
+		 mdname(conf->mddev));
+	/* Do a more sophisticated check for overlapping regions */
+	for (i = 0; i < conf->raid_disks; i++) {
+		sector_t first_bad;
+		int bad_sectors;
+		sector_t next_check_s = 0;
+		int next_check_sectors = conf->disks[i].rdev->sectors;
+
+		pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
+			 mdname(conf->mddev), i,
+			 (unsigned long)next_check_s, next_check_sectors);
+		while (is_badblock(conf->disks[i].rdev,
+				   next_check_s, next_check_sectors,
+				   &first_bad,
+				   &bad_sectors) != 0) {
+			/* Align bad blocks to the size of our stripe */
+			sector_t aligned_first_bad = first_bad &
+				~((sector_t)RAID5_STRIPE_SECTORS(conf) - 1);
+			int aligned_bad_sectors =
+				max_t(int, RAID5_STRIPE_SECTORS(conf),
+				      bad_sectors);
+			int this_num_bad = 1;
+
+			pr_debug("%s: found blocks %i %lu -> %i",
+				 mdname(conf->mddev), i,
+				 (unsigned long)aligned_first_bad,
+				 aligned_bad_sectors);
+			for (j = 0; j < conf->raid_disks; j++) {
+				sector_t this_first_bad;
+				int this_bad_sectors;
+
+				if (j == i)
+					continue;
+				if (is_badblock(conf->disks[j].rdev,
+						aligned_first_bad,
+						aligned_bad_sectors,
+						&this_first_bad,
+						&this_bad_sectors)) {
+					this_num_bad++;
+					pr_debug("md/raid:%s: bad block overlap dev %i: %lu %i",
+						 mdname(conf->mddev), j,
+						 (unsigned long)this_first_bad,
+						 this_bad_sectors);
+				}
+			}
+			if (this_num_bad > conf->max_degraded) {
+				pr_debug("md/raid:%s: %i drives with unreadable sector(s) around %lu %i due to bad block list",
+					 mdname(conf->mddev),
+					 this_num_bad,
+					 (unsigned long)first_bad,
+					 bad_sectors);
+				return 1;
+			}
+			next_check_s = first_bad + bad_sectors;
+			next_check_sectors =
+				next_check_sectors - (first_bad + bad_sectors);
+			pr_debug("%s: badblock check: %i (s: %lu, sec: %i)",
+				 mdname(conf->mddev), i,
+				 (unsigned long)next_check_s,
+				 next_check_sectors);
+		}
+	}
+	return 0;
+}
+
 static int raid5_start_reshape(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
@@ -8498,6 +8586,12 @@  static int raid5_start_reshape(struct mddev *mddev)
 		return -EINVAL;
 	}
 
+	if (array_has_badblock(conf)) {
+		pr_warn("md/raid:%s: reshape not possible due to bad block list",
+			mdname(mddev));
+		return -EIO;
+	}
+
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	write_seqcount_begin(&conf->gen_lock);