diff mbox series

[-next] md/raid1: fix data corruption for degraded array with slow disk

Message ID 20240803091137.3197008-1-yukuai1@huaweicloud.com (mailing list archive)
State Accepted
Headers show
Series [-next] md/raid1: fix data corruption for degraded array with slow disk | expand

Checks

Context Check Description
mdraidci/vmtest-md-6_11-PR success PR summary
mdraidci/vmtest-md-6_11-VM_Test-0 success Logs for build-kernel

Commit Message

Yu Kuai Aug. 3, 2024, 9:11 a.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

read_balance() will avoid reading from slow disks as much as possible,
however, if valid data only lands in slow disks, and a new normal disk
is still in recovery, unrecovered data can be read:

raid1_read_request
 read_balance
  raid1_should_read_first
  -> return false
  choose_best_rdev
  -> normal disk is not recovered, return -1
  choose_bb_rdev
  -> missing the checking of recovery, return the normal disk
 -> read unrecovered data

Root cause is that the checking of recovery is missing in
choose_bb_rdev(). Hence add such checking to fix the problem.

Also fix similar problem in choose_slow_rdev().

Fixes: 9f3ced792203 ("md/raid1: factor out choose_bb_rdev() from read_balance()")
Fixes: dfa8ecd167c1 ("md/raid1: factor out choose_slow_rdev() from read_balance()")
Reported-and-tested-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Closes: https://lore.kernel.org/all/9952f532-2554-44bf-b906-4880b2e88e3a@o2.pl/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid1.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

Comments

Song Liu Aug. 15, 2024, 8:41 p.m. UTC | #1
On Sat, Aug 3, 2024 at 2:15 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> read_balance() will avoid reading from slow disks as much as possible,
> however, if valid data only lands in slow disks, and a new normal disk
> is still in recovery, unrecovered data can be read:
>
> raid1_read_request
>  read_balance
>   raid1_should_read_first
>   -> return false
>   choose_best_rdev
>   -> normal disk is not recovered, return -1
>   choose_bb_rdev
>   -> missing the checking of recovery, return the normal disk
>  -> read unrecovered data
>
> Root cause is that the checking of recovery is missing in
> choose_bb_rdev(). Hence add such checking to fix the problem.
>
> Also fix similar problem in choose_slow_rdev().
>
> Fixes: 9f3ced792203 ("md/raid1: factor out choose_bb_rdev() from read_balance()")
> Fixes: dfa8ecd167c1 ("md/raid1: factor out choose_slow_rdev() from read_balance()")
> Reported-and-tested-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
> Closes: https://lore.kernel.org/all/9952f532-2554-44bf-b906-4880b2e88e3a@o2.pl/
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

Applied to md-6.11. Thanks for the fix!

Song
diff mbox series

Patch

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 531ddfc6efbd..1a45003ea7ee 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -617,6 +617,12 @@  static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
 	return -1;
 }
 
+static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+	return !test_bit(In_sync, &rdev->flags) &&
+	       rdev->recovery_offset < r1_bio->sector + r1_bio->sectors;
+}
+
 static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
 			  int *max_sectors)
 {
@@ -635,6 +641,7 @@  static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
 
 		rdev = conf->mirrors[disk].rdev;
 		if (!rdev || test_bit(Faulty, &rdev->flags) ||
+		    rdev_in_recovery(rdev, r1_bio) ||
 		    test_bit(WriteMostly, &rdev->flags))
 			continue;
 
@@ -673,7 +680,8 @@  static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
 
 		rdev = conf->mirrors[disk].rdev;
 		if (!rdev || test_bit(Faulty, &rdev->flags) ||
-		    !test_bit(WriteMostly, &rdev->flags))
+		    !test_bit(WriteMostly, &rdev->flags) ||
+		    rdev_in_recovery(rdev, r1_bio))
 			continue;
 
 		/* there are no bad blocks, we can use this disk */
@@ -733,9 +741,7 @@  static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
 	if (!rdev || test_bit(Faulty, &rdev->flags))
 		return false;
 
-	/* still in recovery */
-	if (!test_bit(In_sync, &rdev->flags) &&
-	    rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+	if (rdev_in_recovery(rdev, r1_bio))
 		return false;
 
 	/* don't read from slow disk unless have to */