diff mbox series

[2/2] md/raid10: handle replacement devices in fix_recovery_read_error

Message ID 20230627034127.4000994-3-linan666@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series md/raid10: handle replacement if recovery read error | expand

Commit Message

Li Nan June 27, 2023, 3:41 a.m. UTC
From: Li Nan <linan122@huawei.com>

In fix_recovery_read_error(), the handling of replacement devices is
missing. Add it. If io error is from replacement, error this device
directly. If io error is from other device, just set badblocks for
replacement.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid10.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

Comments

Song Liu July 7, 2023, 8:33 a.m. UTC | #1
On Tue, Jun 27, 2023 at 11:42 AM <linan666@huaweicloud.com> wrote:
>
> From: Li Nan <linan122@huawei.com>
>
> In fix_recovery_read_error(), the handling of replacement devices is
> missing. Add it. If io error is from replacement, error this device
> directly. If io error is from other device, just set badblocks for
> replacement.
>
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>  drivers/md/raid10.c | 10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 5105273f60e9..6d9025089455 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -2551,7 +2551,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>
>         while (sectors) {
>                 int s = sectors;
> -               struct md_rdev *rdev;
> +               struct md_rdev *rdev, *repl;
>                 sector_t addr;
>                 int ok;
>
> @@ -2559,6 +2559,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>                         s = PAGE_SIZE >> 9;
>
>                 rdev = conf->mirrors[dr].rdev;
> +               repl = conf->mirrors[dw].replacement;
>                 addr = r10_bio->devs[0].addr + sect,
>                 ok = sync_page_io(rdev,
>                                   addr,
> @@ -2580,6 +2581,9 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>                                         set_bit(MD_RECOVERY_NEEDED,
>                                                 &rdev->mddev->recovery);
>                         }
> +                       if (repl && !sync_page_io(repl, addr, s << 9,
> +                           pages[idx], REQ_OP_WRITE, false))
> +                               md_error(mddev, repl);
>                 }
>                 if (!ok) {
>                         /* We don't worry if we cannot set a bad block -
> @@ -2592,7 +2596,9 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>                                 /* need bad block on destination too */
>                                 rdev = conf->mirrors[dw].rdev;
>                                 addr = r10_bio->devs[1].addr + sect;
> -                               if (!rdev_set_badblocks(rdev, addr, s, 0)) {
> +                               if (!rdev_set_badblocks(rdev, addr, s, 0) ||
> +                                   (repl &&
> +                                   !rdev_set_badblocks(repl, addr, s, 0))) {

Do we really want this in the if () statement? Shall we always set
badblock on both rdev and repl?

Thanks,
Song

>                                         /* just abort the recovery */
>                                         pr_notice("md/raid10:%s: recovery aborted due to read error\n",
>                                                   mdname(mddev));
> --
> 2.39.2
>
Yu Kuai July 10, 2023, 1 p.m. UTC | #2
Hi,

在 2023/07/07 16:33, Song Liu 写道:
> On Tue, Jun 27, 2023 at 11:42 AM <linan666@huaweicloud.com> wrote:
>>
>> From: Li Nan <linan122@huawei.com>
>>
>> In fix_recovery_read_error(), the handling of replacement devices is
>> missing. Add it. If io error is from replacement, error this device
>> directly. If io error is from other device, just set badblocks for
>> replacement.
>>
>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>>   drivers/md/raid10.c | 10 ++++++++--
>>   1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index 5105273f60e9..6d9025089455 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -2551,7 +2551,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>>
>>          while (sectors) {
>>                  int s = sectors;
>> -               struct md_rdev *rdev;
>> +               struct md_rdev *rdev, *repl;
>>                  sector_t addr;
>>                  int ok;
>>
>> @@ -2559,6 +2559,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>>                          s = PAGE_SIZE >> 9;
>>
>>                  rdev = conf->mirrors[dr].rdev;
>> +               repl = conf->mirrors[dw].replacement;
>>                  addr = r10_bio->devs[0].addr + sect,
>>                  ok = sync_page_io(rdev,
>>                                    addr,
>> @@ -2580,6 +2581,9 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>>                                          set_bit(MD_RECOVERY_NEEDED,
>>                                                  &rdev->mddev->recovery);
>>                          }
>> +                       if (repl && !sync_page_io(repl, addr, s << 9,
>> +                           pages[idx], REQ_OP_WRITE, false))
>> +                               md_error(mddev, repl);
>>                  }
>>                  if (!ok) {
>>                          /* We don't worry if we cannot set a bad block -
>> @@ -2592,7 +2596,9 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
>>                                  /* need bad block on destination too */
>>                                  rdev = conf->mirrors[dw].rdev;
>>                                  addr = r10_bio->devs[1].addr + sect;
>> -                               if (!rdev_set_badblocks(rdev, addr, s, 0)) {
>> +                               if (!rdev_set_badblocks(rdev, addr, s, 0) ||
>> +                                   (repl &&
>> +                                   !rdev_set_badblocks(repl, addr, s, 0))) {
> 
> Do we really want this in the if () statement? Shall we always set
> badblock on both rdev and repl?

I think this is wrong to set repl badblocks inside this, because if
setting badblocks for rdev failed, repl is still not handled.

By the way, I think it's better to at least try to read from all
possible copies before setting badblocks for repl.

Thanks,
Kuai
> 
> Thanks,
> Song
> 
>>                                          /* just abort the recovery */
>>                                          pr_notice("md/raid10:%s: recovery aborted due to read error\n",
>>                                                    mdname(mddev));
>> --
>> 2.39.2
>>
> .
>
diff mbox series

Patch

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5105273f60e9..6d9025089455 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2551,7 +2551,7 @@  static void fix_recovery_read_error(struct r10bio *r10_bio)
 
 	while (sectors) {
 		int s = sectors;
-		struct md_rdev *rdev;
+		struct md_rdev *rdev, *repl;
 		sector_t addr;
 		int ok;
 
@@ -2559,6 +2559,7 @@  static void fix_recovery_read_error(struct r10bio *r10_bio)
 			s = PAGE_SIZE >> 9;
 
 		rdev = conf->mirrors[dr].rdev;
+		repl = conf->mirrors[dw].replacement;
 		addr = r10_bio->devs[0].addr + sect,
 		ok = sync_page_io(rdev,
 				  addr,
@@ -2580,6 +2581,9 @@  static void fix_recovery_read_error(struct r10bio *r10_bio)
 					set_bit(MD_RECOVERY_NEEDED,
 						&rdev->mddev->recovery);
 			}
+			if (repl && !sync_page_io(repl, addr, s << 9,
+			    pages[idx], REQ_OP_WRITE, false))
+				md_error(mddev, repl);
 		}
 		if (!ok) {
 			/* We don't worry if we cannot set a bad block -
@@ -2592,7 +2596,9 @@  static void fix_recovery_read_error(struct r10bio *r10_bio)
 				/* need bad block on destination too */
 				rdev = conf->mirrors[dw].rdev;
 				addr = r10_bio->devs[1].addr + sect;
-				if (!rdev_set_badblocks(rdev, addr, s, 0)) {
+				if (!rdev_set_badblocks(rdev, addr, s, 0) ||
+				    (repl &&
+				    !rdev_set_badblocks(repl, addr, s, 0))) {
 					/* just abort the recovery */
 					pr_notice("md/raid10:%s: recovery aborted due to read error\n",
 						  mdname(mddev));