Message ID | 20240612021911.11043-2-heming.zhao@suse.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | [1/2] md-cluster: fix hanging issue while a new disk adding | expand |
Context | Check | Description |
---|---|---|
mdraidci/vmtest-md-6_11-PR | success | PR summary |
mdraidci/vmtest-md-6_11-VM_Test-0 | success | Logs for build-kernel |
Hi, 在 2024/06/12 10:19, Heming Zhao 写道: > The commit db5e653d7c9f ("md: delay choosing sync action to > md_start_sync()") delays the start of the sync action. In a > clustered environment, this will cause another node to first > activate the spare disk and skip recovery. As a result, no > nodes will perform recovery when a disk is added or re-added. > > Before db5e653d7c9f: > > ``` > node1 node2 > ---------------------------------------------------------------- > md_check_recovery > + md_update_sb > | sendmsg: METADATA_UPDATED > + md_choose_sync_action process_metadata_update > | remove_and_add_spares //node1 has not finished adding > + call mddev->sync_work //the spare disk:do nothing > > md_start_sync > starts md_do_sync > > md_do_sync > + grabbed resync_lockres:DLM_LOCK_EX > + do syncing job > > md_check_recovery > sendmsg: METADATA_UPDATED > process_metadata_update > //activate spare disk > > ... ... > > md_do_sync > waiting to grab resync_lockres:EX > ``` > > After db5e653d7c9f: > > (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery > starts md_start_sync, setting the INTR action will exacerbate the > delay in node1 calling the md_do_sync function.) > > ``` > node1 node2 > ---------------------------------------------------------------- > md_check_recovery > + md_update_sb > | sendmsg: METADATA_UPDATED > + calls mddev->sync_work process_metadata_update > //node1 has not finished adding > //the spare disk:do nothing > > md_start_sync > + md_choose_sync_action > | remove_and_add_spares > + calls md_do_sync > > md_check_recovery > md_update_sb > sendmsg: METADATA_UPDATED > process_metadata_update > //activate spare disk > > ... ... ... ... > > md_do_sync > + grabbed resync_lockres:EX > + raid1_sync_request skip sync under > conf->fullsync:0 > md_do_sync > 1. waiting to grab resync_lockres:EX > 2. when node1 could grab EX lock, > node1 will skip resync under recovery_offset:MaxSector I have a question here, because md_do_sync() and the second md_check_recovery() are completely irrelevant, so with or without the commit db5e653d7c9f, the problem is always there? Thanks, Kuai > ``` > > How to trigger: > > ```(commands @node1) > # to easily watch the recovery status > echo 2000 > /proc/sys/dev/raid/speed_limit_max > ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" > > mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean > ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb > mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda > mdadm --manage /dev/md0 --add /dev/sdc > > === "cat /proc/mdstat" on both node, there are no recovery action. === > ``` > > How to fix: > > because md layer code logic is hard to restore for speeding up sync job > on local node, we add new cluster msg to pending the another node to > active disk. > > Signed-off-by: Heming Zhao <heming.zhao@suse.com> > Reviewed-by: Su Yue <glass.su@suse.com> > --- > drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++ > drivers/md/md-cluster.h | 2 ++ > drivers/md/md.c | 17 ++++++++++++++--- > 3 files changed, 43 insertions(+), 3 deletions(-) > > diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c > index 27eaaf9fef94..ab1e3c490279 100644 > --- a/drivers/md/md-cluster.c > +++ b/drivers/md/md-cluster.c > @@ -56,6 +56,7 @@ struct resync_info { > #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 > #define MD_CLUSTER_PENDING_RECV_EVENT 7 > #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 > +#define MD_CLUSTER_WAITING_FOR_SYNC 9 > > struct md_cluster_info { > struct mddev *mddev; /* the md device which md_cluster_info belongs to */ > @@ -91,6 +92,7 @@ struct md_cluster_info { > sector_t sync_hi; > }; > > +/* For compatibility, add the new msg_type at the end. */ > enum msg_type { > METADATA_UPDATED = 0, > RESYNCING, > @@ -100,6 +102,7 @@ enum msg_type { > BITMAP_NEEDS_SYNC, > CHANGE_CAPACITY, > BITMAP_RESIZE, > + RESYNCING_START, > }; > > struct cluster_msg { > @@ -460,6 +463,7 @@ static void process_suspend_info(struct mddev *mddev, > clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); > remove_suspend_info(mddev, slot); > set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); > + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > md_wakeup_thread(mddev->thread); > return; > } > @@ -530,6 +534,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) > res = -1; > } > clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); > + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > return res; > } > > @@ -598,6 +603,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) > case CHANGE_CAPACITY: > set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); > break; > + case RESYNCING_START: > + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); > + break; > case RESYNCING: > set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); > process_suspend_info(mddev, le32_to_cpu(msg->slot), > @@ -1353,6 +1361,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) > spin_unlock_irq(&cinfo->suspend_lock); > } > > +static int resync_status_get(struct mddev *mddev) > +{ > + struct md_cluster_info *cinfo = mddev->cluster_info; > + > + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > +} > + > +static int resync_start_notify(struct mddev *mddev) > +{ > + struct md_cluster_info *cinfo = mddev->cluster_info; > + struct cluster_msg cmsg = {0}; > + > + cmsg.type = cpu_to_le32(RESYNCING_START); > + > + return sendmsg(cinfo, &cmsg, 0); > +} > + > static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) > { > struct md_cluster_info *cinfo = mddev->cluster_info; > @@ -1587,6 +1612,8 @@ static struct md_cluster_operations cluster_ops = { > .resync_start = resync_start, > .resync_finish = resync_finish, > .resync_info_update = resync_info_update, > + .resync_start_notify = resync_start_notify, > + .resync_status_get = resync_status_get, > .resync_info_get = resync_info_get, > .metadata_update_start = metadata_update_start, > .metadata_update_finish = metadata_update_finish, > diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h > index a78e3021775d..470bf18ffde5 100644 > --- a/drivers/md/md-cluster.h > +++ b/drivers/md/md-cluster.h > @@ -14,6 +14,8 @@ struct md_cluster_operations { > int (*leave)(struct mddev *mddev); > int (*slot_number)(struct mddev *mddev); > int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); > + int (*resync_start_notify)(struct mddev *mddev); > + int (*resync_status_get)(struct mddev *mddev); > void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); > int (*metadata_update_start)(struct mddev *mddev); > int (*metadata_update_finish)(struct mddev *mddev); > diff --git a/drivers/md/md.c b/drivers/md/md.c > index e575e74aabf5..97d7e96d1d5e 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -8902,7 +8902,8 @@ void md_do_sync(struct md_thread *thread) > * This will mean we have to start checking from the beginning again. > * > */ > - > + if (mddev_is_clustered(mddev)) > + md_cluster_ops->resync_start_notify(mddev); > do { > int mddev2_minor = -1; > mddev->curr_resync = MD_RESYNC_DELAYED; > @@ -9963,8 +9964,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) > */ > if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && > !(le32_to_cpu(sb->feature_map) & > - MD_FEATURE_RESHAPE_ACTIVE)) { > - rdev2->saved_raid_disk = role; > + MD_FEATURE_RESHAPE_ACTIVE) && > + !md_cluster_ops->resync_status_get(mddev)) { > + /* > + * -1 to make raid1_add_disk() set conf->fullsync > + * to 1. This could avoid skipping sync when the > + * remote node is down during resyncing. > + */ > + if ((le32_to_cpu(sb->feature_map) > + & MD_FEATURE_RECOVERY_OFFSET)) > + rdev2->saved_raid_disk = -1; > + else > + rdev2->saved_raid_disk = role; > ret = remove_and_add_spares(mddev, rdev2); > pr_info("Activated spare: %pg\n", > rdev2->bdev); >
On 6/27/24 21:00, Yu Kuai wrote: > Hi, > > 在 2024/06/12 10:19, Heming Zhao 写道: >> The commit db5e653d7c9f ("md: delay choosing sync action to >> md_start_sync()") delays the start of the sync action. In a >> clustered environment, this will cause another node to first >> activate the spare disk and skip recovery. As a result, no >> nodes will perform recovery when a disk is added or re-added. >> >> Before db5e653d7c9f: >> >> ``` >> node1 node2 >> ---------------------------------------------------------------- >> md_check_recovery >> + md_update_sb >> | sendmsg: METADATA_UPDATED >> + md_choose_sync_action process_metadata_update >> | remove_and_add_spares //node1 has not finished adding >> + call mddev->sync_work //the spare disk:do nothing >> >> md_start_sync >> starts md_do_sync >> >> md_do_sync >> + grabbed resync_lockres:DLM_LOCK_EX >> + do syncing job >> >> md_check_recovery >> sendmsg: METADATA_UPDATED >> process_metadata_update >> //activate spare disk >> >> ... ... >> >> md_do_sync >> waiting to grab resync_lockres:EX >> ``` >> >> After db5e653d7c9f: >> >> (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery >> starts md_start_sync, setting the INTR action will exacerbate the >> delay in node1 calling the md_do_sync function.) >> >> ``` >> node1 node2 >> ---------------------------------------------------------------- >> md_check_recovery >> + md_update_sb >> | sendmsg: METADATA_UPDATED >> + calls mddev->sync_work process_metadata_update >> //node1 has not finished adding >> //the spare disk:do nothing >> >> md_start_sync >> + md_choose_sync_action >> | remove_and_add_spares >> + calls md_do_sync >> >> md_check_recovery >> md_update_sb >> sendmsg: METADATA_UPDATED >> process_metadata_update >> //activate spare disk >> >> ... ... ... ... >> >> md_do_sync >> + grabbed resync_lockres:EX >> + raid1_sync_request skip sync under >> conf->fullsync:0 >> md_do_sync >> 1. waiting to grab resync_lockres:EX >> 2. when node1 could grab EX lock, >> node1 will skip resync under recovery_offset:MaxSector > > I have a question here, because md_do_sync() and the second > md_check_recovery() are completely irrelevant, so with or without > the commit db5e653d7c9f, the problem is always there? > > Thanks, > Kuai I performed a bi-search job. The commit f52f5c71f3d4 ("md: fix stopping sync thread") is the first sync failure commit. However, f52f5c71f3d4 is not the root cause, it just exacerbates the sync problem. In a clustered env, md_check_recovery() will send the msg METADATA_UPDATED many times during the completion of metadata updates. msg:METADATA_UPDATED notifies another node that the metadata is updated. The current code design requires the local node to set correct metadata values for another node. With db5e653d7c9f, the another node will handle the metadata first and start the sync job before the local node, which triggers the bug. -Heming > > >> ``` >> >> How to trigger: >> >> ```(commands @node1) >> # to easily watch the recovery status >> echo 2000 > /proc/sys/dev/raid/speed_limit_max >> ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" >> >> mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean >> ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb >> mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda >> mdadm --manage /dev/md0 --add /dev/sdc >> >> === "cat /proc/mdstat" on both node, there are no recovery action. === >> ``` >> >> How to fix: >> >> because md layer code logic is hard to restore for speeding up sync job >> on local node, we add new cluster msg to pending the another node to >> active disk. >> >> Signed-off-by: Heming Zhao <heming.zhao@suse.com> >> Reviewed-by: Su Yue <glass.su@suse.com> >> --- >> drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++ >> drivers/md/md-cluster.h | 2 ++ >> drivers/md/md.c | 17 ++++++++++++++--- >> 3 files changed, 43 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c >> index 27eaaf9fef94..ab1e3c490279 100644 >> --- a/drivers/md/md-cluster.c >> +++ b/drivers/md/md-cluster.c >> @@ -56,6 +56,7 @@ struct resync_info { >> #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 >> #define MD_CLUSTER_PENDING_RECV_EVENT 7 >> #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 >> +#define MD_CLUSTER_WAITING_FOR_SYNC 9 >> struct md_cluster_info { >> struct mddev *mddev; /* the md device which md_cluster_info belongs to */ >> @@ -91,6 +92,7 @@ struct md_cluster_info { >> sector_t sync_hi; >> }; >> +/* For compatibility, add the new msg_type at the end. */ >> enum msg_type { >> METADATA_UPDATED = 0, >> RESYNCING, >> @@ -100,6 +102,7 @@ enum msg_type { >> BITMAP_NEEDS_SYNC, >> CHANGE_CAPACITY, >> BITMAP_RESIZE, >> + RESYNCING_START, >> }; >> struct cluster_msg { >> @@ -460,6 +463,7 @@ static void process_suspend_info(struct mddev *mddev, >> clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >> remove_suspend_info(mddev, slot); >> set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); >> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >> md_wakeup_thread(mddev->thread); >> return; >> } >> @@ -530,6 +534,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) >> res = -1; >> } >> clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); >> + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >> return res; >> } >> @@ -598,6 +603,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) >> case CHANGE_CAPACITY: >> set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); >> break; >> + case RESYNCING_START: >> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); >> + break; >> case RESYNCING: >> set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >> process_suspend_info(mddev, le32_to_cpu(msg->slot), >> @@ -1353,6 +1361,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) >> spin_unlock_irq(&cinfo->suspend_lock); >> } >> +static int resync_status_get(struct mddev *mddev) >> +{ >> + struct md_cluster_info *cinfo = mddev->cluster_info; >> + >> + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >> +} >> + >> +static int resync_start_notify(struct mddev *mddev) >> +{ >> + struct md_cluster_info *cinfo = mddev->cluster_info; >> + struct cluster_msg cmsg = {0}; >> + >> + cmsg.type = cpu_to_le32(RESYNCING_START); >> + >> + return sendmsg(cinfo, &cmsg, 0); >> +} >> + >> static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) >> { >> struct md_cluster_info *cinfo = mddev->cluster_info; >> @@ -1587,6 +1612,8 @@ static struct md_cluster_operations cluster_ops = { >> .resync_start = resync_start, >> .resync_finish = resync_finish, >> .resync_info_update = resync_info_update, >> + .resync_start_notify = resync_start_notify, >> + .resync_status_get = resync_status_get, >> .resync_info_get = resync_info_get, >> .metadata_update_start = metadata_update_start, >> .metadata_update_finish = metadata_update_finish, >> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h >> index a78e3021775d..470bf18ffde5 100644 >> --- a/drivers/md/md-cluster.h >> +++ b/drivers/md/md-cluster.h >> @@ -14,6 +14,8 @@ struct md_cluster_operations { >> int (*leave)(struct mddev *mddev); >> int (*slot_number)(struct mddev *mddev); >> int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); >> + int (*resync_start_notify)(struct mddev *mddev); >> + int (*resync_status_get)(struct mddev *mddev); >> void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); >> int (*metadata_update_start)(struct mddev *mddev); >> int (*metadata_update_finish)(struct mddev *mddev); >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index e575e74aabf5..97d7e96d1d5e 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -8902,7 +8902,8 @@ void md_do_sync(struct md_thread *thread) >> * This will mean we have to start checking from the beginning again. >> * >> */ >> - >> + if (mddev_is_clustered(mddev)) >> + md_cluster_ops->resync_start_notify(mddev); >> do { >> int mddev2_minor = -1; >> mddev->curr_resync = MD_RESYNC_DELAYED; >> @@ -9963,8 +9964,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) >> */ >> if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && >> !(le32_to_cpu(sb->feature_map) & >> - MD_FEATURE_RESHAPE_ACTIVE)) { >> - rdev2->saved_raid_disk = role; >> + MD_FEATURE_RESHAPE_ACTIVE) && >> + !md_cluster_ops->resync_status_get(mddev)) { >> + /* >> + * -1 to make raid1_add_disk() set conf->fullsync >> + * to 1. This could avoid skipping sync when the >> + * remote node is down during resyncing. >> + */ >> + if ((le32_to_cpu(sb->feature_map) >> + & MD_FEATURE_RECOVERY_OFFSET)) >> + rdev2->saved_raid_disk = -1; >> + else >> + rdev2->saved_raid_disk = role; >> ret = remove_and_add_spares(mddev, rdev2); >> pr_info("Activated spare: %pg\n", >> rdev2->bdev); >> >
Hello Kuai, ping... On 6/29/24 19:16, Heming Zhao wrote: > On 6/27/24 21:00, Yu Kuai wrote: >> Hi, >> >> 在 2024/06/12 10:19, Heming Zhao 写道: >>> The commit db5e653d7c9f ("md: delay choosing sync action to >>> md_start_sync()") delays the start of the sync action. In a >>> clustered environment, this will cause another node to first >>> activate the spare disk and skip recovery. As a result, no >>> nodes will perform recovery when a disk is added or re-added. >>> >>> Before db5e653d7c9f: >>> >>> ``` >>> node1 node2 >>> ---------------------------------------------------------------- >>> md_check_recovery >>> + md_update_sb >>> | sendmsg: METADATA_UPDATED >>> + md_choose_sync_action process_metadata_update >>> | remove_and_add_spares //node1 has not finished adding >>> + call mddev->sync_work //the spare disk:do nothing >>> >>> md_start_sync >>> starts md_do_sync >>> >>> md_do_sync >>> + grabbed resync_lockres:DLM_LOCK_EX >>> + do syncing job >>> >>> md_check_recovery >>> sendmsg: METADATA_UPDATED >>> process_metadata_update >>> //activate spare disk >>> >>> ... ... >>> >>> md_do_sync >>> waiting to grab resync_lockres:EX >>> ``` >>> >>> After db5e653d7c9f: >>> >>> (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery >>> starts md_start_sync, setting the INTR action will exacerbate the >>> delay in node1 calling the md_do_sync function.) >>> >>> ``` >>> node1 node2 >>> ---------------------------------------------------------------- >>> md_check_recovery >>> + md_update_sb >>> | sendmsg: METADATA_UPDATED >>> + calls mddev->sync_work process_metadata_update >>> //node1 has not finished adding >>> //the spare disk:do nothing >>> >>> md_start_sync >>> + md_choose_sync_action >>> | remove_and_add_spares >>> + calls md_do_sync >>> >>> md_check_recovery >>> md_update_sb >>> sendmsg: METADATA_UPDATED >>> process_metadata_update >>> //activate spare disk >>> >>> ... ... ... ... >>> >>> md_do_sync >>> + grabbed resync_lockres:EX >>> + raid1_sync_request skip sync under >>> conf->fullsync:0 >>> md_do_sync >>> 1. waiting to grab resync_lockres:EX >>> 2. when node1 could grab EX lock, >>> node1 will skip resync under recovery_offset:MaxSector >> >> I have a question here, because md_do_sync() and the second >> md_check_recovery() are completely irrelevant, so with or without >> the commit db5e653d7c9f, the problem is always there? >> >> Thanks, >> Kuai > > I performed a bi-search job. The commit f52f5c71f3d4 ("md: fix > stopping sync thread") is the first sync failure commit. However, > f52f5c71f3d4 is not the root cause, it just exacerbates the sync > problem. > > In a clustered env, md_check_recovery() will send the msg > METADATA_UPDATED many times during the completion of metadata updates. > msg:METADATA_UPDATED notifies another node that the metadata is > updated. The current code design requires the local node to set > correct metadata values for another node. With db5e653d7c9f, > the another node will handle the metadata first and start the sync > job before the local node, which triggers the bug. > > -Heming > >> >> >>> ``` >>> >>> How to trigger: >>> >>> ```(commands @node1) >>> # to easily watch the recovery status >>> echo 2000 > /proc/sys/dev/raid/speed_limit_max >>> ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" >>> >>> mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean >>> ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb >>> mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda >>> mdadm --manage /dev/md0 --add /dev/sdc >>> >>> === "cat /proc/mdstat" on both node, there are no recovery action. === >>> ``` >>> >>> How to fix: >>> >>> because md layer code logic is hard to restore for speeding up sync job >>> on local node, we add new cluster msg to pending the another node to >>> active disk. >>> >>> Signed-off-by: Heming Zhao <heming.zhao@suse.com> >>> Reviewed-by: Su Yue <glass.su@suse.com> >>> --- >>> drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++ >>> drivers/md/md-cluster.h | 2 ++ >>> drivers/md/md.c | 17 ++++++++++++++--- >>> 3 files changed, 43 insertions(+), 3 deletions(-) >>> >>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c >>> index 27eaaf9fef94..ab1e3c490279 100644 >>> --- a/drivers/md/md-cluster.c >>> +++ b/drivers/md/md-cluster.c >>> @@ -56,6 +56,7 @@ struct resync_info { >>> #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 >>> #define MD_CLUSTER_PENDING_RECV_EVENT 7 >>> #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 >>> +#define MD_CLUSTER_WAITING_FOR_SYNC 9 >>> struct md_cluster_info { >>> struct mddev *mddev; /* the md device which md_cluster_info belongs to */ >>> @@ -91,6 +92,7 @@ struct md_cluster_info { >>> sector_t sync_hi; >>> }; >>> +/* For compatibility, add the new msg_type at the end. */ >>> enum msg_type { >>> METADATA_UPDATED = 0, >>> RESYNCING, >>> @@ -100,6 +102,7 @@ enum msg_type { >>> BITMAP_NEEDS_SYNC, >>> CHANGE_CAPACITY, >>> BITMAP_RESIZE, >>> + RESYNCING_START, >>> }; >>> struct cluster_msg { >>> @@ -460,6 +463,7 @@ static void process_suspend_info(struct mddev *mddev, >>> clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >>> remove_suspend_info(mddev, slot); >>> set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); >>> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>> md_wakeup_thread(mddev->thread); >>> return; >>> } >>> @@ -530,6 +534,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) >>> res = -1; >>> } >>> clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); >>> + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>> return res; >>> } >>> @@ -598,6 +603,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) >>> case CHANGE_CAPACITY: >>> set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); >>> break; >>> + case RESYNCING_START: >>> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); >>> + break; >>> case RESYNCING: >>> set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >>> process_suspend_info(mddev, le32_to_cpu(msg->slot), >>> @@ -1353,6 +1361,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) >>> spin_unlock_irq(&cinfo->suspend_lock); >>> } >>> +static int resync_status_get(struct mddev *mddev) >>> +{ >>> + struct md_cluster_info *cinfo = mddev->cluster_info; >>> + >>> + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>> +} >>> + >>> +static int resync_start_notify(struct mddev *mddev) >>> +{ >>> + struct md_cluster_info *cinfo = mddev->cluster_info; >>> + struct cluster_msg cmsg = {0}; >>> + >>> + cmsg.type = cpu_to_le32(RESYNCING_START); >>> + >>> + return sendmsg(cinfo, &cmsg, 0); >>> +} >>> + >>> static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) >>> { >>> struct md_cluster_info *cinfo = mddev->cluster_info; >>> @@ -1587,6 +1612,8 @@ static struct md_cluster_operations cluster_ops = { >>> .resync_start = resync_start, >>> .resync_finish = resync_finish, >>> .resync_info_update = resync_info_update, >>> + .resync_start_notify = resync_start_notify, >>> + .resync_status_get = resync_status_get, >>> .resync_info_get = resync_info_get, >>> .metadata_update_start = metadata_update_start, >>> .metadata_update_finish = metadata_update_finish, >>> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h >>> index a78e3021775d..470bf18ffde5 100644 >>> --- a/drivers/md/md-cluster.h >>> +++ b/drivers/md/md-cluster.h >>> @@ -14,6 +14,8 @@ struct md_cluster_operations { >>> int (*leave)(struct mddev *mddev); >>> int (*slot_number)(struct mddev *mddev); >>> int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); >>> + int (*resync_start_notify)(struct mddev *mddev); >>> + int (*resync_status_get)(struct mddev *mddev); >>> void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); >>> int (*metadata_update_start)(struct mddev *mddev); >>> int (*metadata_update_finish)(struct mddev *mddev); >>> diff --git a/drivers/md/md.c b/drivers/md/md.c >>> index e575e74aabf5..97d7e96d1d5e 100644 >>> --- a/drivers/md/md.c >>> +++ b/drivers/md/md.c >>> @@ -8902,7 +8902,8 @@ void md_do_sync(struct md_thread *thread) >>> * This will mean we have to start checking from the beginning again. >>> * >>> */ >>> - >>> + if (mddev_is_clustered(mddev)) >>> + md_cluster_ops->resync_start_notify(mddev); >>> do { >>> int mddev2_minor = -1; >>> mddev->curr_resync = MD_RESYNC_DELAYED; >>> @@ -9963,8 +9964,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) >>> */ >>> if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && >>> !(le32_to_cpu(sb->feature_map) & >>> - MD_FEATURE_RESHAPE_ACTIVE)) { >>> - rdev2->saved_raid_disk = role; >>> + MD_FEATURE_RESHAPE_ACTIVE) && >>> + !md_cluster_ops->resync_status_get(mddev)) { >>> + /* >>> + * -1 to make raid1_add_disk() set conf->fullsync >>> + * to 1. This could avoid skipping sync when the >>> + * remote node is down during resyncing. >>> + */ >>> + if ((le32_to_cpu(sb->feature_map) >>> + & MD_FEATURE_RECOVERY_OFFSET)) >>> + rdev2->saved_raid_disk = -1; >>> + else >>> + rdev2->saved_raid_disk = role; >>> ret = remove_and_add_spares(mddev, rdev2); >>> pr_info("Activated spare: %pg\n", >>> rdev2->bdev); >>> >> >
Hi, 在 2024/07/09 8:34, Heming Zhao 写道: > Hello Kuai, > > ping... > I don't have other suggestions for now, you can send a new vesion. Thanks Kuai > On 6/29/24 19:16, Heming Zhao wrote: >> On 6/27/24 21:00, Yu Kuai wrote: >>> Hi, >>> >>> 在 2024/06/12 10:19, Heming Zhao 写道: >>>> The commit db5e653d7c9f ("md: delay choosing sync action to >>>> md_start_sync()") delays the start of the sync action. In a >>>> clustered environment, this will cause another node to first >>>> activate the spare disk and skip recovery. As a result, no >>>> nodes will perform recovery when a disk is added or re-added. >>>> >>>> Before db5e653d7c9f: >>>> >>>> ``` >>>> node1 node2 >>>> ---------------------------------------------------------------- >>>> md_check_recovery >>>> + md_update_sb >>>> | sendmsg: METADATA_UPDATED >>>> + md_choose_sync_action process_metadata_update >>>> | remove_and_add_spares //node1 has not finished adding >>>> + call mddev->sync_work //the spare disk:do nothing >>>> >>>> md_start_sync >>>> starts md_do_sync >>>> >>>> md_do_sync >>>> + grabbed resync_lockres:DLM_LOCK_EX >>>> + do syncing job >>>> >>>> md_check_recovery >>>> sendmsg: METADATA_UPDATED >>>> process_metadata_update >>>> //activate spare disk >>>> >>>> ... ... >>>> >>>> md_do_sync >>>> waiting to grab resync_lockres:EX >>>> ``` >>>> >>>> After db5e653d7c9f: >>>> >>>> (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery >>>> starts md_start_sync, setting the INTR action will exacerbate the >>>> delay in node1 calling the md_do_sync function.) >>>> >>>> ``` >>>> node1 node2 >>>> ---------------------------------------------------------------- >>>> md_check_recovery >>>> + md_update_sb >>>> | sendmsg: METADATA_UPDATED >>>> + calls mddev->sync_work process_metadata_update >>>> //node1 has not finished adding >>>> //the spare disk:do nothing >>>> >>>> md_start_sync >>>> + md_choose_sync_action >>>> | remove_and_add_spares >>>> + calls md_do_sync >>>> >>>> md_check_recovery >>>> md_update_sb >>>> sendmsg: METADATA_UPDATED >>>> process_metadata_update >>>> //activate spare disk >>>> >>>> ... ... ... ... >>>> >>>> md_do_sync >>>> + grabbed resync_lockres:EX >>>> + raid1_sync_request skip sync >>>> under >>>> conf->fullsync:0 >>>> md_do_sync >>>> 1. waiting to grab resync_lockres:EX >>>> 2. when node1 could grab EX lock, >>>> node1 will skip resync under recovery_offset:MaxSector >>> >>> I have a question here, because md_do_sync() and the second >>> md_check_recovery() are completely irrelevant, so with or without >>> the commit db5e653d7c9f, the problem is always there? >>> >>> Thanks, >>> Kuai >> >> I performed a bi-search job. The commit f52f5c71f3d4 ("md: fix >> stopping sync thread") is the first sync failure commit. However, >> f52f5c71f3d4 is not the root cause, it just exacerbates the sync >> problem. >> >> In a clustered env, md_check_recovery() will send the msg >> METADATA_UPDATED many times during the completion of metadata updates. >> msg:METADATA_UPDATED notifies another node that the metadata is >> updated. The current code design requires the local node to set >> correct metadata values for another node. With db5e653d7c9f, >> the another node will handle the metadata first and start the sync >> job before the local node, which triggers the bug. >> >> -Heming >> >>> >>> >>>> ``` >>>> >>>> How to trigger: >>>> >>>> ```(commands @node1) >>>> # to easily watch the recovery status >>>> echo 2000 > /proc/sys/dev/raid/speed_limit_max >>>> ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" >>>> >>>> mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb >>>> --assume-clean >>>> ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb >>>> mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda >>>> mdadm --manage /dev/md0 --add /dev/sdc >>>> >>>> === "cat /proc/mdstat" on both node, there are no recovery action. === >>>> ``` >>>> >>>> How to fix: >>>> >>>> because md layer code logic is hard to restore for speeding up sync job >>>> on local node, we add new cluster msg to pending the another node to >>>> active disk. >>>> >>>> Signed-off-by: Heming Zhao <heming.zhao@suse.com> >>>> Reviewed-by: Su Yue <glass.su@suse.com> >>>> --- >>>> drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++ >>>> drivers/md/md-cluster.h | 2 ++ >>>> drivers/md/md.c | 17 ++++++++++++++--- >>>> 3 files changed, 43 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c >>>> index 27eaaf9fef94..ab1e3c490279 100644 >>>> --- a/drivers/md/md-cluster.c >>>> +++ b/drivers/md/md-cluster.c >>>> @@ -56,6 +56,7 @@ struct resync_info { >>>> #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 >>>> #define MD_CLUSTER_PENDING_RECV_EVENT 7 >>>> #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 >>>> +#define MD_CLUSTER_WAITING_FOR_SYNC 9 >>>> struct md_cluster_info { >>>> struct mddev *mddev; /* the md device which md_cluster_info >>>> belongs to */ >>>> @@ -91,6 +92,7 @@ struct md_cluster_info { >>>> sector_t sync_hi; >>>> }; >>>> +/* For compatibility, add the new msg_type at the end. */ >>>> enum msg_type { >>>> METADATA_UPDATED = 0, >>>> RESYNCING, >>>> @@ -100,6 +102,7 @@ enum msg_type { >>>> BITMAP_NEEDS_SYNC, >>>> CHANGE_CAPACITY, >>>> BITMAP_RESIZE, >>>> + RESYNCING_START, >>>> }; >>>> struct cluster_msg { >>>> @@ -460,6 +463,7 @@ static void process_suspend_info(struct mddev >>>> *mddev, >>>> clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >>>> remove_suspend_info(mddev, slot); >>>> set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); >>>> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>>> md_wakeup_thread(mddev->thread); >>>> return; >>>> } >>>> @@ -530,6 +534,7 @@ static int process_add_new_disk(struct mddev >>>> *mddev, struct cluster_msg *cmsg) >>>> res = -1; >>>> } >>>> clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); >>>> + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>>> return res; >>>> } >>>> @@ -598,6 +603,9 @@ static int process_recvd_msg(struct mddev >>>> *mddev, struct cluster_msg *msg) >>>> case CHANGE_CAPACITY: >>>> set_capacity_and_notify(mddev->gendisk, >>>> mddev->array_sectors); >>>> break; >>>> + case RESYNCING_START: >>>> + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, >>>> &mddev->cluster_info->state); >>>> + break; >>>> case RESYNCING: >>>> set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); >>>> process_suspend_info(mddev, le32_to_cpu(msg->slot), >>>> @@ -1353,6 +1361,23 @@ static void resync_info_get(struct mddev >>>> *mddev, sector_t *lo, sector_t *hi) >>>> spin_unlock_irq(&cinfo->suspend_lock); >>>> } >>>> +static int resync_status_get(struct mddev *mddev) >>>> +{ >>>> + struct md_cluster_info *cinfo = mddev->cluster_info; >>>> + >>>> + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); >>>> +} >>>> + >>>> +static int resync_start_notify(struct mddev *mddev) >>>> +{ >>>> + struct md_cluster_info *cinfo = mddev->cluster_info; >>>> + struct cluster_msg cmsg = {0}; >>>> + >>>> + cmsg.type = cpu_to_le32(RESYNCING_START); >>>> + >>>> + return sendmsg(cinfo, &cmsg, 0); >>>> +} >>>> + >>>> static int resync_info_update(struct mddev *mddev, sector_t lo, >>>> sector_t hi) >>>> { >>>> struct md_cluster_info *cinfo = mddev->cluster_info; >>>> @@ -1587,6 +1612,8 @@ static struct md_cluster_operations >>>> cluster_ops = { >>>> .resync_start = resync_start, >>>> .resync_finish = resync_finish, >>>> .resync_info_update = resync_info_update, >>>> + .resync_start_notify = resync_start_notify, >>>> + .resync_status_get = resync_status_get, >>>> .resync_info_get = resync_info_get, >>>> .metadata_update_start = metadata_update_start, >>>> .metadata_update_finish = metadata_update_finish, >>>> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h >>>> index a78e3021775d..470bf18ffde5 100644 >>>> --- a/drivers/md/md-cluster.h >>>> +++ b/drivers/md/md-cluster.h >>>> @@ -14,6 +14,8 @@ struct md_cluster_operations { >>>> int (*leave)(struct mddev *mddev); >>>> int (*slot_number)(struct mddev *mddev); >>>> int (*resync_info_update)(struct mddev *mddev, sector_t lo, >>>> sector_t hi); >>>> + int (*resync_start_notify)(struct mddev *mddev); >>>> + int (*resync_status_get)(struct mddev *mddev); >>>> void (*resync_info_get)(struct mddev *mddev, sector_t *lo, >>>> sector_t *hi); >>>> int (*metadata_update_start)(struct mddev *mddev); >>>> int (*metadata_update_finish)(struct mddev *mddev); >>>> diff --git a/drivers/md/md.c b/drivers/md/md.c >>>> index e575e74aabf5..97d7e96d1d5e 100644 >>>> --- a/drivers/md/md.c >>>> +++ b/drivers/md/md.c >>>> @@ -8902,7 +8902,8 @@ void md_do_sync(struct md_thread *thread) >>>> * This will mean we have to start checking from the beginning >>>> again. >>>> * >>>> */ >>>> - >>>> + if (mddev_is_clustered(mddev)) >>>> + md_cluster_ops->resync_start_notify(mddev); >>>> do { >>>> int mddev2_minor = -1; >>>> mddev->curr_resync = MD_RESYNC_DELAYED; >>>> @@ -9963,8 +9964,18 @@ static void check_sb_changes(struct mddev >>>> *mddev, struct md_rdev *rdev) >>>> */ >>>> if (rdev2->raid_disk == -1 && role != >>>> MD_DISK_ROLE_SPARE && >>>> !(le32_to_cpu(sb->feature_map) & >>>> - MD_FEATURE_RESHAPE_ACTIVE)) { >>>> - rdev2->saved_raid_disk = role; >>>> + MD_FEATURE_RESHAPE_ACTIVE) && >>>> + !md_cluster_ops->resync_status_get(mddev)) { >>>> + /* >>>> + * -1 to make raid1_add_disk() set conf->fullsync >>>> + * to 1. This could avoid skipping sync when the >>>> + * remote node is down during resyncing. >>>> + */ >>>> + if ((le32_to_cpu(sb->feature_map) >>>> + & MD_FEATURE_RECOVERY_OFFSET)) >>>> + rdev2->saved_raid_disk = -1; >>>> + else >>>> + rdev2->saved_raid_disk = role; >>>> ret = remove_and_add_spares(mddev, rdev2); >>>> pr_info("Activated spare: %pg\n", >>>> rdev2->bdev); >>>> >>> >> > > > . >
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 27eaaf9fef94..ab1e3c490279 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -56,6 +56,7 @@ struct resync_info { #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 #define MD_CLUSTER_PENDING_RECV_EVENT 7 #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 +#define MD_CLUSTER_WAITING_FOR_SYNC 9 struct md_cluster_info { struct mddev *mddev; /* the md device which md_cluster_info belongs to */ @@ -91,6 +92,7 @@ struct md_cluster_info { sector_t sync_hi; }; +/* For compatibility, add the new msg_type at the end. */ enum msg_type { METADATA_UPDATED = 0, RESYNCING, @@ -100,6 +102,7 @@ enum msg_type { BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, BITMAP_RESIZE, + RESYNCING_START, }; struct cluster_msg { @@ -460,6 +463,7 @@ static void process_suspend_info(struct mddev *mddev, clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); md_wakeup_thread(mddev->thread); return; } @@ -530,6 +534,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) res = -1; } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); return res; } @@ -598,6 +603,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case CHANGE_CAPACITY: set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; + case RESYNCING_START: + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); + break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), @@ -1353,6 +1361,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) spin_unlock_irq(&cinfo->suspend_lock); } +static int resync_status_get(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); +} + +static int resync_start_notify(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + + cmsg.type = cpu_to_le32(RESYNCING_START); + + return sendmsg(cinfo, &cmsg, 0); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1587,6 +1612,8 @@ static struct md_cluster_operations cluster_ops = { .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_start_notify = resync_start_notify, + .resync_status_get = resync_status_get, .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index a78e3021775d..470bf18ffde5 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -14,6 +14,8 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*resync_start_notify)(struct mddev *mddev); + int (*resync_status_get)(struct mddev *mddev); void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index e575e74aabf5..97d7e96d1d5e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8902,7 +8902,8 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9963,8 +9964,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RESHAPE_ACTIVE)) { - rdev2->saved_raid_disk = role; + MD_FEATURE_RESHAPE_ACTIVE) && + !md_cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev);