Message ID | 20240709104120.22243-2-heming.zhao@suse.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [v2,1/2] md-cluster: fix hanging issue while a new disk adding | expand |
Context | Check | Description |
---|---|---|
mdraidci/vmtest-md-6_11-VM_Test-1 | success | Logs for build-kernel |
mdraidci/vmtest-md-6_11-PR | success | PR summary |
mdraidci/vmtest-md-6_11-VM_Test-0 | success | Logs for build-kernel |
在 2024/07/09 18:41, Heming Zhao 写道: > The commit db5e653d7c9f ("md: delay choosing sync action to > md_start_sync()") delays the start of the sync action. In a > clustered environment, this will cause another node to first > activate the spare disk and skip recovery. As a result, no > nodes will perform recovery when a disk is added or re-added. > > Before db5e653d7c9f: > > ``` > node1 node2 > ---------------------------------------------------------------- > md_check_recovery > + md_update_sb > | sendmsg: METADATA_UPDATED > + md_choose_sync_action process_metadata_update > | remove_and_add_spares //node1 has not finished adding > + call mddev->sync_work //the spare disk:do nothing > > md_start_sync > starts md_do_sync > > md_do_sync > + grabbed resync_lockres:DLM_LOCK_EX > + do syncing job > > md_check_recovery > sendmsg: METADATA_UPDATED > process_metadata_update > //activate spare disk > > ... ... > > md_do_sync > waiting to grab resync_lockres:EX > ``` > > After db5e653d7c9f: > > (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery > starts md_start_sync, setting the INTR action will exacerbate the > delay in node1 calling the md_do_sync function.) > > ``` > node1 node2 > ---------------------------------------------------------------- > md_check_recovery > + md_update_sb > | sendmsg: METADATA_UPDATED > + calls mddev->sync_work process_metadata_update > //node1 has not finished adding > //the spare disk:do nothing > > md_start_sync > + md_choose_sync_action > | remove_and_add_spares > + calls md_do_sync > > md_check_recovery > md_update_sb > sendmsg: METADATA_UPDATED > process_metadata_update > //activate spare disk > > ... ... ... ... > > md_do_sync > + grabbed resync_lockres:EX > + raid1_sync_request skip sync under > conf->fullsync:0 > md_do_sync > 1. waiting to grab resync_lockres:EX > 2. when node1 could grab EX lock, > node1 will skip resync under recovery_offset:MaxSector > ``` > > How to trigger: > > ```(commands @node1) > # to easily watch the recovery status > echo 2000 > /proc/sys/dev/raid/speed_limit_max > ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" > > mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean > ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb > mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda > mdadm --manage /dev/md0 --add /dev/sdc > > === "cat /proc/mdstat" on both node, there are no recovery action. === > ``` > > How to fix: > > because md layer code logic is hard to restore for speeding up sync job > on local node, we add new cluster msg to pending the another node to > active disk. Acked-by: Yu Kuai <yukuai3@huawei.com> > > Signed-off-by: Heming Zhao <heming.zhao@suse.com> > Reviewed-by: Su Yue <glass.su@suse.com> > --- > v1 -> v2: no change in this patch > --- > drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++ > drivers/md/md-cluster.h | 2 ++ > drivers/md/md.c | 17 ++++++++++++++--- > 3 files changed, 43 insertions(+), 3 deletions(-) > > diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c > index b5a802ae17bb..bf6a0dd8dac7 100644 > --- a/drivers/md/md-cluster.c > +++ b/drivers/md/md-cluster.c > @@ -57,6 +57,7 @@ struct resync_info { > #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 > #define MD_CLUSTER_PENDING_RECV_EVENT 7 > #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 > +#define MD_CLUSTER_WAITING_FOR_SYNC 9 > > struct md_cluster_info { > struct mddev *mddev; /* the md device which md_cluster_info belongs to */ > @@ -92,6 +93,7 @@ struct md_cluster_info { > sector_t sync_hi; > }; > > +/* For compatibility, add the new msg_type at the end. */ > enum msg_type { > METADATA_UPDATED = 0, > RESYNCING, > @@ -101,6 +103,7 @@ enum msg_type { > BITMAP_NEEDS_SYNC, > CHANGE_CAPACITY, > BITMAP_RESIZE, > + RESYNCING_START, > }; > > struct cluster_msg { > @@ -461,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev, > clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); > remove_suspend_info(mddev, slot); > set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); > + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > md_wakeup_thread(mddev->thread); > return; > } > @@ -531,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) > res = -1; > } > clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); > + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > return res; > } > > @@ -599,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) > case CHANGE_CAPACITY: > set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); > break; > + case RESYNCING_START: > + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); > + break; > case RESYNCING: > set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); > process_suspend_info(mddev, le32_to_cpu(msg->slot), > @@ -1345,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) > spin_unlock_irq(&cinfo->suspend_lock); > } > > +static int resync_status_get(struct mddev *mddev) > +{ > + struct md_cluster_info *cinfo = mddev->cluster_info; > + > + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); > +} > + > +static int resync_start_notify(struct mddev *mddev) > +{ > + struct md_cluster_info *cinfo = mddev->cluster_info; > + struct cluster_msg cmsg = {0}; > + > + cmsg.type = cpu_to_le32(RESYNCING_START); > + > + return sendmsg(cinfo, &cmsg, 0); > +} > + > static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) > { > struct md_cluster_info *cinfo = mddev->cluster_info; > @@ -1579,6 +1604,8 @@ static struct md_cluster_operations cluster_ops = { > .resync_start = resync_start, > .resync_finish = resync_finish, > .resync_info_update = resync_info_update, > + .resync_start_notify = resync_start_notify, > + .resync_status_get = resync_status_get, > .resync_info_get = resync_info_get, > .metadata_update_start = metadata_update_start, > .metadata_update_finish = metadata_update_finish, > diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h > index a78e3021775d..470bf18ffde5 100644 > --- a/drivers/md/md-cluster.h > +++ b/drivers/md/md-cluster.h > @@ -14,6 +14,8 @@ struct md_cluster_operations { > int (*leave)(struct mddev *mddev); > int (*slot_number)(struct mddev *mddev); > int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); > + int (*resync_start_notify)(struct mddev *mddev); > + int (*resync_status_get)(struct mddev *mddev); > void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); > int (*metadata_update_start)(struct mddev *mddev); > int (*metadata_update_finish)(struct mddev *mddev); > diff --git a/drivers/md/md.c b/drivers/md/md.c > index aff9118ff697..e393df55fc8b 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -8907,7 +8907,8 @@ void md_do_sync(struct md_thread *thread) > * This will mean we have to start checking from the beginning again. > * > */ > - > + if (mddev_is_clustered(mddev)) > + md_cluster_ops->resync_start_notify(mddev); > do { > int mddev2_minor = -1; > mddev->curr_resync = MD_RESYNC_DELAYED; > @@ -9968,8 +9969,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) > */ > if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && > !(le32_to_cpu(sb->feature_map) & > - MD_FEATURE_RESHAPE_ACTIVE)) { > - rdev2->saved_raid_disk = role; > + MD_FEATURE_RESHAPE_ACTIVE) && > + !md_cluster_ops->resync_status_get(mddev)) { > + /* > + * -1 to make raid1_add_disk() set conf->fullsync > + * to 1. This could avoid skipping sync when the > + * remote node is down during resyncing. > + */ > + if ((le32_to_cpu(sb->feature_map) > + & MD_FEATURE_RECOVERY_OFFSET)) > + rdev2->saved_raid_disk = -1; > + else > + rdev2->saved_raid_disk = role; > ret = remove_and_add_spares(mddev, rdev2); > pr_info("Activated spare: %pg\n", > rdev2->bdev); >
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b5a802ae17bb..bf6a0dd8dac7 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -57,6 +57,7 @@ struct resync_info { #define MD_CLUSTER_ALREADY_IN_CLUSTER 6 #define MD_CLUSTER_PENDING_RECV_EVENT 7 #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 +#define MD_CLUSTER_WAITING_FOR_SYNC 9 struct md_cluster_info { struct mddev *mddev; /* the md device which md_cluster_info belongs to */ @@ -92,6 +93,7 @@ struct md_cluster_info { sector_t sync_hi; }; +/* For compatibility, add the new msg_type at the end. */ enum msg_type { METADATA_UPDATED = 0, RESYNCING, @@ -101,6 +103,7 @@ enum msg_type { BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, BITMAP_RESIZE, + RESYNCING_START, }; struct cluster_msg { @@ -461,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev, clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); remove_suspend_info(mddev, slot); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); md_wakeup_thread(mddev->thread); return; } @@ -531,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) res = -1; } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); return res; } @@ -599,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case CHANGE_CAPACITY: set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; + case RESYNCING_START: + clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state); + break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); process_suspend_info(mddev, le32_to_cpu(msg->slot), @@ -1345,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) spin_unlock_irq(&cinfo->suspend_lock); } +static int resync_status_get(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state); +} + +static int resync_start_notify(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + + cmsg.type = cpu_to_le32(RESYNCING_START); + + return sendmsg(cinfo, &cmsg, 0); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1579,6 +1604,8 @@ static struct md_cluster_operations cluster_ops = { .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_start_notify = resync_start_notify, + .resync_status_get = resync_status_get, .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index a78e3021775d..470bf18ffde5 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -14,6 +14,8 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*resync_start_notify)(struct mddev *mddev); + int (*resync_status_get)(struct mddev *mddev); void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff697..e393df55fc8b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8907,7 +8907,8 @@ void md_do_sync(struct md_thread *thread) * This will mean we have to start checking from the beginning again. * */ - + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9968,8 +9969,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RESHAPE_ACTIVE)) { - rdev2->saved_raid_disk = role; + MD_FEATURE_RESHAPE_ACTIVE) && + !md_cluster_ops->resync_status_get(mddev)) { + /* + * -1 to make raid1_add_disk() set conf->fullsync + * to 1. This could avoid skipping sync when the + * remote node is down during resyncing. + */ + if ((le32_to_cpu(sb->feature_map) + & MD_FEATURE_RECOVERY_OFFSET)) + rdev2->saved_raid_disk = -1; + else + rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %pg\n", rdev2->bdev);