Message ID | 20230522133225.2983667-1-yukuai1@huaweicloud.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] md: fix duplicate filename for rdev | expand |
Dear Yu, Thank you for your patch. Just a few nits, that can be ignored. Am 22.05.23 um 15:32 schrieb Yu Kuai: > From: Yu Kuai <yukuai3@huawei.com> > > commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device I’d start with capital letter: Commit …. > from an md array via sysfs") delay the deleting of rdev, however, this 1. delay*s* 2. s/deleting/deletion/ > introduce a window that rdev can be added again while the deleting is 1. introduce*s* 2. s/deleting/deletion/ > not done yet, and sysfs will complain about duplicate filename. > > Follow up patches try to fix this problem by flush workqueue, however, flush*ing* the work queue > flush_rdev_wq() is just dead code, the progress in > md_kick_rdev_from_array(): … is: > 1) list_del_rcu(&rdev->same_set); > 2) synchronize_rcu(); > 3) queue_work(md_rdev_misc_wq, &rdev->del_work); > > So in flush_rdev_wq(), if rdev is found in the list, work_pending() can > never pass, in the meantime, if work is queued, then rdev can never be > found in the list. > > flush_rdev_wq() can be replaced by flush_workqueue() directly, however, > this approach is not good: > - the workqueue is global, this synchronization for all raid disks is > not necessary. The work queue … > - flush_workqueue can't be called under 'reconfig_mutex', there is still > a small window between flush_workqueue() and mddev_lock() that other > context can queue new work, hence the problem is not solved completely. context*s* > sysfs already have apis to support delete itself through writer, and 1. s/have/has/ 2. deleting > these apis, specifically sysfs_break/unbreak_active_protection(), is used s/is/are/ > to support deleting rdev synchronously. Therefore, the above commit can be > reverted, and sysfs duplicate filename can be avoided. > > A new mdadm regression test is proposed as well. It’s not included, right? Then I’d remove the sentence, or write: … is going to be proposed … Kind regards, Paul > Link: https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ > Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > Changes in v2: > - rebase from the latest md-next branch > > drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------ > drivers/md/md.h | 8 +++++ > 2 files changed, 51 insertions(+), 41 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 7455bc9d8498..cafb457d614c 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq; > static int remove_and_add_spares(struct mddev *mddev, > struct md_rdev *this); > static void mddev_detach(struct mddev *mddev); > +static void export_rdev(struct md_rdev *rdev); > > /* > * Default number of read corrections we'll attempt on an rdev > @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev) > { > mutex_init(&mddev->open_mutex); > mutex_init(&mddev->reconfig_mutex); > + mutex_init(&mddev->delete_mutex); > mutex_init(&mddev->bitmap_info.mutex); > INIT_LIST_HEAD(&mddev->disks); > INIT_LIST_HEAD(&mddev->all_mddevs); > + INIT_LIST_HEAD(&mddev->deleting); > timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); > atomic_set(&mddev->active, 1); > atomic_set(&mddev->openers, 0); > @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev) > > static const struct attribute_group md_redundancy_group; > > +static void md_free_rdev(struct mddev *mddev) > +{ > + struct md_rdev *rdev; > + struct md_rdev *tmp; > + > + if (list_empty_careful(&mddev->deleting)) > + return; > + > + mutex_lock(&mddev->delete_mutex); > + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { > + list_del_init(&rdev->same_set); > + kobject_del(&rdev->kobj); > + export_rdev(rdev); > + } > + mutex_unlock(&mddev->delete_mutex); > +} > + > void mddev_unlock(struct mddev *mddev) > { > if (mddev->to_remove) { > @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) > } else > mutex_unlock(&mddev->reconfig_mutex); > > + md_free_rdev(mddev); > + > /* As we've dropped the mutex we need a spinlock to > * make sure the thread doesn't disappear > */ > @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) > return err; > } > > -static void rdev_delayed_delete(struct work_struct *ws) > -{ > - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); > - kobject_del(&rdev->kobj); > - kobject_put(&rdev->kobj); > -} > - > void md_autodetect_dev(dev_t dev); > > static void export_rdev(struct md_rdev *rdev) > @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev) > > static void md_kick_rdev_from_array(struct md_rdev *rdev) > { > + struct mddev *mddev = rdev->mddev; > + > bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); > list_del_rcu(&rdev->same_set); > pr_debug("md: unbind<%pg>\n", rdev->bdev); > @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) > rdev->sysfs_unack_badblocks = NULL; > rdev->sysfs_badblocks = NULL; > rdev->badblocks.count = 0; > - /* We need to delay this, otherwise we can deadlock when > - * writing to 'remove' to "dev/state". We also need > - * to delay it due to rcu usage. > - */ > + > synchronize_rcu(); > - INIT_WORK(&rdev->del_work, rdev_delayed_delete); > - kobject_get(&rdev->kobj); > - queue_work(md_rdev_misc_wq, &rdev->del_work); > - export_rdev(rdev); > + > + /* > + * kobject_del() will wait for all in progress writers to be done, where > + * reconfig_mutex is held, hence it can't be called under > + * reconfig_mutex and it's delayed to mddev_unlock(). > + */ > + mutex_lock(&mddev->delete_mutex); > + list_add(&rdev->same_set, &mddev->deleting); > + mutex_unlock(&mddev->delete_mutex); > } > > static void export_array(struct mddev *mddev) > @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > { > struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); > struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); > + struct kernfs_node *kn = NULL; > ssize_t rv; > struct mddev *mddev = rdev->mddev; > > @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > return -EIO; > if (!capable(CAP_SYS_ADMIN)) > return -EACCES; > + > + if (entry->store == state_store && cmd_match(page, "remove")) > + kn = sysfs_break_active_protection(kobj, attr); > + > rv = mddev ? mddev_lock(mddev) : -ENODEV; > if (!rv) { > if (rdev->mddev == NULL) > @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > rv = entry->store(rdev, page, length); > mddev_unlock(mddev); > } > + > + if (kn) > + sysfs_unbreak_active_protection(kn); > + > return rv; > } > > @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) > return -EINVAL; > } > > -/* need to ensure rdev_delayed_delete() has completed */ > -static void flush_rdev_wq(struct mddev *mddev) > -{ > - struct md_rdev *rdev; > - > - rcu_read_lock(); > - rdev_for_each_rcu(rdev, mddev) > - if (work_pending(&rdev->del_work)) { > - flush_workqueue(md_rdev_misc_wq); > - break; > - } > - rcu_read_unlock(); > -} > - > static ssize_t > new_dev_store(struct mddev *mddev, const char *buf, size_t len) > { > @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) > minor != MINOR(dev)) > return -EOVERFLOW; > > - flush_rdev_wq(mddev); > err = mddev_lock(mddev); > if (err) > return err; > @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) > * removed (mddev_delayed_delete). > */ > flush_workqueue(md_misc_wq); > - flush_workqueue(md_rdev_misc_wq); > > mutex_lock(&disks_mutex); > mddev = mddev_alloc(dev); > @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, > > } > > - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) > - flush_rdev_wq(mddev); > - > if (cmd == HOT_REMOVE_DISK) > /* need to ensure recovery thread has run */ > wait_event_interruptible_timeout(mddev->sb_wait, > @@ -9618,10 +9627,6 @@ static int __init md_init(void) > if (!md_misc_wq) > goto err_misc_wq; > > - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); > - if (!md_rdev_misc_wq) > - goto err_rdev_misc_wq; > - > ret = __register_blkdev(MD_MAJOR, "md", md_probe); > if (ret < 0) > goto err_md; > @@ -9640,8 +9645,6 @@ static int __init md_init(void) > err_mdp: > unregister_blkdev(MD_MAJOR, "md"); > err_md: > - destroy_workqueue(md_rdev_misc_wq); > -err_rdev_misc_wq: > destroy_workqueue(md_misc_wq); > err_misc_wq: > destroy_workqueue(md_wq); > @@ -9937,7 +9940,6 @@ static __exit void md_exit(void) > } > spin_unlock(&all_mddevs_lock); > > - destroy_workqueue(md_rdev_misc_wq); > destroy_workqueue(md_misc_wq); > destroy_workqueue(md_wq); > } > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 1eec65cf783c..4d191db831da 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -531,6 +531,14 @@ struct mddev { > unsigned int good_device_nr; /* good device num within cluster raid */ > unsigned int noio_flag; /* for memalloc scope API */ > > + /* > + * Temporarily store rdev that will be finally removed when > + * reconfig_mutex is unlocked. > + */ > + struct list_head deleting; > + /* Protect the deleting list */ > + struct mutex delete_mutex; > + > bool has_superblocks:1; > bool fail_last_dev:1; > bool serialize_policy:1;
On Mon, May 22, 2023 at 6:35 AM Yu Kuai <yukuai1@huaweicloud.com> wrote: > > From: Yu Kuai <yukuai3@huawei.com> > > commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device > from an md array via sysfs") delay the deleting of rdev, however, this > introduce a window that rdev can be added again while the deleting is > not done yet, and sysfs will complain about duplicate filename. > > Follow up patches try to fix this problem by flush workqueue, however, > flush_rdev_wq() is just dead code, the progress in > md_kick_rdev_from_array(): > > 1) list_del_rcu(&rdev->same_set); > 2) synchronize_rcu(); > 3) queue_work(md_rdev_misc_wq, &rdev->del_work); > > So in flush_rdev_wq(), if rdev is found in the list, work_pending() can > never pass, in the meantime, if work is queued, then rdev can never be > found in the list. > > flush_rdev_wq() can be replaced by flush_workqueue() directly, however, > this approach is not good: > - the workqueue is global, this synchronization for all raid disks is > not necessary. > - flush_workqueue can't be called under 'reconfig_mutex', there is still > a small window between flush_workqueue() and mddev_lock() that other > context can queue new work, hence the problem is not solved completely. > > sysfs already have apis to support delete itself through writer, and > these apis, specifically sysfs_break/unbreak_active_protection(), is used > to support deleting rdev synchronously. Therefore, the above commit can be > reverted, and sysfs duplicate filename can be avoided. > > A new mdadm regression test is proposed as well. > > Link: https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ > Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") > Signed-off-by: Yu Kuai <yukuai3@huawei.com> We got build time warning with this patch: drivers/md/md.c:90:33: warning: ‘md_rdev_misc_wq’ defined but not used [-Wunused-variable] static struct workqueue_struct *md_rdev_misc_wq; ^~~~~~~~~~~~~~~ Thanks, Song > --- > Changes in v2: > - rebase from the latest md-next branch > > drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------ > drivers/md/md.h | 8 +++++ > 2 files changed, 51 insertions(+), 41 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 7455bc9d8498..cafb457d614c 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq; > static int remove_and_add_spares(struct mddev *mddev, > struct md_rdev *this); > static void mddev_detach(struct mddev *mddev); > +static void export_rdev(struct md_rdev *rdev); > > /* > * Default number of read corrections we'll attempt on an rdev > @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev) > { > mutex_init(&mddev->open_mutex); > mutex_init(&mddev->reconfig_mutex); > + mutex_init(&mddev->delete_mutex); > mutex_init(&mddev->bitmap_info.mutex); > INIT_LIST_HEAD(&mddev->disks); > INIT_LIST_HEAD(&mddev->all_mddevs); > + INIT_LIST_HEAD(&mddev->deleting); > timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); > atomic_set(&mddev->active, 1); > atomic_set(&mddev->openers, 0); > @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev) > > static const struct attribute_group md_redundancy_group; > > +static void md_free_rdev(struct mddev *mddev) > +{ > + struct md_rdev *rdev; > + struct md_rdev *tmp; > + > + if (list_empty_careful(&mddev->deleting)) > + return; > + > + mutex_lock(&mddev->delete_mutex); > + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { > + list_del_init(&rdev->same_set); > + kobject_del(&rdev->kobj); > + export_rdev(rdev); > + } > + mutex_unlock(&mddev->delete_mutex); > +} > + > void mddev_unlock(struct mddev *mddev) > { > if (mddev->to_remove) { > @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) > } else > mutex_unlock(&mddev->reconfig_mutex); > > + md_free_rdev(mddev); > + > /* As we've dropped the mutex we need a spinlock to > * make sure the thread doesn't disappear > */ > @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) > return err; > } > > -static void rdev_delayed_delete(struct work_struct *ws) > -{ > - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); > - kobject_del(&rdev->kobj); > - kobject_put(&rdev->kobj); > -} > - > void md_autodetect_dev(dev_t dev); > > static void export_rdev(struct md_rdev *rdev) > @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev) > > static void md_kick_rdev_from_array(struct md_rdev *rdev) > { > + struct mddev *mddev = rdev->mddev; > + > bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); > list_del_rcu(&rdev->same_set); > pr_debug("md: unbind<%pg>\n", rdev->bdev); > @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) > rdev->sysfs_unack_badblocks = NULL; > rdev->sysfs_badblocks = NULL; > rdev->badblocks.count = 0; > - /* We need to delay this, otherwise we can deadlock when > - * writing to 'remove' to "dev/state". We also need > - * to delay it due to rcu usage. > - */ > + > synchronize_rcu(); > - INIT_WORK(&rdev->del_work, rdev_delayed_delete); > - kobject_get(&rdev->kobj); > - queue_work(md_rdev_misc_wq, &rdev->del_work); > - export_rdev(rdev); > + > + /* > + * kobject_del() will wait for all in progress writers to be done, where > + * reconfig_mutex is held, hence it can't be called under > + * reconfig_mutex and it's delayed to mddev_unlock(). > + */ > + mutex_lock(&mddev->delete_mutex); > + list_add(&rdev->same_set, &mddev->deleting); > + mutex_unlock(&mddev->delete_mutex); > } > > static void export_array(struct mddev *mddev) > @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > { > struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); > struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); > + struct kernfs_node *kn = NULL; > ssize_t rv; > struct mddev *mddev = rdev->mddev; > > @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > return -EIO; > if (!capable(CAP_SYS_ADMIN)) > return -EACCES; > + > + if (entry->store == state_store && cmd_match(page, "remove")) > + kn = sysfs_break_active_protection(kobj, attr); > + > rv = mddev ? mddev_lock(mddev) : -ENODEV; > if (!rv) { > if (rdev->mddev == NULL) > @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, > rv = entry->store(rdev, page, length); > mddev_unlock(mddev); > } > + > + if (kn) > + sysfs_unbreak_active_protection(kn); > + > return rv; > } > > @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) > return -EINVAL; > } > > -/* need to ensure rdev_delayed_delete() has completed */ > -static void flush_rdev_wq(struct mddev *mddev) > -{ > - struct md_rdev *rdev; > - > - rcu_read_lock(); > - rdev_for_each_rcu(rdev, mddev) > - if (work_pending(&rdev->del_work)) { > - flush_workqueue(md_rdev_misc_wq); > - break; > - } > - rcu_read_unlock(); > -} > - > static ssize_t > new_dev_store(struct mddev *mddev, const char *buf, size_t len) > { > @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) > minor != MINOR(dev)) > return -EOVERFLOW; > > - flush_rdev_wq(mddev); > err = mddev_lock(mddev); > if (err) > return err; > @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) > * removed (mddev_delayed_delete). > */ > flush_workqueue(md_misc_wq); > - flush_workqueue(md_rdev_misc_wq); > > mutex_lock(&disks_mutex); > mddev = mddev_alloc(dev); > @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, > > } > > - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) > - flush_rdev_wq(mddev); > - > if (cmd == HOT_REMOVE_DISK) > /* need to ensure recovery thread has run */ > wait_event_interruptible_timeout(mddev->sb_wait, > @@ -9618,10 +9627,6 @@ static int __init md_init(void) > if (!md_misc_wq) > goto err_misc_wq; > > - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); > - if (!md_rdev_misc_wq) > - goto err_rdev_misc_wq; > - > ret = __register_blkdev(MD_MAJOR, "md", md_probe); > if (ret < 0) > goto err_md; > @@ -9640,8 +9645,6 @@ static int __init md_init(void) > err_mdp: > unregister_blkdev(MD_MAJOR, "md"); > err_md: > - destroy_workqueue(md_rdev_misc_wq); > -err_rdev_misc_wq: > destroy_workqueue(md_misc_wq); > err_misc_wq: > destroy_workqueue(md_wq); > @@ -9937,7 +9940,6 @@ static __exit void md_exit(void) > } > spin_unlock(&all_mddevs_lock); > > - destroy_workqueue(md_rdev_misc_wq); > destroy_workqueue(md_misc_wq); > destroy_workqueue(md_wq); > } > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 1eec65cf783c..4d191db831da 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -531,6 +531,14 @@ struct mddev { > unsigned int good_device_nr; /* good device num within cluster raid */ > unsigned int noio_flag; /* for memalloc scope API */ > > + /* > + * Temporarily store rdev that will be finally removed when > + * reconfig_mutex is unlocked. > + */ > + struct list_head deleting; > + /* Protect the deleting list */ > + struct mutex delete_mutex; > + > bool has_superblocks:1; > bool fail_last_dev:1; > bool serialize_policy:1; > -- > 2.39.2 >
Hi, 在 2023/05/23 3:02, Song Liu 写道: > On Mon, May 22, 2023 at 6:35 AM Yu Kuai <yukuai1@huaweicloud.com> wrote: >> >> From: Yu Kuai <yukuai3@huawei.com> >> >> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device >> from an md array via sysfs") delay the deleting of rdev, however, this >> introduce a window that rdev can be added again while the deleting is >> not done yet, and sysfs will complain about duplicate filename. >> >> Follow up patches try to fix this problem by flush workqueue, however, >> flush_rdev_wq() is just dead code, the progress in >> md_kick_rdev_from_array(): >> >> 1) list_del_rcu(&rdev->same_set); >> 2) synchronize_rcu(); >> 3) queue_work(md_rdev_misc_wq, &rdev->del_work); >> >> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can >> never pass, in the meantime, if work is queued, then rdev can never be >> found in the list. >> >> flush_rdev_wq() can be replaced by flush_workqueue() directly, however, >> this approach is not good: >> - the workqueue is global, this synchronization for all raid disks is >> not necessary. >> - flush_workqueue can't be called under 'reconfig_mutex', there is still >> a small window between flush_workqueue() and mddev_lock() that other >> context can queue new work, hence the problem is not solved completely. >> >> sysfs already have apis to support delete itself through writer, and >> these apis, specifically sysfs_break/unbreak_active_protection(), is used >> to support deleting rdev synchronously. Therefore, the above commit can be >> reverted, and sysfs duplicate filename can be avoided. >> >> A new mdadm regression test is proposed as well. >> >> Link: https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ >> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> > > We got build time warning with this patch: > > drivers/md/md.c:90:33: warning: ‘md_rdev_misc_wq’ defined but not used > [-Wunused-variable] > static struct workqueue_struct *md_rdev_misc_wq; Will fix this in the next version, Thanks, Kuai > ^~~~~~~~~~~~~~~ > > Thanks, > Song > >> --- >> Changes in v2: >> - rebase from the latest md-next branch >> >> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------ >> drivers/md/md.h | 8 +++++ >> 2 files changed, 51 insertions(+), 41 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 7455bc9d8498..cafb457d614c 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq; >> static int remove_and_add_spares(struct mddev *mddev, >> struct md_rdev *this); >> static void mddev_detach(struct mddev *mddev); >> +static void export_rdev(struct md_rdev *rdev); >> >> /* >> * Default number of read corrections we'll attempt on an rdev >> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev) >> { >> mutex_init(&mddev->open_mutex); >> mutex_init(&mddev->reconfig_mutex); >> + mutex_init(&mddev->delete_mutex); >> mutex_init(&mddev->bitmap_info.mutex); >> INIT_LIST_HEAD(&mddev->disks); >> INIT_LIST_HEAD(&mddev->all_mddevs); >> + INIT_LIST_HEAD(&mddev->deleting); >> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); >> atomic_set(&mddev->active, 1); >> atomic_set(&mddev->openers, 0); >> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev) >> >> static const struct attribute_group md_redundancy_group; >> >> +static void md_free_rdev(struct mddev *mddev) >> +{ >> + struct md_rdev *rdev; >> + struct md_rdev *tmp; >> + >> + if (list_empty_careful(&mddev->deleting)) >> + return; >> + >> + mutex_lock(&mddev->delete_mutex); >> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { >> + list_del_init(&rdev->same_set); >> + kobject_del(&rdev->kobj); >> + export_rdev(rdev); >> + } >> + mutex_unlock(&mddev->delete_mutex); >> +} >> + >> void mddev_unlock(struct mddev *mddev) >> { >> if (mddev->to_remove) { >> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) >> } else >> mutex_unlock(&mddev->reconfig_mutex); >> >> + md_free_rdev(mddev); >> + >> /* As we've dropped the mutex we need a spinlock to >> * make sure the thread doesn't disappear >> */ >> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) >> return err; >> } >> >> -static void rdev_delayed_delete(struct work_struct *ws) >> -{ >> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); >> - kobject_del(&rdev->kobj); >> - kobject_put(&rdev->kobj); >> -} >> - >> void md_autodetect_dev(dev_t dev); >> >> static void export_rdev(struct md_rdev *rdev) >> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev) >> >> static void md_kick_rdev_from_array(struct md_rdev *rdev) >> { >> + struct mddev *mddev = rdev->mddev; >> + >> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); >> list_del_rcu(&rdev->same_set); >> pr_debug("md: unbind<%pg>\n", rdev->bdev); >> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) >> rdev->sysfs_unack_badblocks = NULL; >> rdev->sysfs_badblocks = NULL; >> rdev->badblocks.count = 0; >> - /* We need to delay this, otherwise we can deadlock when >> - * writing to 'remove' to "dev/state". We also need >> - * to delay it due to rcu usage. >> - */ >> + >> synchronize_rcu(); >> - INIT_WORK(&rdev->del_work, rdev_delayed_delete); >> - kobject_get(&rdev->kobj); >> - queue_work(md_rdev_misc_wq, &rdev->del_work); >> - export_rdev(rdev); >> + >> + /* >> + * kobject_del() will wait for all in progress writers to be done, where >> + * reconfig_mutex is held, hence it can't be called under >> + * reconfig_mutex and it's delayed to mddev_unlock(). >> + */ >> + mutex_lock(&mddev->delete_mutex); >> + list_add(&rdev->same_set, &mddev->deleting); >> + mutex_unlock(&mddev->delete_mutex); >> } >> >> static void export_array(struct mddev *mddev) >> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, >> { >> struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); >> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); >> + struct kernfs_node *kn = NULL; >> ssize_t rv; >> struct mddev *mddev = rdev->mddev; >> >> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, >> return -EIO; >> if (!capable(CAP_SYS_ADMIN)) >> return -EACCES; >> + >> + if (entry->store == state_store && cmd_match(page, "remove")) >> + kn = sysfs_break_active_protection(kobj, attr); >> + >> rv = mddev ? mddev_lock(mddev) : -ENODEV; >> if (!rv) { >> if (rdev->mddev == NULL) >> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, >> rv = entry->store(rdev, page, length); >> mddev_unlock(mddev); >> } >> + >> + if (kn) >> + sysfs_unbreak_active_protection(kn); >> + >> return rv; >> } >> >> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) >> return -EINVAL; >> } >> >> -/* need to ensure rdev_delayed_delete() has completed */ >> -static void flush_rdev_wq(struct mddev *mddev) >> -{ >> - struct md_rdev *rdev; >> - >> - rcu_read_lock(); >> - rdev_for_each_rcu(rdev, mddev) >> - if (work_pending(&rdev->del_work)) { >> - flush_workqueue(md_rdev_misc_wq); >> - break; >> - } >> - rcu_read_unlock(); >> -} >> - >> static ssize_t >> new_dev_store(struct mddev *mddev, const char *buf, size_t len) >> { >> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) >> minor != MINOR(dev)) >> return -EOVERFLOW; >> >> - flush_rdev_wq(mddev); >> err = mddev_lock(mddev); >> if (err) >> return err; >> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) >> * removed (mddev_delayed_delete). >> */ >> flush_workqueue(md_misc_wq); >> - flush_workqueue(md_rdev_misc_wq); >> >> mutex_lock(&disks_mutex); >> mddev = mddev_alloc(dev); >> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, >> >> } >> >> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) >> - flush_rdev_wq(mddev); >> - >> if (cmd == HOT_REMOVE_DISK) >> /* need to ensure recovery thread has run */ >> wait_event_interruptible_timeout(mddev->sb_wait, >> @@ -9618,10 +9627,6 @@ static int __init md_init(void) >> if (!md_misc_wq) >> goto err_misc_wq; >> >> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); >> - if (!md_rdev_misc_wq) >> - goto err_rdev_misc_wq; >> - >> ret = __register_blkdev(MD_MAJOR, "md", md_probe); >> if (ret < 0) >> goto err_md; >> @@ -9640,8 +9645,6 @@ static int __init md_init(void) >> err_mdp: >> unregister_blkdev(MD_MAJOR, "md"); >> err_md: >> - destroy_workqueue(md_rdev_misc_wq); >> -err_rdev_misc_wq: >> destroy_workqueue(md_misc_wq); >> err_misc_wq: >> destroy_workqueue(md_wq); >> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void) >> } >> spin_unlock(&all_mddevs_lock); >> >> - destroy_workqueue(md_rdev_misc_wq); >> destroy_workqueue(md_misc_wq); >> destroy_workqueue(md_wq); >> } >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 1eec65cf783c..4d191db831da 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -531,6 +531,14 @@ struct mddev { >> unsigned int good_device_nr; /* good device num within cluster raid */ >> unsigned int noio_flag; /* for memalloc scope API */ >> >> + /* >> + * Temporarily store rdev that will be finally removed when >> + * reconfig_mutex is unlocked. >> + */ >> + struct list_head deleting; >> + /* Protect the deleting list */ >> + struct mutex delete_mutex; >> + >> bool has_superblocks:1; >> bool fail_last_dev:1; >> bool serialize_policy:1; >> -- >> 2.39.2 >> > . >
Hi, 在 2023/05/22 22:03, Paul Menzel 写道: > Dear Yu, > > > Thank you for your patch. Just a few nits, that can be ignored. Thanks for these sugestions, I'll update them in v3. Kuai > > Am 22.05.23 um 15:32 schrieb Yu Kuai: >> From: Yu Kuai <yukuai3@huawei.com> >> >> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device > > I’d start with capital letter: Commit …. > >> from an md array via sysfs") delay the deleting of rdev, however, this > > 1. delay*s* > 2. s/deleting/deletion/ > >> introduce a window that rdev can be added again while the deleting is > > 1. introduce*s* > 2. s/deleting/deletion/ > >> not done yet, and sysfs will complain about duplicate filename. >> >> Follow up patches try to fix this problem by flush workqueue, however, > > flush*ing* the work queue > >> flush_rdev_wq() is just dead code, the progress in >> md_kick_rdev_from_array(): > > … is: > >> 1) list_del_rcu(&rdev->same_set); >> 2) synchronize_rcu(); >> 3) queue_work(md_rdev_misc_wq, &rdev->del_work); >> >> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can >> never pass, in the meantime, if work is queued, then rdev can never be >> found in the list. >> >> flush_rdev_wq() can be replaced by flush_workqueue() directly, however, >> this approach is not good: >> - the workqueue is global, this synchronization for all raid disks is >> not necessary. > > The work queue … > >> - flush_workqueue can't be called under 'reconfig_mutex', there is still >> a small window between flush_workqueue() and mddev_lock() that other >> context can queue new work, hence the problem is not solved >> completely. > > context*s* > >> sysfs already have apis to support delete itself through writer, and > > 1. s/have/has/ > 2. deleting > >> these apis, specifically sysfs_break/unbreak_active_protection(), is used > > s/is/are/ > >> to support deleting rdev synchronously. Therefore, the above commit >> can be >> reverted, and sysfs duplicate filename can be avoided. >> >> A new mdadm regression test is proposed as well. > > It’s not included, right? Then I’d remove the sentence, or write: … is > going to be proposed … > > > Kind regards, > > Paul > > >> Link: >> https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ >> >> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a >> device from an md array via sysfs") >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> Changes in v2: >> - rebase from the latest md-next branch >> >> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------ >> drivers/md/md.h | 8 +++++ >> 2 files changed, 51 insertions(+), 41 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 7455bc9d8498..cafb457d614c 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq; >> static int remove_and_add_spares(struct mddev *mddev, >> struct md_rdev *this); >> static void mddev_detach(struct mddev *mddev); >> +static void export_rdev(struct md_rdev *rdev); >> /* >> * Default number of read corrections we'll attempt on an rdev >> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev) >> { >> mutex_init(&mddev->open_mutex); >> mutex_init(&mddev->reconfig_mutex); >> + mutex_init(&mddev->delete_mutex); >> mutex_init(&mddev->bitmap_info.mutex); >> INIT_LIST_HEAD(&mddev->disks); >> INIT_LIST_HEAD(&mddev->all_mddevs); >> + INIT_LIST_HEAD(&mddev->deleting); >> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); >> atomic_set(&mddev->active, 1); >> atomic_set(&mddev->openers, 0); >> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev) >> static const struct attribute_group md_redundancy_group; >> +static void md_free_rdev(struct mddev *mddev) >> +{ >> + struct md_rdev *rdev; >> + struct md_rdev *tmp; >> + >> + if (list_empty_careful(&mddev->deleting)) >> + return; >> + >> + mutex_lock(&mddev->delete_mutex); >> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { >> + list_del_init(&rdev->same_set); >> + kobject_del(&rdev->kobj); >> + export_rdev(rdev); >> + } >> + mutex_unlock(&mddev->delete_mutex); >> +} >> + >> void mddev_unlock(struct mddev *mddev) >> { >> if (mddev->to_remove) { >> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) >> } else >> mutex_unlock(&mddev->reconfig_mutex); >> + md_free_rdev(mddev); >> + >> /* As we've dropped the mutex we need a spinlock to >> * make sure the thread doesn't disappear >> */ >> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev >> *rdev, struct mddev *mddev) >> return err; >> } >> -static void rdev_delayed_delete(struct work_struct *ws) >> -{ >> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); >> - kobject_del(&rdev->kobj); >> - kobject_put(&rdev->kobj); >> -} >> - >> void md_autodetect_dev(dev_t dev); >> static void export_rdev(struct md_rdev *rdev) >> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev) >> static void md_kick_rdev_from_array(struct md_rdev *rdev) >> { >> + struct mddev *mddev = rdev->mddev; >> + >> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); >> list_del_rcu(&rdev->same_set); >> pr_debug("md: unbind<%pg>\n", rdev->bdev); >> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct >> md_rdev *rdev) >> rdev->sysfs_unack_badblocks = NULL; >> rdev->sysfs_badblocks = NULL; >> rdev->badblocks.count = 0; >> - /* We need to delay this, otherwise we can deadlock when >> - * writing to 'remove' to "dev/state". We also need >> - * to delay it due to rcu usage. >> - */ >> + >> synchronize_rcu(); >> - INIT_WORK(&rdev->del_work, rdev_delayed_delete); >> - kobject_get(&rdev->kobj); >> - queue_work(md_rdev_misc_wq, &rdev->del_work); >> - export_rdev(rdev); >> + >> + /* >> + * kobject_del() will wait for all in progress writers to be >> done, where >> + * reconfig_mutex is held, hence it can't be called under >> + * reconfig_mutex and it's delayed to mddev_unlock(). >> + */ >> + mutex_lock(&mddev->delete_mutex); >> + list_add(&rdev->same_set, &mddev->deleting); >> + mutex_unlock(&mddev->delete_mutex); >> } >> static void export_array(struct mddev *mddev) >> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct >> attribute *attr, >> { >> struct rdev_sysfs_entry *entry = container_of(attr, struct >> rdev_sysfs_entry, attr); >> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); >> + struct kernfs_node *kn = NULL; >> ssize_t rv; >> struct mddev *mddev = rdev->mddev; >> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct >> attribute *attr, >> return -EIO; >> if (!capable(CAP_SYS_ADMIN)) >> return -EACCES; >> + >> + if (entry->store == state_store && cmd_match(page, "remove")) >> + kn = sysfs_break_active_protection(kobj, attr); >> + >> rv = mddev ? mddev_lock(mddev) : -ENODEV; >> if (!rv) { >> if (rdev->mddev == NULL) >> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct >> attribute *attr, >> rv = entry->store(rdev, page, length); >> mddev_unlock(mddev); >> } >> + >> + if (kn) >> + sysfs_unbreak_active_protection(kn); >> + >> return rv; >> } >> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) >> return -EINVAL; >> } >> -/* need to ensure rdev_delayed_delete() has completed */ >> -static void flush_rdev_wq(struct mddev *mddev) >> -{ >> - struct md_rdev *rdev; >> - >> - rcu_read_lock(); >> - rdev_for_each_rcu(rdev, mddev) >> - if (work_pending(&rdev->del_work)) { >> - flush_workqueue(md_rdev_misc_wq); >> - break; >> - } >> - rcu_read_unlock(); >> -} >> - >> static ssize_t >> new_dev_store(struct mddev *mddev, const char *buf, size_t len) >> { >> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char >> *buf, size_t len) >> minor != MINOR(dev)) >> return -EOVERFLOW; >> - flush_rdev_wq(mddev); >> err = mddev_lock(mddev); >> if (err) >> return err; >> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) >> * removed (mddev_delayed_delete). >> */ >> flush_workqueue(md_misc_wq); >> - flush_workqueue(md_rdev_misc_wq); >> mutex_lock(&disks_mutex); >> mddev = mddev_alloc(dev); >> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, >> fmode_t mode, >> } >> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) >> - flush_rdev_wq(mddev); >> - >> if (cmd == HOT_REMOVE_DISK) >> /* need to ensure recovery thread has run */ >> wait_event_interruptible_timeout(mddev->sb_wait, >> @@ -9618,10 +9627,6 @@ static int __init md_init(void) >> if (!md_misc_wq) >> goto err_misc_wq; >> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); >> - if (!md_rdev_misc_wq) >> - goto err_rdev_misc_wq; >> - >> ret = __register_blkdev(MD_MAJOR, "md", md_probe); >> if (ret < 0) >> goto err_md; >> @@ -9640,8 +9645,6 @@ static int __init md_init(void) >> err_mdp: >> unregister_blkdev(MD_MAJOR, "md"); >> err_md: >> - destroy_workqueue(md_rdev_misc_wq); >> -err_rdev_misc_wq: >> destroy_workqueue(md_misc_wq); >> err_misc_wq: >> destroy_workqueue(md_wq); >> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void) >> } >> spin_unlock(&all_mddevs_lock); >> - destroy_workqueue(md_rdev_misc_wq); >> destroy_workqueue(md_misc_wq); >> destroy_workqueue(md_wq); >> } >> diff --git a/drivers/md/md.h b/drivers/md/md.h >> index 1eec65cf783c..4d191db831da 100644 >> --- a/drivers/md/md.h >> +++ b/drivers/md/md.h >> @@ -531,6 +531,14 @@ struct mddev { >> unsigned int good_device_nr; /* good device num >> within cluster raid */ >> unsigned int noio_flag; /* for memalloc scope API */ >> + /* >> + * Temporarily store rdev that will be finally removed when >> + * reconfig_mutex is unlocked. >> + */ >> + struct list_head deleting; >> + /* Protect the deleting list */ >> + struct mutex delete_mutex; >> + >> bool has_superblocks:1; >> bool fail_last_dev:1; >> bool serialize_policy:1; > > . >
Hi Yu,
kernel test robot noticed the following build warnings:
[auto build test WARNING on song-md/md-next]
[also build test WARNING on linus/master v6.4-rc3 next-20230522]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/md-fix-duplicate-filename-for-rdev/20230522-221940
base: git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git md-next
patch link: https://lore.kernel.org/r/20230522133225.2983667-1-yukuai1%40huaweicloud.com
patch subject: [PATCH v2] md: fix duplicate filename for rdev
config: i386-defconfig (https://download.01.org/0day-ci/archive/20230523/202305230956.Zk6vAYvj-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/b6addfb36d27d4c98b397edec92ed6feeaf7afab
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yu-Kuai/md-fix-duplicate-filename-for-rdev/20230522-221940
git checkout b6addfb36d27d4c98b397edec92ed6feeaf7afab
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=i386 olddefconfig
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/md/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305230956.Zk6vAYvj-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> drivers/md/md.c:90:33: warning: 'md_rdev_misc_wq' defined but not used [-Wunused-variable]
90 | static struct workqueue_struct *md_rdev_misc_wq;
| ^~~~~~~~~~~~~~~
vim +/md_rdev_misc_wq +90 drivers/md/md.c
edb39c9deda87d Goldwyn Rodrigues 2014-03-29 86
90b08710e41a07 Bernd Schubert 2008-05-23 87 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
e804ac780e2f01 Tejun Heo 2010-10-15 88 static struct workqueue_struct *md_wq;
e804ac780e2f01 Tejun Heo 2010-10-15 89 static struct workqueue_struct *md_misc_wq;
cc1ffe61c026e2 Guoqing Jiang 2020-04-04 @90 static struct workqueue_struct *md_rdev_misc_wq;
90b08710e41a07 Bernd Schubert 2008-05-23 91
diff --git a/drivers/md/md.c b/drivers/md/md.c index 7455bc9d8498..cafb457d614c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev); /* * Default number of read corrections we'll attempt on an rdev @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + if (list_empty_careful(&mddev->deleting)) + return; + + mutex_lock(&mddev->delete_mutex); + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev); + } + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); + md_free_rdev(mddev); + /* As we've dropped the mutex we need a spinlock to * make sure the thread doesn't disappear */ @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); static void export_rdev(struct md_rdev *rdev) @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev) static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -9618,10 +9627,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9640,8 +9645,6 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9937,7 +9940,6 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 1eec65cf783c..4d191db831da 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -531,6 +531,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1;