diff mbox series

[09/10] md: only delete entries from all_mddevs when the disk is freed

Message ID 20220718063410.338626-10-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/10] md: fix mddev->kobj lifetime | expand

Commit Message

Christoph Hellwig July 18, 2022, 6:34 a.m. UTC
This ensures device names don't get prematurely reused.  Instead add a
deleted flag to skip already deleted devices in mddev_get and other
places that only want to see live mddevs.

Reported-by; Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/md.c | 56 +++++++++++++++++++++++++++++++++----------------
 drivers/md/md.h |  1 +
 2 files changed, 39 insertions(+), 18 deletions(-)

Comments

Hannes Reinecke July 18, 2022, 7:17 a.m. UTC | #1
On 7/18/22 08:34, Christoph Hellwig wrote:
> This ensures device names don't get prematurely reused.  Instead add a
> deleted flag to skip already deleted devices in mddev_get and other
> places that only want to see live mddevs.
> 
> Reported-by; Logan Gunthorpe <logang@deltatee.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/md/md.c | 56 +++++++++++++++++++++++++++++++++----------------
>   drivers/md/md.h |  1 +
>   2 files changed, 39 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 805f2b4ed9c0d..08cf21ad4c2d7 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -625,6 +625,10 @@ EXPORT_SYMBOL(md_flush_request);
>   
>   static inline struct mddev *mddev_get(struct mddev *mddev)
>   {
> +	lockdep_assert_held(&all_mddevs_lock);
> +
> +	if (mddev->deleted)
> +		return NULL;
>   	atomic_inc(&mddev->active);
>   	return mddev;
>   }

Why can't we use 'atomic_inc_unless_zero' here and do away with the 
additional 'deleted' flag?

> @@ -639,7 +643,7 @@ static void mddev_put(struct mddev *mddev)
>   	    mddev->ctime == 0 && !mddev->hold_active) {
>   		/* Array is not configured at all, and not held active,
>   		 * so destroy it */
> -		list_del_init(&mddev->all_mddevs);
> +		mddev->deleted = true;
>   
>   		/*
>   		 * Call queue_work inside the spinlock so that
> @@ -719,8 +723,8 @@ static struct mddev *mddev_find(dev_t unit)
>   
>   	spin_lock(&all_mddevs_lock);
>   	mddev = mddev_find_locked(unit);
> -	if (mddev)
> -		mddev_get(mddev);
> +	if (mddev && !mddev_get(mddev))
> +		mddev = NULL;
>   	spin_unlock(&all_mddevs_lock);
>   
>   	return mddev;
> @@ -3338,6 +3342,8 @@ static bool md_rdev_overlaps(struct md_rdev *rdev)
>   
>   	spin_lock(&all_mddevs_lock);
>   	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
> +		if (mddev->deleted)
> +			continue;

Any particular reason why you can't use the 'mddev_get()' / 
'mddev_put()' sequence here?
After all, I don't think that 'mddev' should vanish while being in this 
loop, no?

>   		rdev_for_each(rdev2, mddev) {
>   			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
>   			    md_rdevs_overlap(rdev, rdev2)) {
> @@ -5525,11 +5531,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
>   	if (!entry->show)
>   		return -EIO;
>   	spin_lock(&all_mddevs_lock);
> -	if (list_empty(&mddev->all_mddevs)) {
> +	if (!mddev_get(mddev)) { >   		spin_unlock(&all_mddevs_lock);
>   		return -EBUSY;
>   	}
> -	mddev_get(mddev);
>   	spin_unlock(&all_mddevs_lock);
>   
>   	rv = entry->show(mddev, page);
> @@ -5550,11 +5555,10 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
>   	if (!capable(CAP_SYS_ADMIN))
>   		return -EACCES;
>   	spin_lock(&all_mddevs_lock);
> -	if (list_empty(&mddev->all_mddevs)) {
> +	if (!mddev_get(mddev)) {
>   		spin_unlock(&all_mddevs_lock);
>   		return -EBUSY;
>   	}
> -	mddev_get(mddev);
>   	spin_unlock(&all_mddevs_lock);
>   	rv = entry->store(mddev, page, length);
>   	mddev_put(mddev);
> @@ -7851,7 +7855,7 @@ static void md_free_disk(struct gendisk *disk)
>   	bioset_exit(&mddev->bio_set);
>   	bioset_exit(&mddev->sync_set);
>   
> -	kfree(mddev);
> +	mddev_free(mddev);
>   }
>   
>   const struct block_device_operations md_fops =
> @@ -8173,6 +8177,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
>   		if (!l--) {
>   			mddev = list_entry(tmp, struct mddev, all_mddevs);
>   			mddev_get(mddev);
> +			if (!mddev_get(mddev))
> +				continue;
>   			spin_unlock(&all_mddevs_lock);
>   			return mddev;
>   		}
> @@ -8186,25 +8192,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   {
>   	struct list_head *tmp;
>   	struct mddev *next_mddev, *mddev = v;
> +	struct mddev *to_put = NULL;
>   
>   	++*pos;
>   	if (v == (void*)2)
>   		return NULL;
>   
>   	spin_lock(&all_mddevs_lock);
> -	if (v == (void*)1)
> +	if (v == (void*)1) {
>   		tmp = all_mddevs.next;
> -	else
> +	} else {
> +		to_put = mddev;
>   		tmp = mddev->all_mddevs.next;
> -	if (tmp != &all_mddevs)
> -		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
> -	else {
> -		next_mddev = (void*)2;
> -		*pos = 0x10000;
>   	}
> +
> +	for (;;) {
> +		if (tmp == &all_mddevs) {
> +			next_mddev = (void*)2;
> +			*pos = 0x10000;
> +			break;
> +		}
> +		next_mddev = list_entry(tmp, struct mddev, all_mddevs);
> +		if (mddev_get(next_mddev))
> +			break;
> +		mddev = next_mddev;
> +		tmp = mddev->all_mddevs.next;
> +	};
>   	spin_unlock(&all_mddevs_lock);
>   
> -	if (v != (void*)1)
> +	if (to_put)
>   		mddev_put(mddev);
>   	return next_mddev;
>   
> @@ -8768,6 +8784,8 @@ void md_do_sync(struct md_thread *thread)
>   			goto skip;
>   		spin_lock(&all_mddevs_lock);
>   		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
> +			if (mddev2->deleted)
> +				continue;
>   			if (mddev2 == mddev)
>   				continue;
>   			if (!mddev->parallel_resync
> @@ -9570,7 +9588,8 @@ static int md_notify_reboot(struct notifier_block *this,
>   
>   	spin_lock(&all_mddevs_lock);
>   	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
> -		mddev_get(mddev);
> +		if (!mddev_get(mddev))
> +			continue;
>   		spin_unlock(&all_mddevs_lock);
>   		if (mddev_trylock(mddev)) {
>   			if (mddev->pers)
> @@ -9925,7 +9944,8 @@ static __exit void md_exit(void)
>   
>   	spin_lock(&all_mddevs_lock);
>   	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
> -		mddev_get(mddev);
> +		if (!mddev_get(mddev))
> +			continue;
>   		spin_unlock(&all_mddevs_lock);
>   		export_array(mddev);
>   		mddev->ctime = 0;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1a85dbe78a71c..bc870e1f1e8c2 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -503,6 +503,7 @@ struct mddev {
>   
>   	atomic_t			max_corr_read_errors; /* max read retries */
>   	struct list_head		all_mddevs;
> +	bool				deleted;
>   
>   	const struct attribute_group	*to_remove;
>   

Cheers,

Hannes
Song Liu July 18, 2022, 6:20 p.m. UTC | #2
On Mon, Jul 18, 2022 at 12:17 AM Hannes Reinecke <hare@suse.de> wrote:
>
> On 7/18/22 08:34, Christoph Hellwig wrote:
> > This ensures device names don't get prematurely reused.  Instead add a
> > deleted flag to skip already deleted devices in mddev_get and other
> > places that only want to see live mddevs.
> >
> > Reported-by; Logan Gunthorpe <logang@deltatee.com>
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >   drivers/md/md.c | 56 +++++++++++++++++++++++++++++++++----------------
> >   drivers/md/md.h |  1 +
> >   2 files changed, 39 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 805f2b4ed9c0d..08cf21ad4c2d7 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -625,6 +625,10 @@ EXPORT_SYMBOL(md_flush_request);
> >
> >   static inline struct mddev *mddev_get(struct mddev *mddev)
> >   {
> > +     lockdep_assert_held(&all_mddevs_lock);
> > +
> > +     if (mddev->deleted)
> > +             return NULL;
> >       atomic_inc(&mddev->active);
> >       return mddev;
> >   }
>
> Why can't we use 'atomic_inc_unless_zero' here and do away with the
> additional 'deleted' flag?

Thanks Christoph for the set. And thanks Hannes for the quick review!

I think atomic_inc_unless_zero() should work here. Alternatively, we can
also grab a big from mddev->flags as MD_DELETED.

Thanks,
Song

[...]
Christoph Hellwig July 19, 2022, 5:06 a.m. UTC | #3
On Mon, Jul 18, 2022 at 11:20:39AM -0700, Song Liu wrote:
> > >   static inline struct mddev *mddev_get(struct mddev *mddev)
> > >   {
> > > +     lockdep_assert_held(&all_mddevs_lock);
> > > +
> > > +     if (mddev->deleted)
> > > +             return NULL;
> > >       atomic_inc(&mddev->active);
> > >       return mddev;
> > >   }
> >
> > Why can't we use 'atomic_inc_unless_zero' here and do away with the
> > additional 'deleted' flag?

> I think atomic_inc_unless_zero() should work here. Alternatively, we can
> also grab a big from mddev->flags as MD_DELETED.

s/big/bit/ ?

Yes, this could use mddev->flags.  I don't think atomic_inc_unless_zero
would work, as an array can have a refcount of 0 but we still allow
to take new references to it, the deletion only happens if the other
conditions in mddev_put are met.

Do you want me to repin the series using a flag?
Song Liu July 19, 2022, 6:28 a.m. UTC | #4
On Mon, Jul 18, 2022 at 10:07 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Mon, Jul 18, 2022 at 11:20:39AM -0700, Song Liu wrote:
> > > >   static inline struct mddev *mddev_get(struct mddev *mddev)
> > > >   {
> > > > +     lockdep_assert_held(&all_mddevs_lock);
> > > > +
> > > > +     if (mddev->deleted)
> > > > +             return NULL;
> > > >       atomic_inc(&mddev->active);
> > > >       return mddev;
> > > >   }
> > >
> > > Why can't we use 'atomic_inc_unless_zero' here and do away with the
> > > additional 'deleted' flag?
>
> > I think atomic_inc_unless_zero() should work here. Alternatively, we can
> > also grab a big from mddev->flags as MD_DELETED.
>
> s/big/bit/ ?
yeah, bit...

>
> Yes, this could use mddev->flags.  I don't think atomic_inc_unless_zero
> would work, as an array can have a refcount of 0 but we still allow
> to take new references to it, the deletion only happens if the other
> conditions in mddev_put are met.
>
> Do you want me to repin the series using a flag?

Respin the series sounds good. Please also address Hannes' feedback
on 8/10.

Thanks,
Song
Hannes Reinecke July 19, 2022, 7 a.m. UTC | #5
On 7/19/22 07:06, Christoph Hellwig wrote:
> On Mon, Jul 18, 2022 at 11:20:39AM -0700, Song Liu wrote:
>>>>    static inline struct mddev *mddev_get(struct mddev *mddev)
>>>>    {
>>>> +     lockdep_assert_held(&all_mddevs_lock);
>>>> +
>>>> +     if (mddev->deleted)
>>>> +             return NULL;
>>>>        atomic_inc(&mddev->active);
>>>>        return mddev;
>>>>    }
>>>
>>> Why can't we use 'atomic_inc_unless_zero' here and do away with the
>>> additional 'deleted' flag?
> 
>> I think atomic_inc_unless_zero() should work here. Alternatively, we can
>> also grab a big from mddev->flags as MD_DELETED.
> 
> s/big/bit/ ?
> 
> Yes, this could use mddev->flags.  I don't think atomic_inc_unless_zero
> would work, as an array can have a refcount of 0 but we still allow
> to take new references to it, the deletion only happens if the other
> conditions in mddev_put are met.
> 
> Do you want me to repin the series using a flag?

I would vote for it.

Cheers,

Hannes
Christoph Hellwig July 19, 2022, 7:14 a.m. UTC | #6
On Mon, Jul 18, 2022 at 09:17:42AM +0200, Hannes Reinecke wrote:
> Why can't we use 'atomic_inc_unless_zero' here and do away with the 
> additional 'deleted' flag?

See my previous reply.

>> @@ -3338,6 +3342,8 @@ static bool md_rdev_overlaps(struct md_rdev *rdev)
>>     	spin_lock(&all_mddevs_lock);
>>   	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
>> +		if (mddev->deleted)
>> +			continue;
>
> Any particular reason why you can't use the 'mddev_get()' / 'mddev_put()' 
> sequence here?

Mostly because it would be pretty mess.  mdev_put takes all_mddevs_lock,
so we'd have to add an unlock/relock cycle for each loop iteration.

> After all, I don't think that 'mddev' should vanish while being in this 
> loop, no?

It won't, at least without the call to mddev_put.  Once mddev_put is
in the game things aren't that easy, and I suspect the exising and
new code might have bugs in that area in the reboot notifier and
md_exit for extreme corner cases.
Hannes Reinecke July 19, 2022, 7:59 a.m. UTC | #7
On 7/19/22 09:14, Christoph Hellwig wrote:
> On Mon, Jul 18, 2022 at 09:17:42AM +0200, Hannes Reinecke wrote:
>> Why can't we use 'atomic_inc_unless_zero' here and do away with the
>> additional 'deleted' flag?
> 
> See my previous reply.
> 
>>> @@ -3338,6 +3342,8 @@ static bool md_rdev_overlaps(struct md_rdev *rdev)
>>>      	spin_lock(&all_mddevs_lock);
>>>    	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
>>> +		if (mddev->deleted)
>>> +			continue;
>>
>> Any particular reason why you can't use the 'mddev_get()' / 'mddev_put()'
>> sequence here?
> 
> Mostly because it would be pretty mess.  mdev_put takes all_mddevs_lock,
> so we'd have to add an unlock/relock cycle for each loop iteration.
> 
>> After all, I don't think that 'mddev' should vanish while being in this
>> loop, no?
> 
> It won't, at least without the call to mddev_put.  Once mddev_put is
> in the game things aren't that easy, and I suspect the exising and
> new code might have bugs in that area in the reboot notifier and
> md_exit for extreme corner cases.

And again, MD mess.
But thanks for start clearing it up.

Cheers,

Hannes
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 805f2b4ed9c0d..08cf21ad4c2d7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -625,6 +625,10 @@  EXPORT_SYMBOL(md_flush_request);
 
 static inline struct mddev *mddev_get(struct mddev *mddev)
 {
+	lockdep_assert_held(&all_mddevs_lock);
+
+	if (mddev->deleted)
+		return NULL;
 	atomic_inc(&mddev->active);
 	return mddev;
 }
@@ -639,7 +643,7 @@  static void mddev_put(struct mddev *mddev)
 	    mddev->ctime == 0 && !mddev->hold_active) {
 		/* Array is not configured at all, and not held active,
 		 * so destroy it */
-		list_del_init(&mddev->all_mddevs);
+		mddev->deleted = true;
 
 		/*
 		 * Call queue_work inside the spinlock so that
@@ -719,8 +723,8 @@  static struct mddev *mddev_find(dev_t unit)
 
 	spin_lock(&all_mddevs_lock);
 	mddev = mddev_find_locked(unit);
-	if (mddev)
-		mddev_get(mddev);
+	if (mddev && !mddev_get(mddev))
+		mddev = NULL;
 	spin_unlock(&all_mddevs_lock);
 
 	return mddev;
@@ -3338,6 +3342,8 @@  static bool md_rdev_overlaps(struct md_rdev *rdev)
 
 	spin_lock(&all_mddevs_lock);
 	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+		if (mddev->deleted)
+			continue;
 		rdev_for_each(rdev2, mddev) {
 			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
 			    md_rdevs_overlap(rdev, rdev2)) {
@@ -5525,11 +5531,10 @@  md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 	if (!entry->show)
 		return -EIO;
 	spin_lock(&all_mddevs_lock);
-	if (list_empty(&mddev->all_mddevs)) {
+	if (!mddev_get(mddev)) {
 		spin_unlock(&all_mddevs_lock);
 		return -EBUSY;
 	}
-	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
 
 	rv = entry->show(mddev, page);
@@ -5550,11 +5555,10 @@  md_attr_store(struct kobject *kobj, struct attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	spin_lock(&all_mddevs_lock);
-	if (list_empty(&mddev->all_mddevs)) {
+	if (!mddev_get(mddev)) {
 		spin_unlock(&all_mddevs_lock);
 		return -EBUSY;
 	}
-	mddev_get(mddev);
 	spin_unlock(&all_mddevs_lock);
 	rv = entry->store(mddev, page, length);
 	mddev_put(mddev);
@@ -7851,7 +7855,7 @@  static void md_free_disk(struct gendisk *disk)
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
 
-	kfree(mddev);
+	mddev_free(mddev);
 }
 
 const struct block_device_operations md_fops =
@@ -8173,6 +8177,8 @@  static void *md_seq_start(struct seq_file *seq, loff_t *pos)
 		if (!l--) {
 			mddev = list_entry(tmp, struct mddev, all_mddevs);
 			mddev_get(mddev);
+			if (!mddev_get(mddev))
+				continue;
 			spin_unlock(&all_mddevs_lock);
 			return mddev;
 		}
@@ -8186,25 +8192,35 @@  static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct list_head *tmp;
 	struct mddev *next_mddev, *mddev = v;
+	struct mddev *to_put = NULL;
 
 	++*pos;
 	if (v == (void*)2)
 		return NULL;
 
 	spin_lock(&all_mddevs_lock);
-	if (v == (void*)1)
+	if (v == (void*)1) {
 		tmp = all_mddevs.next;
-	else
+	} else {
+		to_put = mddev;
 		tmp = mddev->all_mddevs.next;
-	if (tmp != &all_mddevs)
-		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
-	else {
-		next_mddev = (void*)2;
-		*pos = 0x10000;
 	}
+
+	for (;;) {
+		if (tmp == &all_mddevs) {
+			next_mddev = (void*)2;
+			*pos = 0x10000;
+			break;
+		}
+		next_mddev = list_entry(tmp, struct mddev, all_mddevs);
+		if (mddev_get(next_mddev))
+			break;
+		mddev = next_mddev;
+		tmp = mddev->all_mddevs.next;
+	};
 	spin_unlock(&all_mddevs_lock);
 
-	if (v != (void*)1)
+	if (to_put)
 		mddev_put(mddev);
 	return next_mddev;
 
@@ -8768,6 +8784,8 @@  void md_do_sync(struct md_thread *thread)
 			goto skip;
 		spin_lock(&all_mddevs_lock);
 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
+			if (mddev2->deleted)
+				continue;
 			if (mddev2 == mddev)
 				continue;
 			if (!mddev->parallel_resync
@@ -9570,7 +9588,8 @@  static int md_notify_reboot(struct notifier_block *this,
 
 	spin_lock(&all_mddevs_lock);
 	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
-		mddev_get(mddev);
+		if (!mddev_get(mddev))
+			continue;
 		spin_unlock(&all_mddevs_lock);
 		if (mddev_trylock(mddev)) {
 			if (mddev->pers)
@@ -9925,7 +9944,8 @@  static __exit void md_exit(void)
 
 	spin_lock(&all_mddevs_lock);
 	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
-		mddev_get(mddev);
+		if (!mddev_get(mddev))
+			continue;
 		spin_unlock(&all_mddevs_lock);
 		export_array(mddev);
 		mddev->ctime = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1a85dbe78a71c..bc870e1f1e8c2 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -503,6 +503,7 @@  struct mddev {
 
 	atomic_t			max_corr_read_errors; /* max read retries */
 	struct list_head		all_mddevs;
+	bool				deleted;
 
 	const struct attribute_group	*to_remove;