diff mbox series

[2/8] md: implement ->free_disk

Message ID 20220712070331.1390700-3-hch@lst.de (mailing list archive)
State Superseded, archived
Delegated to: Song Liu
Headers show
Series [1/8] md: fix kobject_add error handling | expand

Commit Message

Christoph Hellwig July 12, 2022, 7:03 a.m. UTC
Ensure that all private data is only freed once all accesses are done.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/md.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

Comments

Logan Gunthorpe July 12, 2022, 11:13 p.m. UTC | #1
On 2022-07-12 01:03, Christoph Hellwig wrote:
> Ensure that all private data is only freed once all accesses are done.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/md/md.c | 17 ++++++++++++-----
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 861d6a9481b2e..ae076a7a87796 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5581,11 +5581,6 @@ static void md_free(struct kobject *ko)
>  		del_gendisk(mddev->gendisk);
>  		put_disk(mddev->gendisk);
>  	}
> -	percpu_ref_exit(&mddev->writes_pending);
> -
> -	bioset_exit(&mddev->bio_set);
> -	bioset_exit(&mddev->sync_set);
> -	kfree(mddev);
>  }
>  
>  static const struct sysfs_ops md_sysfs_ops = {
> @@ -7844,6 +7839,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
>  	return ret;
>  }
>  
> +static void md_free_disk(struct gendisk *disk)
> +{
> +	struct mddev *mddev = disk->private_data;
> +
> +	percpu_ref_exit(&mddev->writes_pending);
> +	bioset_exit(&mddev->bio_set);
> +	bioset_exit(&mddev->sync_set);
> +
> +	kfree(mddev);
> +}

I still don't think this is entirely correct. There are error paths that
will put the kobject before the disk is created and if they get hit then
the kfree(mddev) will never be called and the memory will be leaked.

Instead of creating an ugly special path for that, I came up with a solution 
that I think  makes a bit more sense: the kobject is still freed in it's 
own free  function, but the disk holds a reference to the kobject and drops
it in its free function. The sysfs puts and del_gendisk are then moved
into mddev_delayed_delete() so they happen earlier.

Thoughts?

Logan

--

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 78e2588ed43e..330db78a5b38 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5585,15 +5585,6 @@ static void md_free(struct kobject *ko)
 {
        struct mddev *mddev = container_of(ko, struct mddev, kobj);
 
-       if (mddev->sysfs_state)
-               sysfs_put(mddev->sysfs_state);
-       if (mddev->sysfs_level)
-               sysfs_put(mddev->sysfs_level);
-
-       if (mddev->gendisk) {
-               del_gendisk(mddev->gendisk);
-               blk_cleanup_disk(mddev->gendisk);
-       }
        percpu_ref_exit(&mddev->writes_pending);
 
        bioset_exit(&mddev->bio_set);
@@ -5618,6 +5609,17 @@ static void mddev_delayed_delete(struct work_struct *ws)
        struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
        kobject_del(&mddev->kobj);
+
+       if (mddev->sysfs_state)
+               sysfs_put(mddev->sysfs_state);
+       if (mddev->sysfs_level)
+               sysfs_put(mddev->sysfs_level);
+
+       if (mddev->gendisk) {
+               del_gendisk(mddev->gendisk);
+               blk_cleanup_disk(mddev->gendisk);
+       }
+
        kobject_put(&mddev->kobj);
 }
 
@@ -5708,6 +5710,7 @@ int md_alloc(dev_t dev, char *name)
        else
                sprintf(disk->disk_name, "md%d", unit);
        disk->fops = &md_fops;
+       kobject_get(&mddev->kobj);
        disk->private_data = mddev;
 
        mddev->queue = disk->queue;
@@ -7858,6 +7861,13 @@ static unsigned int md_check_events(struct gendisk *disk, unsign>
        return ret;
 }
 
+static void md_free_disk(struct gendisk *disk)
+{
+       struct mddev *mddev = disk->private_data;
+
+       kobject_put(&mddev->kobj);
+}
+
 const struct block_device_operations md_fops =
 {
        .owner          = THIS_MODULE,
@@ -7871,6 +7881,7 @@ const struct block_device_operations md_fops =
        .getgeo         = md_getgeo,
        .check_events   = md_check_events,
        .set_read_only  = md_set_read_only,
+       .free_disk      = md_free_disk,
 };
Christoph Hellwig July 13, 2022, 7:17 a.m. UTC | #2
On Tue, Jul 12, 2022 at 05:13:48PM -0600, Logan Gunthorpe wrote:
> > +
> > +	percpu_ref_exit(&mddev->writes_pending);
> > +	bioset_exit(&mddev->bio_set);
> > +	bioset_exit(&mddev->sync_set);
> > +
> > +	kfree(mddev);
> > +}
> 
> I still don't think this is entirely correct. There are error paths that
> will put the kobject before the disk is created and if they get hit then
> the kfree(mddev) will never be called and the memory will be leaked.

True.

> Instead of creating an ugly special path for that, I came up with a solution 
> that I think  makes a bit more sense: the kobject is still freed in it's 
> own free  function, but the disk holds a reference to the kobject and drops
> it in its free function. The sysfs puts and del_gendisk are then moved
> into mddev_delayed_delete() so they happen earlier.

I'm not sure this is a good idea.  The mddev kobject hangs off the
disk, so I don't think that it should in any way control the life
time of the disk, as that just creates potential problems down the
road.

Moreover we actually need the special kfree path anyway for the
add_disk failure, something that I had missed.  Something like
the untested patch below on top of my series.  I'll look into folding
and testing it.

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 53a92b306b1fc..62f40c49344e4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5641,7 +5641,7 @@ static int md_alloc(dev_t dev, char *name)
 			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
 				spin_unlock(&all_mddevs_lock);
 				error = -EEXIST;
-				goto out_unlock_disks_mutex;
+				goto out_free_mddev;
 			}
 		spin_unlock(&all_mddevs_lock);
 	}
@@ -5654,7 +5654,7 @@ static int md_alloc(dev_t dev, char *name)
 	error = -ENOMEM;
 	disk = blk_alloc_disk(NUMA_NO_NODE);
 	if (!disk)
-		goto out_unlock_disks_mutex;
+		goto out_free_mddev;
 
 	disk->major = MAJOR(mddev->unit);
 	disk->first_minor = unit << shift;
@@ -5674,26 +5674,35 @@ static int md_alloc(dev_t dev, char *name)
 	disk->events |= DISK_EVENT_MEDIA_CHANGE;
 	mddev->gendisk = disk;
 	error = add_disk(disk);
-	if (error) {
-		mddev->gendisk = NULL;
-		put_disk(disk);
-		goto out_unlock_disks_mutex;
-	}
+	if (error)
+		goto out_put_disk;
 
 	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
-	if (error)
+	if (error) {
+		/*
+		 * The disk is already live at this point.  Clear the hold flag
+		 * and let mddev_put take care of the deletion, as it isn't any
+		 * different from a normal close on last release now.
+		 */
+		mddev->hold_active = 0;
 		goto out_unlock_disks_mutex;
+	}
 
 	kobject_uevent(&mddev->kobj, KOBJ_ADD);
 	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
 	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
 
 out_unlock_disks_mutex:
-	if (error)
-		mddev->hold_active = 0;
 	mutex_unlock(&disks_mutex);
 	mddev_put(mddev);
 	return error;
+
+out_put_disk:
+	put_disk(disk);
+out_free_mddev:
+	mutex_unlock(&disks_mutex);
+	kfree(mddev);
+	return error;
 }
 
 static void md_probe(dev_t dev)
Logan Gunthorpe July 13, 2022, 3:30 p.m. UTC | #3
On 2022-07-13 01:17, Christoph Hellwig wrote:
> On Tue, Jul 12, 2022 at 05:13:48PM -0600, Logan Gunthorpe wrote:
>>> +
>>> +	percpu_ref_exit(&mddev->writes_pending);
>>> +	bioset_exit(&mddev->bio_set);
>>> +	bioset_exit(&mddev->sync_set);
>>> +
>>> +	kfree(mddev);
>>> +}
>>
>> I still don't think this is entirely correct. There are error paths that
>> will put the kobject before the disk is created and if they get hit then
>> the kfree(mddev) will never be called and the memory will be leaked.
> 
> True.
> 
>> Instead of creating an ugly special path for that, I came up with a solution 
>> that I think  makes a bit more sense: the kobject is still freed in it's 
>> own free  function, but the disk holds a reference to the kobject and drops
>> it in its free function. The sysfs puts and del_gendisk are then moved
>> into mddev_delayed_delete() so they happen earlier.
> 
> I'm not sure this is a good idea.  The mddev kobject hangs off the
> disk, so I don't think that it should in any way control the life
> time of the disk, as that just creates potential problems down the
> road.

My interpretation was that kobject_del() (which is called in
mddev_delayed_delete() before the disk would be removed) removes the
mddev from the disk. At that point, it's just a structure hanging around
that will be freed when its last reference is dropped, which is the disk
itself cleaning up. I'm not sure how that would cause potential problems.

Logan
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 861d6a9481b2e..ae076a7a87796 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5581,11 +5581,6 @@  static void md_free(struct kobject *ko)
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
-	percpu_ref_exit(&mddev->writes_pending);
-
-	bioset_exit(&mddev->bio_set);
-	bioset_exit(&mddev->sync_set);
-	kfree(mddev);
 }
 
 static const struct sysfs_ops md_sysfs_ops = {
@@ -7844,6 +7839,17 @@  static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
 	return ret;
 }
 
+static void md_free_disk(struct gendisk *disk)
+{
+	struct mddev *mddev = disk->private_data;
+
+	percpu_ref_exit(&mddev->writes_pending);
+	bioset_exit(&mddev->bio_set);
+	bioset_exit(&mddev->sync_set);
+
+	kfree(mddev);
+}
+
 const struct block_device_operations md_fops =
 {
 	.owner		= THIS_MODULE,
@@ -7857,6 +7863,7 @@  const struct block_device_operations md_fops =
 	.getgeo		= md_getgeo,
 	.check_events	= md_check_events,
 	.set_read_only	= md_set_read_only,
+	.free_disk	= md_free_disk,
 };
 
 static int md_thread(void *arg)