diff mbox series

[5/7] dm: track per-add_disk holder relations in DM

Message ID 20221030153120.1045101-6-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [1/7] block: clear ->slave_dir when dropping the main slave_dir reference | expand

Commit Message

Christoph Hellwig Oct. 30, 2022, 3:31 p.m. UTC
dm is a bit special in that it opens the underlying devices.  Commit
89f871af1b26 ("dm: delay registering the gendisk") tried to accomodate
that by allowing to add the holder to the list before add_gendisk and
then just add them to sysfs once add_disk is called.  But that leads to
really odd lifetime problems and error handling problems as we can't
know the state of the kobjects and don't unwind properly.  To fix this
switch to just registering all existing table_devices with the holder
code right after add_disk, and remove them before calling del_gendisk.

Fixes: 89f871af1b26 ("dm: delay registering the gendisk")
Reported-by: Yu Kuai <yukuai1@huaweicloud.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/dm.c | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

Comments

Yu Kuai Nov. 9, 2022, 2:08 a.m. UTC | #1
Hi,

在 2022/10/30 23:31, Christoph Hellwig 写道:
> dm is a bit special in that it opens the underlying devices.  Commit
> 89f871af1b26 ("dm: delay registering the gendisk") tried to accomodate
> that by allowing to add the holder to the list before add_gendisk and
> then just add them to sysfs once add_disk is called.  But that leads to
> really odd lifetime problems and error handling problems as we can't
> know the state of the kobjects and don't unwind properly.  To fix this
> switch to just registering all existing table_devices with the holder
> code right after add_disk, and remove them before calling del_gendisk.
> 
> Fixes: 89f871af1b26 ("dm: delay registering the gendisk")
> Reported-by: Yu Kuai <yukuai1@huaweicloud.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/md/dm.c | 45 +++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 37 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 2917700b1e15c..7b0d6dc957549 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -751,9 +751,16 @@ static struct table_device *open_table_device(struct mapped_device *md,
>   		goto out_free_td;
>   	}
>   
> -	r = bd_link_disk_holder(bdev, dm_disk(md));
> -	if (r)
> -		goto out_blkdev_put;
> +	/*
> +	 * We can be called before the dm disk is added.  In that case we can't
> +	 * register the holder relation here.  It will be done once add_disk was
> +	 * called.
> +	 */
> +	if (md->disk->slave_dir) {
If device_add_disk() or del_gendisk() can concurrent with this, It seems
to me that using 'slave_dir' is not safe.

I'm not quite familiar with dm, can we guarantee that they can't
concurrent?

Thanks,
Kuai
> +		r = bd_link_disk_holder(bdev, md->disk);
> +		if (r)
> +			goto out_blkdev_put;
> +	}
>   
>   	td->dm_dev.mode = mode;
>   	td->dm_dev.bdev = bdev;
> @@ -774,7 +781,8 @@ static struct table_device *open_table_device(struct mapped_device *md,
>    */
>   static void close_table_device(struct table_device *td, struct mapped_device *md)
>   {
> -	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> +	if (md->disk->slave_dir)
> +		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
>   	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
>   	put_dax(td->dm_dev.dax_dev);
>   	list_del(&td->list);
> @@ -1951,7 +1959,13 @@ static void cleanup_mapped_device(struct mapped_device *md)
>   		md->disk->private_data = NULL;
>   		spin_unlock(&_minor_lock);
>   		if (dm_get_md_type(md) != DM_TYPE_NONE) {
> +			struct table_device *td;
> +
>   			dm_sysfs_exit(md);
> +			list_for_each_entry(td, &md->table_devices, list) {
> +				bd_unlink_disk_holder(td->dm_dev.bdev,
> +						      md->disk);
> +			}
>   			del_gendisk(md->disk);
>   		}
>   		dm_queue_destroy_crypto_profile(md->queue);
> @@ -2284,6 +2298,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
>   {
>   	enum dm_queue_mode type = dm_table_get_type(t);
>   	struct queue_limits limits;
> +	struct table_device *td;
>   	int r;
>   
>   	switch (type) {
> @@ -2316,13 +2331,27 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
>   	if (r)
>   		return r;
>   
> -	r = dm_sysfs_init(md);
> -	if (r) {
> -		del_gendisk(md->disk);
> -		return r;
> +	/*
> +	 * Register the holder relationship for devices added before the disk
> +	 * was live.
> +	 */
> +	list_for_each_entry(td, &md->table_devices, list) {
> +		r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
> +		if (r)
> +			goto out_undo_holders;
>   	}
> +
> +	r = dm_sysfs_init(md);
> +	if (r)
> +		goto out_undo_holders;
>   	md->type = type;
>   	return 0;
> +
> +out_undo_holders:
> +	list_for_each_entry_continue_reverse(td, &md->table_devices, list)
> +		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> +	del_gendisk(md->disk);
> +	return r;
>   }
>   
>   struct mapped_device *dm_get_md(dev_t dev)
>
Christoph Hellwig Nov. 9, 2022, 8:26 a.m. UTC | #2
On Wed, Nov 09, 2022 at 10:08:14AM +0800, Yu Kuai wrote:
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index 2917700b1e15c..7b0d6dc957549 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -751,9 +751,16 @@ static struct table_device *open_table_device(struct mapped_device *md,
>>   		goto out_free_td;
>>   	}
>>   -	r = bd_link_disk_holder(bdev, dm_disk(md));
>> -	if (r)
>> -		goto out_blkdev_put;
>> +	/*
>> +	 * We can be called before the dm disk is added.  In that case we can't
>> +	 * register the holder relation here.  It will be done once add_disk was
>> +	 * called.
>> +	 */
>> +	if (md->disk->slave_dir) {
> If device_add_disk() or del_gendisk() can concurrent with this, It seems
> to me that using 'slave_dir' is not safe.
>
> I'm not quite familiar with dm, can we guarantee that they can't
> concurrent?

I assumed dm would not get itself into territory were creating /
deleting the device could race with adding component devices, but
digging deeper I can't find anything.  This could be done
by holding table_devices_lock around add_disk/del_gendisk, but
I'm not that familar with the dm code.

Mike, can you help out on this?
Mike Snitzer Nov. 10, 2022, 6:09 p.m. UTC | #3
On Wed, Nov 09 2022 at  3:26P -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Wed, Nov 09, 2022 at 10:08:14AM +0800, Yu Kuai wrote:
> >> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> >> index 2917700b1e15c..7b0d6dc957549 100644
> >> --- a/drivers/md/dm.c
> >> +++ b/drivers/md/dm.c
> >> @@ -751,9 +751,16 @@ static struct table_device *open_table_device(struct mapped_device *md,
> >>   		goto out_free_td;
> >>   	}
> >>   -	r = bd_link_disk_holder(bdev, dm_disk(md));
> >> -	if (r)
> >> -		goto out_blkdev_put;
> >> +	/*
> >> +	 * We can be called before the dm disk is added.  In that case we can't
> >> +	 * register the holder relation here.  It will be done once add_disk was
> >> +	 * called.
> >> +	 */
> >> +	if (md->disk->slave_dir) {
> > If device_add_disk() or del_gendisk() can concurrent with this, It seems
> > to me that using 'slave_dir' is not safe.
> >
> > I'm not quite familiar with dm, can we guarantee that they can't
> > concurrent?
> 
> I assumed dm would not get itself into territory were creating /
> deleting the device could race with adding component devices, but
> digging deeper I can't find anything.  This could be done
> by holding table_devices_lock around add_disk/del_gendisk, but
> I'm not that familar with the dm code.
> 
> Mike, can you help out on this?

Maybe :/

Underlying component devices can certainly come and go at any
time. And there is no DM code that can, or should, prevent that. All
we can do is cope with unavailability of devices. But pretty sure that
isn't the question.

I'm unclear about the specific race in question:
if open_table_device() doesn't see slave_dir it is the first table
load. Otherwise, the DM device (and associated gendisk) shouldn't have
been torn down while a table is actively being loaded for it. But
_where_ the code lives, to ensure that, is also eluding me...

You could use a big lock (table_devices_lock) to disallow changes to
DM relations while loading the table. But I wouldn't think it needed
as long as the gendisk's lifecycle is protected vs table loads (or
other concurrent actions like table load vs dm device removal). Again,
more code inspection needed to page all this back into my head.

The concern for race aside:
I am concerned that your redundant bd_link_disk_holder() (first in
open_table_device and later in dm_setup_md_queue) will result in
dangling refcount (e.g. increase of 2 when it should only be by 1) --
given bd_link_disk_holder will gladly just bump its holder->refcnt if
bd_find_holder_disk() returns an existing holder. This would occur if
a DM table is already loaded (and DM device's gendisk exists) and a
new DM table is being loaded.

Mike
Mike Snitzer Nov. 10, 2022, 7:48 p.m. UTC | #4
On Thu, Nov 10 2022 at  1:09P -0500,
Mike Snitzer <snitzer@redhat.com> wrote:
 
> The concern for race aside:
> I am concerned that your redundant bd_link_disk_holder() (first in
> open_table_device and later in dm_setup_md_queue) will result in
> dangling refcount (e.g. increase of 2 when it should only be by 1) --
> given bd_link_disk_holder will gladly just bump its holder->refcnt if
> bd_find_holder_disk() returns an existing holder. This would occur if
> a DM table is already loaded (and DM device's gendisk exists) and a
> new DM table is being loaded.

Nevermind, dm_setup_md_queue should only ever be called once.
Yu Kuai Nov. 12, 2022, 6:23 a.m. UTC | #5
在 2022/11/11 2:09, Mike Snitzer 写道:
> On Wed, Nov 09 2022 at  3:26P -0500,
> Christoph Hellwig <hch@lst.de> wrote:
> 
>> On Wed, Nov 09, 2022 at 10:08:14AM +0800, Yu Kuai wrote:
>>>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>>>> index 2917700b1e15c..7b0d6dc957549 100644
>>>> --- a/drivers/md/dm.c
>>>> +++ b/drivers/md/dm.c
>>>> @@ -751,9 +751,16 @@ static struct table_device *open_table_device(struct mapped_device *md,
>>>>    		goto out_free_td;
>>>>    	}
>>>>    -	r = bd_link_disk_holder(bdev, dm_disk(md));
>>>> -	if (r)
>>>> -		goto out_blkdev_put;
>>>> +	/*
>>>> +	 * We can be called before the dm disk is added.  In that case we can't
>>>> +	 * register the holder relation here.  It will be done once add_disk was
>>>> +	 * called.
>>>> +	 */
>>>> +	if (md->disk->slave_dir) {
>>> If device_add_disk() or del_gendisk() can concurrent with this, It seems
>>> to me that using 'slave_dir' is not safe.
>>>
>>> I'm not quite familiar with dm, can we guarantee that they can't
>>> concurrent?
>>
>> I assumed dm would not get itself into territory were creating /
>> deleting the device could race with adding component devices, but
>> digging deeper I can't find anything.  This could be done
>> by holding table_devices_lock around add_disk/del_gendisk, but
>> I'm not that familar with the dm code.
>>
>> Mike, can you help out on this?
> 
> Maybe :/
> 
> Underlying component devices can certainly come and go at any
> time. And there is no DM code that can, or should, prevent that. All
> we can do is cope with unavailability of devices. But pretty sure that
> isn't the question.
> 
> I'm unclear about the specific race in question:
> if open_table_device() doesn't see slave_dir it is the first table
> load. Otherwise, the DM device (and associated gendisk) shouldn't have
> been torn down while a table is actively being loaded for it. But
> _where_ the code lives, to ensure that, is also eluding me...
> 
> You could use a big lock (table_devices_lock) to disallow changes to
> DM relations while loading the table. But I wouldn't think it needed

How about using table_devices_lock to protect device addition and
removal to forbid table load race with creating and deleting explictily,
as Christoph suggested?

Thanks,
Kuai

> as long as the gendisk's lifecycle is protected vs table loads (or
> other concurrent actions like table load vs dm device removal). Again,
> more code inspection needed to page all this back into my head.
> 
> The concern for race aside:
> I am concerned that your redundant bd_link_disk_holder() (first in
> open_table_device and later in dm_setup_md_queue) will result in
> dangling refcount (e.g. increase of 2 when it should only be by 1) --
> given bd_link_disk_holder will gladly just bump its holder->refcnt if
> bd_find_holder_disk() returns an existing holder. This would occur if
> a DM table is already loaded (and DM device's gendisk exists) and a
> new DM table is being loaded.
> 
> Mike
> 
> .
>
diff mbox series

Patch

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2917700b1e15c..7b0d6dc957549 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -751,9 +751,16 @@  static struct table_device *open_table_device(struct mapped_device *md,
 		goto out_free_td;
 	}
 
-	r = bd_link_disk_holder(bdev, dm_disk(md));
-	if (r)
-		goto out_blkdev_put;
+	/*
+	 * We can be called before the dm disk is added.  In that case we can't
+	 * register the holder relation here.  It will be done once add_disk was
+	 * called.
+	 */
+	if (md->disk->slave_dir) {
+		r = bd_link_disk_holder(bdev, md->disk);
+		if (r)
+			goto out_blkdev_put;
+	}
 
 	td->dm_dev.mode = mode;
 	td->dm_dev.bdev = bdev;
@@ -774,7 +781,8 @@  static struct table_device *open_table_device(struct mapped_device *md,
  */
 static void close_table_device(struct table_device *td, struct mapped_device *md)
 {
-	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
+	if (md->disk->slave_dir)
+		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
 	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 	put_dax(td->dm_dev.dax_dev);
 	list_del(&td->list);
@@ -1951,7 +1959,13 @@  static void cleanup_mapped_device(struct mapped_device *md)
 		md->disk->private_data = NULL;
 		spin_unlock(&_minor_lock);
 		if (dm_get_md_type(md) != DM_TYPE_NONE) {
+			struct table_device *td;
+
 			dm_sysfs_exit(md);
+			list_for_each_entry(td, &md->table_devices, list) {
+				bd_unlink_disk_holder(td->dm_dev.bdev,
+						      md->disk);
+			}
 			del_gendisk(md->disk);
 		}
 		dm_queue_destroy_crypto_profile(md->queue);
@@ -2284,6 +2298,7 @@  int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	enum dm_queue_mode type = dm_table_get_type(t);
 	struct queue_limits limits;
+	struct table_device *td;
 	int r;
 
 	switch (type) {
@@ -2316,13 +2331,27 @@  int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 	if (r)
 		return r;
 
-	r = dm_sysfs_init(md);
-	if (r) {
-		del_gendisk(md->disk);
-		return r;
+	/*
+	 * Register the holder relationship for devices added before the disk
+	 * was live.
+	 */
+	list_for_each_entry(td, &md->table_devices, list) {
+		r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
+		if (r)
+			goto out_undo_holders;
 	}
+
+	r = dm_sysfs_init(md);
+	if (r)
+		goto out_undo_holders;
 	md->type = type;
 	return 0;
+
+out_undo_holders:
+	list_for_each_entry_continue_reverse(td, &md->table_devices, list)
+		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
+	del_gendisk(md->disk);
+	return r;
 }
 
 struct mapped_device *dm_get_md(dev_t dev)