diff mbox series

[v2,04/34] md: port block device access to file

Message ID 20240123-vfs-bdev-file-v2-4-adbd023e19cc@kernel.org (mailing list archive)
State New, archived
Headers show
Series Open block devices as files | expand

Commit Message

Christian Brauner Jan. 23, 2024, 1:26 p.m. UTC
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/md/dm.c               | 23 +++++++++++++----------
 drivers/md/md.c               | 12 ++++++------
 drivers/md/md.h               |  2 +-
 include/linux/device-mapper.h |  2 +-
 4 files changed, 21 insertions(+), 18 deletions(-)

Comments

Christoph Hellwig Jan. 29, 2024, 4:14 p.m. UTC | #1
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Jan Kara Jan. 31, 2024, 6:15 p.m. UTC | #2
On Tue 23-01-24 14:26:21, Christian Brauner wrote:
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  drivers/md/dm.c               | 23 +++++++++++++----------
>  drivers/md/md.c               | 12 ++++++------
>  drivers/md/md.h               |  2 +-
>  include/linux/device-mapper.h |  2 +-
>  4 files changed, 21 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 8dcabf84d866..87de5b5682ad 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -726,7 +726,8 @@ static struct table_device *open_table_device(struct mapped_device *md,
>  		dev_t dev, blk_mode_t mode)
>  {
>  	struct table_device *td;
> -	struct bdev_handle *bdev_handle;
> +	struct file *bdev_file;
> +	struct block_device *bdev;
>  	u64 part_off;
>  	int r;
>  
> @@ -735,34 +736,36 @@ static struct table_device *open_table_device(struct mapped_device *md,
>  		return ERR_PTR(-ENOMEM);
>  	refcount_set(&td->count, 1);
>  
> -	bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
> -	if (IS_ERR(bdev_handle)) {
> -		r = PTR_ERR(bdev_handle);
> +	bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
> +	if (IS_ERR(bdev_file)) {
> +		r = PTR_ERR(bdev_file);
>  		goto out_free_td;
>  	}
>  
> +	bdev = file_bdev(bdev_file);
> +
>  	/*
>  	 * We can be called before the dm disk is added.  In that case we can't
>  	 * register the holder relation here.  It will be done once add_disk was
>  	 * called.
>  	 */
>  	if (md->disk->slave_dir) {
> -		r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
> +		r = bd_link_disk_holder(bdev, md->disk);
>  		if (r)
>  			goto out_blkdev_put;
>  	}
>  
>  	td->dm_dev.mode = mode;
> -	td->dm_dev.bdev = bdev_handle->bdev;
> -	td->dm_dev.bdev_handle = bdev_handle;
> -	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
> +	td->dm_dev.bdev = bdev;
> +	td->dm_dev.bdev_file = bdev_file;
> +	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off,
>  						NULL, NULL);
>  	format_dev_t(td->dm_dev.name, dev);
>  	list_add(&td->list, &md->table_devices);
>  	return td;
>  
>  out_blkdev_put:
> -	bdev_release(bdev_handle);
> +	fput(bdev_file);
>  out_free_td:
>  	kfree(td);
>  	return ERR_PTR(r);
> @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
>  {
>  	if (md->disk->slave_dir)
>  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> -	bdev_release(td->dm_dev.bdev_handle);
> +	fput(td->dm_dev.bdev_file);
>  	put_dax(td->dm_dev.dax_dev);
>  	list_del(&td->list);
>  	kfree(td);
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2266358d8074..0653584db63b 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -2578,7 +2578,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
>  	if (test_bit(AutoDetected, &rdev->flags))
>  		md_autodetect_dev(rdev->bdev->bd_dev);
>  #endif
> -	bdev_release(rdev->bdev_handle);
> +	fput(rdev->bdev_file);
>  	rdev->bdev = NULL;
>  	kobject_put(&rdev->kobj);
>  }
> @@ -3773,16 +3773,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
>  	if (err)
>  		goto out_clear_rdev;
>  
> -	rdev->bdev_handle = bdev_open_by_dev(newdev,
> +	rdev->bdev_file = bdev_file_open_by_dev(newdev,
>  			BLK_OPEN_READ | BLK_OPEN_WRITE,
>  			super_format == -2 ? &claim_rdev : rdev, NULL);
> -	if (IS_ERR(rdev->bdev_handle)) {
> +	if (IS_ERR(rdev->bdev_file)) {
>  		pr_warn("md: could not open device unknown-block(%u,%u).\n",
>  			MAJOR(newdev), MINOR(newdev));
> -		err = PTR_ERR(rdev->bdev_handle);
> +		err = PTR_ERR(rdev->bdev_file);
>  		goto out_clear_rdev;
>  	}
> -	rdev->bdev = rdev->bdev_handle->bdev;
> +	rdev->bdev = file_bdev(rdev->bdev_file);
>  
>  	kobject_init(&rdev->kobj, &rdev_ktype);
>  
> @@ -3813,7 +3813,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
>  	return rdev;
>  
>  out_blkdev_put:
> -	bdev_release(rdev->bdev_handle);
> +	fput(rdev->bdev_file);
>  out_clear_rdev:
>  	md_rdev_clear(rdev);
>  out_free_rdev:
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 8d881cc59799..a079ee9b6190 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -59,7 +59,7 @@ struct md_rdev {
>  	 */
>  	struct block_device *meta_bdev;
>  	struct block_device *bdev;	/* block device handle */
> -	struct bdev_handle *bdev_handle;	/* Handle from open for bdev */
> +	struct file *bdev_file;		/* Handle from open for bdev */
>  
>  	struct page	*sb_page, *bb_page;
>  	int		sb_loaded;
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 772ab4d74d94..82b2195efaca 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -165,7 +165,7 @@ void dm_error(const char *message);
>  
>  struct dm_dev {
>  	struct block_device *bdev;
> -	struct bdev_handle *bdev_handle;
> +	struct file *bdev_file;
>  	struct dax_device *dax_dev;
>  	blk_mode_t mode;
>  	char name[16];
> 
> -- 
> 2.43.0
>
Ming Lei April 15, 2024, 9:26 a.m. UTC | #3
Hello,

On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  drivers/md/dm.c               | 23 +++++++++++++----------
>  drivers/md/md.c               | 12 ++++++------
>  drivers/md/md.h               |  2 +-
>  include/linux/device-mapper.h |  2 +-
>  4 files changed, 21 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 8dcabf84d866..87de5b5682ad 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c

...

> @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
>  {
>  	if (md->disk->slave_dir)
>  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> -	bdev_release(td->dm_dev.bdev_handle);
> +	fput(td->dm_dev.bdev_file);

The above change caused regression on 'dmsetup remove_all'.

blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
returns -EBUSY, then this dm disk is skipped in remove_all().

Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
mapper guys to check if it is safe.

Or other better solution?

thanks,
Ming
Christian Brauner April 15, 2024, 12:35 p.m. UTC | #4
On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> Hello,
> 
> On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  drivers/md/dm.c               | 23 +++++++++++++----------
> >  drivers/md/md.c               | 12 ++++++------
> >  drivers/md/md.h               |  2 +-
> >  include/linux/device-mapper.h |  2 +-
> >  4 files changed, 21 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > index 8dcabf84d866..87de5b5682ad 100644
> > --- a/drivers/md/dm.c
> > +++ b/drivers/md/dm.c
> 
> ...
> 
> > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> >  {
> >  	if (md->disk->slave_dir)
> >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > -	bdev_release(td->dm_dev.bdev_handle);
> > +	fput(td->dm_dev.bdev_file);
> 
> The above change caused regression on 'dmsetup remove_all'.
> 
> blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> returns -EBUSY, then this dm disk is skipped in remove_all().
> 
> Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> mapper guys to check if it is safe.
> 
> Or other better solution?

Yeah, I think there is. You can just switch all fput() instances in
device mapper to bdev_fput() which is mainline now. This will yield the
device and make it able to be reclaimed. Should be as simple as the
patch below. Could you test this and send a patch based on this (I'm on
a prolonged vacation so I don't have time right now.):

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..0f681a1e70af 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -765,7 +765,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
        return td;

 out_blkdev_put:
-       fput(bdev_file);
+       bdev_fput(bdev_file);
 out_free_td:
        kfree(td);
        return ERR_PTR(r);
@@ -778,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 {
        if (md->disk->slave_dir)
                bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
-       fput(td->dm_dev.bdev_file);
+       bdev_fput(td->dm_dev.bdev_file);
        put_dax(td->dm_dev.dax_dev);
        list_del(&td->list);
        kfree(td);
Mike Snitzer April 15, 2024, 1:56 p.m. UTC | #5
On Mon, Apr 15 2024 at  8:35P -0400,
Christian Brauner <brauner@kernel.org> wrote:

> On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > Hello,
> > 
> > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > >  drivers/md/md.c               | 12 ++++++------
> > >  drivers/md/md.h               |  2 +-
> > >  include/linux/device-mapper.h |  2 +-
> > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > index 8dcabf84d866..87de5b5682ad 100644
> > > --- a/drivers/md/dm.c
> > > +++ b/drivers/md/dm.c
> > 
> > ...
> > 
> > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > >  {
> > >  	if (md->disk->slave_dir)
> > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > -	bdev_release(td->dm_dev.bdev_handle);
> > > +	fput(td->dm_dev.bdev_file);
> > 
> > The above change caused regression on 'dmsetup remove_all'.
> > 
> > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > returns -EBUSY, then this dm disk is skipped in remove_all().
> > 
> > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > mapper guys to check if it is safe.
> > 
> > Or other better solution?
> 
> Yeah, I think there is. You can just switch all fput() instances in
> device mapper to bdev_fput() which is mainline now. This will yield the
> device and make it able to be reclaimed. Should be as simple as the
> patch below. Could you test this and send a patch based on this (I'm on
> a prolonged vacation so I don't have time right now.):
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..0f681a1e70af 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -765,7 +765,7 @@ static struct table_device *open_table_device(struct mapped_device *md,
>         return td;
> 
>  out_blkdev_put:
> -       fput(bdev_file);
> +       bdev_fput(bdev_file);
>  out_free_td:
>         kfree(td);
>         return ERR_PTR(r);
> @@ -778,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
>  {
>         if (md->disk->slave_dir)
>                 bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> -       fput(td->dm_dev.bdev_file);
> +       bdev_fput(td->dm_dev.bdev_file);
>         put_dax(td->dm_dev.dax_dev);
>         list_del(&td->list);
>         kfree(td);
> 
> 

Thanks. I'll work with Ming and others to take care of it. Have a great vacation!
Ming Lei April 15, 2024, 2:35 p.m. UTC | #6
On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > Hello,
> > 
> > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > >  drivers/md/md.c               | 12 ++++++------
> > >  drivers/md/md.h               |  2 +-
> > >  include/linux/device-mapper.h |  2 +-
> > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > index 8dcabf84d866..87de5b5682ad 100644
> > > --- a/drivers/md/dm.c
> > > +++ b/drivers/md/dm.c
> > 
> > ...
> > 
> > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > >  {
> > >  	if (md->disk->slave_dir)
> > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > -	bdev_release(td->dm_dev.bdev_handle);
> > > +	fput(td->dm_dev.bdev_file);
> > 
> > The above change caused regression on 'dmsetup remove_all'.
> > 
> > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > returns -EBUSY, then this dm disk is skipped in remove_all().
> > 
> > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > mapper guys to check if it is safe.
> > 
> > Or other better solution?
> 
> Yeah, I think there is. You can just switch all fput() instances in
> device mapper to bdev_fput() which is mainline now. This will yield the
> device and make it able to be reclaimed. Should be as simple as the
> patch below. Could you test this and send a patch based on this (I'm on
> a prolonged vacation so I don't have time right now.):

Unfortunately it doesn't work.

Here the problem is that blkdev_release() is delayed, which changes
'dmsetup remove_all' behavior, and causes that some of dm disks aren't
removed.

Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().

Thanks,
Ming
Christian Brauner April 15, 2024, 2:53 p.m. UTC | #7
On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote:
> On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > > Hello,
> > > 
> > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > > ---
> > > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > > >  drivers/md/md.c               | 12 ++++++------
> > > >  drivers/md/md.h               |  2 +-
> > > >  include/linux/device-mapper.h |  2 +-
> > > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > index 8dcabf84d866..87de5b5682ad 100644
> > > > --- a/drivers/md/dm.c
> > > > +++ b/drivers/md/dm.c
> > > 
> > > ...
> > > 
> > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > > >  {
> > > >  	if (md->disk->slave_dir)
> > > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > > -	bdev_release(td->dm_dev.bdev_handle);
> > > > +	fput(td->dm_dev.bdev_file);
> > > 
> > > The above change caused regression on 'dmsetup remove_all'.
> > > 
> > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > > returns -EBUSY, then this dm disk is skipped in remove_all().
> > > 
> > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > > mapper guys to check if it is safe.
> > > 
> > > Or other better solution?
> > 
> > Yeah, I think there is. You can just switch all fput() instances in
> > device mapper to bdev_fput() which is mainline now. This will yield the
> > device and make it able to be reclaimed. Should be as simple as the
> > patch below. Could you test this and send a patch based on this (I'm on
> > a prolonged vacation so I don't have time right now.):
> 
> Unfortunately it doesn't work.
> 
> Here the problem is that blkdev_release() is delayed, which changes
> 'dmsetup remove_all' behavior, and causes that some of dm disks aren't
> removed.
> 
> Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().

So you really need blkdev_release() itself to be synchronous? Groan, in
that case use __fput_sync() instead of fput() which ensures that this
file is closed synchronously.
Ming Lei April 15, 2024, 3:11 p.m. UTC | #8
On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote:
> On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote:
> > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > > > Hello,
> > > > 
> > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > > > ---
> > > > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > > > >  drivers/md/md.c               | 12 ++++++------
> > > > >  drivers/md/md.h               |  2 +-
> > > > >  include/linux/device-mapper.h |  2 +-
> > > > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > index 8dcabf84d866..87de5b5682ad 100644
> > > > > --- a/drivers/md/dm.c
> > > > > +++ b/drivers/md/dm.c
> > > > 
> > > > ...
> > > > 
> > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > > > >  {
> > > > >  	if (md->disk->slave_dir)
> > > > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > > > -	bdev_release(td->dm_dev.bdev_handle);
> > > > > +	fput(td->dm_dev.bdev_file);
> > > > 
> > > > The above change caused regression on 'dmsetup remove_all'.
> > > > 
> > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > > > returns -EBUSY, then this dm disk is skipped in remove_all().
> > > > 
> > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > > > mapper guys to check if it is safe.
> > > > 
> > > > Or other better solution?
> > > 
> > > Yeah, I think there is. You can just switch all fput() instances in
> > > device mapper to bdev_fput() which is mainline now. This will yield the
> > > device and make it able to be reclaimed. Should be as simple as the
> > > patch below. Could you test this and send a patch based on this (I'm on
> > > a prolonged vacation so I don't have time right now.):
> > 
> > Unfortunately it doesn't work.
> > 
> > Here the problem is that blkdev_release() is delayed, which changes
> > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't
> > removed.
> > 
> > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().
> 
> So you really need blkdev_release() itself to be synchronous? Groan, in

At least the current dm implementation relies on this way sort of, and
it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in
remove_all().

> that case use __fput_sync() instead of fput() which ensures that this
> file is closed synchronously.

I tried __fput_sync(), but the following panic is caused:

[  113.486522] ------------[ cut here ]------------
[  113.486524] kernel BUG at fs/file_table.c:453!
[  113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[  113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23
[  113.490114] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
[  113.491661] RIP: 0010:__fput_sync+0x25/0x30
[  113.492562] Code: 90 90 90 90 90 0f 1f 44 00 00 f0 48 ff 4f 38 75 14 65 48 8b 04 25 40 25 03 00 f6 40 36 20 74 0a e9 20 fd ff ff c3 cc cc cc cc <0f0
[  113.493926] RSP: 0018:ffffb76581003c20 EFLAGS: 00010246
[  113.494220] RAX: ffff92eca6ef8000 RBX: ffff92ed176c3c18 RCX: 000000008080007c
[  113.494632] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff92ec844cac00
[  113.495033] RBP: ffff92ed176c3c00 R08: 0000000000000001 R09: 0000000000000000
[  113.495378] R10: ffffb76581003b00 R11: ffffb76581003b68 R12: ffff92ec8fccec20
[  113.495723] R13: ffff92ec8431b400 R14: ffff92ec8431b508 R15: ffff92ec8fccec00
[  113.496108] FS:  00007f5be5638840(0000) GS:ffff92f0ebb80000(0000) knlGS:0000000000000000
[  113.496581] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  113.496907] CR2: 00007f5be54694b0 CR3: 0000000108e54003 CR4: 0000000000770ef0
[  113.497308] PKRU: 55555554
[  113.497469] Call Trace:
[  113.497613]  <TASK>
[  113.497741]  ? show_trace_log_lvl+0x1c4/0x2df
[  113.497997]  ? show_trace_log_lvl+0x1c4/0x2df
[  113.498251]  ? dm_put_table_device+0x64/0xd0 [dm_mod]
[  113.498553]  ? __die_body.cold+0x8/0xd
[  113.498768]  ? die+0x2b/0x50
[  113.498937]  ? do_trap+0xce/0x120
[  113.499129]  ? __fput_sync+0x25/0x30
[  113.499337]  ? do_error_trap+0x65/0x80
[  113.499577]  ? __fput_sync+0x25/0x30
[  113.499787]  ? exc_invalid_op+0x4e/0x70
[  113.500011]  ? __fput_sync+0x25/0x30
[  113.500239]  ? asm_exc_invalid_op+0x16/0x20
[  113.500842]  ? __fput_sync+0x25/0x30
[  113.501387]  dm_put_table_device+0x64/0xd0 [dm_mod]
[  113.502047]  dm_put_device+0x80/0x110 [dm_mod]
[  113.502650]  stripe_dtr+0x2f/0x50 [dm_mod]
[  113.503218]  dm_table_destroy+0x59/0x120 [dm_mod]
[  113.503842]  __dm_destroy+0x114/0x1e0 [dm_mod]
[  113.504402]  dm_hash_remove_all+0x63/0x160 [dm_mod]
[  113.505028]  remove_all+0x1e/0x30 [dm_mod]
[  113.505602]  ctl_ioctl+0x19f/0x290 [dm_mod]
[  113.506146]  dm_ctl_ioctl+0xa/0x20 [dm_mod]
[  113.506717]  __x64_sys_ioctl+0x87/0xc0
[  113.507230]  do_syscall_64+0x5c/0xf0
[  113.507755]  ? exc_page_fault+0x62/0x150
[  113.508309]  entry_SYSCALL_64_after_hwframe+0x6e/0x76
[  113.508945] RIP: 0033:0x7f5be543ec6b



Thanks. 
Ming
Mike Snitzer April 15, 2024, 3:53 p.m. UTC | #9
On Mon, Apr 15, 2024 at 11:11:50PM +0800, Ming Lei wrote:
> On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote:
> > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote:
> > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > > > > Hello,
> > > > > 
> > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > > > > ---
> > > > > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > > > > >  drivers/md/md.c               | 12 ++++++------
> > > > > >  drivers/md/md.h               |  2 +-
> > > > > >  include/linux/device-mapper.h |  2 +-
> > > > > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > > index 8dcabf84d866..87de5b5682ad 100644
> > > > > > --- a/drivers/md/dm.c
> > > > > > +++ b/drivers/md/dm.c
> > > > > 
> > > > > ...
> > > > > 
> > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > > > > >  {
> > > > > >  	if (md->disk->slave_dir)
> > > > > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > > > > -	bdev_release(td->dm_dev.bdev_handle);
> > > > > > +	fput(td->dm_dev.bdev_file);
> > > > > 
> > > > > The above change caused regression on 'dmsetup remove_all'.
> > > > > 
> > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > > > > returns -EBUSY, then this dm disk is skipped in remove_all().
> > > > > 
> > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > > > > mapper guys to check if it is safe.
> > > > > 
> > > > > Or other better solution?
> > > > 
> > > > Yeah, I think there is. You can just switch all fput() instances in
> > > > device mapper to bdev_fput() which is mainline now. This will yield the
> > > > device and make it able to be reclaimed. Should be as simple as the
> > > > patch below. Could you test this and send a patch based on this (I'm on
> > > > a prolonged vacation so I don't have time right now.):
> > > 
> > > Unfortunately it doesn't work.
> > > 
> > > Here the problem is that blkdev_release() is delayed, which changes
> > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't
> > > removed.
> > > 
> > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().
> > 
> > So you really need blkdev_release() itself to be synchronous? Groan, in
> 
> At least the current dm implementation relies on this way sort of, and
> it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in
> remove_all().

You floated that earlier in this thread, etc: no, that would change
the interface.  DMF_DEFERRED_REMOVE gives people options to allow for
async device closes, etc.  But I don't want to impose it as some faux
equivalent to the sync model remove_all has always provided.

And what about simple 'dmsetup remove'? remove_all just loops doing
remove... so isn't 'dmsetup remove' also being forced to be async as
of commit a28d893eb3270 ("md: port block device access to file")?

dm.c:dm_put_device -> dm_put_table_device -> close_table_device

> > that case use __fput_sync() instead of fput() which ensures that this
> > file is closed synchronously.
> 
> I tried __fput_sync(), but the following panic is caused:

Ok, so more work needed.  But we need to preserve the existing sync
interface for DM device removal.

Mike
Jan Kara April 15, 2024, 4:22 p.m. UTC | #10
On Mon 15-04-24 23:11:50, Ming Lei wrote:
> On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote:
> > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote:
> > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > > > > Hello,
> > > > > 
> > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > > > > ---
> > > > > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > > > > >  drivers/md/md.c               | 12 ++++++------
> > > > > >  drivers/md/md.h               |  2 +-
> > > > > >  include/linux/device-mapper.h |  2 +-
> > > > > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > > index 8dcabf84d866..87de5b5682ad 100644
> > > > > > --- a/drivers/md/dm.c
> > > > > > +++ b/drivers/md/dm.c
> > > > > 
> > > > > ...
> > > > > 
> > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > > > > >  {
> > > > > >  	if (md->disk->slave_dir)
> > > > > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > > > > -	bdev_release(td->dm_dev.bdev_handle);
> > > > > > +	fput(td->dm_dev.bdev_file);
> > > > > 
> > > > > The above change caused regression on 'dmsetup remove_all'.
> > > > > 
> > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > > > > returns -EBUSY, then this dm disk is skipped in remove_all().
> > > > > 
> > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > > > > mapper guys to check if it is safe.
> > > > > 
> > > > > Or other better solution?
> > > > 
> > > > Yeah, I think there is. You can just switch all fput() instances in
> > > > device mapper to bdev_fput() which is mainline now. This will yield the
> > > > device and make it able to be reclaimed. Should be as simple as the
> > > > patch below. Could you test this and send a patch based on this (I'm on
> > > > a prolonged vacation so I don't have time right now.):
> > > 
> > > Unfortunately it doesn't work.
> > > 
> > > Here the problem is that blkdev_release() is delayed, which changes
> > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't
> > > removed.
> > > 
> > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().
> > 
> > So you really need blkdev_release() itself to be synchronous? Groan, in
> 
> At least the current dm implementation relies on this way sort of, and
> it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in
> remove_all().
> 
> > that case use __fput_sync() instead of fput() which ensures that this
> > file is closed synchronously.
> 
> I tried __fput_sync(), but the following panic is caused:
> 
> [  113.486522] ------------[ cut here ]------------
> [  113.486524] kernel BUG at fs/file_table.c:453!
> [  113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [  113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23

Wait, how come this is 5.14 kernel? Apparently you're crashing on:

BUG_ON(!(task->flags & PF_KTHREAD));

but that is not present in current upstream (BUG_ON was removed in 6.6-rc1
by commit 021a160abf62c).

								Honza

> [  113.490114] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014
> [  113.491661] RIP: 0010:__fput_sync+0x25/0x30
> [  113.492562] Code: 90 90 90 90 90 0f 1f 44 00 00 f0 48 ff 4f 38 75 14 65 48 8b 04 25 40 25 03 00 f6 40 36 20 74 0a e9 20 fd ff ff c3 cc cc cc cc <0f0
> [  113.493926] RSP: 0018:ffffb76581003c20 EFLAGS: 00010246
> [  113.494220] RAX: ffff92eca6ef8000 RBX: ffff92ed176c3c18 RCX: 000000008080007c
> [  113.494632] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff92ec844cac00
> [  113.495033] RBP: ffff92ed176c3c00 R08: 0000000000000001 R09: 0000000000000000
> [  113.495378] R10: ffffb76581003b00 R11: ffffb76581003b68 R12: ffff92ec8fccec20
> [  113.495723] R13: ffff92ec8431b400 R14: ffff92ec8431b508 R15: ffff92ec8fccec00
> [  113.496108] FS:  00007f5be5638840(0000) GS:ffff92f0ebb80000(0000) knlGS:0000000000000000
> [  113.496581] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  113.496907] CR2: 00007f5be54694b0 CR3: 0000000108e54003 CR4: 0000000000770ef0
> [  113.497308] PKRU: 55555554
> [  113.497469] Call Trace:
> [  113.497613]  <TASK>
> [  113.497741]  ? show_trace_log_lvl+0x1c4/0x2df
> [  113.497997]  ? show_trace_log_lvl+0x1c4/0x2df
> [  113.498251]  ? dm_put_table_device+0x64/0xd0 [dm_mod]
> [  113.498553]  ? __die_body.cold+0x8/0xd
> [  113.498768]  ? die+0x2b/0x50
> [  113.498937]  ? do_trap+0xce/0x120
> [  113.499129]  ? __fput_sync+0x25/0x30
> [  113.499337]  ? do_error_trap+0x65/0x80
> [  113.499577]  ? __fput_sync+0x25/0x30
> [  113.499787]  ? exc_invalid_op+0x4e/0x70
> [  113.500011]  ? __fput_sync+0x25/0x30
> [  113.500239]  ? asm_exc_invalid_op+0x16/0x20
> [  113.500842]  ? __fput_sync+0x25/0x30
> [  113.501387]  dm_put_table_device+0x64/0xd0 [dm_mod]
> [  113.502047]  dm_put_device+0x80/0x110 [dm_mod]
> [  113.502650]  stripe_dtr+0x2f/0x50 [dm_mod]
> [  113.503218]  dm_table_destroy+0x59/0x120 [dm_mod]
> [  113.503842]  __dm_destroy+0x114/0x1e0 [dm_mod]
> [  113.504402]  dm_hash_remove_all+0x63/0x160 [dm_mod]
> [  113.505028]  remove_all+0x1e/0x30 [dm_mod]
> [  113.505602]  ctl_ioctl+0x19f/0x290 [dm_mod]
> [  113.506146]  dm_ctl_ioctl+0xa/0x20 [dm_mod]
> [  113.506717]  __x64_sys_ioctl+0x87/0xc0
> [  113.507230]  do_syscall_64+0x5c/0xf0
> [  113.507755]  ? exc_page_fault+0x62/0x150
> [  113.508309]  entry_SYSCALL_64_after_hwframe+0x6e/0x76
> [  113.508945] RIP: 0033:0x7f5be543ec6b
> 
> 
> 
> Thanks. 
> Ming
>
Ming Lei April 16, 2024, 12:27 a.m. UTC | #11
On Mon, Apr 15, 2024 at 06:22:10PM +0200, Jan Kara wrote:
> On Mon 15-04-24 23:11:50, Ming Lei wrote:
> > On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote:
> > > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote:
> > > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote:
> > > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote:
> > > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > > > > > ---
> > > > > > >  drivers/md/dm.c               | 23 +++++++++++++----------
> > > > > > >  drivers/md/md.c               | 12 ++++++------
> > > > > > >  drivers/md/md.h               |  2 +-
> > > > > > >  include/linux/device-mapper.h |  2 +-
> > > > > > >  4 files changed, 21 insertions(+), 18 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > > > index 8dcabf84d866..87de5b5682ad 100644
> > > > > > > --- a/drivers/md/dm.c
> > > > > > > +++ b/drivers/md/dm.c
> > > > > > 
> > > > > > ...
> > > > > > 
> > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
> > > > > > >  {
> > > > > > >  	if (md->disk->slave_dir)
> > > > > > >  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
> > > > > > > -	bdev_release(td->dm_dev.bdev_handle);
> > > > > > > +	fput(td->dm_dev.bdev_file);
> > > > > > 
> > > > > > The above change caused regression on 'dmsetup remove_all'.
> > > > > > 
> > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion
> > > > > > returns -EBUSY, then this dm disk is skipped in remove_all().
> > > > > > 
> > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device
> > > > > > mapper guys to check if it is safe.
> > > > > > 
> > > > > > Or other better solution?
> > > > > 
> > > > > Yeah, I think there is. You can just switch all fput() instances in
> > > > > device mapper to bdev_fput() which is mainline now. This will yield the
> > > > > device and make it able to be reclaimed. Should be as simple as the
> > > > > patch below. Could you test this and send a patch based on this (I'm on
> > > > > a prolonged vacation so I don't have time right now.):
> > > > 
> > > > Unfortunately it doesn't work.
> > > > 
> > > > Here the problem is that blkdev_release() is delayed, which changes
> > > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't
> > > > removed.
> > > > 
> > > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close().
> > > 
> > > So you really need blkdev_release() itself to be synchronous? Groan, in
> > 
> > At least the current dm implementation relies on this way sort of, and
> > it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in
> > remove_all().
> > 
> > > that case use __fput_sync() instead of fput() which ensures that this
> > > file is closed synchronously.
> > 
> > I tried __fput_sync(), but the following panic is caused:
> > 
> > [  113.486522] ------------[ cut here ]------------
> > [  113.486524] kernel BUG at fs/file_table.c:453!
> > [  113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> > [  113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23
> 
> Wait, how come this is 5.14 kernel? Apparently you're crashing on:
> 
> BUG_ON(!(task->flags & PF_KTHREAD));
> 
> but that is not present in current upstream (BUG_ON was removed in 6.6-rc1
> by commit 021a160abf62c).

Indeed, just tried the change on v6.9-rc3, looks it does work. 


Thanks,
Ming
diff mbox series

Patch

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8dcabf84d866..87de5b5682ad 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -726,7 +726,8 @@  static struct table_device *open_table_device(struct mapped_device *md,
 		dev_t dev, blk_mode_t mode)
 {
 	struct table_device *td;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
+	struct block_device *bdev;
 	u64 part_off;
 	int r;
 
@@ -735,34 +736,36 @@  static struct table_device *open_table_device(struct mapped_device *md,
 		return ERR_PTR(-ENOMEM);
 	refcount_set(&td->count, 1);
 
-	bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
-	if (IS_ERR(bdev_handle)) {
-		r = PTR_ERR(bdev_handle);
+	bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL);
+	if (IS_ERR(bdev_file)) {
+		r = PTR_ERR(bdev_file);
 		goto out_free_td;
 	}
 
+	bdev = file_bdev(bdev_file);
+
 	/*
 	 * We can be called before the dm disk is added.  In that case we can't
 	 * register the holder relation here.  It will be done once add_disk was
 	 * called.
 	 */
 	if (md->disk->slave_dir) {
-		r = bd_link_disk_holder(bdev_handle->bdev, md->disk);
+		r = bd_link_disk_holder(bdev, md->disk);
 		if (r)
 			goto out_blkdev_put;
 	}
 
 	td->dm_dev.mode = mode;
-	td->dm_dev.bdev = bdev_handle->bdev;
-	td->dm_dev.bdev_handle = bdev_handle;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off,
+	td->dm_dev.bdev = bdev;
+	td->dm_dev.bdev_file = bdev_file;
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off,
 						NULL, NULL);
 	format_dev_t(td->dm_dev.name, dev);
 	list_add(&td->list, &md->table_devices);
 	return td;
 
 out_blkdev_put:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 out_free_td:
 	kfree(td);
 	return ERR_PTR(r);
@@ -775,7 +778,7 @@  static void close_table_device(struct table_device *td, struct mapped_device *md
 {
 	if (md->disk->slave_dir)
 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
-	bdev_release(td->dm_dev.bdev_handle);
+	fput(td->dm_dev.bdev_file);
 	put_dax(td->dm_dev.dax_dev);
 	list_del(&td->list);
 	kfree(td);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2266358d8074..0653584db63b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2578,7 +2578,7 @@  static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
 #endif
-	bdev_release(rdev->bdev_handle);
+	fput(rdev->bdev_file);
 	rdev->bdev = NULL;
 	kobject_put(&rdev->kobj);
 }
@@ -3773,16 +3773,16 @@  static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	if (err)
 		goto out_clear_rdev;
 
-	rdev->bdev_handle = bdev_open_by_dev(newdev,
+	rdev->bdev_file = bdev_file_open_by_dev(newdev,
 			BLK_OPEN_READ | BLK_OPEN_WRITE,
 			super_format == -2 ? &claim_rdev : rdev, NULL);
-	if (IS_ERR(rdev->bdev_handle)) {
+	if (IS_ERR(rdev->bdev_file)) {
 		pr_warn("md: could not open device unknown-block(%u,%u).\n",
 			MAJOR(newdev), MINOR(newdev));
-		err = PTR_ERR(rdev->bdev_handle);
+		err = PTR_ERR(rdev->bdev_file);
 		goto out_clear_rdev;
 	}
-	rdev->bdev = rdev->bdev_handle->bdev;
+	rdev->bdev = file_bdev(rdev->bdev_file);
 
 	kobject_init(&rdev->kobj, &rdev_ktype);
 
@@ -3813,7 +3813,7 @@  static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
 	return rdev;
 
 out_blkdev_put:
-	bdev_release(rdev->bdev_handle);
+	fput(rdev->bdev_file);
 out_clear_rdev:
 	md_rdev_clear(rdev);
 out_free_rdev:
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8d881cc59799..a079ee9b6190 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -59,7 +59,7 @@  struct md_rdev {
 	 */
 	struct block_device *meta_bdev;
 	struct block_device *bdev;	/* block device handle */
-	struct bdev_handle *bdev_handle;	/* Handle from open for bdev */
+	struct file *bdev_file;		/* Handle from open for bdev */
 
 	struct page	*sb_page, *bb_page;
 	int		sb_loaded;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 772ab4d74d94..82b2195efaca 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -165,7 +165,7 @@  void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];