Message ID | 20240123-vfs-bdev-file-v2-4-adbd023e19cc@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Open block devices as files | expand |
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
On Tue 23-01-24 14:26:21, Christian Brauner wrote: > Signed-off-by: Christian Brauner <brauner@kernel.org> Looks good. Feel free to add: Reviewed-by: Jan Kara <jack@suse.cz> Honza > --- > drivers/md/dm.c | 23 +++++++++++++---------- > drivers/md/md.c | 12 ++++++------ > drivers/md/md.h | 2 +- > include/linux/device-mapper.h | 2 +- > 4 files changed, 21 insertions(+), 18 deletions(-) > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index 8dcabf84d866..87de5b5682ad 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c > @@ -726,7 +726,8 @@ static struct table_device *open_table_device(struct mapped_device *md, > dev_t dev, blk_mode_t mode) > { > struct table_device *td; > - struct bdev_handle *bdev_handle; > + struct file *bdev_file; > + struct block_device *bdev; > u64 part_off; > int r; > > @@ -735,34 +736,36 @@ static struct table_device *open_table_device(struct mapped_device *md, > return ERR_PTR(-ENOMEM); > refcount_set(&td->count, 1); > > - bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL); > - if (IS_ERR(bdev_handle)) { > - r = PTR_ERR(bdev_handle); > + bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL); > + if (IS_ERR(bdev_file)) { > + r = PTR_ERR(bdev_file); > goto out_free_td; > } > > + bdev = file_bdev(bdev_file); > + > /* > * We can be called before the dm disk is added. In that case we can't > * register the holder relation here. It will be done once add_disk was > * called. > */ > if (md->disk->slave_dir) { > - r = bd_link_disk_holder(bdev_handle->bdev, md->disk); > + r = bd_link_disk_holder(bdev, md->disk); > if (r) > goto out_blkdev_put; > } > > td->dm_dev.mode = mode; > - td->dm_dev.bdev = bdev_handle->bdev; > - td->dm_dev.bdev_handle = bdev_handle; > - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off, > + td->dm_dev.bdev = bdev; > + td->dm_dev.bdev_file = bdev_file; > + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, > NULL, NULL); > format_dev_t(td->dm_dev.name, dev); > list_add(&td->list, &md->table_devices); > return td; > > out_blkdev_put: > - bdev_release(bdev_handle); > + fput(bdev_file); > out_free_td: > kfree(td); > return ERR_PTR(r); > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > { > if (md->disk->slave_dir) > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > - bdev_release(td->dm_dev.bdev_handle); > + fput(td->dm_dev.bdev_file); > put_dax(td->dm_dev.dax_dev); > list_del(&td->list); > kfree(td); > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 2266358d8074..0653584db63b 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -2578,7 +2578,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) > if (test_bit(AutoDetected, &rdev->flags)) > md_autodetect_dev(rdev->bdev->bd_dev); > #endif > - bdev_release(rdev->bdev_handle); > + fput(rdev->bdev_file); > rdev->bdev = NULL; > kobject_put(&rdev->kobj); > } > @@ -3773,16 +3773,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe > if (err) > goto out_clear_rdev; > > - rdev->bdev_handle = bdev_open_by_dev(newdev, > + rdev->bdev_file = bdev_file_open_by_dev(newdev, > BLK_OPEN_READ | BLK_OPEN_WRITE, > super_format == -2 ? &claim_rdev : rdev, NULL); > - if (IS_ERR(rdev->bdev_handle)) { > + if (IS_ERR(rdev->bdev_file)) { > pr_warn("md: could not open device unknown-block(%u,%u).\n", > MAJOR(newdev), MINOR(newdev)); > - err = PTR_ERR(rdev->bdev_handle); > + err = PTR_ERR(rdev->bdev_file); > goto out_clear_rdev; > } > - rdev->bdev = rdev->bdev_handle->bdev; > + rdev->bdev = file_bdev(rdev->bdev_file); > > kobject_init(&rdev->kobj, &rdev_ktype); > > @@ -3813,7 +3813,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe > return rdev; > > out_blkdev_put: > - bdev_release(rdev->bdev_handle); > + fput(rdev->bdev_file); > out_clear_rdev: > md_rdev_clear(rdev); > out_free_rdev: > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 8d881cc59799..a079ee9b6190 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -59,7 +59,7 @@ struct md_rdev { > */ > struct block_device *meta_bdev; > struct block_device *bdev; /* block device handle */ > - struct bdev_handle *bdev_handle; /* Handle from open for bdev */ > + struct file *bdev_file; /* Handle from open for bdev */ > > struct page *sb_page, *bb_page; > int sb_loaded; > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h > index 772ab4d74d94..82b2195efaca 100644 > --- a/include/linux/device-mapper.h > +++ b/include/linux/device-mapper.h > @@ -165,7 +165,7 @@ void dm_error(const char *message); > > struct dm_dev { > struct block_device *bdev; > - struct bdev_handle *bdev_handle; > + struct file *bdev_file; > struct dax_device *dax_dev; > blk_mode_t mode; > char name[16]; > > -- > 2.43.0 >
Hello, On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > drivers/md/dm.c | 23 +++++++++++++---------- > drivers/md/md.c | 12 ++++++------ > drivers/md/md.h | 2 +- > include/linux/device-mapper.h | 2 +- > 4 files changed, 21 insertions(+), 18 deletions(-) > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index 8dcabf84d866..87de5b5682ad 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c ... > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > { > if (md->disk->slave_dir) > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > - bdev_release(td->dm_dev.bdev_handle); > + fput(td->dm_dev.bdev_file); The above change caused regression on 'dmsetup remove_all'. blkdev_release() is delayed because of fput(), so dm_lock_for_deletion returns -EBUSY, then this dm disk is skipped in remove_all(). Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device mapper guys to check if it is safe. Or other better solution? thanks, Ming
On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > Hello, > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > --- > > drivers/md/dm.c | 23 +++++++++++++---------- > > drivers/md/md.c | 12 ++++++------ > > drivers/md/md.h | 2 +- > > include/linux/device-mapper.h | 2 +- > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > index 8dcabf84d866..87de5b5682ad 100644 > > --- a/drivers/md/dm.c > > +++ b/drivers/md/dm.c > > ... > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > { > > if (md->disk->slave_dir) > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > - bdev_release(td->dm_dev.bdev_handle); > > + fput(td->dm_dev.bdev_file); > > The above change caused regression on 'dmsetup remove_all'. > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > returns -EBUSY, then this dm disk is skipped in remove_all(). > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > mapper guys to check if it is safe. > > Or other better solution? Yeah, I think there is. You can just switch all fput() instances in device mapper to bdev_fput() which is mainline now. This will yield the device and make it able to be reclaimed. Should be as simple as the patch below. Could you test this and send a patch based on this (I'm on a prolonged vacation so I don't have time right now.): diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 56aa2a8b9d71..0f681a1e70af 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -765,7 +765,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return td; out_blkdev_put: - fput(bdev_file); + bdev_fput(bdev_file); out_free_td: kfree(td); return ERR_PTR(r); @@ -778,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - fput(td->dm_dev.bdev_file); + bdev_fput(td->dm_dev.bdev_file); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td);
On Mon, Apr 15 2024 at 8:35P -0400, Christian Brauner <brauner@kernel.org> wrote: > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > Hello, > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > --- > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > drivers/md/md.c | 12 ++++++------ > > > drivers/md/md.h | 2 +- > > > include/linux/device-mapper.h | 2 +- > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > index 8dcabf84d866..87de5b5682ad 100644 > > > --- a/drivers/md/dm.c > > > +++ b/drivers/md/dm.c > > > > ... > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > { > > > if (md->disk->slave_dir) > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > - bdev_release(td->dm_dev.bdev_handle); > > > + fput(td->dm_dev.bdev_file); > > > > The above change caused regression on 'dmsetup remove_all'. > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > mapper guys to check if it is safe. > > > > Or other better solution? > > Yeah, I think there is. You can just switch all fput() instances in > device mapper to bdev_fput() which is mainline now. This will yield the > device and make it able to be reclaimed. Should be as simple as the > patch below. Could you test this and send a patch based on this (I'm on > a prolonged vacation so I don't have time right now.): > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index 56aa2a8b9d71..0f681a1e70af 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c > @@ -765,7 +765,7 @@ static struct table_device *open_table_device(struct mapped_device *md, > return td; > > out_blkdev_put: > - fput(bdev_file); > + bdev_fput(bdev_file); > out_free_td: > kfree(td); > return ERR_PTR(r); > @@ -778,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > { > if (md->disk->slave_dir) > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > - fput(td->dm_dev.bdev_file); > + bdev_fput(td->dm_dev.bdev_file); > put_dax(td->dm_dev.dax_dev); > list_del(&td->list); > kfree(td); > > Thanks. I'll work with Ming and others to take care of it. Have a great vacation!
On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > Hello, > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > --- > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > drivers/md/md.c | 12 ++++++------ > > > drivers/md/md.h | 2 +- > > > include/linux/device-mapper.h | 2 +- > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > index 8dcabf84d866..87de5b5682ad 100644 > > > --- a/drivers/md/dm.c > > > +++ b/drivers/md/dm.c > > > > ... > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > { > > > if (md->disk->slave_dir) > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > - bdev_release(td->dm_dev.bdev_handle); > > > + fput(td->dm_dev.bdev_file); > > > > The above change caused regression on 'dmsetup remove_all'. > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > mapper guys to check if it is safe. > > > > Or other better solution? > > Yeah, I think there is. You can just switch all fput() instances in > device mapper to bdev_fput() which is mainline now. This will yield the > device and make it able to be reclaimed. Should be as simple as the > patch below. Could you test this and send a patch based on this (I'm on > a prolonged vacation so I don't have time right now.): Unfortunately it doesn't work. Here the problem is that blkdev_release() is delayed, which changes 'dmsetup remove_all' behavior, and causes that some of dm disks aren't removed. Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). Thanks, Ming
On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote: > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > > Hello, > > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > > --- > > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > > drivers/md/md.c | 12 ++++++------ > > > > drivers/md/md.h | 2 +- > > > > include/linux/device-mapper.h | 2 +- > > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > > index 8dcabf84d866..87de5b5682ad 100644 > > > > --- a/drivers/md/dm.c > > > > +++ b/drivers/md/dm.c > > > > > > ... > > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > > { > > > > if (md->disk->slave_dir) > > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > > - bdev_release(td->dm_dev.bdev_handle); > > > > + fput(td->dm_dev.bdev_file); > > > > > > The above change caused regression on 'dmsetup remove_all'. > > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > > mapper guys to check if it is safe. > > > > > > Or other better solution? > > > > Yeah, I think there is. You can just switch all fput() instances in > > device mapper to bdev_fput() which is mainline now. This will yield the > > device and make it able to be reclaimed. Should be as simple as the > > patch below. Could you test this and send a patch based on this (I'm on > > a prolonged vacation so I don't have time right now.): > > Unfortunately it doesn't work. > > Here the problem is that blkdev_release() is delayed, which changes > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't > removed. > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). So you really need blkdev_release() itself to be synchronous? Groan, in that case use __fput_sync() instead of fput() which ensures that this file is closed synchronously.
On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote: > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote: > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > > > Hello, > > > > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > > > --- > > > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > > > drivers/md/md.c | 12 ++++++------ > > > > > drivers/md/md.h | 2 +- > > > > > include/linux/device-mapper.h | 2 +- > > > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > > > index 8dcabf84d866..87de5b5682ad 100644 > > > > > --- a/drivers/md/dm.c > > > > > +++ b/drivers/md/dm.c > > > > > > > > ... > > > > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > > > { > > > > > if (md->disk->slave_dir) > > > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > > > - bdev_release(td->dm_dev.bdev_handle); > > > > > + fput(td->dm_dev.bdev_file); > > > > > > > > The above change caused regression on 'dmsetup remove_all'. > > > > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > > > mapper guys to check if it is safe. > > > > > > > > Or other better solution? > > > > > > Yeah, I think there is. You can just switch all fput() instances in > > > device mapper to bdev_fput() which is mainline now. This will yield the > > > device and make it able to be reclaimed. Should be as simple as the > > > patch below. Could you test this and send a patch based on this (I'm on > > > a prolonged vacation so I don't have time right now.): > > > > Unfortunately it doesn't work. > > > > Here the problem is that blkdev_release() is delayed, which changes > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't > > removed. > > > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). > > So you really need blkdev_release() itself to be synchronous? Groan, in At least the current dm implementation relies on this way sort of, and it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in remove_all(). > that case use __fput_sync() instead of fput() which ensures that this > file is closed synchronously. I tried __fput_sync(), but the following panic is caused: [ 113.486522] ------------[ cut here ]------------ [ 113.486524] kernel BUG at fs/file_table.c:453! [ 113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23 [ 113.490114] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014 [ 113.491661] RIP: 0010:__fput_sync+0x25/0x30 [ 113.492562] Code: 90 90 90 90 90 0f 1f 44 00 00 f0 48 ff 4f 38 75 14 65 48 8b 04 25 40 25 03 00 f6 40 36 20 74 0a e9 20 fd ff ff c3 cc cc cc cc <0f0 [ 113.493926] RSP: 0018:ffffb76581003c20 EFLAGS: 00010246 [ 113.494220] RAX: ffff92eca6ef8000 RBX: ffff92ed176c3c18 RCX: 000000008080007c [ 113.494632] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff92ec844cac00 [ 113.495033] RBP: ffff92ed176c3c00 R08: 0000000000000001 R09: 0000000000000000 [ 113.495378] R10: ffffb76581003b00 R11: ffffb76581003b68 R12: ffff92ec8fccec20 [ 113.495723] R13: ffff92ec8431b400 R14: ffff92ec8431b508 R15: ffff92ec8fccec00 [ 113.496108] FS: 00007f5be5638840(0000) GS:ffff92f0ebb80000(0000) knlGS:0000000000000000 [ 113.496581] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 113.496907] CR2: 00007f5be54694b0 CR3: 0000000108e54003 CR4: 0000000000770ef0 [ 113.497308] PKRU: 55555554 [ 113.497469] Call Trace: [ 113.497613] <TASK> [ 113.497741] ? show_trace_log_lvl+0x1c4/0x2df [ 113.497997] ? show_trace_log_lvl+0x1c4/0x2df [ 113.498251] ? dm_put_table_device+0x64/0xd0 [dm_mod] [ 113.498553] ? __die_body.cold+0x8/0xd [ 113.498768] ? die+0x2b/0x50 [ 113.498937] ? do_trap+0xce/0x120 [ 113.499129] ? __fput_sync+0x25/0x30 [ 113.499337] ? do_error_trap+0x65/0x80 [ 113.499577] ? __fput_sync+0x25/0x30 [ 113.499787] ? exc_invalid_op+0x4e/0x70 [ 113.500011] ? __fput_sync+0x25/0x30 [ 113.500239] ? asm_exc_invalid_op+0x16/0x20 [ 113.500842] ? __fput_sync+0x25/0x30 [ 113.501387] dm_put_table_device+0x64/0xd0 [dm_mod] [ 113.502047] dm_put_device+0x80/0x110 [dm_mod] [ 113.502650] stripe_dtr+0x2f/0x50 [dm_mod] [ 113.503218] dm_table_destroy+0x59/0x120 [dm_mod] [ 113.503842] __dm_destroy+0x114/0x1e0 [dm_mod] [ 113.504402] dm_hash_remove_all+0x63/0x160 [dm_mod] [ 113.505028] remove_all+0x1e/0x30 [dm_mod] [ 113.505602] ctl_ioctl+0x19f/0x290 [dm_mod] [ 113.506146] dm_ctl_ioctl+0xa/0x20 [dm_mod] [ 113.506717] __x64_sys_ioctl+0x87/0xc0 [ 113.507230] do_syscall_64+0x5c/0xf0 [ 113.507755] ? exc_page_fault+0x62/0x150 [ 113.508309] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 113.508945] RIP: 0033:0x7f5be543ec6b Thanks. Ming
On Mon, Apr 15, 2024 at 11:11:50PM +0800, Ming Lei wrote: > On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote: > > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote: > > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > > > > Hello, > > > > > > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > > > > --- > > > > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > > > > drivers/md/md.c | 12 ++++++------ > > > > > > drivers/md/md.h | 2 +- > > > > > > include/linux/device-mapper.h | 2 +- > > > > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > > > > index 8dcabf84d866..87de5b5682ad 100644 > > > > > > --- a/drivers/md/dm.c > > > > > > +++ b/drivers/md/dm.c > > > > > > > > > > ... > > > > > > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > > > > { > > > > > > if (md->disk->slave_dir) > > > > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > > > > - bdev_release(td->dm_dev.bdev_handle); > > > > > > + fput(td->dm_dev.bdev_file); > > > > > > > > > > The above change caused regression on 'dmsetup remove_all'. > > > > > > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > > > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > > > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > > > > mapper guys to check if it is safe. > > > > > > > > > > Or other better solution? > > > > > > > > Yeah, I think there is. You can just switch all fput() instances in > > > > device mapper to bdev_fput() which is mainline now. This will yield the > > > > device and make it able to be reclaimed. Should be as simple as the > > > > patch below. Could you test this and send a patch based on this (I'm on > > > > a prolonged vacation so I don't have time right now.): > > > > > > Unfortunately it doesn't work. > > > > > > Here the problem is that blkdev_release() is delayed, which changes > > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't > > > removed. > > > > > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). > > > > So you really need blkdev_release() itself to be synchronous? Groan, in > > At least the current dm implementation relies on this way sort of, and > it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in > remove_all(). You floated that earlier in this thread, etc: no, that would change the interface. DMF_DEFERRED_REMOVE gives people options to allow for async device closes, etc. But I don't want to impose it as some faux equivalent to the sync model remove_all has always provided. And what about simple 'dmsetup remove'? remove_all just loops doing remove... so isn't 'dmsetup remove' also being forced to be async as of commit a28d893eb3270 ("md: port block device access to file")? dm.c:dm_put_device -> dm_put_table_device -> close_table_device > > that case use __fput_sync() instead of fput() which ensures that this > > file is closed synchronously. > > I tried __fput_sync(), but the following panic is caused: Ok, so more work needed. But we need to preserve the existing sync interface for DM device removal. Mike
On Mon 15-04-24 23:11:50, Ming Lei wrote: > On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote: > > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote: > > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > > > > Hello, > > > > > > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > > > > --- > > > > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > > > > drivers/md/md.c | 12 ++++++------ > > > > > > drivers/md/md.h | 2 +- > > > > > > include/linux/device-mapper.h | 2 +- > > > > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > > > > index 8dcabf84d866..87de5b5682ad 100644 > > > > > > --- a/drivers/md/dm.c > > > > > > +++ b/drivers/md/dm.c > > > > > > > > > > ... > > > > > > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > > > > { > > > > > > if (md->disk->slave_dir) > > > > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > > > > - bdev_release(td->dm_dev.bdev_handle); > > > > > > + fput(td->dm_dev.bdev_file); > > > > > > > > > > The above change caused regression on 'dmsetup remove_all'. > > > > > > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > > > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > > > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > > > > mapper guys to check if it is safe. > > > > > > > > > > Or other better solution? > > > > > > > > Yeah, I think there is. You can just switch all fput() instances in > > > > device mapper to bdev_fput() which is mainline now. This will yield the > > > > device and make it able to be reclaimed. Should be as simple as the > > > > patch below. Could you test this and send a patch based on this (I'm on > > > > a prolonged vacation so I don't have time right now.): > > > > > > Unfortunately it doesn't work. > > > > > > Here the problem is that blkdev_release() is delayed, which changes > > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't > > > removed. > > > > > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). > > > > So you really need blkdev_release() itself to be synchronous? Groan, in > > At least the current dm implementation relies on this way sort of, and > it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in > remove_all(). > > > that case use __fput_sync() instead of fput() which ensures that this > > file is closed synchronously. > > I tried __fput_sync(), but the following panic is caused: > > [ 113.486522] ------------[ cut here ]------------ > [ 113.486524] kernel BUG at fs/file_table.c:453! > [ 113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI > [ 113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23 Wait, how come this is 5.14 kernel? Apparently you're crashing on: BUG_ON(!(task->flags & PF_KTHREAD)); but that is not present in current upstream (BUG_ON was removed in 6.6-rc1 by commit 021a160abf62c). Honza > [ 113.490114] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc37 04/01/2014 > [ 113.491661] RIP: 0010:__fput_sync+0x25/0x30 > [ 113.492562] Code: 90 90 90 90 90 0f 1f 44 00 00 f0 48 ff 4f 38 75 14 65 48 8b 04 25 40 25 03 00 f6 40 36 20 74 0a e9 20 fd ff ff c3 cc cc cc cc <0f0 > [ 113.493926] RSP: 0018:ffffb76581003c20 EFLAGS: 00010246 > [ 113.494220] RAX: ffff92eca6ef8000 RBX: ffff92ed176c3c18 RCX: 000000008080007c > [ 113.494632] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff92ec844cac00 > [ 113.495033] RBP: ffff92ed176c3c00 R08: 0000000000000001 R09: 0000000000000000 > [ 113.495378] R10: ffffb76581003b00 R11: ffffb76581003b68 R12: ffff92ec8fccec20 > [ 113.495723] R13: ffff92ec8431b400 R14: ffff92ec8431b508 R15: ffff92ec8fccec00 > [ 113.496108] FS: 00007f5be5638840(0000) GS:ffff92f0ebb80000(0000) knlGS:0000000000000000 > [ 113.496581] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 113.496907] CR2: 00007f5be54694b0 CR3: 0000000108e54003 CR4: 0000000000770ef0 > [ 113.497308] PKRU: 55555554 > [ 113.497469] Call Trace: > [ 113.497613] <TASK> > [ 113.497741] ? show_trace_log_lvl+0x1c4/0x2df > [ 113.497997] ? show_trace_log_lvl+0x1c4/0x2df > [ 113.498251] ? dm_put_table_device+0x64/0xd0 [dm_mod] > [ 113.498553] ? __die_body.cold+0x8/0xd > [ 113.498768] ? die+0x2b/0x50 > [ 113.498937] ? do_trap+0xce/0x120 > [ 113.499129] ? __fput_sync+0x25/0x30 > [ 113.499337] ? do_error_trap+0x65/0x80 > [ 113.499577] ? __fput_sync+0x25/0x30 > [ 113.499787] ? exc_invalid_op+0x4e/0x70 > [ 113.500011] ? __fput_sync+0x25/0x30 > [ 113.500239] ? asm_exc_invalid_op+0x16/0x20 > [ 113.500842] ? __fput_sync+0x25/0x30 > [ 113.501387] dm_put_table_device+0x64/0xd0 [dm_mod] > [ 113.502047] dm_put_device+0x80/0x110 [dm_mod] > [ 113.502650] stripe_dtr+0x2f/0x50 [dm_mod] > [ 113.503218] dm_table_destroy+0x59/0x120 [dm_mod] > [ 113.503842] __dm_destroy+0x114/0x1e0 [dm_mod] > [ 113.504402] dm_hash_remove_all+0x63/0x160 [dm_mod] > [ 113.505028] remove_all+0x1e/0x30 [dm_mod] > [ 113.505602] ctl_ioctl+0x19f/0x290 [dm_mod] > [ 113.506146] dm_ctl_ioctl+0xa/0x20 [dm_mod] > [ 113.506717] __x64_sys_ioctl+0x87/0xc0 > [ 113.507230] do_syscall_64+0x5c/0xf0 > [ 113.507755] ? exc_page_fault+0x62/0x150 > [ 113.508309] entry_SYSCALL_64_after_hwframe+0x6e/0x76 > [ 113.508945] RIP: 0033:0x7f5be543ec6b > > > > Thanks. > Ming >
On Mon, Apr 15, 2024 at 06:22:10PM +0200, Jan Kara wrote: > On Mon 15-04-24 23:11:50, Ming Lei wrote: > > On Mon, Apr 15, 2024 at 04:53:42PM +0200, Christian Brauner wrote: > > > On Mon, Apr 15, 2024 at 10:35:53PM +0800, Ming Lei wrote: > > > > On Mon, Apr 15, 2024 at 02:35:17PM +0200, Christian Brauner wrote: > > > > > On Mon, Apr 15, 2024 at 05:26:19PM +0800, Ming Lei wrote: > > > > > > Hello, > > > > > > > > > > > > On Tue, Jan 23, 2024 at 02:26:21PM +0100, Christian Brauner wrote: > > > > > > > Signed-off-by: Christian Brauner <brauner@kernel.org> > > > > > > > --- > > > > > > > drivers/md/dm.c | 23 +++++++++++++---------- > > > > > > > drivers/md/md.c | 12 ++++++------ > > > > > > > drivers/md/md.h | 2 +- > > > > > > > include/linux/device-mapper.h | 2 +- > > > > > > > 4 files changed, 21 insertions(+), 18 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > > > > > > index 8dcabf84d866..87de5b5682ad 100644 > > > > > > > --- a/drivers/md/dm.c > > > > > > > +++ b/drivers/md/dm.c > > > > > > > > > > > > ... > > > > > > > > > > > > > @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > > > > > > { > > > > > > > if (md->disk->slave_dir) > > > > > > > bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); > > > > > > > - bdev_release(td->dm_dev.bdev_handle); > > > > > > > + fput(td->dm_dev.bdev_file); > > > > > > > > > > > > The above change caused regression on 'dmsetup remove_all'. > > > > > > > > > > > > blkdev_release() is delayed because of fput(), so dm_lock_for_deletion > > > > > > returns -EBUSY, then this dm disk is skipped in remove_all(). > > > > > > > > > > > > Force to mark DMF_DEFERRED_REMOVE might solve it, but need our device > > > > > > mapper guys to check if it is safe. > > > > > > > > > > > > Or other better solution? > > > > > > > > > > Yeah, I think there is. You can just switch all fput() instances in > > > > > device mapper to bdev_fput() which is mainline now. This will yield the > > > > > device and make it able to be reclaimed. Should be as simple as the > > > > > patch below. Could you test this and send a patch based on this (I'm on > > > > > a prolonged vacation so I don't have time right now.): > > > > > > > > Unfortunately it doesn't work. > > > > > > > > Here the problem is that blkdev_release() is delayed, which changes > > > > 'dmsetup remove_all' behavior, and causes that some of dm disks aren't > > > > removed. > > > > > > > > Please see dm_lock_for_deletion() and dm_blk_open()/dm_blk_close(). > > > > > > So you really need blkdev_release() itself to be synchronous? Groan, in > > > > At least the current dm implementation relies on this way sort of, and > > it could be addressed by forcing to mark DMF_DEFERRED_REMOVE in > > remove_all(). > > > > > that case use __fput_sync() instead of fput() which ensures that this > > > file is closed synchronously. > > > > I tried __fput_sync(), but the following panic is caused: > > > > [ 113.486522] ------------[ cut here ]------------ > > [ 113.486524] kernel BUG at fs/file_table.c:453! > > [ 113.486531] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI > > [ 113.488878] CPU: 6 PID: 1919 Comm: dmsetup Kdump: loaded Not tainted 5.14.0+ #23 > > Wait, how come this is 5.14 kernel? Apparently you're crashing on: > > BUG_ON(!(task->flags & PF_KTHREAD)); > > but that is not present in current upstream (BUG_ON was removed in 6.6-rc1 > by commit 021a160abf62c). Indeed, just tried the change on v6.9-rc3, looks it does work. Thanks, Ming
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8dcabf84d866..87de5b5682ad 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -726,7 +726,8 @@ static struct table_device *open_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode) { struct table_device *td; - struct bdev_handle *bdev_handle; + struct file *bdev_file; + struct block_device *bdev; u64 part_off; int r; @@ -735,34 +736,36 @@ static struct table_device *open_table_device(struct mapped_device *md, return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); - bdev_handle = bdev_open_by_dev(dev, mode, _dm_claim_ptr, NULL); - if (IS_ERR(bdev_handle)) { - r = PTR_ERR(bdev_handle); + bdev_file = bdev_file_open_by_dev(dev, mode, _dm_claim_ptr, NULL); + if (IS_ERR(bdev_file)) { + r = PTR_ERR(bdev_file); goto out_free_td; } + bdev = file_bdev(bdev_file); + /* * We can be called before the dm disk is added. In that case we can't * register the holder relation here. It will be done once add_disk was * called. */ if (md->disk->slave_dir) { - r = bd_link_disk_holder(bdev_handle->bdev, md->disk); + r = bd_link_disk_holder(bdev, md->disk); if (r) goto out_blkdev_put; } td->dm_dev.mode = mode; - td->dm_dev.bdev = bdev_handle->bdev; - td->dm_dev.bdev_handle = bdev_handle; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, &part_off, + td->dm_dev.bdev = bdev; + td->dm_dev.bdev_file = bdev_file; + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); format_dev_t(td->dm_dev.name, dev); list_add(&td->list, &md->table_devices); return td; out_blkdev_put: - bdev_release(bdev_handle); + fput(bdev_file); out_free_td: kfree(td); return ERR_PTR(r); @@ -775,7 +778,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - bdev_release(td->dm_dev.bdev_handle); + fput(td->dm_dev.bdev_file); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); diff --git a/drivers/md/md.c b/drivers/md/md.c index 2266358d8074..0653584db63b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2578,7 +2578,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - bdev_release(rdev->bdev_handle); + fput(rdev->bdev_file); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -3773,16 +3773,16 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - rdev->bdev_handle = bdev_open_by_dev(newdev, + rdev->bdev_file = bdev_file_open_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); - if (IS_ERR(rdev->bdev_handle)) { + if (IS_ERR(rdev->bdev_file)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); - err = PTR_ERR(rdev->bdev_handle); + err = PTR_ERR(rdev->bdev_file); goto out_clear_rdev; } - rdev->bdev = rdev->bdev_handle->bdev; + rdev->bdev = file_bdev(rdev->bdev_file); kobject_init(&rdev->kobj, &rdev_ktype); @@ -3813,7 +3813,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - bdev_release(rdev->bdev_handle); + fput(rdev->bdev_file); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: diff --git a/drivers/md/md.h b/drivers/md/md.h index 8d881cc59799..a079ee9b6190 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -59,7 +59,7 @@ struct md_rdev { */ struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ - struct bdev_handle *bdev_handle; /* Handle from open for bdev */ + struct file *bdev_file; /* Handle from open for bdev */ struct page *sb_page, *bb_page; int sb_loaded; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 772ab4d74d94..82b2195efaca 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -165,7 +165,7 @@ void dm_error(const char *message); struct dm_dev { struct block_device *bdev; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct dax_device *dax_dev; blk_mode_t mode; char name[16];
Signed-off-by: Christian Brauner <brauner@kernel.org> --- drivers/md/dm.c | 23 +++++++++++++---------- drivers/md/md.c | 12 ++++++------ drivers/md/md.h | 2 +- include/linux/device-mapper.h | 2 +- 4 files changed, 21 insertions(+), 18 deletions(-)