diff mbox series

fs,block: get holder during claim

Message ID 20240315-freibad-annehmbar-ca68c375af91@brauner (mailing list archive)
State New, archived
Headers show
Series fs,block: get holder during claim | expand

Commit Message

Christian Brauner March 15, 2024, 1:23 p.m. UTC
Now that we open block devices as files we need to deal with the
realities that closing is a deferred operation. An operation on the
block device such as e.g., freeze, thaw, or removal that runs
concurrently with umount, tries to acquire a stable reference on the
holder. The holder might already be gone though. Make that reliable by
grabbing a passive reference to the holder during bdev_open() and
releasing it during bdev_release().

Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only
Reported-by: Christoph Hellwig <hch@infradead.org>
Link: https://lore.kernel.org/r/ZfEQQ9jZZVes0WCZ@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Hey all,

I ran blktests with nbd enabled which contains a reliable repro for the
issue. Thanks to Christoph for pointing in that direction. The
underlying issue is not reproducible anymore with this patch applied.
xfstests and blktests pass.

Thanks!
Christian
---
 block/bdev.c           |  7 +++++++
 fs/super.c             | 18 ++++++++++++++++++
 include/linux/blkdev.h | 10 ++++++++++
 3 files changed, 35 insertions(+)

Comments

Jan Kara March 15, 2024, 2:28 p.m. UTC | #1
On Fri 15-03-24 14:23:07, Christian Brauner wrote:
> Now that we open block devices as files we need to deal with the
> realities that closing is a deferred operation. An operation on the
> block device such as e.g., freeze, thaw, or removal that runs
> concurrently with umount, tries to acquire a stable reference on the
> holder. The holder might already be gone though. Make that reliable by
> grabbing a passive reference to the holder during bdev_open() and
> releasing it during bdev_release().
> 
> Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only
> Reported-by: Christoph Hellwig <hch@infradead.org>
> Link: https://lore.kernel.org/r/ZfEQQ9jZZVes0WCZ@infradead.org
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> Hey all,
> 
> I ran blktests with nbd enabled which contains a reliable repro for the
> issue. Thanks to Christoph for pointing in that direction. The
> underlying issue is not reproducible anymore with this patch applied.
> xfstests and blktests pass.

Thanks for the fix! It looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza


> diff --git a/block/bdev.c b/block/bdev.c
> index e7adaaf1c219..7a5f611c3d2e 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -583,6 +583,9 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder,
>  	mutex_unlock(&bdev->bd_holder_lock);
>  	bd_clear_claiming(whole, holder);
>  	mutex_unlock(&bdev_lock);
> +
> +	if (hops && hops->get_holder)
> +		hops->get_holder(holder);
>  }
>  
>  /**
> @@ -605,6 +608,7 @@ EXPORT_SYMBOL(bd_abort_claiming);
>  static void bd_end_claim(struct block_device *bdev, void *holder)
>  {
>  	struct block_device *whole = bdev_whole(bdev);
> +	const struct blk_holder_ops *hops = bdev->bd_holder_ops;
>  	bool unblock = false;
>  
>  	/*
> @@ -627,6 +631,9 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
>  		whole->bd_holder = NULL;
>  	mutex_unlock(&bdev_lock);
>  
> +	if (hops && hops->put_holder)
> +		hops->put_holder(holder);
> +
>  	/*
>  	 * If this was the last claim, remove holder link and unblock evpoll if
>  	 * it was a write holder.
> diff --git a/fs/super.c b/fs/super.c
> index ee05ab6b37e7..71d9779c42b1 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1515,11 +1515,29 @@ static int fs_bdev_thaw(struct block_device *bdev)
>  	return error;
>  }
>  
> +static void fs_bdev_super_get(void *data)
> +{
> +	struct super_block *sb = data;
> +
> +	spin_lock(&sb_lock);
> +	sb->s_count++;
> +	spin_unlock(&sb_lock);
> +}
> +
> +static void fs_bdev_super_put(void *data)
> +{
> +	struct super_block *sb = data;
> +
> +	put_super(sb);
> +}
> +
>  const struct blk_holder_ops fs_holder_ops = {
>  	.mark_dead		= fs_bdev_mark_dead,
>  	.sync			= fs_bdev_sync,
>  	.freeze			= fs_bdev_freeze,
>  	.thaw			= fs_bdev_thaw,
> +	.get_holder		= fs_bdev_super_get,
> +	.put_holder		= fs_bdev_super_put,
>  };
>  EXPORT_SYMBOL_GPL(fs_holder_ops);
>  
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f9b87c39cab0..c3e8f7cf96be 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1505,6 +1505,16 @@ struct blk_holder_ops {
>  	 * Thaw the file system mounted on the block device.
>  	 */
>  	int (*thaw)(struct block_device *bdev);
> +
> +	/*
> +	 * If needed, get a reference to the holder.
> +	 */
> +	void (*get_holder)(void *holder);
> +
> +	/*
> +	 * Release the holder.
> +	 */
> +	void (*put_holder)(void *holder);
>  };
>  
>  /*
> -- 
> 2.43.0
>
Christoph Hellwig March 17, 2024, 8:53 p.m. UTC | #2
On Fri, Mar 15, 2024 at 02:23:07PM +0100, Christian Brauner wrote:
> Now that we open block devices as files we need to deal with the
> realities that closing is a deferred operation. An operation on the
> block device such as e.g., freeze, thaw, or removal that runs
> concurrently with umount, tries to acquire a stable reference on the
> holder. The holder might already be gone though. Make that reliable by
> grabbing a passive reference to the holder during bdev_open() and
> releasing it during bdev_release().

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

does bcachefs also need a fix for it's holder ops?  Or does it get to
keep the pieces as it has it's own NULL holder_ops and obviously doens't
care about getting any of this right?
Christian Brauner March 18, 2024, 8:33 a.m. UTC | #3
On Sun, Mar 17, 2024 at 01:53:51PM -0700, Christoph Hellwig wrote:
> On Fri, Mar 15, 2024 at 02:23:07PM +0100, Christian Brauner wrote:
> > Now that we open block devices as files we need to deal with the
> > realities that closing is a deferred operation. An operation on the
> > block device such as e.g., freeze, thaw, or removal that runs
> > concurrently with umount, tries to acquire a stable reference on the
> > holder. The holder might already be gone though. Make that reliable by
> > grabbing a passive reference to the holder during bdev_open() and
> > releasing it during bdev_release().
> 
> Looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> does bcachefs also need a fix for it's holder ops?  Or does it get to
> keep the pieces as it has it's own NULL holder_ops and obviously doens't
> care about getting any of this right?

It has empty holder ops and so is behaving equivalent too having NULL
holder ops. IOW, the block layer cannot access the holder.
Yi Zhang March 18, 2024, 9:10 a.m. UTC | #4
On Fri, Mar 15, 2024 at 9:23 PM Christian Brauner <brauner@kernel.org> wrote:
>
> Now that we open block devices as files we need to deal with the
> realities that closing is a deferred operation. An operation on the
> block device such as e.g., freeze, thaw, or removal that runs
> concurrently with umount, tries to acquire a stable reference on the
> holder. The holder might already be gone though. Make that reliable by
> grabbing a passive reference to the holder during bdev_open() and
> releasing it during bdev_release().
>
> Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only
> Reported-by: Christoph Hellwig <hch@infradead.org>
> Link: https://lore.kernel.org/r/ZfEQQ9jZZVes0WCZ@infradead.org
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Verified the blktests ndb/003 panic issue was fixed by this patch,
feel free to add:

Tested-by: Yi Zhang <yi.zhang@redhat.com>




> ---
> Hey all,
>
> I ran blktests with nbd enabled which contains a reliable repro for the
> issue. Thanks to Christoph for pointing in that direction. The
> underlying issue is not reproducible anymore with this patch applied.
> xfstests and blktests pass.
>
> Thanks!
> Christian
> ---
>  block/bdev.c           |  7 +++++++
>  fs/super.c             | 18 ++++++++++++++++++
>  include/linux/blkdev.h | 10 ++++++++++
>  3 files changed, 35 insertions(+)
>
> diff --git a/block/bdev.c b/block/bdev.c
> index e7adaaf1c219..7a5f611c3d2e 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -583,6 +583,9 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder,
>         mutex_unlock(&bdev->bd_holder_lock);
>         bd_clear_claiming(whole, holder);
>         mutex_unlock(&bdev_lock);
> +
> +       if (hops && hops->get_holder)
> +               hops->get_holder(holder);
>  }
>
>  /**
> @@ -605,6 +608,7 @@ EXPORT_SYMBOL(bd_abort_claiming);
>  static void bd_end_claim(struct block_device *bdev, void *holder)
>  {
>         struct block_device *whole = bdev_whole(bdev);
> +       const struct blk_holder_ops *hops = bdev->bd_holder_ops;
>         bool unblock = false;
>
>         /*
> @@ -627,6 +631,9 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
>                 whole->bd_holder = NULL;
>         mutex_unlock(&bdev_lock);
>
> +       if (hops && hops->put_holder)
> +               hops->put_holder(holder);
> +
>         /*
>          * If this was the last claim, remove holder link and unblock evpoll if
>          * it was a write holder.
> diff --git a/fs/super.c b/fs/super.c
> index ee05ab6b37e7..71d9779c42b1 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1515,11 +1515,29 @@ static int fs_bdev_thaw(struct block_device *bdev)
>         return error;
>  }
>
> +static void fs_bdev_super_get(void *data)
> +{
> +       struct super_block *sb = data;
> +
> +       spin_lock(&sb_lock);
> +       sb->s_count++;
> +       spin_unlock(&sb_lock);
> +}
> +
> +static void fs_bdev_super_put(void *data)
> +{
> +       struct super_block *sb = data;
> +
> +       put_super(sb);
> +}
> +
>  const struct blk_holder_ops fs_holder_ops = {
>         .mark_dead              = fs_bdev_mark_dead,
>         .sync                   = fs_bdev_sync,
>         .freeze                 = fs_bdev_freeze,
>         .thaw                   = fs_bdev_thaw,
> +       .get_holder             = fs_bdev_super_get,
> +       .put_holder             = fs_bdev_super_put,
>  };
>  EXPORT_SYMBOL_GPL(fs_holder_ops);
>
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f9b87c39cab0..c3e8f7cf96be 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1505,6 +1505,16 @@ struct blk_holder_ops {
>          * Thaw the file system mounted on the block device.
>          */
>         int (*thaw)(struct block_device *bdev);
> +
> +       /*
> +        * If needed, get a reference to the holder.
> +        */
> +       void (*get_holder)(void *holder);
> +
> +       /*
> +        * Release the holder.
> +        */
> +       void (*put_holder)(void *holder);
>  };
>
>  /*
> --
> 2.43.0
>
>


--
Best Regards,
  Yi Zhang
Christian Brauner March 19, 2024, 4:24 p.m. UTC | #5
> > I ran blktests with nbd enabled which contains a reliable repro for the
> > issue. Thanks to Christoph for pointing in that direction. The
> > underlying issue is not reproducible anymore with this patch applied.
> > xfstests and blktests pass.
> 
> Thanks for the fix! It looks good to me. Feel free to add:

So Linus complained about the fact that we have holder ops when really
it currently isn't needed by anything apart from filesytems. And I think
that he's right and we should consider removing the holder ops and just
calling helpers directly. If there's users outside of filesystems we can
always reintroduce that. So I went ahead and drafted a patch series. I
think it ends up simplifying things and it ends up easier to follow and
we can handle lifetime of the superblock cleaner with relying on
callbacks. Appending the patch series here and pushed to
vfs.bdev.holder. Want to gauge your thoughts before sending it out.

https://gitlab.com/brauner/linux/-/commits/vfs.bdev.holder
Matthew Wilcox (Oracle) March 19, 2024, 5:03 p.m. UTC | #6
On Tue, Mar 19, 2024 at 05:24:44PM +0100, Christian Brauner wrote:
> @@ -631,8 +631,8 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
>  		whole->bd_holder = NULL;
>  	mutex_unlock(&bdev_lock);
>  
> -	if (hops && hops->put_holder)
> -		hops->put_holder(holder);
> +	if (mounted)
> +		fs_bdev_super_put(holder);

I think you haven't gone quite far enough here.  Call super_put()
directly and make bd_end_claim() take a super_block pointer.
Christoph Hellwig March 19, 2024, 11:13 p.m. UTC | #7
On Tue, Mar 19, 2024 at 05:24:44PM +0100, Christian Brauner wrote:
> So Linus complained about the fact that we have holder ops when really
> it currently isn't needed by anything apart from filesytems.

Err, that's jut because we haven't gotten to it.  The whole point of the
ops is that we do want device removal (and in the future resize)
notification for others users as well.  In fact what drove me is to
have proper notifications for a removed devices in md parity raid, I've
just not gotten to that quite yet as I've been busy with other things.

Vut even if that wasn't the case, calling straight from block layer code
into file systems is just wrong.
diff mbox series

Patch

diff --git a/block/bdev.c b/block/bdev.c
index e7adaaf1c219..7a5f611c3d2e 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -583,6 +583,9 @@  static void bd_finish_claiming(struct block_device *bdev, void *holder,
 	mutex_unlock(&bdev->bd_holder_lock);
 	bd_clear_claiming(whole, holder);
 	mutex_unlock(&bdev_lock);
+
+	if (hops && hops->get_holder)
+		hops->get_holder(holder);
 }
 
 /**
@@ -605,6 +608,7 @@  EXPORT_SYMBOL(bd_abort_claiming);
 static void bd_end_claim(struct block_device *bdev, void *holder)
 {
 	struct block_device *whole = bdev_whole(bdev);
+	const struct blk_holder_ops *hops = bdev->bd_holder_ops;
 	bool unblock = false;
 
 	/*
@@ -627,6 +631,9 @@  static void bd_end_claim(struct block_device *bdev, void *holder)
 		whole->bd_holder = NULL;
 	mutex_unlock(&bdev_lock);
 
+	if (hops && hops->put_holder)
+		hops->put_holder(holder);
+
 	/*
 	 * If this was the last claim, remove holder link and unblock evpoll if
 	 * it was a write holder.
diff --git a/fs/super.c b/fs/super.c
index ee05ab6b37e7..71d9779c42b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1515,11 +1515,29 @@  static int fs_bdev_thaw(struct block_device *bdev)
 	return error;
 }
 
+static void fs_bdev_super_get(void *data)
+{
+	struct super_block *sb = data;
+
+	spin_lock(&sb_lock);
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+}
+
+static void fs_bdev_super_put(void *data)
+{
+	struct super_block *sb = data;
+
+	put_super(sb);
+}
+
 const struct blk_holder_ops fs_holder_ops = {
 	.mark_dead		= fs_bdev_mark_dead,
 	.sync			= fs_bdev_sync,
 	.freeze			= fs_bdev_freeze,
 	.thaw			= fs_bdev_thaw,
+	.get_holder		= fs_bdev_super_get,
+	.put_holder		= fs_bdev_super_put,
 };
 EXPORT_SYMBOL_GPL(fs_holder_ops);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f9b87c39cab0..c3e8f7cf96be 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1505,6 +1505,16 @@  struct blk_holder_ops {
 	 * Thaw the file system mounted on the block device.
 	 */
 	int (*thaw)(struct block_device *bdev);
+
+	/*
+	 * If needed, get a reference to the holder.
+	 */
+	void (*get_holder)(void *holder);
+
+	/*
+	 * Release the holder.
+	 */
+	void (*put_holder)(void *holder);
 };
 
 /*