Message ID | 20230927-vfs-super-freeze-v1-3-ecc36d9ab4d9@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Implement freeze and thaw as holder operations | expand |
On Wed, Sep 27, 2023 at 03:21:16PM +0200, Christian Brauner wrote: > The old method of implementing block device freeze and thaw operations > required us to rely on get_active_super() to walk the list of all > superblocks on the system to find any superblock that might use the > block device. This is wasteful and not very pleasant overall. > > Now that we can finally go straight from block device to owning > superblock things become way simpler. In fact, we can even get rid of > bd_fsfreeze_mutex and rely on the holder lock for freezing. > > We let fs_bdev_thaw() grab its own active reference so we can hold > bd_holder_lock across freeze and thaw without risk of deadlocks. That in > turn allows us to get rid of bd_fsfreeze_mutex. This means > fs_bdev_thaw() might put the last reference to the superblock instead of > thaw_super(). This shouldn't be an issue. If it turns out to be one we > can reshuffle the code to simply hold s_umount when thaw_super() returns. > > Thanks to Jan and Christoph who pointed out that bd_fsfreeze_mutex is > unnecessary now. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > block/bdev.c | 74 +++++++++++++++++-------------------- > fs/super.c | 94 ++++++++++++++++++++++++++++++++++++++++++----- > include/linux/blk_types.h | 2 +- > 3 files changed, 119 insertions(+), 51 deletions(-) > > diff --git a/block/bdev.c b/block/bdev.c > index 0d27db3e69e7..3deccd0ffcf2 100644 > --- a/block/bdev.c > +++ b/block/bdev.c > @@ -222,32 +222,23 @@ EXPORT_SYMBOL(sync_blockdev_range); > */ > int bdev_freeze(struct block_device *bdev) > { > - struct super_block *sb; > int error = 0; > > - mutex_lock(&bdev->bd_fsfreeze_mutex); > - if (++bdev->bd_fsfreeze_count > 1) > - goto done; > + mutex_lock(&bdev->bd_holder_lock); > > - sb = get_active_super(bdev); > - if (!sb) > - goto sync; > - if (sb->s_op->freeze_super) > - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); > - else > - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); > - deactivate_super(sb); > + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { > + mutex_unlock(&bdev->bd_holder_lock); > + return 0; > + } > > - if (error) { > - bdev->bd_fsfreeze_count--; > - goto done; > + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { > + error = bdev->bd_holder_ops->freeze(bdev); > + lockdep_assert_not_held(&bdev->bd_holder_lock); > + } else { > + sync_blockdev(bdev); Why ignore the return value from sync_blockdev? It calls filemap_write_and_wait, which clears AS_EIO/AS_ENOSPC from the bdev mapping, which means that we'll silently drop accumulated IO errors. > + mutex_unlock(&bdev->bd_holder_lock); Also not sure why this fallback case holds bd_holder_lock across the sync_blockdev but fs_bdev_freeze doesn't? > } > - bdev->bd_fsfreeze_sb = sb; > > -sync: > - sync_blockdev(bdev); > -done: > - mutex_unlock(&bdev->bd_fsfreeze_mutex); > return error; > } > EXPORT_SYMBOL(bdev_freeze); > @@ -262,31 +253,32 @@ EXPORT_SYMBOL(bdev_freeze); > */ > int bdev_thaw(struct block_device *bdev) > { > - struct super_block *sb; > - int error = -EINVAL; > + int error = 0, nr_freeze; > > - mutex_lock(&bdev->bd_fsfreeze_mutex); > - if (!bdev->bd_fsfreeze_count) > - goto out; > + mutex_lock(&bdev->bd_holder_lock); > > - error = 0; > - if (--bdev->bd_fsfreeze_count > 0) > - goto out; > + /* > + * If this returns < 0 it means that @bd_fsfreeze_count was > + * already 0 and no decrement was performed. > + */ > + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); > + if (nr_freeze < 0) { > + mutex_unlock(&bdev->bd_holder_lock); > + return -EINVAL; > + } > > - sb = bdev->bd_fsfreeze_sb; > - if (!sb) > - goto out; > + if (nr_freeze > 0) { > + mutex_unlock(&bdev->bd_holder_lock); > + return 0; > + } > + > + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { > + error = bdev->bd_holder_ops->thaw(bdev); > + lockdep_assert_not_held(&bdev->bd_holder_lock); > + } else { > + mutex_unlock(&bdev->bd_holder_lock); > + } > > - if (sb->s_op->thaw_super) > - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); > - else > - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); > - if (error) > - bdev->bd_fsfreeze_count++; > - else > - bdev->bd_fsfreeze_sb = NULL; > -out: > - mutex_unlock(&bdev->bd_fsfreeze_mutex); > return error; > } > EXPORT_SYMBOL(bdev_thaw); > diff --git a/fs/super.c b/fs/super.c > index e54866345dc7..672f1837fbef 100644 > --- a/fs/super.c > +++ b/fs/super.c > @@ -1469,9 +1469,91 @@ static void fs_bdev_sync(struct block_device *bdev) > super_unlock_shared(sb); > } > > +static struct super_block *get_bdev_super(const struct block_device *bdev) > +{ > + struct super_block *sb_bdev = bdev->bd_holder, *sb = NULL; > + > + if (!sb_bdev) > + return NULL; > + if (super_lock_excl(sb_bdev) && atomic_inc_not_zero(&sb_bdev->s_active)) > + sb = sb_bdev; > + super_unlock_excl(sb_bdev); > + return sb; > +} > + > +static int fs_bdev_freeze(struct block_device *bdev) > + __releases(&bdev->bd_holder_lock) > +{ > + struct super_block *sb; > + int error = 0; > + > + lockdep_assert_held(&bdev->bd_holder_lock); > + > + sb = get_bdev_super(bdev); > + if (sb) { > + if (sb->s_op->freeze_super) > + error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); > + else > + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); > + if (error) > + atomic_dec(&bdev->bd_fsfreeze_count); > + } > + > + /* > + * We have grabbed an active reference which means that the > + * superblock and the block device cannot go away. But we might > + * end up holding the last reference and so end up shutting the > + * superblock down. So drop @bdev->bd_holder_lock to avoid > + * deadlocks with blkdev_put(). > + */ > + mutex_unlock(&bdev->bd_holder_lock); > + > + if (sb) > + deactivate_super(sb); > + > + if (!error) > + sync_blockdev(bdev); Same question here about ignoring the return value. (Everything after this point looks ok to me.) --D > + > + return error; > +} > + > +static int fs_bdev_thaw(struct block_device *bdev) > + __releases(&bdev->bd_holder_lock) > +{ > + struct super_block *sb; > + int error; > + > + lockdep_assert_held(&bdev->bd_holder_lock); > + > + sb = get_bdev_super(bdev); > + if (WARN_ON_ONCE(!sb)) > + return -EINVAL; > + > + if (sb->s_op->thaw_super) > + error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); > + else > + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); > + if (error) > + atomic_inc(&bdev->bd_fsfreeze_count); > + > + /* > + * We have grabbed an active reference which means that the > + * superblock and the block device cannot go away. But we might > + * end up holding the last reference and so end up shutting the > + * superblock down. So drop @bdev->bd_holder_lock to avoid > + * deadlocks with blkdev_put(). > + */ > + mutex_unlock(&bdev->bd_holder_lock); > + deactivate_super(sb); > + > + return error; > +} > + > const struct blk_holder_ops fs_holder_ops = { > .mark_dead = fs_bdev_mark_dead, > .sync = fs_bdev_sync, > + .freeze = fs_bdev_freeze, > + .thaw = fs_bdev_thaw, > }; > EXPORT_SYMBOL_GPL(fs_holder_ops); > > @@ -1499,15 +1581,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, > } > > /* > - * Until SB_BORN flag is set, there can be no active superblock > - * references and thus no filesystem freezing. get_active_super() will > - * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed. > - * > - * It is enough to check bdev was not frozen before we set s_bdev. > + * It is enough to check bdev was not frozen before we set > + * s_bdev as freezing will wait until SB_BORN is set. > */ > - mutex_lock(&bdev->bd_fsfreeze_mutex); > - if (bdev->bd_fsfreeze_count > 0) { > - mutex_unlock(&bdev->bd_fsfreeze_mutex); > + if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { > if (fc) > warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); > blkdev_put(bdev, sb); > @@ -1519,7 +1596,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, > if (bdev_stable_writes(bdev)) > sb->s_iflags |= SB_I_STABLE_WRITES; > spin_unlock(&sb_lock); > - mutex_unlock(&bdev->bd_fsfreeze_mutex); > > snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); > shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index d5c5e59ddbd2..88e1848b0869 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -57,7 +57,7 @@ struct block_device { > const struct blk_holder_ops *bd_holder_ops; > struct mutex bd_holder_lock; > /* The counter of freeze processes */ > - int bd_fsfreeze_count; > + atomic_t bd_fsfreeze_count; > int bd_holders; > struct kobject *bd_holder_dir; > > > -- > 2.34.1 >
> > + sync_blockdev(bdev); > > Why ignore the return value from sync_blockdev? It calls > filemap_write_and_wait, which clears AS_EIO/AS_ENOSPC from the bdev > mapping, which means that we'll silently drop accumulated IO errors. Because freeze_bdev() has always ignored sync_blockdev() errors so far and I'm not sure what we'd do with that error. We can report it but we might confuse callers that think that the freeze failed when it hasn't. > > > + mutex_unlock(&bdev->bd_holder_lock); > > Also not sure why this fallback case holds bd_holder_lock across the > sync_blockdev but fs_bdev_freeze doesn't? I'll massage that to be consistent. Thanks!
On Wed, Sep 27, 2023 at 05:15:28PM +0200, Christian Brauner wrote: > > > + sync_blockdev(bdev); > > > > Why ignore the return value from sync_blockdev? It calls > > filemap_write_and_wait, which clears AS_EIO/AS_ENOSPC from the bdev > > mapping, which means that we'll silently drop accumulated IO errors. > > Because freeze_bdev() has always ignored sync_blockdev() errors so far > and I'm not sure what we'd do with that error. We can report it but we > might confuse callers that think that the freeze failed when it hasn't. Thinking about this some more... I got confused that fs_bdev_freeze drops the bd_fsfreeze_count if freeze_super fails. But I guess that has to get done before letting go of bd_holder_lock. For the bdev->bd_holder_ops == fs_holder_ops case, the freeze_super call will call sync_filesystem, which calls sync_blockdev. If that fails, the fsfreeze aborts, and the bdev freeze (at least with the old code) would also abort. For the !bdev->bd_holder_ops case, why not capture the sync_blockdev error code and decrement bd_fsfreeze_count if the sync failed? Then this function either returns 0 with the fs and bdev frozen; or an error code and nothing frozen. (Also, does this mean that the new sync_blockdev call at the bottom of fs_bdev_freeze isn't necessary? Filesystems that do IO in ->freeze_fs should be flushing the block device.) --D > > > > > + mutex_unlock(&bdev->bd_holder_lock); > > > > Also not sure why this fallback case holds bd_holder_lock across the > > sync_blockdev but fs_bdev_freeze doesn't? > > I'll massage that to be consistent. Thanks!
On Wed, Sep 27, 2023 at 09:01:42AM -0700, Darrick J. Wong wrote: > > For the bdev->bd_holder_ops == fs_holder_ops case, the freeze_super call > will call sync_filesystem, which calls sync_blockdev. If that fails, > the fsfreeze aborts, and the bdev freeze (at least with the old code) > would also abort. > > For the !bdev->bd_holder_ops case, why not capture the sync_blockdev > error code and decrement bd_fsfreeze_count if the sync failed? Then > this function either returns 0 with the fs and bdev frozen; or an error > code and nothing frozen. Yes, even if that is a behavior change it would be a lot more consistent. Maybe do the capturing of the error code as a prep patch so that it is clearly bisectable. > (Also, does this mean that the new sync_blockdev call at the bottom of > fs_bdev_freeze isn't necessary? Filesystems that do IO in ->freeze_fs > should be flushing the block device.) Various methods including freeze do the sync_blockdev unconditionally. I think this is a bad idea and should be moved into the file systems, but I don't think this is in scope for this series.
> > +static struct super_block *get_bdev_super(const struct block_device *bdev) > +{ > + struct super_block *sb_bdev = bdev->bd_holder, *sb = NULL; > + > + if (!sb_bdev) > + return NULL; > + if (super_lock_excl(sb_bdev) && atomic_inc_not_zero(&sb_bdev->s_active)) > + sb = sb_bdev; > + super_unlock_excl(sb_bdev); > + return sb; I find the flow here a bit confusing, because to me sb_bdev implies the super_block of the bdev fs, and because the super_lock_excl calling convention that always locks no matter of the return value is very confusing. Maybe at least rename sb_bdev to holder_bdev? > +static int fs_bdev_freeze(struct block_device *bdev) > + __releases(&bdev->bd_holder_lock) > +{ > + struct super_block *sb; > + int error = 0; > + > + lockdep_assert_held(&bdev->bd_holder_lock); > + > + sb = get_bdev_super(bdev); > + if (sb) { We always have a sb in bdev->bd_holder. So the only way get_bdev_super could return NULL is if we can't get an active reference or the fs is marked as SB_DYING. I think the best is to just give up and not even bother with the sync, which is going to cause more problems than it could help. I think we're better off just straight returning an error here, and don't bother with the sync_blockdev.
diff --git a/block/bdev.c b/block/bdev.c index 0d27db3e69e7..3deccd0ffcf2 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -222,32 +222,23 @@ EXPORT_SYMBOL(sync_blockdev_range); */ int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; + mutex_lock(&bdev->bd_holder_lock); - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_holder_lock); + return 0; + } - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_holder_lock); } - bdev->bd_fsfreeze_sb = sb; -sync: - sync_blockdev(bdev); -done: - mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_freeze); @@ -262,31 +253,32 @@ EXPORT_SYMBOL(bdev_freeze); */ int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = 0, nr_freeze; - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) - goto out; + mutex_lock(&bdev->bd_holder_lock); - error = 0; - if (--bdev->bd_fsfreeze_count > 0) - goto out; + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) { + mutex_unlock(&bdev->bd_holder_lock); + return -EINVAL; + } - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + if (nr_freeze > 0) { + mutex_unlock(&bdev->bd_holder_lock); + return 0; + } + + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); - if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; -out: - mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_thaw); diff --git a/fs/super.c b/fs/super.c index e54866345dc7..672f1837fbef 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1469,9 +1469,91 @@ static void fs_bdev_sync(struct block_device *bdev) super_unlock_shared(sb); } +static struct super_block *get_bdev_super(const struct block_device *bdev) +{ + struct super_block *sb_bdev = bdev->bd_holder, *sb = NULL; + + if (!sb_bdev) + return NULL; + if (super_lock_excl(sb_bdev) && atomic_inc_not_zero(&sb_bdev->s_active)) + sb = sb_bdev; + super_unlock_excl(sb_bdev); + return sb; +} + +static int fs_bdev_freeze(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct super_block *sb; + int error = 0; + + lockdep_assert_held(&bdev->bd_holder_lock); + + sb = get_bdev_super(bdev); + if (sb) { + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + else + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + } + + /* + * We have grabbed an active reference which means that the + * superblock and the block device cannot go away. But we might + * end up holding the last reference and so end up shutting the + * superblock down. So drop @bdev->bd_holder_lock to avoid + * deadlocks with blkdev_put(). + */ + mutex_unlock(&bdev->bd_holder_lock); + + if (sb) + deactivate_super(sb); + + if (!error) + sync_blockdev(bdev); + + return error; +} + +static int fs_bdev_thaw(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct super_block *sb; + int error; + + lockdep_assert_held(&bdev->bd_holder_lock); + + sb = get_bdev_super(bdev); + if (WARN_ON_ONCE(!sb)) + return -EINVAL; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + else + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + atomic_inc(&bdev->bd_fsfreeze_count); + + /* + * We have grabbed an active reference which means that the + * superblock and the block device cannot go away. But we might + * end up holding the last reference and so end up shutting the + * superblock down. So drop @bdev->bd_holder_lock to avoid + * deadlocks with blkdev_put(). + */ + mutex_unlock(&bdev->bd_holder_lock); + deactivate_super(sb); + + return error; +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, + .freeze = fs_bdev_freeze, + .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1499,15 +1581,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, } /* - * Until SB_BORN flag is set, there can be no active superblock - * references and thus no filesystem freezing. get_active_super() will - * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed. - * - * It is enough to check bdev was not frozen before we set s_bdev. + * It is enough to check bdev was not frozen before we set + * s_bdev as freezing will wait until SB_BORN is set. */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); blkdev_put(bdev, sb); @@ -1519,7 +1596,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); - mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d5c5e59ddbd2..88e1848b0869 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -57,7 +57,7 @@ struct block_device { const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; /* The counter of freeze processes */ - int bd_fsfreeze_count; + atomic_t bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir;
The old method of implementing block device freeze and thaw operations required us to rely on get_active_super() to walk the list of all superblocks on the system to find any superblock that might use the block device. This is wasteful and not very pleasant overall. Now that we can finally go straight from block device to owning superblock things become way simpler. In fact, we can even get rid of bd_fsfreeze_mutex and rely on the holder lock for freezing. We let fs_bdev_thaw() grab its own active reference so we can hold bd_holder_lock across freeze and thaw without risk of deadlocks. That in turn allows us to get rid of bd_fsfreeze_mutex. This means fs_bdev_thaw() might put the last reference to the superblock instead of thaw_super(). This shouldn't be an issue. If it turns out to be one we can reshuffle the code to simply hold s_umount when thaw_super() returns. Thanks to Jan and Christoph who pointed out that bd_fsfreeze_mutex is unnecessary now. Signed-off-by: Christian Brauner <brauner@kernel.org> --- block/bdev.c | 74 +++++++++++++++++-------------------- fs/super.c | 94 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/blk_types.h | 2 +- 3 files changed, 119 insertions(+), 51 deletions(-)