diff mbox series

[6/6] fs: Make bind mounts work with bdev_allow_write_mounted=n

Message ID 20230704125702.23180-6-jack@suse.cz (mailing list archive)
State New, archived
Headers show
Series block: Add config option to not allow writing to mounted devices | expand

Commit Message

Jan Kara July 4, 2023, 12:56 p.m. UTC
When we don't allow opening of mounted block devices for writing, bind
mounting is broken because the bind mount tries to open the block device
before finding the superblock for it already exists. Reorganize the
mounting code to first look whether the superblock for a particular
device is already mounted and open the block device only if it is not.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/super.c | 188 +++++++++++++++++++++++++----------------------------
 1 file changed, 89 insertions(+), 99 deletions(-)

Comments

Christian Brauner July 4, 2023, 1:59 p.m. UTC | #1
On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> When we don't allow opening of mounted block devices for writing, bind
> mounting is broken because the bind mount tries to open the block device

Sorry, I'm going to be annoying now...

Afaict, the analysis is misleading but I'm happy to be corrected ofc.
Finding an existing superblock is independent of mounts. get_tree_bdev()
and mount_bdev() are really only interested in finding a matching
superblock independent of whether or not a mount for it already exists.
IOW, if you had two filesystem contexts for the same block device with
different mount options:

T1								T2
fd_fs = fsopen("ext4");						fd_fs = fsopen("ext4");
fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");	fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");

// create superblock
fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
								// finds superblock of T1 if opts are compatible
								fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)

you should have the issue that you're describing. But for neither of
them does a mount already exist as the first mount here would only be
created when:

T1								T2
fsmount(fd_fs);							fsmount(fd_fs);

is called at which point the whole superblock issue is already settled.
Afterwards, both mounts of both T1 and T2 refer to the same superblock -
as long as the fs and the mount options support this ofc.

---

Btw, I talked to Karel a while ago. I'd like to add an option like:

FSCONFIG_CMD_CREATE_EXCL

or similar to fsconfig() that fails in scenarios like the one I
described in T1 and T2. IOW, T1 succeeds but T2 would fail even if the
filesystem would consider the sb to be compatible. This way userspace
can be sure that they did indeed succeed in creating the superblock...

> before finding the superblock for it already exists. Reorganize the
> mounting code to first look whether the superblock for a particular
> device is already mounted and open the block device only if it is not.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/super.c | 188 +++++++++++++++++++++++++----------------------------
>  1 file changed, 89 insertions(+), 99 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index ea135fece772..fdf1e286926e 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1228,13 +1228,7 @@ static const struct blk_holder_ops fs_holder_ops = {
>  
>  static int set_bdev_super(struct super_block *s, void *data)
>  {
> -	s->s_bdev_handle = data;
> -	s->s_bdev = s->s_bdev_handle->bdev;
> -	s->s_dev = s->s_bdev->bd_dev;
> -	s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
> -
> -	if (bdev_stable_writes(s->s_bdev))
> -		s->s_iflags |= SB_I_STABLE_WRITES;
> +	s->s_dev = *(dev_t *)data;
>  	return 0;
>  }
>  
> @@ -1246,7 +1240,53 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
>  static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
>  {
>  	return !(s->s_iflags & SB_I_RETIRED) &&
> -		s->s_bdev == ((struct bdev_handle *)fc->sget_key)->bdev;
> +	       s->s_dev == *(dev_t *)fc->sget_key;
> +}
> +
> +static int setup_bdev_super(struct super_block *s, int sb_flags,
> +			    struct fs_context *fc)
> +{
> +	struct bdev_handle *bdev_handle;
> +
> +	bdev_handle = blkdev_get_by_dev(s->s_dev, sb_open_mode(sb_flags),
> +					s->s_type, &fs_holder_ops);
> +	if (IS_ERR(bdev_handle)) {
> +		if (fc)
> +			errorf(fc, "%s: Can't open blockdev", fc->source);
> +		return PTR_ERR(bdev_handle);
> +	}
> +	spin_lock(&sb_lock);
> +	s->s_bdev_handle = bdev_handle;
> +	s->s_bdev = bdev_handle->bdev;
> +	s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
> +
> +	if (bdev_stable_writes(s->s_bdev))
> +		s->s_iflags |= SB_I_STABLE_WRITES;
> +	spin_unlock(&sb_lock);
> +
> +	/*
> +	 * Until SB_BORN flag is set, there can be no active superblock
> + 	 * references and thus no filesystem freezing. get_active_super()
> +	 * will just loop waiting for SB_BORN so even freeze_bdev() cannot
> +	 * proceed. It is enough to check bdev was not frozen before we set
> +	 * s_bdev.
> +	 */
> +	mutex_lock(&s->s_bdev->bd_fsfreeze_mutex);
> +	if (s->s_bdev->bd_fsfreeze_count > 0) {
> +		mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
> +		if (fc)
> +			warnf(fc, "%pg: Can't mount, blockdev is frozen",
> +			      s->s_bdev);
> +		return -EBUSY;
> +	}
> +	mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
> +
> +	snprintf(s->s_id, sizeof(s->s_id), "%pg", s->s_bdev);
> +	shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
> +				fc->fs_type->name, s->s_id);
> +	sb_set_blocksize(s, block_size(s->s_bdev));
> +
> +	return 0;
>  }
>  
>  /**
> @@ -1258,75 +1298,51 @@ int get_tree_bdev(struct fs_context *fc,
>  		int (*fill_super)(struct super_block *,
>  				  struct fs_context *))
>  {
> -	struct bdev_handle *bdev_handle;
> -	struct block_device *bdev;
> +	dev_t dev;
>  	struct super_block *s;
>  	int error = 0;
>  
>  	if (!fc->source)
>  		return invalf(fc, "No source specified");
>  
> -	bdev_handle = blkdev_get_by_path(fc->source,
> -					 sb_open_mode(fc->sb_flags),
> -					 fc->fs_type, &fs_holder_ops);
> -	if (IS_ERR(bdev_handle)) {
> -		errorf(fc, "%s: Can't open blockdev", fc->source);
> -		return PTR_ERR(bdev_handle);
> -	}
> -	bdev = bdev_handle->bdev;
> -
> -	/* Once the superblock is inserted into the list by sget_fc(), s_umount
> -	 * will protect the lockfs code from trying to start a snapshot while
> -	 * we are mounting
> -	 */
> -	mutex_lock(&bdev->bd_fsfreeze_mutex);
> -	if (bdev->bd_fsfreeze_count > 0) {
> -		mutex_unlock(&bdev->bd_fsfreeze_mutex);
> -		warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
> -		blkdev_put(bdev_handle);
> -		return -EBUSY;
> +	error = lookup_bdev(fc->source, &dev);
> +	if (error) {
> +		errorf(fc, "%s: Can't lookup blockdev", fc->source);
> +		return error;
>  	}
>  
>  	fc->sb_flags |= SB_NOSEC;
> -	fc->sget_key = bdev_handle;
> +	fc->sget_key = &dev;
>  	s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
> -	mutex_unlock(&bdev->bd_fsfreeze_mutex);
> -	if (IS_ERR(s)) {
> -		blkdev_put(bdev_handle);
> +	if (IS_ERR(s))
>  		return PTR_ERR(s);
> -	}
>  
>  	if (s->s_root) {
>  		/* Don't summarily change the RO/RW state. */
>  		if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
> -			warnf(fc, "%pg: Can't mount, would change RO state", bdev);
> +			warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
>  			deactivate_locked_super(s);
> -			blkdev_put(bdev_handle);
>  			return -EBUSY;
>  		}
> -
> +	} else {
>  		/*
> -		 * s_umount nests inside open_mutex during
> -		 * __invalidate_device().  blkdev_put() acquires open_mutex and
> -		 * can't be called under s_umount.  Drop s_umount temporarily.
> -		 * This is safe as we're holding an active reference.
> +		 * We drop s_umount here because we need to lookup bdev and
> +		 * bdev->open_mutex ranks above s_umount (blkdev_put() ->
> +		 * invalidate_bdev()). It is safe because we have active sb
> +		 * reference and SB_BORN is not set yet.
>  		 */
>  		up_write(&s->s_umount);
> -		blkdev_put(bdev_handle);
> +		error = setup_bdev_super(s, fc->sb_flags, fc);
>  		down_write(&s->s_umount);
> -	} else {
> -		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
> -		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
> -					fc->fs_type->name, s->s_id);
> -		sb_set_blocksize(s, block_size(bdev));
> -		error = fill_super(s, fc);
> +		if (!error)
> +			error = fill_super(s, fc);
>  		if (error) {
>  			deactivate_locked_super(s);
>  			return error;
>  		}
>  
>  		s->s_flags |= SB_ACTIVE;
> -		bdev->bd_super = s;
> +		s->s_bdev->bd_super = s;
>  	}
>  
>  	BUG_ON(fc->root);
> @@ -1337,81 +1353,53 @@ EXPORT_SYMBOL(get_tree_bdev);
>  
>  static int test_bdev_super(struct super_block *s, void *data)
>  {
> -	return !(s->s_iflags & SB_I_RETIRED) &&
> -		s->s_bdev == ((struct bdev_handle *)data)->bdev;
> +	return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
>  }
>  
>  struct dentry *mount_bdev(struct file_system_type *fs_type,
>  	int flags, const char *dev_name, void *data,
>  	int (*fill_super)(struct super_block *, void *, int))
>  {
> -	struct bdev_handle *bdev_handle;
> -	struct block_device *bdev;
>  	struct super_block *s;
>  	int error = 0;
> +	dev_t dev;
>  
> -	bdev_handle = blkdev_get_by_path(dev_name, sb_open_mode(flags),
> -					 fs_type, &fs_holder_ops);
> -	if (IS_ERR(bdev_handle))
> -		return ERR_CAST(bdev_handle);
> -	bdev = bdev_handle->bdev;
> +	error = lookup_bdev(dev_name, &dev);
> +	if (error)
> +		return ERR_PTR(error);
>  
> -	/*
> -	 * once the super is inserted into the list by sget, s_umount
> -	 * will protect the lockfs code from trying to start a snapshot
> -	 * while we are mounting
> -	 */
> -	mutex_lock(&bdev->bd_fsfreeze_mutex);
> -	if (bdev->bd_fsfreeze_count > 0) {
> -		mutex_unlock(&bdev->bd_fsfreeze_mutex);
> -		error = -EBUSY;
> -		goto error_bdev;
> -	}
> -	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
> -		 bdev_handle);
> -	mutex_unlock(&bdev->bd_fsfreeze_mutex);
> +	flags |= SB_NOSEC;
> +	s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
>  	if (IS_ERR(s))
> -		goto error_s;
> +		return ERR_CAST(s);
>  
>  	if (s->s_root) {
>  		if ((flags ^ s->s_flags) & SB_RDONLY) {
>  			deactivate_locked_super(s);
> -			error = -EBUSY;
> -			goto error_bdev;
> +			return ERR_PTR(-EBUSY);
>  		}
> -
> +	} else {
>  		/*
> -		 * s_umount nests inside open_mutex during
> -		 * __invalidate_device().  blkdev_put() acquires open_mutex and
> -		 * can't be called under s_umount.  Drop s_umount temporarily.
> -		 * This is safe as we're holding an active reference.
> +		 * We drop s_umount here because we need to lookup bdev and
> +		 * bdev->open_mutex ranks above s_umount (blkdev_put() ->
> +		 * invalidate_bdev()). It is safe because we have active sb
> +		 * reference and SB_BORN is not set yet.
>  		 */
>  		up_write(&s->s_umount);
> -		blkdev_put(bdev_handle);
> +		error = setup_bdev_super(s, flags, NULL);
>  		down_write(&s->s_umount);
> -	} else {
> -		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
> -		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
> -					fs_type->name, s->s_id);
> -		sb_set_blocksize(s, block_size(bdev));
> -		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
> +		if (!error)
> +			error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
>  		if (error) {
>  			deactivate_locked_super(s);
> -			goto error;
> +			return ERR_PTR(error);
>  		}
>  
>  		s->s_flags |= SB_ACTIVE;
> -		bdev->bd_super = s;
> +		s->s_bdev->bd_super = s;
>  	}
>  
>  	return dget(s->s_root);
> -
> -error_s:
> -	error = PTR_ERR(s);
> -error_bdev:
> -	blkdev_put(bdev_handle);
> -error:
> -	return ERR_PTR(error);
>  }
>  EXPORT_SYMBOL(mount_bdev);
>  
> @@ -1419,10 +1407,12 @@ void kill_block_super(struct super_block *sb)
>  {
>  	struct block_device *bdev = sb->s_bdev;
>  
> -	bdev->bd_super = NULL;
>  	generic_shutdown_super(sb);
> -	sync_blockdev(bdev);
> -	blkdev_put(sb->s_bdev_handle);
> +	if (bdev) {
> +		bdev->bd_super = NULL;
> +		sync_blockdev(bdev);
> +		blkdev_put(sb->s_bdev_handle);
> +	}
>  }
>  
>  EXPORT_SYMBOL(kill_block_super);
> -- 
> 2.35.3
>
Jan Kara July 5, 2023, 1 p.m. UTC | #2
On Tue 04-07-23 15:59:41, Christian Brauner wrote:
> On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > When we don't allow opening of mounted block devices for writing, bind
> > mounting is broken because the bind mount tries to open the block device
> 
> Sorry, I'm going to be annoying now...
> 
> Afaict, the analysis is misleading but I'm happy to be corrected ofc.

I'm not sure what your objection exactly is. Probably I was imprecise in my
changelog description. What gets broken by not allowing RW open of a
mounted block device is:

mount -t ext4 /dev/sda1 /mnt1
mount -t ext4 /dev/sda1 /mnt2

The second mount should create another mount of the superblock created by
the first mount but before that is done, get_tree_bdev() tries to open the
block device and fails when only patches 1 & 2 are applied. This patch
fixes that.

> Finding an existing superblock is independent of mounts. get_tree_bdev()
> and mount_bdev() are really only interested in finding a matching
> superblock independent of whether or not a mount for it already exists.
> IOW, if you had two filesystem contexts for the same block device with
> different mount options:
> 
> T1								T2
> fd_fs = fsopen("ext4");						fd_fs = fsopen("ext4");
> fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");	fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");
> 
> // create superblock
> fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> 								// finds superblock of T1 if opts are compatible
> 								fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> 
> you should have the issue that you're describing.

Correct, this will get broken when not allowing RW open for mounted block
devices as well because the second fsconfig(fd_fs, FSCONFIG_CMD_CREATE,
...) will fail to open the block device in get_tree_bdev(). But again this
patch should fix that.

> But for neither of them does a mount already exist as the first mount
> here would only be created when:
> 
> T1								T2
> fsmount(fd_fs);							fsmount(fd_fs);
> 
> is called at which point the whole superblock issue is already settled.
> Afterwards, both mounts of both T1 and T2 refer to the same superblock -
> as long as the fs and the mount options support this ofc.

I guess the confusion comes from me calling "mount" an operation as
performed by the mount(8) command but which is in fact multiple operations
with the new mount API. Anyway, is the motivation of this patch clearer
now?

								Honza
Christian Brauner July 5, 2023, 1:46 p.m. UTC | #3
On Wed, Jul 05, 2023 at 03:00:33PM +0200, Jan Kara wrote:
> On Tue 04-07-23 15:59:41, Christian Brauner wrote:
> > On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > > When we don't allow opening of mounted block devices for writing, bind
> > > mounting is broken because the bind mount tries to open the block device
> > 
> > Sorry, I'm going to be annoying now...
> > 
> > Afaict, the analysis is misleading but I'm happy to be corrected ofc.
> 
> I'm not sure what your objection exactly is. Probably I was imprecise in my
> changelog description. What gets broken by not allowing RW open of a
> mounted block device is:
> 
> mount -t ext4 /dev/sda1 /mnt1
> mount -t ext4 /dev/sda1 /mnt2
> 
> The second mount should create another mount of the superblock created by
> the first mount but before that is done, get_tree_bdev() tries to open the
> block device and fails when only patches 1 & 2 are applied. This patch
> fixes that.

My objection is that this has nothing to do with mounts but with
superblocks. :) No mount need exist for this issue to appear. And I would
prefer if we keep superblock and mount separate as this leads to unclear
analysis and changelogs.

> 
> > Finding an existing superblock is independent of mounts. get_tree_bdev()
> > and mount_bdev() are really only interested in finding a matching
> > superblock independent of whether or not a mount for it already exists.
> > IOW, if you had two filesystem contexts for the same block device with
> > different mount options:
> > 
> > T1								T2
> > fd_fs = fsopen("ext4");						fd_fs = fsopen("ext4");
> > fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");	fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");
> > 
> > // create superblock
> > fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> > 								// finds superblock of T1 if opts are compatible
> > 								fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> > 
> > you should have the issue that you're describing.
> 
> Correct, this will get broken when not allowing RW open for mounted block
> devices as well because the second fsconfig(fd_fs, FSCONFIG_CMD_CREATE,
> ...) will fail to open the block device in get_tree_bdev(). But again this
> patch should fix that.
> 
> > But for neither of them does a mount already exist as the first mount
> > here would only be created when:
> > 
> > T1								T2
> > fsmount(fd_fs);							fsmount(fd_fs);
> > 
> > is called at which point the whole superblock issue is already settled.
> > Afterwards, both mounts of both T1 and T2 refer to the same superblock -
> > as long as the fs and the mount options support this ofc.
> 
> I guess the confusion comes from me calling "mount" an operation as
> performed by the mount(8) command but which is in fact multiple operations
> with the new mount API. Anyway, is the motivation of this patch clearer
> now?

I'm clear about what you're doing here. I would just like to not have
mounts brought into the changelog. Even before the new mount api what
you were describing was technically a superblock only issue. If someone
reads the changelog I want them to be able to clearly see that this is a
fix for superblocks, not mounts.

Especially, since the code you touch really only has to to with
superblocks.
Let me - non ironically - return the question: Is my own request clearer
now?
Jan Kara July 5, 2023, 4:14 p.m. UTC | #4
On Wed 05-07-23 15:46:19, Christian Brauner wrote:
> On Wed, Jul 05, 2023 at 03:00:33PM +0200, Jan Kara wrote:
> > On Tue 04-07-23 15:59:41, Christian Brauner wrote:
> > > On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > > > When we don't allow opening of mounted block devices for writing, bind
> > > > mounting is broken because the bind mount tries to open the block device
> > > 
> > > Sorry, I'm going to be annoying now...
> > > 
> > > Afaict, the analysis is misleading but I'm happy to be corrected ofc.
> > 
> > I'm not sure what your objection exactly is. Probably I was imprecise in my
> > changelog description. What gets broken by not allowing RW open of a
> > mounted block device is:
> > 
> > mount -t ext4 /dev/sda1 /mnt1
> > mount -t ext4 /dev/sda1 /mnt2
> > 
> > The second mount should create another mount of the superblock created by
> > the first mount but before that is done, get_tree_bdev() tries to open the
> > block device and fails when only patches 1 & 2 are applied. This patch
> > fixes that.
> 
> My objection is that this has nothing to do with mounts but with
> superblocks. :) No mount need exist for this issue to appear. And I would
> prefer if we keep superblock and mount separate as this leads to unclear
> analysis and changelogs.
> 
> > 
> > > Finding an existing superblock is independent of mounts. get_tree_bdev()
> > > and mount_bdev() are really only interested in finding a matching
> > > superblock independent of whether or not a mount for it already exists.
> > > IOW, if you had two filesystem contexts for the same block device with
> > > different mount options:
> > > 
> > > T1								T2
> > > fd_fs = fsopen("ext4");						fd_fs = fsopen("ext4");
> > > fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");	fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda");
> > > 
> > > // create superblock
> > > fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> > > 								// finds superblock of T1 if opts are compatible
> > > 								fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...)
> > > 
> > > you should have the issue that you're describing.
> > 
> > Correct, this will get broken when not allowing RW open for mounted block
> > devices as well because the second fsconfig(fd_fs, FSCONFIG_CMD_CREATE,
> > ...) will fail to open the block device in get_tree_bdev(). But again this
> > patch should fix that.
> > 
> > > But for neither of them does a mount already exist as the first mount
> > > here would only be created when:
> > > 
> > > T1								T2
> > > fsmount(fd_fs);							fsmount(fd_fs);
> > > 
> > > is called at which point the whole superblock issue is already settled.
> > > Afterwards, both mounts of both T1 and T2 refer to the same superblock -
> > > as long as the fs and the mount options support this ofc.
> > 
> > I guess the confusion comes from me calling "mount" an operation as
> > performed by the mount(8) command but which is in fact multiple operations
> > with the new mount API. Anyway, is the motivation of this patch clearer
> > now?
> 
> I'm clear about what you're doing here. I would just like to not have
> mounts brought into the changelog. Even before the new mount api what
> you were describing was technically a superblock only issue. If someone
> reads the changelog I want them to be able to clearly see that this is a
> fix for superblocks, not mounts.
> 
> Especially, since the code you touch really only has to to with
> superblocks.
> Let me - non ironically - return the question: Is my own request clearer
> now?

Yes, I understand now :). Thanks for explanation. I'll rephrase the
changelog to speak about superblocks.

								Honza
Christoph Hellwig July 6, 2023, 3:55 p.m. UTC | #5
On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> When we don't allow opening of mounted block devices for writing, bind
> mounting is broken because the bind mount tries to open the block device
> before finding the superblock for it already exists. Reorganize the
> mounting code to first look whether the superblock for a particular
> device is already mounted and open the block device only if it is not.

Warning: this might be a rathole.

I really hate how mount_bdev / get_tree_bdev try to deal with multiple
mounts.

The idea to just open the device and work from there just feels very
bogus.

There is really no good reason to have the bdev to find a superblock,
the dev_t does just fine (and in fact I have a patch to remove
the bdev based get_super and just use the dev_t based one all the
time).  So I'd really like to actually turn this around and only
open when we need to allocate a new super block.  That probably
means tearning sget_fc apart a bit, so it will turn into a fair
amount of work, but I think it's the right thing to do.
Jan Kara July 6, 2023, 4:12 p.m. UTC | #6
On Thu 06-07-23 08:55:18, Christoph Hellwig wrote:
> On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > When we don't allow opening of mounted block devices for writing, bind
> > mounting is broken because the bind mount tries to open the block device
> > before finding the superblock for it already exists. Reorganize the
> > mounting code to first look whether the superblock for a particular
> > device is already mounted and open the block device only if it is not.
> 
> Warning: this might be a rathole.
> 
> I really hate how mount_bdev / get_tree_bdev try to deal with multiple
> mounts.
> 
> The idea to just open the device and work from there just feels very
> bogus.
> 
> There is really no good reason to have the bdev to find a superblock,
> the dev_t does just fine (and in fact I have a patch to remove
> the bdev based get_super and just use the dev_t based one all the
> time).  So I'd really like to actually turn this around and only
> open when we need to allocate a new super block.  That probably
> means tearning sget_fc apart a bit, so it will turn into a fair
> amount of work, but I think it's the right thing to do.

Well, this is exactly what this patch does - we use dev_t to lookup the
superblock in sget_fc() and we open the block device only if we cannot find
matching superblock and need to create a new one...

								Honza
Christian Brauner July 7, 2023, 7:39 a.m. UTC | #7
On Thu, Jul 06, 2023 at 06:12:55PM +0200, Jan Kara wrote:
> On Thu 06-07-23 08:55:18, Christoph Hellwig wrote:
> > On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > > When we don't allow opening of mounted block devices for writing, bind
> > > mounting is broken because the bind mount tries to open the block device
> > > before finding the superblock for it already exists. Reorganize the
> > > mounting code to first look whether the superblock for a particular
> > > device is already mounted and open the block device only if it is not.
> > 
> > Warning: this might be a rathole.
> > 
> > I really hate how mount_bdev / get_tree_bdev try to deal with multiple
> > mounts.
> > 
> > The idea to just open the device and work from there just feels very
> > bogus.
> > 
> > There is really no good reason to have the bdev to find a superblock,
> > the dev_t does just fine (and in fact I have a patch to remove
> > the bdev based get_super and just use the dev_t based one all the
> > time).  So I'd really like to actually turn this around and only
> > open when we need to allocate a new super block.  That probably
> > means tearning sget_fc apart a bit, so it will turn into a fair
> > amount of work, but I think it's the right thing to do.
> 
> Well, this is exactly what this patch does - we use dev_t to lookup the
> superblock in sget_fc() and we open the block device only if we cannot find
> matching superblock and need to create a new one...

Can you do this rework independent of the bdev_handle work that you're
doing so this series doesn't depend on the other work and we can get
the VFS bits merged for this?
Jan Kara July 7, 2023, 10:48 a.m. UTC | #8
On Fri 07-07-23 09:39:05, Christian Brauner wrote:
> On Thu, Jul 06, 2023 at 06:12:55PM +0200, Jan Kara wrote:
> > On Thu 06-07-23 08:55:18, Christoph Hellwig wrote:
> > > On Tue, Jul 04, 2023 at 02:56:54PM +0200, Jan Kara wrote:
> > > > When we don't allow opening of mounted block devices for writing, bind
> > > > mounting is broken because the bind mount tries to open the block device
> > > > before finding the superblock for it already exists. Reorganize the
> > > > mounting code to first look whether the superblock for a particular
> > > > device is already mounted and open the block device only if it is not.
> > > 
> > > Warning: this might be a rathole.
> > > 
> > > I really hate how mount_bdev / get_tree_bdev try to deal with multiple
> > > mounts.
> > > 
> > > The idea to just open the device and work from there just feels very
> > > bogus.
> > > 
> > > There is really no good reason to have the bdev to find a superblock,
> > > the dev_t does just fine (and in fact I have a patch to remove
> > > the bdev based get_super and just use the dev_t based one all the
> > > time).  So I'd really like to actually turn this around and only
> > > open when we need to allocate a new super block.  That probably
> > > means tearning sget_fc apart a bit, so it will turn into a fair
> > > amount of work, but I think it's the right thing to do.
> > 
> > Well, this is exactly what this patch does - we use dev_t to lookup the
> > superblock in sget_fc() and we open the block device only if we cannot find
> > matching superblock and need to create a new one...
> 
> Can you do this rework independent of the bdev_handle work that you're
> doing so this series doesn't depend on the other work and we can get
> the VFS bits merged for this?

Yeah, it should be doable. I'll have a look into it.

								Honza
Christoph Hellwig July 7, 2023, 11:30 a.m. UTC | #9
On Thu, Jul 06, 2023 at 06:12:55PM +0200, Jan Kara wrote:
> Well, this is exactly what this patch does - we use dev_t to lookup the
> superblock in sget_fc() and we open the block device only if we cannot find
> matching superblock and need to create a new one...

Hah, that's what you get for writing one last mail before getting on
an airplane without reading the patch.  This just sounded like a
workaround especially being at the end of the series.
Christoph Hellwig July 7, 2023, 11:31 a.m. UTC | #10
On Fri, Jul 07, 2023 at 09:39:05AM +0200, Christian Brauner wrote:
> Can you do this rework independent of the bdev_handle work that you're
> doing so this series doesn't depend on the other work and we can get
> the VFS bits merged for this?

It really should be before it.  I have a few other things related to
it, so if Jan doesn't mind I'd be happy to take it over and post a
series with a version of this and some sget and get_super rework.
Jan Kara July 7, 2023, 12:28 p.m. UTC | #11
On Fri 07-07-23 04:31:07, Christoph Hellwig wrote:
> On Fri, Jul 07, 2023 at 09:39:05AM +0200, Christian Brauner wrote:
> > Can you do this rework independent of the bdev_handle work that you're
> > doing so this series doesn't depend on the other work and we can get
> > the VFS bits merged for this?
> 
> It really should be before it.  I have a few other things related to
> it, so if Jan doesn't mind I'd be happy to take it over and post a
> series with a version of this and some sget and get_super rework.

OK, sure go ahead. I'll be on vacation with my family in the coming weeks
anyway so I won't have time to seriously dwelve into this. So I'll just see
what's the situation once I return.

								Honza
diff mbox series

Patch

diff --git a/fs/super.c b/fs/super.c
index ea135fece772..fdf1e286926e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1228,13 +1228,7 @@  static const struct blk_holder_ops fs_holder_ops = {
 
 static int set_bdev_super(struct super_block *s, void *data)
 {
-	s->s_bdev_handle = data;
-	s->s_bdev = s->s_bdev_handle->bdev;
-	s->s_dev = s->s_bdev->bd_dev;
-	s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
-
-	if (bdev_stable_writes(s->s_bdev))
-		s->s_iflags |= SB_I_STABLE_WRITES;
+	s->s_dev = *(dev_t *)data;
 	return 0;
 }
 
@@ -1246,7 +1240,53 @@  static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 {
 	return !(s->s_iflags & SB_I_RETIRED) &&
-		s->s_bdev == ((struct bdev_handle *)fc->sget_key)->bdev;
+	       s->s_dev == *(dev_t *)fc->sget_key;
+}
+
+static int setup_bdev_super(struct super_block *s, int sb_flags,
+			    struct fs_context *fc)
+{
+	struct bdev_handle *bdev_handle;
+
+	bdev_handle = blkdev_get_by_dev(s->s_dev, sb_open_mode(sb_flags),
+					s->s_type, &fs_holder_ops);
+	if (IS_ERR(bdev_handle)) {
+		if (fc)
+			errorf(fc, "%s: Can't open blockdev", fc->source);
+		return PTR_ERR(bdev_handle);
+	}
+	spin_lock(&sb_lock);
+	s->s_bdev_handle = bdev_handle;
+	s->s_bdev = bdev_handle->bdev;
+	s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
+
+	if (bdev_stable_writes(s->s_bdev))
+		s->s_iflags |= SB_I_STABLE_WRITES;
+	spin_unlock(&sb_lock);
+
+	/*
+	 * Until SB_BORN flag is set, there can be no active superblock
+ 	 * references and thus no filesystem freezing. get_active_super()
+	 * will just loop waiting for SB_BORN so even freeze_bdev() cannot
+	 * proceed. It is enough to check bdev was not frozen before we set
+	 * s_bdev.
+	 */
+	mutex_lock(&s->s_bdev->bd_fsfreeze_mutex);
+	if (s->s_bdev->bd_fsfreeze_count > 0) {
+		mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
+		if (fc)
+			warnf(fc, "%pg: Can't mount, blockdev is frozen",
+			      s->s_bdev);
+		return -EBUSY;
+	}
+	mutex_unlock(&s->s_bdev->bd_fsfreeze_mutex);
+
+	snprintf(s->s_id, sizeof(s->s_id), "%pg", s->s_bdev);
+	shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+				fc->fs_type->name, s->s_id);
+	sb_set_blocksize(s, block_size(s->s_bdev));
+
+	return 0;
 }
 
 /**
@@ -1258,75 +1298,51 @@  int get_tree_bdev(struct fs_context *fc,
 		int (*fill_super)(struct super_block *,
 				  struct fs_context *))
 {
-	struct bdev_handle *bdev_handle;
-	struct block_device *bdev;
+	dev_t dev;
 	struct super_block *s;
 	int error = 0;
 
 	if (!fc->source)
 		return invalf(fc, "No source specified");
 
-	bdev_handle = blkdev_get_by_path(fc->source,
-					 sb_open_mode(fc->sb_flags),
-					 fc->fs_type, &fs_holder_ops);
-	if (IS_ERR(bdev_handle)) {
-		errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev_handle);
-	}
-	bdev = bdev_handle->bdev;
-
-	/* Once the superblock is inserted into the list by sget_fc(), s_umount
-	 * will protect the lockfs code from trying to start a snapshot while
-	 * we are mounting
-	 */
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		blkdev_put(bdev_handle);
-		return -EBUSY;
+	error = lookup_bdev(fc->source, &dev);
+	if (error) {
+		errorf(fc, "%s: Can't lookup blockdev", fc->source);
+		return error;
 	}
 
 	fc->sb_flags |= SB_NOSEC;
-	fc->sget_key = bdev_handle;
+	fc->sget_key = &dev;
 	s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	if (IS_ERR(s)) {
-		blkdev_put(bdev_handle);
+	if (IS_ERR(s))
 		return PTR_ERR(s);
-	}
 
 	if (s->s_root) {
 		/* Don't summarily change the RO/RW state. */
 		if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
-			warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+			warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
 			deactivate_locked_super(s);
-			blkdev_put(bdev_handle);
 			return -EBUSY;
 		}
-
+	} else {
 		/*
-		 * s_umount nests inside open_mutex during
-		 * __invalidate_device().  blkdev_put() acquires open_mutex and
-		 * can't be called under s_umount.  Drop s_umount temporarily.
-		 * This is safe as we're holding an active reference.
+		 * We drop s_umount here because we need to lookup bdev and
+		 * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+		 * invalidate_bdev()). It is safe because we have active sb
+		 * reference and SB_BORN is not set yet.
 		 */
 		up_write(&s->s_umount);
-		blkdev_put(bdev_handle);
+		error = setup_bdev_super(s, fc->sb_flags, fc);
 		down_write(&s->s_umount);
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
-					fc->fs_type->name, s->s_id);
-		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, fc);
+		if (!error)
+			error = fill_super(s, fc);
 		if (error) {
 			deactivate_locked_super(s);
 			return error;
 		}
 
 		s->s_flags |= SB_ACTIVE;
-		bdev->bd_super = s;
+		s->s_bdev->bd_super = s;
 	}
 
 	BUG_ON(fc->root);
@@ -1337,81 +1353,53 @@  EXPORT_SYMBOL(get_tree_bdev);
 
 static int test_bdev_super(struct super_block *s, void *data)
 {
-	return !(s->s_iflags & SB_I_RETIRED) &&
-		s->s_bdev == ((struct bdev_handle *)data)->bdev;
+	return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
 }
 
 struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int))
 {
-	struct bdev_handle *bdev_handle;
-	struct block_device *bdev;
 	struct super_block *s;
 	int error = 0;
+	dev_t dev;
 
-	bdev_handle = blkdev_get_by_path(dev_name, sb_open_mode(flags),
-					 fs_type, &fs_holder_ops);
-	if (IS_ERR(bdev_handle))
-		return ERR_CAST(bdev_handle);
-	bdev = bdev_handle->bdev;
+	error = lookup_bdev(dev_name, &dev);
+	if (error)
+		return ERR_PTR(error);
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		error = -EBUSY;
-		goto error_bdev;
-	}
-	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
-		 bdev_handle);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	flags |= SB_NOSEC;
+	s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
 	if (IS_ERR(s))
-		goto error_s;
+		return ERR_CAST(s);
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & SB_RDONLY) {
 			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_bdev;
+			return ERR_PTR(-EBUSY);
 		}
-
+	} else {
 		/*
-		 * s_umount nests inside open_mutex during
-		 * __invalidate_device().  blkdev_put() acquires open_mutex and
-		 * can't be called under s_umount.  Drop s_umount temporarily.
-		 * This is safe as we're holding an active reference.
+		 * We drop s_umount here because we need to lookup bdev and
+		 * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+		 * invalidate_bdev()). It is safe because we have active sb
+		 * reference and SB_BORN is not set yet.
 		 */
 		up_write(&s->s_umount);
-		blkdev_put(bdev_handle);
+		error = setup_bdev_super(s, flags, NULL);
 		down_write(&s->s_umount);
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
-					fs_type->name, s->s_id);
-		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+		if (!error)
+			error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
-			goto error;
+			return ERR_PTR(error);
 		}
 
 		s->s_flags |= SB_ACTIVE;
-		bdev->bd_super = s;
+		s->s_bdev->bd_super = s;
 	}
 
 	return dget(s->s_root);
-
-error_s:
-	error = PTR_ERR(s);
-error_bdev:
-	blkdev_put(bdev_handle);
-error:
-	return ERR_PTR(error);
 }
 EXPORT_SYMBOL(mount_bdev);
 
@@ -1419,10 +1407,12 @@  void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
 
-	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
-	sync_blockdev(bdev);
-	blkdev_put(sb->s_bdev_handle);
+	if (bdev) {
+		bdev->bd_super = NULL;
+		sync_blockdev(bdev);
+		blkdev_put(sb->s_bdev_handle);
+	}
 }
 
 EXPORT_SYMBOL(kill_block_super);