diff mbox series

[13/14] loop: remove lo_refcount and avoid lo_mutex in ->open / ->release

Message ID 20220325063929.1773899-14-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/14] nbd: use the correct block_device in nbd_bdev_reset | expand

Commit Message

Christoph Hellwig March 25, 2022, 6:39 a.m. UTC
lo_refcount counts how many openers a loop device has, but that count
is already provided by the block layer in the bd_openers field of the
whole-disk block_device.  Remove lo_refcount and allow opens to
succeed even on devices beeing deleted - now that ->free_disk is
implemented we can handle that race gracefull and all I/O on it will
just fail. Similarly there is a small race window now where
loop_control_remove does not synchronize the delete vs the remove
due do bd_openers not being under lo_mutex protection, but we can
handle that just as gracefully.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 drivers/block/loop.c | 37 +++++++------------------------------
 drivers/block/loop.h |  1 -
 2 files changed, 7 insertions(+), 31 deletions(-)

Comments

Tetsuo Handa March 26, 2022, 2:52 a.m. UTC | #1
Since __loop_clr_fd() is currently holding loop_validate_mutex and lo->lo_mutex,
"avoid lo_mutex in ->release" part is incomplete.

The intent of holding loop_validate_mutex (which causes disk->open_mutex =>
loop_validate_mutex => lo->lo_mutex => blk_mq_freeze_queue()/blk_mq_unfreeze_queue()
chain) is to make loop_validate_file() traversal safe.

Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from
lo_release() is called via ->release when disk_openers() == 0, we are guaranteed
that "struct file" which will be passed to loop_validate_file() via fget() cannot
be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to
hold loop_validate_mutex and lo->lo_mutex from __loop_clr_fd() if release == true.

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2506193a4fd1..d47f3d86dd55 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1136,25 +1136,26 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	gfp_t gfp = lo->old_gfp_mask;
 
 	/*
-	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
-	 * loop_validate_file() to succeed, for actual clear operation has not
-	 * started yet.
-	 */
-	mutex_lock(&loop_validate_mutex);
-	mutex_unlock(&loop_validate_mutex);
-	/*
-	 * loop_validate_file() now fails because l->lo_state != Lo_bound
-	 * became visible.
+	 * Make sure that Lo_rundown state becomes visible to loop_configure()
+	 * and loop_change_fd(). When called from ->release, we are guaranteed
+	 * that the "struct file" which loop_configure()/loop_change_fd() found
+	 * via fget() is not this loop device.
 	 */
+	if (!release) {
+		mutex_lock(&loop_validate_mutex);
+		mutex_unlock(&loop_validate_mutex);
+	}
 
 	/*
-	 * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close()
-	 * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if
-	 * lo->lo_state has changed while waiting for lo->lo_mutex.
+	 * It is a sign of something going wrong if lo->lo_state has changed
+	 * while waiting for lo->lo_mutex. When called from ->release, we are
+	 * guaranteed that the nobody is using this loop device.
 	 */
-	mutex_lock(&lo->lo_mutex);
-	BUG_ON(lo->lo_state != Lo_rundown);
-	mutex_unlock(&lo->lo_mutex);
+	if (!release) {
+		mutex_lock(&lo->lo_mutex);
+		BUG_ON(lo->lo_state != Lo_rundown);
+		mutex_unlock(&lo->lo_mutex);
+	}
 
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);

But thinking again about release == false case, which I wrote "It is acceptable
for loop_validate_file() to succeed, for actual clear operation has not started
yet.", I came to feel why it is acceptable to succeed.

Even if loop_validate_file() was safely traversed due to serialization via
loop_validate_mutex, I/O requests after loop_configure()/loop_change_fd() completed
will fail. Is this behavior what we want?

If we don't want I/O requests after loop_configure()/loop_change_fd() completed
fail due to __loop_clr_fd(), it is not acceptable for loop_validate_file() to
succeed. We should hold loop_validate_mutex before setting Lo_rundown in order to
make sure that loop_validate_file() will see Lo_rundown state. That is, something
like below will be expected?

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2506193a4fd1..a4ff94ca654f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1135,27 +1135,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
 
-	/*
-	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
-	 * loop_validate_file() to succeed, for actual clear operation has not
-	 * started yet.
-	 */
-	mutex_lock(&loop_validate_mutex);
-	mutex_unlock(&loop_validate_mutex);
-	/*
-	 * loop_validate_file() now fails because l->lo_state != Lo_bound
-	 * became visible.
-	 */
-
-	/*
-	 * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close()
-	 * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if
-	 * lo->lo_state has changed while waiting for lo->lo_mutex.
-	 */
-	mutex_lock(&lo->lo_mutex);
-	BUG_ON(lo->lo_state != Lo_rundown);
-	mutex_unlock(&lo->lo_mutex);
-
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
 
@@ -1238,11 +1217,18 @@ static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
 
-	err = mutex_lock_killable(&lo->lo_mutex);
+	/*
+	 * Use global lock when setting Lo_rundown state in order to make sure
+	 * that loop_validate_file() will fail if the "struct file" which
+	 * loop_configure()/loop_change_fd() found via fget() was this loop
+	 * device. The disk_openers(lo->lo_disk) > 1 test below guarantees that
+	 * fget() did not return this loop device.
+	 */
+	err = loop_global_lock_killable(lo, true);
 	if (err)
 		return err;
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&lo->lo_mutex);
+		loop_global_unlock(lo, true);
 		return -ENXIO;
 	}
 	/*
@@ -1257,11 +1243,11 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (disk_openers(lo->lo_disk) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&lo->lo_mutex);
+		loop_global_unlock(lo, true);
 		return 0;
 	}
 	lo->lo_state = Lo_rundown;
-	mutex_unlock(&lo->lo_mutex);
+	loop_global_unlock(lo, true);
 
 	__loop_clr_fd(lo, false);
 	return 0;
Christoph Hellwig March 29, 2022, 6:52 a.m. UTC | #2
Hi Tetsuo,

I'm a bit confused. Partially due to the two patches in one mail, but
also because I can't actually find a try that they cleanly apply to.

But let me add my thoughts here:

 - I think the change 

On Sat, Mar 26, 2022 at 11:52:36AM +0900, Tetsuo Handa wrote:
> Since __loop_clr_fd() is currently holding loop_validate_mutex and lo->lo_mutex,
> "avoid lo_mutex in ->release" part is incomplete.
> 
> The intent of holding loop_validate_mutex (which causes disk->open_mutex =>
> loop_validate_mutex => lo->lo_mutex => blk_mq_freeze_queue()/blk_mq_unfreeze_queue()
> chain) is to make loop_validate_file() traversal safe.
> 
> Since ->release is called with disk->open_mutex held, and __loop_clr_fd() from
> lo_release() is called via ->release when disk_openers() == 0, we are guaranteed
> that "struct file" which will be passed to loop_validate_file() via fget() cannot
> be the loop device __loop_clr_fd(lo, true) will clear. Thus, there is no need to
> hold loop_validate_mutex and lo->lo_mutex from __loop_clr_fd() if release == true.

This part looks reasonable.  Can you give a signoff and proper commit
log and I'll slot it in before the lo_refcount removal.
Christoph Hellwig March 29, 2022, 1:25 p.m. UTC | #3
Thinking a bit more, I really don't think the existing any refcount
check protects us against a different tread modififying the backing
file.  When a process has a file descriptor to a loop device open
and is multithreaded (or forks) we can still have multiple threads
manipulating the loop state.

That being said I do not think we really need that refcount check
at all - once loop_clr_fd set lo->lo_state to Lo_rundown under the
global lock we know that loop_validate_file will error out on it
due to the lo_state != Lo_bound check.
Tetsuo Handa March 29, 2022, 2:02 p.m. UTC | #4
On 2022/03/29 22:25, Christoph Hellwig wrote:
> Thinking a bit more, I really don't think the existing any refcount
> check protects us against a different tread modifying the backing
> file.  When a process has a file descriptor to a loop device open
> and is multithreaded (or forks) we can still have multiple threads
> manipulating the loop state.

Yes, I came to that answer. But

> 
> That being said I do not think we really need that refcount check
> at all - once loop_clr_fd set lo->lo_state to Lo_rundown under the
> global lock we know that loop_validate_file will error out on it
> due to the lo_state != Lo_bound check.

if I think about non "a file descriptor to a loop device" case, I
came to the opposite answer. Rather, shouldn't we check the refcount
strictly like below?

[PATCH] loop: avoid loop_validate_mutex/lo_mutex in ->release

Since ->release is called with disk->open_mutex held, and __loop_clr_fd()
 from lo_release() is called via ->release when disk_openers() == 0, we are
guaranteed that "struct file" which will be passed to loop_validate_file()
via fget() cannot be the loop device __loop_clr_fd(lo, true) will clear.
Thus, there is no need to hold loop_validate_mutex from __loop_clr_fd()
if release == true.

When I made commit 3ce6e1f662a91097 ("loop: reintroduce global lock for
safe loop_validate_file() traversal"), I wrote "It is acceptable for
loop_validate_file() to succeed, for actual clear operation has not started
yet.". But now I came to feel why it is acceptable to succeed.

It seems that the loop driver was added in Linux 1.3.68, and

  if (lo->lo_refcnt > 1)
    return -EBUSY;

check in loop_clr_fd() was there from the beginning. The intent of this
check was unclear. But now I think that current

  disk_openers(lo->lo_disk) > 1

form is there for three reasons.

(1) Avoid I/O errors when some process which opens and reads from this
    loop device in response to uevent notification (e.g. systemd-udevd),
    as described in commit a1ecac3b0656a682 ("loop: Make explicit loop
    device destruction lazy"). This opener is short-lived because it is
    likely that the file descriptor used by that process is closed soon.

(2) Avoid I/O errors caused by underlying layer of stacked loop devices
    (i.e. ioctl(some_loop_fd, LOOP_SET_FD, other_loop_fd)) being suddenly
    disappeared. This opener is long-lived because this reference is
    associated with not a file descriptor but lo->lo_backing_file.

(3) Avoid I/O errors caused by underlying layer of mounted loop device
    (i.e. mount(some_loop_device, some_mount_point)) being suddenly
    disappeared. This opener is long-lived because this reference is
    associated with not a file descriptor but mount.

While race in (1) might be acceptable, (2) and (3) should be checked
racelessly. That is, make sure that __loop_clr_fd() will not run if
loop_validate_file() succeeds, by doing refcount check with global lock
held when explicit loop device destruction is requested.

As a result of no longer waiting for lo->lo_mutex after setting Lo_rundown,
we can remove pointless BUG_ON(lo->lo_state != Lo_rundown) check.

Not-yet-signed-off. ;-)
---
 drivers/block/loop.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2506193a4fd1..6b813c592159 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1135,27 +1135,6 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
 	struct file *filp;
 	gfp_t gfp = lo->old_gfp_mask;
 
-	/*
-	 * Flush loop_configure() and loop_change_fd(). It is acceptable for
-	 * loop_validate_file() to succeed, for actual clear operation has not
-	 * started yet.
-	 */
-	mutex_lock(&loop_validate_mutex);
-	mutex_unlock(&loop_validate_mutex);
-	/*
-	 * loop_validate_file() now fails because l->lo_state != Lo_bound
-	 * became visible.
-	 */
-
-	/*
-	 * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close()
-	 * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if
-	 * lo->lo_state has changed while waiting for lo->lo_mutex.
-	 */
-	mutex_lock(&lo->lo_mutex);
-	BUG_ON(lo->lo_state != Lo_rundown);
-	mutex_unlock(&lo->lo_mutex);
-
 	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
 		blk_queue_write_cache(lo->lo_queue, false, false);
 
@@ -1238,11 +1217,20 @@ static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
 
-	err = mutex_lock_killable(&lo->lo_mutex);
+	/*
+	 * Since lo_ioctl() is called without locks held, it is possible that
+	 * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel.
+	 *
+	 * Therefore, use global lock when setting Lo_rundown state in order to
+	 * make sure that loop_validate_file() will fail if the "struct file"
+	 * which loop_configure()/loop_change_fd() found via fget() was this
+	 * loop device.
+	 */
+	err = loop_global_lock_killable(lo, true);
 	if (err)
 		return err;
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&lo->lo_mutex);
+		loop_global_unlock(lo, true);
 		return -ENXIO;
 	}
 	/*
@@ -1257,11 +1245,11 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (disk_openers(lo->lo_disk) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&lo->lo_mutex);
+		loop_global_unlock(lo, true);
 		return 0;
 	}
 	lo->lo_state = Lo_rundown;
-	mutex_unlock(&lo->lo_mutex);
+	loop_global_unlock(lo, true);
 
 	__loop_clr_fd(lo, false);
 	return 0;
Christoph Hellwig March 29, 2022, 2:49 p.m. UTC | #5
On Tue, Mar 29, 2022 at 11:02:15PM +0900, Tetsuo Handa wrote:
> It seems that the loop driver was added in Linux 1.3.68, and
> 
>   if (lo->lo_refcnt > 1)
>     return -EBUSY;
> 
> check in loop_clr_fd() was there from the beginning. The intent of this
> check was unclear.

Yes.

> But now I think that current
> 
>   disk_openers(lo->lo_disk) > 1
> 
> form is there for three reasons.
> 
> (1) Avoid I/O errors when some process which opens and reads from this
>     loop device in response to uevent notification (e.g. systemd-udevd),
>     as described in commit a1ecac3b0656a682 ("loop: Make explicit loop
>     device destruction lazy"). This opener is short-lived because it is
>     likely that the file descriptor used by that process is closed soon.

Well.  With the the uevent supression in the current series there won't
be uevents until the capacity has been set to 0.  More importantly
anything that listens to theses kinds of uevents needs to be able to
deal with I/O errors like this.

> (2) Avoid I/O errors caused by underlying layer of stacked loop devices
>     (i.e. ioctl(some_loop_fd, LOOP_SET_FD, other_loop_fd)) being suddenly
>     disappeared. This opener is long-lived because this reference is
>     associated with not a file descriptor but lo->lo_backing_file.

Again, if you clear the FD expecting I/O errors is the logical consequence.
This is like saying we should work around seeing I/O errors when hot
removing a physical device.

> (3) Avoid I/O errors caused by underlying layer of mounted loop device
>     (i.e. mount(some_loop_device, some_mount_point)) being suddenly
>     disappeared. This opener is long-lived because this reference is
>     associated with not a file descriptor but mount.

Same I/O error story.  If you hot remove a nvme SSD you do expect
error in the file system.  This is a pretty clear action -> consequence
relation.

> While race in (1) might be acceptable, (2) and (3) should be checked
> racelessly. That is, make sure that __loop_clr_fd() will not run if
> loop_validate_file() succeeds, by doing refcount check with global lock
> held when explicit loop device destruction is requested.
> 
> As a result of no longer waiting for lo->lo_mutex after setting Lo_rundown,
> we can remove pointless BUG_ON(lo->lo_state != Lo_rundown) check.

I really do like this patch.  And I think based on your description that
we both agree that the disk_openers check is not needed for functional
correctness as a malicious userspace can do concurrent operations even
without the openers check.  You want a protection against "I/O errors"
when the FD is cleared on a live device, and with your patch we get
that with the disk_openers check.  I'm perfectly fine with that state
for this series as it keeps the status quo.  I just think this check
that goes all the way back is actually a really bad idea that just
provides some false security.  But that isn't something we need to
discuss here and now.
diff mbox series

Patch

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bfd21af7aa38b..8ad8cfffdcbdc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1261,7 +1261,7 @@  static int loop_clr_fd(struct loop_device *lo)
 	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
 	 * command to fail with EBUSY.
 	 */
-	if (atomic_read(&lo->lo_refcnt) > 1) {
+	if (disk_openers(lo->lo_disk) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
 		mutex_unlock(&lo->lo_mutex);
 		return 0;
@@ -1741,33 +1741,15 @@  static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 }
 #endif
 
-static int lo_open(struct block_device *bdev, fmode_t mode)
-{
-	struct loop_device *lo = bdev->bd_disk->private_data;
-	int err;
-
-	err = mutex_lock_killable(&lo->lo_mutex);
-	if (err)
-		return err;
-	if (lo->lo_state == Lo_deleting)
-		err = -ENXIO;
-	else
-		atomic_inc(&lo->lo_refcnt);
-	mutex_unlock(&lo->lo_mutex);
-	return err;
-}
-
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&lo->lo_mutex);
-	if (atomic_dec_return(&lo->lo_refcnt))
-		goto out_unlock;
+	if (disk_openers(disk) > 0)
+		return;
 
-	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
-		if (lo->lo_state != Lo_bound)
-			goto out_unlock;
+	mutex_lock(&lo->lo_mutex);
+	if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
 		lo->lo_state = Lo_rundown;
 		mutex_unlock(&lo->lo_mutex);
 		/*
@@ -1777,8 +1759,6 @@  static void lo_release(struct gendisk *disk, fmode_t mode)
 		__loop_clr_fd(lo, true);
 		return;
 	}
-
-out_unlock:
 	mutex_unlock(&lo->lo_mutex);
 }
 
@@ -1792,7 +1772,6 @@  static void lo_free_disk(struct gendisk *disk)
 
 static const struct block_device_operations lo_fops = {
 	.owner =	THIS_MODULE,
-	.open =		lo_open,
 	.release =	lo_release,
 	.ioctl =	lo_ioctl,
 #ifdef CONFIG_COMPAT
@@ -2046,7 +2025,6 @@  static int loop_add(int i)
 	 */
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART;
-	atomic_set(&lo->lo_refcnt, 0);
 	mutex_init(&lo->lo_mutex);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -2136,13 +2114,12 @@  static int loop_control_remove(int idx)
 	ret = mutex_lock_killable(&lo->lo_mutex);
 	if (ret)
 		goto mark_visible;
-	if (lo->lo_state != Lo_unbound ||
-	    atomic_read(&lo->lo_refcnt) > 0) {
+	if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) {
 		mutex_unlock(&lo->lo_mutex);
 		ret = -EBUSY;
 		goto mark_visible;
 	}
-	/* Mark this loop device no longer open()-able. */
+	/* Mark this loop device as no more bound, but not quite unbound yet */
 	lo->lo_state = Lo_deleting;
 	mutex_unlock(&lo->lo_mutex);
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 082d4b6bfc6a6..449d562738c52 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -28,7 +28,6 @@  struct loop_func_table;
 
 struct loop_device {
 	int		lo_number;
-	atomic_t	lo_refcnt;
 	loff_t		lo_offset;
 	loff_t		lo_sizelimit;
 	int		lo_flags;