loop: avoid EAGAIN, if offset or block_size are changed

Message ID	20190518004751.18962-1-jaegeuk@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Jaegeuk Kim <jaegeuk@kernel.org> To: linux-kernel@vger.kernel.org, linux-f2fs-devel@lists.sourceforge.net Cc: Jaegeuk Kim <jaegeuk@kernel.org>, stable@vger.kernel.org, Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org, Bart Van Assche <bvanassche@acm.org> Subject: [PATCH] loop: avoid EAGAIN, if offset or block_size are changed Date: Fri, 17 May 2019 17:47:51 -0700 Message-Id: <20190518004751.18962-1-jaegeuk@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-block-owner@vger.kernel.org Precedence: bulk
Series	loop: avoid EAGAIN, if offset or block_size are changed \| expand loop: avoid EAGAIN, if offset or block_size are changed

Jaegeuk Kim May 18, 2019, 12:47 a.m. UTC

This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
to drop stale pages resulting in wrong data access.

Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Bart Van Assche <bvanassche@acm.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 drivers/block/loop.c | 44 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

Jaegeuk Kim May 18, 2019, 12:53 a.m. UTC | #1

This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
to drop stale pages resulting in wrong data access.

Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Bart Van Assche <bvanassche@acm.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
v2 from v1:
 - remove obsolete jump

 drivers/block/loop.c | 45 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 102d79575895..42994de2dd12 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1212,6 +1212,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	kuid_t uid = current_uid();
 	struct block_device *bdev;
 	bool partscan = false;
+	bool drop_caches = false;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
@@ -1232,10 +1233,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	}
 
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	    lo->lo_sizelimit != info->lo_sizelimit)
+		drop_caches = true;
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -1265,14 +1264,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto out_unfreeze;
@@ -1317,6 +1308,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		bdev = lo->lo_device;
 		partscan = true;
 	}
+
+	/* truncate stale pages cached by previous operations */
+	if (!err && drop_caches) {
+		sync_blockdev(lo->lo_device);
+		kill_bdev(lo->lo_device);
+	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
@@ -1498,6 +1495,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
+	bool drop_caches = false;
 	int err = 0;
 
 	if (lo->lo_state != Lo_bound)
@@ -1506,30 +1504,21 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	if (lo->lo_queue->limits.logical_block_size != arg)
+		drop_caches = true;
 
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
+	/* truncate stale pages cached by previous operations */
+	if (drop_caches) {
+		sync_blockdev(lo->lo_device);
+		kill_bdev(lo->lo_device);
+	}
 	return err;
 }

Jaegeuk Kim June 17, 2019, 9:08 p.m. UTC | #2

Jens,

Any chance to get a review for this?

(Added Tested-by:)

On 05/17, Jaegeuk Kim wrote:
> This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> to drop stale pages resulting in wrong data access.
> 
> Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38
> 
> Cc: <stable@vger.kernel.org>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: linux-block@vger.kernel.org
> Cc: Bart Van Assche <bvanassche@acm.org>
> Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
> Reported-by: Gwendal Grignou <gwendal@chromium.org>
> Reported-by: grygorii tertychnyi <gtertych@cisco.com>

Tested-by: Francesco Ruggeri <fruggeri@arista.com>

> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> v2 from v1:
>  - remove obsolete jump
> 
>  drivers/block/loop.c | 45 +++++++++++++++++---------------------------
>  1 file changed, 17 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 102d79575895..42994de2dd12 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1212,6 +1212,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	kuid_t uid = current_uid();
>  	struct block_device *bdev;
>  	bool partscan = false;
> +	bool drop_caches = false;
>  
>  	err = mutex_lock_killable(&loop_ctl_mutex);
>  	if (err)
> @@ -1232,10 +1233,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	}
>  
>  	if (lo->lo_offset != info->lo_offset ||
> -	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	    lo->lo_sizelimit != info->lo_sizelimit)
> +		drop_caches = true;
>  
>  	/* I/O need to be drained during transfer transition */
>  	blk_mq_freeze_queue(lo->lo_queue);
> @@ -1265,14 +1264,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  
>  	if (lo->lo_offset != info->lo_offset ||
>  	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		/* kill_bdev should have truncated all the pages */
> -		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -			err = -EAGAIN;
> -			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -				__func__, lo->lo_number, lo->lo_file_name,
> -				lo->lo_device->bd_inode->i_mapping->nrpages);
> -			goto out_unfreeze;
> -		}
>  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
>  			err = -EFBIG;
>  			goto out_unfreeze;
> @@ -1317,6 +1308,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  		bdev = lo->lo_device;
>  		partscan = true;
>  	}
> +
> +	/* truncate stale pages cached by previous operations */
> +	if (!err && drop_caches) {
> +		sync_blockdev(lo->lo_device);
> +		kill_bdev(lo->lo_device);
> +	}
>  out_unlock:
>  	mutex_unlock(&loop_ctl_mutex);
>  	if (partscan)
> @@ -1498,6 +1495,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
>  
>  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  {
> +	bool drop_caches = false;
>  	int err = 0;
>  
>  	if (lo->lo_state != Lo_bound)
> @@ -1506,30 +1504,21 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
>  		return -EINVAL;
>  
> -	if (lo->lo_queue->limits.logical_block_size != arg) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	if (lo->lo_queue->limits.logical_block_size != arg)
> +		drop_caches = true;
>  
>  	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* kill_bdev should have truncated all the pages */
> -	if (lo->lo_queue->limits.logical_block_size != arg &&
> -			lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
>  	blk_mq_unfreeze_queue(lo->lo_queue);
>  
> +	/* truncate stale pages cached by previous operations */
> +	if (drop_caches) {
> +		sync_blockdev(lo->lo_device);
> +		kill_bdev(lo->lo_device);
> +	}
>  	return err;
>  }
>  
> -- 
> 2.19.0.605.g01d371f741-goog
> 
> 
> 
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

Andrew Norrie Nov. 18, 2019, 6:36 p.m. UTC | #3

Any chance to get a review on this patch?

We're running Singularity containers in an mpi environment where multiple
concurrent container image loop mounts occur and we hit this issue as reported
to Sylabs by us here:

https://github.com/sylabs/singularity/issues/3860

It is affecting our production.
This email and any accompanying attachments are confidential. If you received this email by mistake, please delete
it from your system. Any review, disclosure, copying, distribution, or use of the email by others is strictly prohibited.

Greg Kroah-Hartman Nov. 19, 2019, 4 a.m. UTC | #4

On Mon, Nov 18, 2019 at 11:36:16AM -0700, Andrew Norrie wrote:
> This email and any accompanying attachments are confidential. If you received this email by mistake, please delete
> it from your system. Any review, disclosure, copying, distribution, or use of the email by others is strictly prohibited.

Now deleted.  This is not compatible with Linux mailing lists, sorry.

Bart Van Assche Nov. 19, 2019, 11:40 p.m. UTC | #5

On 5/17/19 5:53 PM, Jaegeuk Kim wrote:
> This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> to drop stale pages resulting in wrong data access.
> 
> Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Please provide a more detailed commit description. What is wrong with 
the current implementation and why is the new behavior considered the 
correct behavior?

This patch moves draining code from before the following comment to 
after that comment:

/* I/O need to be drained during transfer transition */

Is that comment still correct or should it perhaps be updated?

Thanks,

Bart.

Jaegeuk Kim Nov. 25, 2019, 5:59 p.m. UTC | #6

On 11/19, Bart Van Assche wrote:
> On 5/17/19 5:53 PM, Jaegeuk Kim wrote:
> > This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> > to drop stale pages resulting in wrong data access.
> > 
> > Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38
> 
> Please provide a more detailed commit description. What is wrong with the
> current implementation and why is the new behavior considered the correct
> behavior?

Some history would be:

Original bug fix is:
commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed"),
which returns EAGAIN so that user land like Chrome would require enhancing their
error handling routines.

So, this patch tries to avoid EAGAIN while addressing the original bug.

> 
> This patch moves draining code from before the following comment to after
> that comment:
> 
> /* I/O need to be drained during transfer transition */
> 
> Is that comment still correct or should it perhaps be updated?

IMHO, it's still valid.

> 
> Thanks,
> 

> Bart.

Bart Van Assche Nov. 25, 2019, 6:35 p.m. UTC | #7

On 11/25/19 9:59 AM, Jaegeuk Kim wrote:
> On 11/19, Bart Van Assche wrote:
>> On 5/17/19 5:53 PM, Jaegeuk Kim wrote:
>>> This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
>>> to drop stale pages resulting in wrong data access.
>>>
>>> Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38
>>
>> Please provide a more detailed commit description. What is wrong with the
>> current implementation and why is the new behavior considered the correct
>> behavior?
> 
> Some history would be:
> 
> Original bug fix is:
> commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed"),
> which returns EAGAIN so that user land like Chrome would require enhancing their
> error handling routines.
> 
> So, this patch tries to avoid EAGAIN while addressing the original bug.
> 
>>
>> This patch moves draining code from before the following comment to after
>> that comment:
>>
>> /* I/O need to be drained during transfer transition */
>>
>> Is that comment still correct or should it perhaps be updated?
> 
> IMHO, it's still valid.

Hi Jaegeuk,

Thank you for the additional and very helpful clarification. Can you have a look at the (totally untested) patch below? I prefer that version because it prevents concurrent processing of requests and syncing/killing the bdev.

Thanks,

Bart.


Subject: [PATCH] loop: Avoid EAGAIN if offset or block_size are changed

After sync_blockdev() and kill_bdev() have been called, more requests
can be submitted to the loop device. These requests dirty additional
pages, causing loop_set_status() to return -EAGAIN. Not all user space
code that changes the offset and/or the block size handles -EAGAIN
correctly. Hence make sure that loop_set_status() does not return
-EAGAIN.

Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/block/loop.c | 35 +++++++----------------------------
  1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 739b372a5112..48cfc8b9c247 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1264,15 +1264,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
  		goto out_unlock;
  	}

+	/* I/O need to be drained during transfer transition */
+	blk_mq_freeze_queue(lo->lo_queue);
+
  	if (lo->lo_offset != info->lo_offset ||
  	    lo->lo_sizelimit != info->lo_sizelimit) {
  		sync_blockdev(lo->lo_device);
  		kill_bdev(lo->lo_device);
  	}

-	/* I/O need to be drained during transfer transition */
-	blk_mq_freeze_queue(lo->lo_queue);
-
  	err = loop_release_xfer(lo);
  	if (err)
  		goto out_unfreeze;
@@ -1298,14 +1298,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)

  	if (lo->lo_offset != info->lo_offset ||
  	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
  			err = -EFBIG;
  			goto out_unfreeze;
@@ -1531,39 +1523,26 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)

  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
  {
-	int err = 0;
-
  	if (lo->lo_state != Lo_bound)
  		return -ENXIO;

  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
  		return -EINVAL;

+	blk_mq_freeze_queue(lo->lo_queue);
+
  	if (lo->lo_queue->limits.logical_block_size != arg) {
  		sync_blockdev(lo->lo_device);
  		kill_bdev(lo->lo_device);
  	}
-
-	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
  	blk_queue_logical_block_size(lo->lo_queue, arg);
  	blk_queue_physical_block_size(lo->lo_queue, arg);
  	blk_queue_io_min(lo->lo_queue, arg);
  	loop_update_dio(lo);
-out_unfreeze:
+
  	blk_mq_unfreeze_queue(lo->lo_queue);

-	return err;
+	return 0;
  }

  static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Jaegeuk Kim Nov. 25, 2019, 7:22 p.m. UTC | #8

Hi Bart,

On 11/25, Bart Van Assche wrote:
> On 11/25/19 9:59 AM, Jaegeuk Kim wrote:
> > On 11/19, Bart Van Assche wrote:
> > > On 5/17/19 5:53 PM, Jaegeuk Kim wrote:
> > > > This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> > > > to drop stale pages resulting in wrong data access.
> > > > 
> > > > Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38
> > > 
> > > Please provide a more detailed commit description. What is wrong with the
> > > current implementation and why is the new behavior considered the correct
> > > behavior?
> > 
> > Some history would be:
> > 
> > Original bug fix is:
> > commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed"),
> > which returns EAGAIN so that user land like Chrome would require enhancing their
> > error handling routines.
> > 
> > So, this patch tries to avoid EAGAIN while addressing the original bug.
> > 
> > > 
> > > This patch moves draining code from before the following comment to after
> > > that comment:
> > > 
> > > /* I/O need to be drained during transfer transition */
> > > 
> > > Is that comment still correct or should it perhaps be updated?
> > 
> > IMHO, it's still valid.
> 
> Hi Jaegeuk,
> 
> Thank you for the additional and very helpful clarification. Can you have a look at the (totally untested) patch below? I prefer that version because it prevents concurrent processing of requests and syncing/killing the bdev.

Yeah, I thought this was much cleaner way, but wasn't sure it could be doable
to sync|kill block device after freezing the queue. Is it okay?

> 
> Thanks,
> 
> Bart.
> 
> 
> Subject: [PATCH] loop: Avoid EAGAIN if offset or block_size are changed
> 
> After sync_blockdev() and kill_bdev() have been called, more requests
> can be submitted to the loop device. These requests dirty additional
> pages, causing loop_set_status() to return -EAGAIN. Not all user space
> code that changes the offset and/or the block size handles -EAGAIN
> correctly. Hence make sure that loop_set_status() does not return
> -EAGAIN.
> 
> Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
> Reported-by: Gwendal Grignou <gwendal@chromium.org>
> Reported-by: grygorii tertychnyi <gtertych@cisco.com>
> Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/block/loop.c | 35 +++++++----------------------------
>  1 file changed, 7 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 739b372a5112..48cfc8b9c247 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1264,15 +1264,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  		goto out_unlock;
>  	}
> 
> +	/* I/O need to be drained during transfer transition */
> +	blk_mq_freeze_queue(lo->lo_queue);
> +
>  	if (lo->lo_offset != info->lo_offset ||
>  	    lo->lo_sizelimit != info->lo_sizelimit) {
>  		sync_blockdev(lo->lo_device);
>  		kill_bdev(lo->lo_device);
>  	}
> 
> -	/* I/O need to be drained during transfer transition */
> -	blk_mq_freeze_queue(lo->lo_queue);
> -
>  	err = loop_release_xfer(lo);
>  	if (err)
>  		goto out_unfreeze;
> @@ -1298,14 +1298,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
> 
>  	if (lo->lo_offset != info->lo_offset ||
>  	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		/* kill_bdev should have truncated all the pages */
> -		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -			err = -EAGAIN;
> -			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -				__func__, lo->lo_number, lo->lo_file_name,
> -				lo->lo_device->bd_inode->i_mapping->nrpages);
> -			goto out_unfreeze;
> -		}
>  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
>  			err = -EFBIG;
>  			goto out_unfreeze;
> @@ -1531,39 +1523,26 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
> 
>  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  {
> -	int err = 0;
> -
>  	if (lo->lo_state != Lo_bound)
>  		return -ENXIO;
> 
>  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
>  		return -EINVAL;
> 
> +	blk_mq_freeze_queue(lo->lo_queue);
> +
>  	if (lo->lo_queue->limits.logical_block_size != arg) {
>  		sync_blockdev(lo->lo_device);
>  		kill_bdev(lo->lo_device);
>  	}
> -
> -	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* kill_bdev should have truncated all the pages */
> -	if (lo->lo_queue->limits.logical_block_size != arg &&
> -			lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
> +
>  	blk_mq_unfreeze_queue(lo->lo_queue);
> 
> -	return err;
> +	return 0;
>  }
> 
>  static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Bart Van Assche Nov. 25, 2019, 7:41 p.m. UTC | #9

On 11/25/19 11:22 AM, Jaegeuk Kim wrote:
> On 11/25, Bart Van Assche wrote:
>> Thank you for the additional and very helpful clarification. Can you have a look at the (totally untested) patch below? I prefer that version because it prevents concurrent processing of requests and syncing/killing the bdev.
> 
> Yeah, I thought this was much cleaner way, but wasn't sure it could be doable
> to sync|kill block device after freezing the queue. Is it okay?

Hi Jaegeuk,

That patch was based on an incorrect interpretation of the meaning of 
lo_device. After having taken another loop at the block driver, I don't 
think that calling sync after freezing the queue is OK. How about using 
the following call sequence:
* sync_blockdev()
* blk_mq_freeze_queue()
* kill_bdev()

Thanks,

Bart.

Bart Van Assche Nov. 25, 2019, 10:27 p.m. UTC | #10

On 11/25/19 11:41 AM, Bart Van Assche wrote:
> On 11/25/19 11:22 AM, Jaegeuk Kim wrote:
>> On 11/25, Bart Van Assche wrote:
>>> Thank you for the additional and very helpful clarification. Can you 
>>> have a look at the (totally untested) patch below? I prefer that 
>>> version because it prevents concurrent processing of requests and 
>>> syncing/killing the bdev.
>>
>> Yeah, I thought this was much cleaner way, but wasn't sure it could be 
>> doable
>> to sync|kill block device after freezing the queue. Is it okay?
> 
> Hi Jaegeuk,
> 
> That patch was based on an incorrect interpretation of the meaning of 
> lo_device. After having taken another loop at the block driver, I don't 
> think that calling sync after freezing the queue is OK. How about using 
> the following call sequence:
> * sync_blockdev()
> * blk_mq_freeze_queue()
> * kill_bdev()

This is what I had in mind (still untested):


Subject: [PATCH] loop: Avoid EAGAIN if the offset or the block_size are changed

After sync_blockdev() has been called, more requests can be submitted
to the loop device. These requests dirty additional pages, causing
loop_set_status() to return -EAGAIN. Not all user space code that
changes the offset and/or the block size handles -EAGAIN correctly.
Hence make sure that loop_set_status() does not return -EAGAIN.

Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Gwendal Grignou <gwendal@chromium.org>
Cc: grygorii tertychnyi <gtertych@cisco.com>
Cc: Andrew Norrie <andrew.norrie@cgg.com>
Cc: <stable@vger.kernel.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/block/loop.c | 56 ++++++++++++++++++--------------------------
  1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 739b372a5112..84bdb3a6f6d0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1264,14 +1264,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
  		goto out_unlock;
  	}

-	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	/*
+	 * Drain the page cache and the request queue. Set the "dying" flag to
+	 * prevent that kill_bdev() locks up.
+	 */
+	sync_blockdev(lo->lo_device);

-	/* I/O need to be drained during transfer transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	blk_set_queue_dying(lo->lo_queue);
+	blk_mq_freeze_queue_wait(lo->lo_queue);
+
+	/* Kill buffers that got dirtied after the sync_blockdev() call. */
+	kill_bdev(lo->lo_device);

  	err = loop_release_xfer(lo);
  	if (err)
@@ -1298,14 +1301,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)

  	if (lo->lo_offset != info->lo_offset ||
  	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
  			err = -EFBIG;
  			goto out_unfreeze;
@@ -1341,6 +1336,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
  	__loop_update_dio(lo, lo->use_dio);

  out_unfreeze:
+	blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
  	blk_mq_unfreeze_queue(lo->lo_queue);

  	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
@@ -1531,39 +1527,33 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)

  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
  {
-	int err = 0;
-
  	if (lo->lo_state != Lo_bound)
  		return -ENXIO;

  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
  		return -EINVAL;

-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	/*
+	 * Drain the page cache and the request queue. Set the "dying" flag to
+	 * prevent that kill_bdev() locks up.
+	 */
+	sync_blockdev(lo->lo_device);

-	blk_mq_freeze_queue(lo->lo_queue);
+	blk_set_queue_dying(lo->lo_queue);
+	blk_mq_freeze_queue_wait(lo->lo_queue);

-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
+	/* Kill buffers that got dirtied after the sync_blockdev() call. */
+	kill_bdev(lo->lo_device);

  	blk_queue_logical_block_size(lo->lo_queue, arg);
  	blk_queue_physical_block_size(lo->lo_queue, arg);
  	blk_queue_io_min(lo->lo_queue, arg);
  	loop_update_dio(lo);
-out_unfreeze:
+
+	blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
  	blk_mq_unfreeze_queue(lo->lo_queue);

-	return err;
+	return 0;
  }

  static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Jaegeuk Kim Nov. 26, 2019, 6:29 p.m. UTC | #11

On 11/25, Bart Van Assche wrote:
> On 11/25/19 11:41 AM, Bart Van Assche wrote:
> > On 11/25/19 11:22 AM, Jaegeuk Kim wrote:
> > > On 11/25, Bart Van Assche wrote:
> > > > Thank you for the additional and very helpful clarification. Can
> > > > you have a look at the (totally untested) patch below? I prefer
> > > > that version because it prevents concurrent processing of
> > > > requests and syncing/killing the bdev.
> > > 
> > > Yeah, I thought this was much cleaner way, but wasn't sure it could
> > > be doable
> > > to sync|kill block device after freezing the queue. Is it okay?
> > 
> > Hi Jaegeuk,
> > 
> > That patch was based on an incorrect interpretation of the meaning of
> > lo_device. After having taken another loop at the block driver, I don't
> > think that calling sync after freezing the queue is OK. How about using
> > the following call sequence:
> > * sync_blockdev()
> > * blk_mq_freeze_queue()
> > * kill_bdev()
> 
> This is what I had in mind (still untested):
> 
> 
> Subject: [PATCH] loop: Avoid EAGAIN if the offset or the block_size are changed
> 
> After sync_blockdev() has been called, more requests can be submitted
> to the loop device. These requests dirty additional pages, causing
> loop_set_status() to return -EAGAIN. Not all user space code that
> changes the offset and/or the block size handles -EAGAIN correctly.
> Hence make sure that loop_set_status() does not return -EAGAIN.
> 
> Cc: Jaegeuk Kim <jaegeuk@kernel.org>
> Cc: Gwendal Grignou <gwendal@chromium.org>
> Cc: grygorii tertychnyi <gtertych@cisco.com>
> Cc: Andrew Norrie <andrew.norrie@cgg.com>
> Cc: <stable@vger.kernel.org>
> Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
> Reported-by: Gwendal Grignou <gwendal@chromium.org>
> Reported-by: grygorii tertychnyi <gtertych@cisco.com>
> Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/block/loop.c | 56 ++++++++++++++++++--------------------------
>  1 file changed, 23 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 739b372a5112..84bdb3a6f6d0 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1264,14 +1264,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  		goto out_unlock;
>  	}
> 
> -	if (lo->lo_offset != info->lo_offset ||
> -	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	/*
> +	 * Drain the page cache and the request queue. Set the "dying" flag to
> +	 * prevent that kill_bdev() locks up.
> +	 */
> +	sync_blockdev(lo->lo_device);
> 
> -	/* I/O need to be drained during transfer transition */
> -	blk_mq_freeze_queue(lo->lo_queue);
> +	blk_set_queue_dying(lo->lo_queue);
> +	blk_mq_freeze_queue_wait(lo->lo_queue);
> +
> +	/* Kill buffers that got dirtied after the sync_blockdev() call. */

Any race condition where we can truncate any dirty pages written between
sync_blockdev() and kill_bdev()?

Thanks,

> +	kill_bdev(lo->lo_device);
> 
>  	err = loop_release_xfer(lo);
>  	if (err)
> @@ -1298,14 +1301,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
> 
>  	if (lo->lo_offset != info->lo_offset ||
>  	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		/* kill_bdev should have truncated all the pages */
> -		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -			err = -EAGAIN;
> -			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -				__func__, lo->lo_number, lo->lo_file_name,
> -				lo->lo_device->bd_inode->i_mapping->nrpages);
> -			goto out_unfreeze;
> -		}
>  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
>  			err = -EFBIG;
>  			goto out_unfreeze;
> @@ -1341,6 +1336,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	__loop_update_dio(lo, lo->use_dio);
> 
>  out_unfreeze:
> +	blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
>  	blk_mq_unfreeze_queue(lo->lo_queue);
> 
>  	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
> @@ -1531,39 +1527,33 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
> 
>  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  {
> -	int err = 0;
> -
>  	if (lo->lo_state != Lo_bound)
>  		return -ENXIO;
> 
>  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
>  		return -EINVAL;
> 
> -	if (lo->lo_queue->limits.logical_block_size != arg) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	/*
> +	 * Drain the page cache and the request queue. Set the "dying" flag to
> +	 * prevent that kill_bdev() locks up.
> +	 */
> +	sync_blockdev(lo->lo_device);
> 
> -	blk_mq_freeze_queue(lo->lo_queue);
> +	blk_set_queue_dying(lo->lo_queue);
> +	blk_mq_freeze_queue_wait(lo->lo_queue);
> 
> -	/* kill_bdev should have truncated all the pages */
> -	if (lo->lo_queue->limits.logical_block_size != arg &&
> -			lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> +	/* Kill buffers that got dirtied after the sync_blockdev() call. */
> +	kill_bdev(lo->lo_device);
> 
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
>  	loop_update_dio(lo);
> -out_unfreeze:
> +
> +	blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
>  	blk_mq_unfreeze_queue(lo->lo_queue);
> 
> -	return err;
> +	return 0;
>  }
> 
>  static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Bart Van Assche Nov. 26, 2019, 6:59 p.m. UTC | #12

On 11/26/19 10:29 AM, Jaegeuk Kim wrote:
> On 11/25, Bart Van Assche wrote:
>> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
>> index 739b372a5112..84bdb3a6f6d0 100644
>> --- a/drivers/block/loop.c
>> +++ b/drivers/block/loop.c
>> @@ -1264,14 +1264,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>>   		goto out_unlock;
>>   	}
>>
>> -	if (lo->lo_offset != info->lo_offset ||
>> -	    lo->lo_sizelimit != info->lo_sizelimit) {
>> -		sync_blockdev(lo->lo_device);
>> -		kill_bdev(lo->lo_device);
>> -	}
>> +	/*
>> +	 * Drain the page cache and the request queue. Set the "dying" flag to
>> +	 * prevent that kill_bdev() locks up.
>> +	 */
>> +	sync_blockdev(lo->lo_device);
>>
>> -	/* I/O need to be drained during transfer transition */
>> -	blk_mq_freeze_queue(lo->lo_queue);
>> +	blk_set_queue_dying(lo->lo_queue);
>> +	blk_mq_freeze_queue_wait(lo->lo_queue);
>> +
>> +	/* Kill buffers that got dirtied after the sync_blockdev() call. */
> 
> Any race condition where we can truncate any dirty pages written between
> sync_blockdev() and kill_bdev()?
> 
> Thanks,
> 
>> +	kill_bdev(lo->lo_device);

Hi Jaegeuk,

As you know sync_blockdev() triggers a call to filemap_write_and_wait(). 
That function in turn calls mapping->a_ops->writepages() with sync_mode 
== WB_SYNC_ALL. I think that causes sync_blockdev() to wait until all 
dirty pages have been written so we don't have to worry about pages 
dirtied before the sync_blockdev() call started.

Should we try to handle read and write requests that are submitted to a 
loop device while the loop device block size, offset and/or size are 
being modified or is it OK to fail such requests? The 
blk_set_queue_dying() and blk_mq_freeze_queue_wait() calls set the DYING 
queue flag and also wait for all ongoing block layer requests submitted 
to the loop device to finish. All later submit_bio(), 
generic_make_request(), direct_make_request() and blk_get_request() 
calls will fail with BLK_STS_IOERR or -ENODEV, including those triggered 
by kill_bdev(). In other words, the above patch causes I/O to fail that 
is submitted concurrently with kill_bdev(). Do you agree with failing 
I/O requests submitted to a loop device while the loop device block 
size, offset and/or size are being modified?

Thanks,

Bart.

Jaegeuk Kim Nov. 26, 2019, 10:32 p.m. UTC | #13

On 11/26, Bart Van Assche wrote:
> On 11/26/19 10:29 AM, Jaegeuk Kim wrote:
> > On 11/25, Bart Van Assche wrote:
> > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > > index 739b372a5112..84bdb3a6f6d0 100644
> > > --- a/drivers/block/loop.c
> > > +++ b/drivers/block/loop.c
> > > @@ -1264,14 +1264,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
> > >   		goto out_unlock;
> > >   	}
> > > 
> > > -	if (lo->lo_offset != info->lo_offset ||
> > > -	    lo->lo_sizelimit != info->lo_sizelimit) {
> > > -		sync_blockdev(lo->lo_device);
> > > -		kill_bdev(lo->lo_device);
> > > -	}
> > > +	/*
> > > +	 * Drain the page cache and the request queue. Set the "dying" flag to
> > > +	 * prevent that kill_bdev() locks up.
> > > +	 */
> > > +	sync_blockdev(lo->lo_device);
> > > 
> > > -	/* I/O need to be drained during transfer transition */
> > > -	blk_mq_freeze_queue(lo->lo_queue);
> > > +	blk_set_queue_dying(lo->lo_queue);
> > > +	blk_mq_freeze_queue_wait(lo->lo_queue);
> > > +
> > > +	/* Kill buffers that got dirtied after the sync_blockdev() call. */
> > 
> > Any race condition where we can truncate any dirty pages written between
> > sync_blockdev() and kill_bdev()?
> > 
> > Thanks,
> > 
> > > +	kill_bdev(lo->lo_device);
> 
> Hi Jaegeuk,
> 
> As you know sync_blockdev() triggers a call to filemap_write_and_wait().
> That function in turn calls mapping->a_ops->writepages() with sync_mode ==
> WB_SYNC_ALL. I think that causes sync_blockdev() to wait until all dirty
> pages have been written so we don't have to worry about pages dirtied before
> the sync_blockdev() call started.

My concern is on some dirty pages after sync_blockdev(), since
truncate_inode_pages() in kill_bdev() will cancel dirty pages.

> 
> Should we try to handle read and write requests that are submitted to a loop
> device while the loop device block size, offset and/or size are being
> modified or is it OK to fail such requests? The blk_set_queue_dying() and
> blk_mq_freeze_queue_wait() calls set the DYING queue flag and also wait for
> all ongoing block layer requests submitted to the loop device to finish. All
> later submit_bio(), generic_make_request(), direct_make_request() and
> blk_get_request() calls will fail with BLK_STS_IOERR or -ENODEV, including
> those triggered by kill_bdev(). In other words, the above patch causes I/O
> to fail that is submitted concurrently with kill_bdev(). Do you agree with
> failing I/O requests submitted to a loop device while the loop device block
> size, offset and/or size are being modified?

Actually, the previous patch returned EAGAIN, if there was an interim IO which
produced some dirty pages betwen kill_bdev() and blk_mq_freeze_queue_wait().
IMHO, we can't do blk_set_queue_dying() which gives EIO to those IOs.

Looking at the flow again, how about this?
---
 drivers/block/loop.c | 67 ++++++++++++++++++++++----------------------
 fs/block_dev.c       | 12 ++++++++
 include/linux/fs.h   |  3 ++
 3 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..0f3eca774655 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1224,6 +1224,23 @@ static int loop_clr_fd(struct loop_device *lo)
 	return __loop_clr_fd(lo, false);
 }
 
+static void
+loop_mq_freeze_queue(struct loop_device *lo)
+{
+	for (;;) {
+		sync_blockdev(lo->lo_device);
+
+		/* I/O need to be drained during transfer transition */
+		blk_mq_freeze_queue(lo->lo_queue);
+
+		if (is_dirty_bdev(lo->lo_device))
+			blk_mq_unfreeze_queue(lo->lo_queue);
+		else
+			break;
+	}
+	kill_bdev(lo->lo_device);
+}
+
 static int
 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 {
@@ -1232,6 +1249,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	kuid_t uid = current_uid();
 	struct block_device *bdev;
 	bool partscan = false;
+	bool drop_cache;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
@@ -1251,14 +1269,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		goto out_unlock;
 	}
 
-	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	drop_cache = (lo->lo_offset != info->lo_offset ||
+	    lo->lo_sizelimit != info->lo_sizelimit);
 
 	/* I/O need to be drained during transfer transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	if (drop_cache)
+		loop_mq_freeze_queue(lo);
+	else
+		blk_mq_freeze_queue(lo->lo_queue);
 
 	err = loop_release_xfer(lo);
 	if (err)
@@ -1283,16 +1301,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (err)
 		goto out_unfreeze;
 
-	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
+	if (drop_cache) {
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto out_unfreeze;
@@ -1518,7 +1527,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
-	int err = 0;
+	bool drop_cache;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1526,31 +1535,21 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
-
-	blk_mq_freeze_queue(lo->lo_queue);
+	drop_cache = (lo->lo_queue->limits.logical_block_size != arg);
 
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
+	/* I/O need to be drained during transfer transition */
+	if (drop_cache)
+		loop_mq_freeze_queue(lo);
+	else
+		blk_mq_freeze_queue(lo->lo_queue);
 
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	return err;
+	return 0;
 }
 
 static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9c073dbdc1b0..81fe3beef92b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -74,6 +74,18 @@ static void bdev_write_inode(struct block_device *bdev)
 	spin_unlock(&inode->i_lock);
 }
 
+bool is_dirty_bdev(struct block_device *bdev)
+{
+	struct inode *inode = bdev->bd_inode;
+	bool dirty;
+
+	spin_lock(&inode->i_lock);
+	dirty = inode->i_state & I_DIRTY ? true: false;
+	spin_unlock(&inode->i_lock);
+	return dirty;
+}
+EXPORT_SYMBOL(is_dirty_bdev);
+
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e0d909d35763..02156ad96be2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2570,6 +2570,7 @@ extern void bdput(struct block_device *);
 extern void invalidate_bdev(struct block_device *);
 extern void iterate_bdevs(void (*)(struct block_device *, void *), void *);
 extern int sync_blockdev(struct block_device *bdev);
+extern bool is_dirty_bdev(struct block_device *bdev);
 extern void kill_bdev(struct block_device *);
 extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
@@ -2583,9 +2584,11 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb)
 {
 	return sb == blockdev_superblock;
 }
+
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline bool is_dirty_bdev(struct block_device *bdev) { return false; }
 static inline void kill_bdev(struct block_device *bdev) {}
 static inline void invalidate_bdev(struct block_device *bdev) {}

Bart Van Assche Nov. 26, 2019, 10:54 p.m. UTC | #14

On 11/26/19 2:32 PM, Jaegeuk Kim wrote:
> +static void
> +loop_mq_freeze_queue(struct loop_device *lo)
> +{
> +	for (;;) {
> +		sync_blockdev(lo->lo_device);
> +
> +		/* I/O need to be drained during transfer transition */
> +		blk_mq_freeze_queue(lo->lo_queue);
> +
> +		if (is_dirty_bdev(lo->lo_device))
> +			blk_mq_unfreeze_queue(lo->lo_queue);
> +		else
> +			break;
> +	}
> +	kill_bdev(lo->lo_device);
> +}

Maybe I overlooked something, but how does the above code prevent that 
pages get dirtied after is_dirty_bdev() returns false? 
blk_mq_freeze_queue() freezes lo->lo_queue but not the filesystem that 
uses that request queue. Did you perhaps want to call a function that 
freezes filesystem activity from inside the loop (not sure if such a 
function already exists)?

[ ... ]
>   	/* I/O need to be drained during transfer transition */
> -	blk_mq_freeze_queue(lo->lo_queue);
> +	if (drop_cache)
> +		loop_mq_freeze_queue(lo);
> +	else
> +		blk_mq_freeze_queue(lo->lo_queue);

How about always calling loop_mq_freeze_queue(), independent of the 
value of 'drop_cache'? I think that will make the code both easier to 
read and easier to test.

> +	/* I/O need to be drained during transfer transition */
> +	if (drop_cache)
> +		loop_mq_freeze_queue(lo);
> +	else
> +		blk_mq_freeze_queue(lo->lo_queue);

Same comment here.

> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 9c073dbdc1b0..81fe3beef92b 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -74,6 +74,18 @@ static void bdev_write_inode(struct block_device *bdev)
>   	spin_unlock(&inode->i_lock);
>   }
>   
> +bool is_dirty_bdev(struct block_device *bdev)
> +{
> +	struct inode *inode = bdev->bd_inode;
> +	bool dirty;
> +
> +	spin_lock(&inode->i_lock);
> +	dirty = inode->i_state & I_DIRTY ? true: false;
> +	spin_unlock(&inode->i_lock);
> +	return dirty;
> +}

Since the Linux kernel 'bool' datatype conforms to C99, I think that the 
" ? true : false" part can be left out. From include/linux/types.h:

typedef _Bool			bool;

 From http://www.open-std.org/jtc1/sc22/WG14/www/docs/n1256.pdf: "When 
any scalar value is converted to _Bool, the result is 0 if the value 
compares equal to 0; otherwise, the result is 1."

Thanks,

Bart.

Jaegeuk Kim Nov. 27, 2019, 12:04 a.m. UTC | #15

On 11/26, Bart Van Assche wrote:
> On 11/26/19 2:32 PM, Jaegeuk Kim wrote:
> > +static void
> > +loop_mq_freeze_queue(struct loop_device *lo)
> > +{
> > +	for (;;) {
> > +		sync_blockdev(lo->lo_device);
> > +
> > +		/* I/O need to be drained during transfer transition */
> > +		blk_mq_freeze_queue(lo->lo_queue);
> > +
> > +		if (is_dirty_bdev(lo->lo_device))
> > +			blk_mq_unfreeze_queue(lo->lo_queue);
> > +		else
> > +			break;
> > +	}
> > +	kill_bdev(lo->lo_device);
> > +}
> 
> Maybe I overlooked something, but how does the above code prevent that pages
> get dirtied after is_dirty_bdev() returns false? blk_mq_freeze_queue()
> freezes lo->lo_queue but not the filesystem that uses that request queue.
> Did you perhaps want to call a function that freezes filesystem activity
> from inside the loop (not sure if such a function already exists)?

Ah, no. Indeed, I realize that that's why I posted v2 patch at the first time.
We don't need to care about any stale cached pages during transition, but
after transition, it'd be enough to refresh the page cache.

The below should be v3.

From 7700e52a9e04c02256923222d8948ac6e1182b60 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 17 May 2019 16:37:50 -0700
Subject: [PATCH] loop: avoid EAGAIN, if offset or block_size are changed

This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
to drop stale pages resulting in wrong data access.

Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Bart Van Assche <bvanassche@acm.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 drivers/block/loop.c | 48 +++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..aeab5897ec8f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1232,6 +1232,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	kuid_t uid = current_uid();
 	struct block_device *bdev;
 	bool partscan = false;
+	bool drop_caches = false;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
@@ -1252,10 +1253,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	}
 
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	    lo->lo_sizelimit != info->lo_sizelimit)
+		drop_caches = true;
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -1285,14 +1284,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto out_unfreeze;
@@ -1337,6 +1328,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		bdev = lo->lo_device;
 		partscan = true;
 	}
+
+	/* truncate stale pages cached by previous operations */
+	if (!err && drop_caches) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
@@ -1518,7 +1515,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
-	int err = 0;
+	bool drop_caches = false;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1526,31 +1523,22 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	if (lo->lo_queue->limits.logical_block_size != arg)
+		drop_caches = true;
 
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	return err;
+	/* truncate stale pages cached by previous operations */
+	if (drop_caches) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
+	return 0;
 }
 
 static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Bart Van Assche Nov. 27, 2019, 12:26 a.m. UTC | #16

On 11/26/19 4:04 PM, Jaegeuk Kim wrote:
> Subject: [PATCH] loop: avoid EAGAIN, if offset or block_size are changed
> 
> This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> to drop stale pages resulting in wrong data access.

Does this patch remove all code that returns EAGAIN from the code paths 
used for changing the offset and block size? If so, please make the 
commit message more affirmative.

>   	if (lo->lo_offset != info->lo_offset ||
> -	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	    lo->lo_sizelimit != info->lo_sizelimit)
> +		drop_caches = true;

If the offset is changed and dirty pages are only flushed after the loop 
device offset has been changed, can that cause data to be written at a 
wrong LBA? In other words, I'd like to keep a sync_blockdev() call here.

> +	/* truncate stale pages cached by previous operations */
> +	if (!err && drop_caches) {
> +		sync_blockdev(lo->lo_device);
> +		invalidate_bdev(lo->lo_device);
> +	}

Is the invalidate_bdev() call necessary here?

Thanks,

Bart.

Jaegeuk Kim Nov. 27, 2019, 1:09 a.m. UTC | #17

On 11/26, Bart Van Assche wrote:
> On 11/26/19 4:04 PM, Jaegeuk Kim wrote:
> > Subject: [PATCH] loop: avoid EAGAIN, if offset or block_size are changed
> > 
> > This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> > to drop stale pages resulting in wrong data access.
> 
> Does this patch remove all code that returns EAGAIN from the code paths used
> for changing the offset and block size? If so, please make the commit
> message more affirmative.
> 
> >   	if (lo->lo_offset != info->lo_offset ||
> > -	    lo->lo_sizelimit != info->lo_sizelimit) {
> > -		sync_blockdev(lo->lo_device);
> > -		kill_bdev(lo->lo_device);
> > -	}
> > +	    lo->lo_sizelimit != info->lo_sizelimit)
> > +		drop_caches = true;
> 
> If the offset is changed and dirty pages are only flushed after the loop
> device offset has been changed, can that cause data to be written at a wrong
> LBA? In other words, I'd like to keep a sync_blockdev() call here.
> 
> > +	/* truncate stale pages cached by previous operations */
> > +	if (!err && drop_caches) {
> > +		sync_blockdev(lo->lo_device);
> > +		invalidate_bdev(lo->lo_device);
> > +	}
> 
> Is the invalidate_bdev() call necessary here?

We need this to reload 4KB-sized buffer caches back.

How about this?

From ceef42dbf4ec74c34d58125a20cc11ef13e2e1c4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 17 May 2019 16:37:50 -0700
Subject: [PATCH] loop: avoid EAGAIN, if offset or block_size are changed

Previously, there was a bug where user could see stale buffer cache (e.g, 512B)
attached in the 4KB-sized pager cache, when the block size was changed from
512B to 4KB. That was fixed by:
commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed")

But, there were some regression reports saying the fix returns EAGAIN easily.
So, this patch removes previously added EAGAIN condition, nrpages != 0.

Instead, it changes the flow like this:
- sync_blockdev()
- blk_mq_freeze_queue()
 : change the loop configuration
- blk_mq_unfreeze_queue()
- sync_blockdev()
- invalidate_bdev()

After invalidating the buffer cache, we must see the full valid 4KB page.

Additional concern came from Bart in which we can lose some data when
changing the lo_offset. In that case, this patch adds:
- sync_blockdev()
- blk_set_queue_dying
- blk_mq_freeze_queue()
 : change the loop configuration
- blk_mq_unfreeze_queue()
- blk_queue_flag_clear(QUEUE_FLAG_DYING);
- sync_blockdev()
- invalidate_bdev()

Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Bart Van Assche <bvanassche@acm.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 drivers/block/loop.c | 59 ++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..9c1985de85e0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1232,6 +1232,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	kuid_t uid = current_uid();
 	struct block_device *bdev;
 	bool partscan = false;
+	bool drop_request = false;
+	bool drop_cache = false;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
@@ -1251,11 +1253,16 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		goto out_unlock;
 	}
 
+	if (lo->lo_offset != info->lo_offset)
+		drop_request = true;
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	    lo->lo_sizelimit != info->lo_sizelimit)
+		drop_cache = true;
+
+	sync_blockdev(lo->lo_device);
+
+	if (drop_request)
+		blk_set_queue_dying(lo->lo_queue);
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -1285,14 +1292,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto out_unfreeze;
@@ -1329,6 +1328,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
+	if (drop_request)
+		blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
 
 	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
@@ -1337,6 +1338,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		bdev = lo->lo_device;
 		partscan = true;
 	}
+
+	/* truncate stale pages cached by previous operations */
+	if (!err && drop_cache) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
@@ -1518,7 +1525,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
-	int err = 0;
+	bool drop_cache = false;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1526,31 +1533,23 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	if (lo->lo_queue->limits.logical_block_size != arg)
+		drop_cache = true;
 
+	sync_blockdev(lo->lo_device);
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	return err;
+	/* truncate stale pages cached by previous operations */
+	if (drop_cache) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
+	return 0;
 }
 
 static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Bart Van Assche Nov. 27, 2019, 4:35 p.m. UTC | #18

On 11/26/19 5:09 PM, Jaegeuk Kim wrote:
> +	if (drop_request)
> +		blk_set_queue_dying(lo->lo_queue);
>   
>   	/* I/O need to be drained during transfer transition */
>   	blk_mq_freeze_queue(lo->lo_queue);

Since blk_set_queue_dying() calls blk_freeze_queue_start(), I think the 
above code will increase q->mq_freeze_depth by one or by two depending 
on which path is taken. How about changing the above code into the 
following:

	if (drop_request) {
		blk_set_queue_dying(lo->lo_queue);
		blk_mq_freeze_queue_wait(lo->lo_queue);
	} else {
		blk_mq_freeze_queue(lo->lo_queue);
	}

Otherwise this patch looks good to me.

Thanks,

Bart.

Jaegeuk Kim Nov. 27, 2019, 6:17 p.m. UTC | #19

On 11/27, Bart Van Assche wrote:
> On 11/26/19 5:09 PM, Jaegeuk Kim wrote:
> > +	if (drop_request)
> > +		blk_set_queue_dying(lo->lo_queue);
> >   	/* I/O need to be drained during transfer transition */
> >   	blk_mq_freeze_queue(lo->lo_queue);
> 
> Since blk_set_queue_dying() calls blk_freeze_queue_start(), I think the
> above code will increase q->mq_freeze_depth by one or by two depending on
> which path is taken. How about changing the above code into the following:
> 
> 	if (drop_request) {
> 		blk_set_queue_dying(lo->lo_queue);
> 		blk_mq_freeze_queue_wait(lo->lo_queue);
> 	} else {
> 		blk_mq_freeze_queue(lo->lo_queue);
> 	}

Done.

> 
> Otherwise this patch looks good to me.
> 
> Thanks,
> 
> Bart.

Jaegeuk Kim Nov. 27, 2019, 6:18 p.m. UTC | #20

Previously, there was a bug where user could see stale buffer cache (e.g, 512B)
attached in the 4KB-sized pager cache, when the block size was changed from
512B to 4KB. That was fixed by:
commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed")

But, there were some regression reports saying the fix returns EAGAIN easily.
So, this patch removes previously added EAGAIN condition, nrpages != 0.

Instead, it changes the flow like this:
- sync_blockdev()
- blk_mq_freeze_queue()
 : change the loop configuration
- blk_mq_unfreeze_queue()
- sync_blockdev()
- invalidate_bdev()

After invalidating the buffer cache, we must see the full valid 4KB page.

Additional concern came from Bart in which we can lose some data when
changing the lo_offset. In that case, this patch adds:
- sync_blockdev()
- blk_set_queue_dying
- blk_mq_freeze_queue()
 : change the loop configuration
- blk_mq_unfreeze_queue()
- blk_queue_flag_clear(QUEUE_FLAG_DYING);
- sync_blockdev()
- invalidate_bdev()

Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

Cc: <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Bart Van Assche <bvanassche@acm.org>
Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Reported-by: Gwendal Grignou <gwendal@chromium.org>
Reported-by: grygorii tertychnyi <gtertych@cisco.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 drivers/block/loop.c | 65 ++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6f77eaa7217..b583050d513a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1232,6 +1232,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	kuid_t uid = current_uid();
 	struct block_device *bdev;
 	bool partscan = false;
+	bool drop_request = false;
+	bool drop_cache = false;
 
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
@@ -1251,14 +1253,21 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		goto out_unlock;
 	}
 
+	if (lo->lo_offset != info->lo_offset)
+		drop_request = true;
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	    lo->lo_sizelimit != info->lo_sizelimit)
+		drop_cache = true;
 
-	/* I/O need to be drained during transfer transition */
-	blk_mq_freeze_queue(lo->lo_queue);
+	sync_blockdev(lo->lo_device);
+
+	if (drop_request) {
+		blk_set_queue_dying(lo->lo_queue);
+		blk_mq_freeze_queue_wait(lo->lo_queue);
+	} else {
+		/* I/O need to be drained during transfer transition */
+		blk_mq_freeze_queue(lo->lo_queue);
+	}
 
 	err = loop_release_xfer(lo);
 	if (err)
@@ -1285,14 +1294,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
-		/* kill_bdev should have truncated all the pages */
-		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
-			err = -EAGAIN;
-			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-				__func__, lo->lo_number, lo->lo_file_name,
-				lo->lo_device->bd_inode->i_mapping->nrpages);
-			goto out_unfreeze;
-		}
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto out_unfreeze;
@@ -1329,6 +1330,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 
 out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
+	if (drop_request)
+		blk_queue_flag_clear(QUEUE_FLAG_DYING, lo->lo_queue);
 
 	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
@@ -1337,6 +1340,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		bdev = lo->lo_device;
 		partscan = true;
 	}
+
+	/* truncate stale pages cached by previous operations */
+	if (!err && drop_cache) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	if (partscan)
@@ -1518,7 +1527,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 
 static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 {
-	int err = 0;
+	bool drop_cache = false;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1526,31 +1535,23 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
 		return -EINVAL;
 
-	if (lo->lo_queue->limits.logical_block_size != arg) {
-		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
-	}
+	if (lo->lo_queue->limits.logical_block_size != arg)
+		drop_cache = true;
 
+	sync_blockdev(lo->lo_device);
 	blk_mq_freeze_queue(lo->lo_queue);
-
-	/* kill_bdev should have truncated all the pages */
-	if (lo->lo_queue->limits.logical_block_size != arg &&
-			lo->lo_device->bd_inode->i_mapping->nrpages) {
-		err = -EAGAIN;
-		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
-			__func__, lo->lo_number, lo->lo_file_name,
-			lo->lo_device->bd_inode->i_mapping->nrpages);
-		goto out_unfreeze;
-	}
-
 	blk_queue_logical_block_size(lo->lo_queue, arg);
 	blk_queue_physical_block_size(lo->lo_queue, arg);
 	blk_queue_io_min(lo->lo_queue, arg);
 	loop_update_dio(lo);
-out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	return err;
+	/* truncate stale pages cached by previous operations */
+	if (drop_cache) {
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
+	return 0;
 }
 
 static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,

Bart Van Assche Nov. 27, 2019, 6:54 p.m. UTC | #21

On 11/27/19 10:18 AM, Jaegeuk Kim wrote:
> Previously, there was a bug where user could see stale buffer cache (e.g, 512B)
> attached in the 4KB-sized pager cache, when the block size was changed from
> 512B to 4KB. That was fixed by:
> commit 5db470e229e2 ("loop: drop caches if offset or block_size are changed")

[ ... ]

Reviewed-by: Bart Van Assche <bvanassche@acm.org>

Andrew Norrie Feb. 19, 2020, 7:58 p.m. UTC | #22

Hi,

Just checking again the status of this patch?
It doesn't look like it's made it into the kernel yet?

Jan Kara March 5, 2020, 9:04 p.m. UTC | #23

On Fri 17-05-19 17:47:51, Jaegeuk Kim wrote:
> This patch tries to avoid EAGAIN due to nrpages!=0 that was originally trying
> to drop stale pages resulting in wrong data access.
> 
> Report: https://bugs.chromium.org/p/chromium/issues/detail?id=938958#c38

...

> ---
>  drivers/block/loop.c | 44 +++++++++++++++++---------------------------
>  1 file changed, 17 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 102d79575895..7c7d2d9c47d0 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -1212,6 +1212,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	kuid_t uid = current_uid();
>  	struct block_device *bdev;
>  	bool partscan = false;
> +	bool drop_caches = false;
>  
>  	err = mutex_lock_killable(&loop_ctl_mutex);
>  	if (err)
> @@ -1232,10 +1233,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  	}
>  
>  	if (lo->lo_offset != info->lo_offset ||
> -	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	    lo->lo_sizelimit != info->lo_sizelimit)
> +		drop_caches = true;

I don't think this solution of moving buffer cache invalidation after loop
device is updated is really correct.

If there's any dirty data in the buffer cache, god knows where it ends
up being written after the loop device is reconfigured. Think e.g. of a
file offset being changed. It may not be even possible to write it if say
block size increased and we have now improperly sized buffers in the buffer
cache...

Frankly, I have rather limited sympathy to someone trying to reconfigure a
loop device while it is in use. Is there any sane usecase? I'd be inclined
to just use a similar trick as we did with LOOP_SET_FD and allow these
changes only if the caller has the loop device open exclusively or we are
able to upgrade to exclusive open. As otherwise say mounted filesystem on
top of loop device being reconfigured is very likely to be in serious
trouble (e.g. it's impossible to fully invalidate buffer cache in that
case).

But that's probably somewhat tangential to the problem you have. For your
case I don't really see a race-free way to invalidate buffer cache and
update loop configuration - the best I can see is to flush & invalidate
the cache, freeze the bdev so that new data cannot be read into the
buffer cache, check the cache is still empty - if yes, go ahead. If not,
unfreeze and try again...

								Honza

>  	/* I/O need to be drained during transfer transition */
>  	blk_mq_freeze_queue(lo->lo_queue);
> @@ -1265,14 +1264,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  
>  	if (lo->lo_offset != info->lo_offset ||
>  	    lo->lo_sizelimit != info->lo_sizelimit) {
> -		/* kill_bdev should have truncated all the pages */
> -		if (lo->lo_device->bd_inode->i_mapping->nrpages) {
> -			err = -EAGAIN;
> -			pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -				__func__, lo->lo_number, lo->lo_file_name,
> -				lo->lo_device->bd_inode->i_mapping->nrpages);
> -			goto out_unfreeze;
> -		}
>  		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
>  			err = -EFBIG;
>  			goto out_unfreeze;
> @@ -1317,6 +1308,12 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
>  		bdev = lo->lo_device;
>  		partscan = true;
>  	}
> +
> +	/* truncate stale pages cached by previous operations */
> +	if (!err && drop_caches) {
> +		sync_blockdev(lo->lo_device);
> +		kill_bdev(lo->lo_device);
> +	}
>  out_unlock:
>  	mutex_unlock(&loop_ctl_mutex);
>  	if (partscan)
> @@ -1498,6 +1495,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
>  
>  static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  {
> +	bool drop_caches = false;
>  	int err = 0;
>  
>  	if (lo->lo_state != Lo_bound)
> @@ -1506,23 +1504,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
>  		return -EINVAL;
>  
> -	if (lo->lo_queue->limits.logical_block_size != arg) {
> -		sync_blockdev(lo->lo_device);
> -		kill_bdev(lo->lo_device);
> -	}
> +	if (lo->lo_queue->limits.logical_block_size != arg)
> +		drop_caches = true;
>  
>  	blk_mq_freeze_queue(lo->lo_queue);
> -
> -	/* kill_bdev should have truncated all the pages */
> -	if (lo->lo_queue->limits.logical_block_size != arg &&
> -			lo->lo_device->bd_inode->i_mapping->nrpages) {
> -		err = -EAGAIN;
> -		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
> -			__func__, lo->lo_number, lo->lo_file_name,
> -			lo->lo_device->bd_inode->i_mapping->nrpages);
> -		goto out_unfreeze;
> -	}
> -
>  	blk_queue_logical_block_size(lo->lo_queue, arg);
>  	blk_queue_physical_block_size(lo->lo_queue, arg);
>  	blk_queue_io_min(lo->lo_queue, arg);
> @@ -1530,6 +1515,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
>  out_unfreeze:
>  	blk_mq_unfreeze_queue(lo->lo_queue);
>  
> +	/* truncate stale pages cached by previous operations */
> +	if (drop_caches) {
> +		sync_blockdev(lo->lo_device);
> +		kill_bdev(lo->lo_device);
> +	}
>  	return err;
>  }
>

loop: avoid EAGAIN, if offset or block_size are changed

Commit Message

Comments

Patch