diff mbox series

[v2,1/2] block: Read max write zeroes once for __blkdev_issue_write_zeroes()

Message ID 20240815163228.216051-2-john.g.garry@oracle.com (mailing list archive)
State New, archived
Headers show
Series block: Fix __blkdev_issue_write_zeroes() limit handling | expand

Commit Message

John Garry Aug. 15, 2024, 4:32 p.m. UTC
As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
drive.

Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
changed __blkdev_issue_write_zeroes() to read the max write zeroes
value in the loop. This is not safe as max write zeroes may change in
value. Specifically for the case of [0], the value goes to 0, and we get
an infinite loop.

Lift the limit reading out of the loop.

[0] https://lore.kernel.org/linux-xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/#t

Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/blk-lib.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Comments

Martin K. Petersen Aug. 15, 2024, 5:43 p.m. UTC | #1
John,

> As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
> drive.
>
> Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
> changed __blkdev_issue_write_zeroes() to read the max write zeroes
> value in the loop. This is not safe as max write zeroes may change in
> value. Specifically for the case of [0], the value goes to 0, and we get
> an infinite loop.
>
> Lift the limit reading out of the loop.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Nitesh Shetty Aug. 16, 2024, 6:34 p.m. UTC | #2
On 15/08/24 04:32PM, John Garry wrote:
>As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
>drive.
>
>Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>changed __blkdev_issue_write_zeroes() to read the max write zeroes
>value in the loop. This is not safe as max write zeroes may change in
>value. Specifically for the case of [0], the value goes to 0, and we get
>an infinite loop.
>
>Lift the limit reading out of the loop.
>
>[0] https://lore.kernel.org/linux-xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/#t
>
>Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>Reviewed-by: Christoph Hellwig <hch@lst.de>
>Signed-off-by: John Garry <john.g.garry@oracle.com>
>---
> block/blk-lib.c | 25 ++++++++++++++++++-------
> 1 file changed, 18 insertions(+), 7 deletions(-)
>
>diff --git a/block/blk-lib.c b/block/blk-lib.c
>index 9f735efa6c94..83eb7761c2bf 100644
>--- a/block/blk-lib.c
>+++ b/block/blk-lib.c
>@@ -111,13 +111,20 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev)
> 		(UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
> }
>
>+/*
>+ * There is no reliable way for the SCSI subsystem to determine whether a
>+ * device supports a WRITE SAME operation without actually performing a write
>+ * to media. As a result, write_zeroes is enabled by default and will be
>+ * disabled if a zeroing operation subsequently fails. This means that this
>+ * queue limit is likely to change at runtime.
>+ */
> static void __blkdev_issue_write_zeroes(struct block_device *bdev,
> 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>-		struct bio **biop, unsigned flags)
>+		struct bio **biop, unsigned flags, sector_t limit)
> {
>+
> 	while (nr_sects) {
>-		unsigned int len = min_t(sector_t, nr_sects,
>-				bio_write_zeroes_limit(bdev));
>+		unsigned int len = min(nr_sects, limit);

I feel changes something like below will simplify the whole patch.

--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -115,9 +115,13 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev,
  		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
  		struct bio **biop, unsigned flags)
  {
+	sector_t limit = bio_write_zeroes_limit(bdev);
+
+	if (!limit)
+		return;
+
  	while (nr_sects) {
-		unsigned int len = min_t(sector_t, nr_sects,
-				bio_write_zeroes_limit(bdev));
+		unsigned int len = min_t(sector_t, nr_sects, limit);
  		struct bio *bio;
  
  		if ((flags & BLKDEV_ZERO_KILLABLE) &&


Otherwise, this series looks good to me.

-- Nitesh Shetty
John Garry Aug. 17, 2024, 3:33 p.m. UTC | #3
On 16/08/2024 19:34, Nitesh Shetty wrote:
> On 15/08/24 04:32PM, John Garry wrote:
>> As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
>> drive.
>>
>> Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>> changed __blkdev_issue_write_zeroes() to read the max write zeroes
>> value in the loop. This is not safe as max write zeroes may change in
>> value. Specifically for the case of [0], the value goes to 0, and we get
>> an infinite loop.
>>
>> Lift the limit reading out of the loop.
>>
>> [0] https://urldefense.com/v3/__https://lore.kernel.org/linux- 
>> xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/*t__;Iw!! 
>> ACWV5N9M2RV99hQ! 
>> KNrgu0c216k_Y_2RLxTasjxizyhbN8eKD61JwIwDxT5OJJDamER6hw1nvf5biNMqQLaLl9PqC2qRUDdHlrGF7g$
>> Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>> block/blk-lib.c | 25 ++++++++++++++++++-------
>> 1 file changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index 9f735efa6c94..83eb7761c2bf 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -111,13 +111,20 @@ static sector_t bio_write_zeroes_limit(struct 
>> block_device *bdev)
>>         (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
>> }
>>
>> +/*
>> + * There is no reliable way for the SCSI subsystem to determine 
>> whether a
>> + * device supports a WRITE SAME operation without actually performing 
>> a write
>> + * to media. As a result, write_zeroes is enabled by default and will be
>> + * disabled if a zeroing operation subsequently fails. This means 
>> that this
>> + * queue limit is likely to change at runtime.
>> + */
>> static void __blkdev_issue_write_zeroes(struct block_device *bdev,
>>         sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>> -        struct bio **biop, unsigned flags)
>> +        struct bio **biop, unsigned flags, sector_t limit)
>> {
>> +
>>     while (nr_sects) {
>> -        unsigned int len = min_t(sector_t, nr_sects,
>> -                bio_write_zeroes_limit(bdev));
>> +        unsigned int len = min(nr_sects, limit);
> 
> I feel changes something like below will simplify the whole patch.
> 
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -115,9 +115,13 @@ static void __blkdev_issue_write_zeroes(struct 
> block_device *bdev,
>           sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>           struct bio **biop, unsigned flags)
>   {
> +    sector_t limit = bio_write_zeroes_limit(bdev);
> +
> +    if (!limit)
> +        return;

Just this?

If yes, then __blkdev_issue_write_zeroes() does not return an error 
code, so can we tell whether the zeroes were actually issued?

Furthermore, I would rather limit points at which we call 
bio_write_zeroes_limit()

BTW, I am going on a short vacation now, so I can't quickly rework this 
(if that was actually required).

> +
>       while (nr_sects) {
> -        unsigned int len = min_t(sector_t, nr_sects,
> -                bio_write_zeroes_limit(bdev));
> +        unsigned int len = min_t(sector_t, nr_sects, limit);
>           struct bio *bio;
> 
>           if ((flags & BLKDEV_ZERO_KILLABLE) &&
> 
> 
> Otherwise, this series looks good to me.
> 
> -- Nitesh Shetty
>
Nitesh Shetty Aug. 19, 2024, 3:15 p.m. UTC | #4
On 17/08/24 04:33PM, John Garry wrote:
>On 16/08/2024 19:34, Nitesh Shetty wrote:
>>On 15/08/24 04:32PM, John Garry wrote:
>>>As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
>>>drive.
>>>
>>>Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>>>changed __blkdev_issue_write_zeroes() to read the max write zeroes
>>>value in the loop. This is not safe as max write zeroes may change in
>>>value. Specifically for the case of [0], the value goes to 0, and we get
>>>an infinite loop.
>>>
>>>Lift the limit reading out of the loop.
>>>
>>>[0] https://urldefense.com/v3/__https://lore.kernel.org/linux- 
>>>xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/*t__;Iw!! 
>>>ACWV5N9M2RV99hQ! KNrgu0c216k_Y_2RLxTasjxizyhbN8eKD61JwIwDxT5OJJDamER6hw1nvf5biNMqQLaLl9PqC2qRUDdHlrGF7g$
>>>Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
>>>Reviewed-by: Christoph Hellwig <hch@lst.de>
>>>Signed-off-by: John Garry <john.g.garry@oracle.com>
>>>---
>>>block/blk-lib.c | 25 ++++++++++++++++++-------
>>>1 file changed, 18 insertions(+), 7 deletions(-)
>>>
>>>diff --git a/block/blk-lib.c b/block/blk-lib.c
>>>index 9f735efa6c94..83eb7761c2bf 100644
>>>--- a/block/blk-lib.c
>>>+++ b/block/blk-lib.c
>>>@@ -111,13 +111,20 @@ static sector_t 
>>>bio_write_zeroes_limit(struct block_device *bdev)
>>>        (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
>>>}
>>>
>>>+/*
>>>+ * There is no reliable way for the SCSI subsystem to determine 
>>>whether a
>>>+ * device supports a WRITE SAME operation without actually 
>>>performing a write
>>>+ * to media. As a result, write_zeroes is enabled by default and will be
>>>+ * disabled if a zeroing operation subsequently fails. This means 
>>>that this
>>>+ * queue limit is likely to change at runtime.
>>>+ */
>>>static void __blkdev_issue_write_zeroes(struct block_device *bdev,
>>>        sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>>>-        struct bio **biop, unsigned flags)
>>>+        struct bio **biop, unsigned flags, sector_t limit)
>>>{
>>>+
>>>    while (nr_sects) {
>>>-        unsigned int len = min_t(sector_t, nr_sects,
>>>-                bio_write_zeroes_limit(bdev));
>>>+        unsigned int len = min(nr_sects, limit);
>>
>>I feel changes something like below will simplify the whole patch.
>>
>>--- a/block/blk-lib.c
>>+++ b/block/blk-lib.c
>>@@ -115,9 +115,13 @@ static void __blkdev_issue_write_zeroes(struct 
>>block_device *bdev,
>>          sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
>>          struct bio **biop, unsigned flags)
>>  {
>>+    sector_t limit = bio_write_zeroes_limit(bdev);
>>+
>>+    if (!limit)
>>+        return;
>
>Just this?
>
>If yes, then __blkdev_issue_write_zeroes() does not return an error 
>code, so can we tell whether the zeroes were actually issued?
>
I failed to see this.

>Furthermore, I would rather limit points at which we call 
>bio_write_zeroes_limit()
>
>BTW, I am going on a short vacation now, so I can't quickly rework 
>this (if that was actually required).
>

Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Shinichiro Kawasaki Aug. 28, 2024, 7:15 a.m. UTC | #5
On Aug 15, 2024 / 16:32, John Garry wrote:
> As reported in [0], we may get a hang when formatting a XFS FS on a RAID0
> drive.
> 
> Commit 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
> changed __blkdev_issue_write_zeroes() to read the max write zeroes
> value in the loop. This is not safe as max write zeroes may change in
> value. Specifically for the case of [0], the value goes to 0, and we get
> an infinite loop.
> 
> Lift the limit reading out of the loop.
> 
> [0] https://lore.kernel.org/linux-xfs/4d31268f-310b-4220-88a2-e191c3932a82@oracle.com/T/#t
> 
> Fixes: 73a768d5f955 ("block: factor out a blk_write_zeroes_limit helper")
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: John Garry <john.g.garry@oracle.com>

Hello John,

I ran may test set for the kernel v6.11-rc5 and found that this commit triggers
an error. When I set up xfs with a dm-zoned device on a TCMU zoned device and
filled the filesystem, the kernel reported this error message:

  device-mapper: zoned reclaim: (sdg): Align zone 19 wp 0 to 32736 (wp+32736) blocks failed -121

When dm-zoned calls blkdev_issue_zeorout(), EREMOTEIO 121 was returned, then
the error was reported. I think a change in this commit is the cause. Please
find my comment below.

> ---
>  block/blk-lib.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 9f735efa6c94..83eb7761c2bf 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -111,13 +111,20 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev)
>  		(UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
>  }
>  
> +/*
> + * There is no reliable way for the SCSI subsystem to determine whether a
> + * device supports a WRITE SAME operation without actually performing a write
> + * to media. As a result, write_zeroes is enabled by default and will be
> + * disabled if a zeroing operation subsequently fails. This means that this
> + * queue limit is likely to change at runtime.
> + */
>  static void __blkdev_issue_write_zeroes(struct block_device *bdev,
>  		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
> -		struct bio **biop, unsigned flags)
> +		struct bio **biop, unsigned flags, sector_t limit)
>  {
> +
>  	while (nr_sects) {
> -		unsigned int len = min_t(sector_t, nr_sects,
> -				bio_write_zeroes_limit(bdev));
> +		unsigned int len = min(nr_sects, limit);
>  		struct bio *bio;
>  
>  		if ((flags & BLKDEV_ZERO_KILLABLE) &&
> @@ -141,12 +148,14 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev,
>  static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
>  		sector_t nr_sects, gfp_t gfp, unsigned flags)
>  {
> +	sector_t limit = bio_write_zeroes_limit(bdev);
>  	struct bio *bio = NULL;
>  	struct blk_plug plug;
>  	int ret = 0;
>  
>  	blk_start_plug(&plug);
> -	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags);
> +	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
> +			flags, limit);
>  	if (bio) {
>  		if ((flags & BLKDEV_ZERO_KILLABLE) &&
>  		    fatal_signal_pending(current)) {
> @@ -165,7 +174,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
>  	 * on an I/O error, in which case we'll turn any error into
>  	 * "not supported" here.
>  	 */
> -	if (ret && !bdev_write_zeroes_sectors(bdev))
> +	if (ret && !limit)
>  		return -EOPNOTSUPP;
>  	return ret;
>  }

The hunk above changed the timing to check WRITE ZEROES capability. Before the
change, bdev_write_zeroes_sectors(bdev) was called after WRITE ZEROES bio
completion. Then it was able to detect the q->limits.max_write_zeroes_sector
value change by drivers at WRITE ZEROES failure (scsi sd driver does it).
However, after applying the hunk, the check is done for the local variable
'limit' which is cached before the WRITE ZEROES bio issue. So it can not detect
the q->limits.max_write_zeroes_sector value change at the WRITE ZEROES failure.
Hence, it does not replace the ERMOTEIO with EOPNOTSUPP.

I think the hunk above should be reverted. I created a patch below for it, and
confirmed it avoids the error. Now I'm running my test set to confirm no
regression. If you have comments on my findings and the patch, please share.

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 83eb7761c2bf..b336d57673d3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -148,14 +148,13 @@ static void __blkdev_issue_write_zeroes(struct block_device *bdev,
 static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp, unsigned flags)
 {
-	sector_t limit = bio_write_zeroes_limit(bdev);
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 	int ret = 0;
 
 	blk_start_plug(&plug);
 	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
-			flags, limit);
+				    flags, bio_write_zeroes_limit(bdev));
 	if (bio) {
 		if ((flags & BLKDEV_ZERO_KILLABLE) &&
 		    fatal_signal_pending(current)) {
@@ -174,7 +173,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
 	 * on an I/O error, in which case we'll turn any error into
 	 * "not supported" here.
 	 */
-	if (ret && !limit)
+	if (ret && !bdev_write_zeroes_sectors(bdev))
 		return -EOPNOTSUPP;
 	return ret;
 }
Christoph Hellwig Aug. 28, 2024, 7:17 a.m. UTC | #6
On Wed, Aug 28, 2024 at 07:15:55AM +0000, Shinichiro Kawasaki wrote:
> I ran may test set for the kernel v6.11-rc5 and found that this commit triggers
> an error. When I set up xfs with a dm-zoned device on a TCMU zoned device and
> filled the filesystem, the kernel reported this error message:
> 
>   device-mapper: zoned reclaim: (sdg): Align zone 19 wp 0 to 32736 (wp+32736) blocks failed -121
> 
> When dm-zoned calls blkdev_issue_zeorout(), EREMOTEIO 121 was returned, then
> the error was reported. I think a change in this commit is the cause. Please
> find my comment below.

This patch should fix it:

https://lore.kernel.org/linux-block/20240827175340.GB1977952@frogsfrogsfrogs/
Shinichiro Kawasaki Aug. 28, 2024, 7:25 a.m. UTC | #7
On Aug 28, 2024 / 09:17, hch wrote:
> On Wed, Aug 28, 2024 at 07:15:55AM +0000, Shinichiro Kawasaki wrote:
> > I ran may test set for the kernel v6.11-rc5 and found that this commit triggers
> > an error. When I set up xfs with a dm-zoned device on a TCMU zoned device and
> > filled the filesystem, the kernel reported this error message:
> > 
> >   device-mapper: zoned reclaim: (sdg): Align zone 19 wp 0 to 32736 (wp+32736) blocks failed -121
> > 
> > When dm-zoned calls blkdev_issue_zeorout(), EREMOTEIO 121 was returned, then
> > the error was reported. I think a change in this commit is the cause. Please
> > find my comment below.
> 
> This patch should fix it:
> 
> https://lore.kernel.org/linux-block/20240827175340.GB1977952@frogsfrogsfrogs/

Oh I should have checked the latest linux-block posts. Anyway, good to have the
fix patch. Thanks!
diff mbox series

Patch

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f735efa6c94..83eb7761c2bf 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -111,13 +111,20 @@  static sector_t bio_write_zeroes_limit(struct block_device *bdev)
 		(UINT_MAX >> SECTOR_SHIFT) & ~bs_mask);
 }
 
+/*
+ * There is no reliable way for the SCSI subsystem to determine whether a
+ * device supports a WRITE SAME operation without actually performing a write
+ * to media. As a result, write_zeroes is enabled by default and will be
+ * disabled if a zeroing operation subsequently fails. This means that this
+ * queue limit is likely to change at runtime.
+ */
 static void __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
-		struct bio **biop, unsigned flags)
+		struct bio **biop, unsigned flags, sector_t limit)
 {
+
 	while (nr_sects) {
-		unsigned int len = min_t(sector_t, nr_sects,
-				bio_write_zeroes_limit(bdev));
+		unsigned int len = min(nr_sects, limit);
 		struct bio *bio;
 
 		if ((flags & BLKDEV_ZERO_KILLABLE) &&
@@ -141,12 +148,14 @@  static void __blkdev_issue_write_zeroes(struct block_device *bdev,
 static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp, unsigned flags)
 {
+	sector_t limit = bio_write_zeroes_limit(bdev);
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 	int ret = 0;
 
 	blk_start_plug(&plug);
-	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags);
+	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
+			flags, limit);
 	if (bio) {
 		if ((flags & BLKDEV_ZERO_KILLABLE) &&
 		    fatal_signal_pending(current)) {
@@ -165,7 +174,7 @@  static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
 	 * on an I/O error, in which case we'll turn any error into
 	 * "not supported" here.
 	 */
-	if (ret && !bdev_write_zeroes_sectors(bdev))
+	if (ret && !limit)
 		return -EOPNOTSUPP;
 	return ret;
 }
@@ -265,12 +274,14 @@  int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
 		unsigned flags)
 {
+	sector_t limit = bio_write_zeroes_limit(bdev);
+
 	if (bdev_read_only(bdev))
 		return -EPERM;
 
-	if (bdev_write_zeroes_sectors(bdev)) {
+	if (limit) {
 		__blkdev_issue_write_zeroes(bdev, sector, nr_sects,
-				gfp_mask, biop, flags);
+				gfp_mask, biop, flags, limit);
 	} else {
 		if (flags & BLKDEV_ZERO_NOFALLBACK)
 			return -EOPNOTSUPP;