block: introduce BLKDEV_DISCARD_ZERO to fix zeroout
diff mbox

Message ID 1466418523-22552-1-git-send-email-hch@lst.de
State New
Headers show

Commit Message

Christoph Hellwig June 20, 2016, 10:28 a.m. UTC
Currently blkdev_issue_zeroout cascades down from discards (if the driver
gurantees that discards zero data), to WRITE SAME and then to a loop
writing zeroes.  Unfortunately we ignore tun-time EOPNOTSUPP errors in the
block layer blkdev_issue_discard helper to work around DM volumes that
may have mixed discard support underneath.

This path intoroduces a new BLKDEV_DISCARD_ZERO flag to
blkdev_issue_discard that indicates we are called for zeroing operation.
This allows both to ignore the EOPNOTSUPP hack and actually consolidating
the discard_zeroes_data check into the function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c        | 17 +++++++++++------
 include/linux/blkdev.h |  4 +++-
 2 files changed, 14 insertions(+), 7 deletions(-)

Comments

Jeff Moyer June 20, 2016, 6:29 p.m. UTC | #1
Christoph Hellwig <hch@lst.de> writes:

> Currently blkdev_issue_zeroout cascades down from discards (if the driver
> gurantees that discards zero data), to WRITE SAME and then to a loop
> writing zeroes.  Unfortunately we ignore tun-time EOPNOTSUPP errors in the
> block layer blkdev_issue_discard helper to work around DM volumes that
> may have mixed discard support underneath.
>
> This path intoroduces a new BLKDEV_DISCARD_ZERO flag to
> blkdev_issue_discard that indicates we are called for zeroing operation.
> This allows both to ignore the EOPNOTSUPP hack and actually consolidating
> the discard_zeroes_data check into the function.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-lib.c        | 17 +++++++++++------
>  include/linux/blkdev.h |  4 +++-
>  2 files changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 78626c2..45b35b1 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -36,12 +36,17 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		return -ENXIO;
>  
>  	if (flags & BLKDEV_DISCARD_SECURE) {
> +		if (flags & BLKDEV_DISCARD_ZERO)
> +			return -EOPNOTSUPP;

Should this be -EINVAL?

-Jeff

>  		if (!blk_queue_secure_erase(q))
>  			return -EOPNOTSUPP;
>  		op = REQ_OP_SECURE_ERASE;
>  	} else {
>  		if (!blk_queue_discard(q))
>  			return -EOPNOTSUPP;
> +		if ((flags & BLKDEV_DISCARD_ZERO) &&
> +		    !q->limits.discard_zeroes_data)
> +			return -EOPNOTSUPP;
>  		op = REQ_OP_DISCARD;
>  	}
>  
> @@ -116,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  			&bio);
>  	if (!ret && bio) {
>  		ret = submit_bio_wait(bio);
> -		if (ret == -EOPNOTSUPP)
> +		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
>  			ret = 0;
>  	}
>  	blk_finish_plug(&plug);
> @@ -241,11 +246,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
>  int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
>  			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
>  {
> -	struct request_queue *q = bdev_get_queue(bdev);
> -
> -	if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data &&
> -	    blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0)
> -		return 0;
> +	if (discard) {
> +		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
> +				BLKDEV_DISCARD_ZERO))
> +			return 0;
> +	}
>  
>  	if (bdev_write_same(bdev) &&
>  	    blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 9d1e0a4..b65ca66 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1136,7 +1136,9 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
>  	return bqt->tag_index[tag];
>  }
>  
> -#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
> +
> +#define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
> +#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
>  
>  extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
>  extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen June 21, 2016, 12:44 a.m. UTC | #2
>>>>> "Christoph" == Christoph Hellwig <hch@lst.de> writes:

Christoph> Currently blkdev_issue_zeroout cascades down from discards
Christoph> (if the driver gurantees that discards zero data), to WRITE

guarantees

Christoph> SAME and then to a loop writing zeroes.  Unfortunately we
Christoph> ignore tun-time EOPNOTSUPP errors in the block layer

run-time

Christoph> blkdev_issue_discard helper to work around DM volumes that
Christoph> may have mixed discard support underneath.

Christoph> This path intoroduces a new BLKDEV_DISCARD_ZERO flag to

patch introduces

Christoph> blkdev_issue_discard that indicates we are called for zeroing
Christoph> operation.  This allows both to ignore the EOPNOTSUPP hack
Christoph> and actually consolidating the discard_zeroes_data check into
Christoph> the function.

I like this better than the I/O error tracking approach. The
blkdev_issue_write_same() -EOPNOTSUPP regression is still present,
though.
Christoph Hellwig June 21, 2016, 12:39 p.m. UTC | #3
> >  	if (flags & BLKDEV_DISCARD_SECURE) {
> > +		if (flags & BLKDEV_DISCARD_ZERO)
> > +			return -EOPNOTSUPP;
> 
> Should this be -EINVAL?

BLKDEV_DISCARD_SECURE | BLKDEV_DISCARD_ZERO seems like a theoretically
possible case, but I don't really care..
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig June 21, 2016, 12:40 p.m. UTC | #4
On Mon, Jun 20, 2016 at 08:44:02PM -0400, Martin K. Petersen wrote:
> Christoph> operation.  This allows both to ignore the EOPNOTSUPP hack
> Christoph> and actually consolidating the discard_zeroes_data check into
> Christoph> the function.
> 
> I like this better than the I/O error tracking approach. The
> blkdev_issue_write_same() -EOPNOTSUPP regression is still present,
> though.

Which one?  Can't think of how that has anything to do with
blkdev_issue_discard.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Moyer June 21, 2016, 2:02 p.m. UTC | #5
Christoph Hellwig <hch@lst.de> writes:

>> >  	if (flags & BLKDEV_DISCARD_SECURE) {
>> > +		if (flags & BLKDEV_DISCARD_ZERO)
>> > +			return -EOPNOTSUPP;
>> 
>> Should this be -EINVAL?
>
> BLKDEV_DISCARD_SECURE | BLKDEV_DISCARD_ZERO seems like a theoretically
> possible case, but I don't really care..

I agree, that is a possible combination.  EOPNOTSUPP is fine.

-Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen June 22, 2016, 2:02 a.m. UTC | #6
>>>>> "Christoph" == Christoph Hellwig <hch@lst.de> writes:

Christoph,

>> I like this better than the I/O error tracking approach. The
>> blkdev_issue_write_same() -EOPNOTSUPP regression is still present,
>> though.

Christoph> Which one?  Can't think of how that has anything to do with
Christoph> blkdev_issue_discard.

In 9082e87bfbf8 ("block: remove struct bio_batch") you changed the error
return for blkdev_issue_write_same() to look like the one for discard:

-	if (bb.error)
-		return bb.error;
-	return ret;
+	if (bio)
+		ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio);
+	return ret != -EOPNOTSUPP ? ret : 0;

It's not a problem for SCSI since I'd be returning -EREMOTEIO. But if a
stacking driver returns -EOPNOTSUPP in response to a WRITE SAME request
we'll get data corruption.
Christoph Hellwig June 22, 2016, 7:52 a.m. UTC | #7
On Tue, Jun 21, 2016 at 10:02:10PM -0400, Martin K. Petersen wrote:
> In 9082e87bfbf8 ("block: remove struct bio_batch") you changed the error
> return for blkdev_issue_write_same() to look like the one for discard:
> 
> -	if (bb.error)
> -		return bb.error;
> -	return ret;
> +	if (bio)
> +		ret = submit_bio_wait(REQ_WRITE | REQ_WRITE_SAME, bio);
> +	return ret != -EOPNOTSUPP ? ret : 0;
> 
> It's not a problem for SCSI since I'd be returning -EREMOTEIO. But if a
> stacking driver returns -EOPNOTSUPP in response to a WRITE SAME request
> we'll get data corruption.

Oh, I see what you mean, but I disagree with the analysis.  Unlike
discard outside the zeroout path, write same is a data integrity operation.
Just like in the zero out case turning an EOPNOTSUPP into 0 will get you
data corruption, as the caller will see a successful return for an
operation that did not actually write data to disk.  Same is true for
writing the actual zeroes in __blkdev_issue_zeroout.

Re stacking drivers and discard / write same:  why does
blk_set_stacking_limits set discard_zeroes_data to 1 and
max_write_same_sectors to UINT_MAX?  These seem like inherently dangerous
defaults.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen June 22, 2016, 1:53 p.m. UTC | #8
>>>>> "Christoph" == Christoph Hellwig <hch@lst.de> writes:

Christoph,

Christoph> Unlike discard outside the zeroout path, write same is a data
Christoph> integrity operation.  Just like in the zero out case turning
Christoph> an EOPNOTSUPP into 0 will get you data corruption, as the
Christoph> caller will see a successful return for an operation that did
Christoph> not actually write data to disk.

Exactly. So why add the dreaded -EOPNOTSUPP special casing to
blkdev_issue_write_same()?

Christoph> Re stacking drivers and discard / write same: why does
Christoph> blk_set_stacking_limits set discard_zeroes_data to 1 and
Christoph> max_write_same_sectors to UINT_MAX?  These seem like
Christoph> inherently dangerous defaults.

It's just shorthand for "stacking driver does not impose any limits, use
whatever the low-level device sets".

If the stacking driver has an actual constraint it is free to set a
different limit prior to calling the stacking function.
Christoph Hellwig June 22, 2016, 2:04 p.m. UTC | #9
On Wed, Jun 22, 2016 at 09:53:04AM -0400, Martin K. Petersen wrote:
> >>>>> "Christoph" == Christoph Hellwig <hch@lst.de> writes:
> 
> Christoph,
> 
> Christoph> Unlike discard outside the zeroout path, write same is a data
> Christoph> integrity operation.  Just like in the zero out case turning
> Christoph> an EOPNOTSUPP into 0 will get you data corruption, as the
> Christoph> caller will see a successful return for an operation that did
> Christoph> not actually write data to disk.
> 
> Exactly. So why add the dreaded -EOPNOTSUPP special casing to
> blkdev_issue_write_same()?

This is a leftover from bio_batch_end_io and not new behavior.  But
I agree that we should kill it.

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 78626c2..45b35b1 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -36,12 +36,17 @@  int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (flags & BLKDEV_DISCARD_ZERO)
+			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
+		if ((flags & BLKDEV_DISCARD_ZERO) &&
+		    !q->limits.discard_zeroes_data)
+			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -116,7 +121,7 @@  int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP)
+		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
 			ret = 0;
 	}
 	blk_finish_plug(&plug);
@@ -241,11 +246,11 @@  static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data &&
-	    blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0) == 0)
-		return 0;
+	if (discard) {
+		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
+				BLKDEV_DISCARD_ZERO))
+			return 0;
+	}
 
 	if (bdev_write_same(bdev) &&
 	    blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9d1e0a4..b65ca66 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1136,7 +1136,9 @@  static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 	return bqt->tag_index[tag];
 }
 
-#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+
+#define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
+#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,