diff mbox

[03/23] sd: implement REQ_OP_WRITE_ZEROES

Message ID 20170323143341.31549-4-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig March 23, 2017, 2:33 p.m. UTC
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c     | 45 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/scsi/sd_zbc.c |  1 +
 2 files changed, 41 insertions(+), 5 deletions(-)

Comments

Bart Van Assche March 28, 2017, 6:50 p.m. UTC | #1
On Thu, 2017-03-23 at 10:33 -0400, Christoph Hellwig wrote:
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index af632e350ab4..b6f70a09a301 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -748,7 +748,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
>  	return scsi_init_io(cmd);
>  }
>  
> -static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> +static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
>  {
>  	struct scsi_device *sdp = cmd->device;
>  	struct request *rq = cmd->request;
> @@ -765,13 +765,14 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
>  
>  	cmd->cmd_len = 16;
>  	cmd->cmnd[0] = WRITE_SAME_16;
> -	cmd->cmnd[1] = 0x8; /* UNMAP */
> +	if (unmap)
> +		cmd->cmnd[1] = 0x8; /* UNMAP */
>  	put_unaligned_be64(sector, &cmd->cmnd[2]);
>  	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);

Hello Christoph,

A quote from SBC: "An OPTIMAL UNMAP GRANULARITY field set to a non-zero value
indicates the optimal granularity in logical blocks for unmap requests (e.g.,
an UNMAP command or a WRITE SAME (16) command with the UNMAP bit set to one).
An unmap request with a number of logical blocks that is not a multiple of
this value may result in unmap operations on fewer LBAs than requested."

This means that just like the start and end of a discard must be aligned on a
discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
also respect that granularity. I think this means that either
__blkdev_issue_zeroout() has to be modified such that it rejects unaligned
REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.

Bart.
Mike Snitzer March 28, 2017, 7:33 p.m. UTC | #2
On Tue, Mar 28 2017 at  2:50pm -0400,
Bart Van Assche <Bart.VanAssche@sandisk.com> wrote:

> On Thu, 2017-03-23 at 10:33 -0400, Christoph Hellwig wrote:
> > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> > index af632e350ab4..b6f70a09a301 100644
> > --- a/drivers/scsi/sd.c
> > +++ b/drivers/scsi/sd.c
> > @@ -748,7 +748,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
> >  	return scsi_init_io(cmd);
> >  }
> >  
> > -static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> > +static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
> >  {
> >  	struct scsi_device *sdp = cmd->device;
> >  	struct request *rq = cmd->request;
> > @@ -765,13 +765,14 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
> >  
> >  	cmd->cmd_len = 16;
> >  	cmd->cmnd[0] = WRITE_SAME_16;
> > -	cmd->cmnd[1] = 0x8; /* UNMAP */
> > +	if (unmap)
> > +		cmd->cmnd[1] = 0x8; /* UNMAP */
> >  	put_unaligned_be64(sector, &cmd->cmnd[2]);
> >  	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
> 
> Hello Christoph,
> 
> A quote from SBC: "An OPTIMAL UNMAP GRANULARITY field set to a non-zero value
> indicates the optimal granularity in logical blocks for unmap requests (e.g.,
> an UNMAP command or a WRITE SAME (16) command with the UNMAP bit set to one).
> An unmap request with a number of logical blocks that is not a multiple of
> this value may result in unmap operations on fewer LBAs than requested."
> 
> This means that just like the start and end of a discard must be aligned on a
> discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
> also respect that granularity. I think this means that either
> __blkdev_issue_zeroout() has to be modified such that it rejects unaligned
> REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
> modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.

That'd get DM thinp off the hook from having to zero the unaligned start
and tail...
Paolo Bonzini March 29, 2017, 2:51 p.m. UTC | #3
On 28/03/2017 20:50, Bart Van Assche wrote:
> 
> This means that just like the start and end of a discard must be aligned on a
> discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
> also respect that granularity. I think this means that either
> __blkdev_issue_zeroout() has to be modified such that it rejects unaligned
> REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
> modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.

I don't think this is the case.

Rather, Linux should try to align the WRITE SAME commands to the optimal
unmap granularity if the zeroed area requires performing more than one
WRITE SAME command (i.e. > maximum write same length or too large to fit
in the CDB).  However, even in that case it can use WRITE SAME with
UNMAP for the unaligned start and tail.  Unlike the UNMAP command, the
SCSI standard does guarantee that zeroes are written in the unaligned parts.

Paolo
Bart Van Assche March 29, 2017, 4:28 p.m. UTC | #4
On Wed, 2017-03-29 at 16:51 +0200, Paolo Bonzini wrote:
> On 28/03/2017 20:50, Bart Van Assche wrote:
> > This means that just like the start and end of a discard must be aligned on a
> > discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
> > also respect that granularity. I think this means that either
> > __blkdev_issue_zeroout() has to be modified such that it rejects unaligned
> > REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
> > modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.
> 
> I don't think this is the case.

Hello Paolo,

Can you cite the section(s) from the SCSI specs that support your view? I
reread the "5.49 WRITE SAME (10) command" and "4.7.3.4.4 WRITE SAME command
and unmap operations" sections but I have not found any explicit statement
that specifies the behavior for unaligned WRITE SAME commands with the UNMAP
bit set. It seems to me like the OPTIMAL UNMAP GRANULARITY parameter was
overlooked when both sections were written. Should we ask the T10 committee
for a clarification?

Another question is, if the specification of WRITE SAME + UNMAP would be
made unambiguous in the SBC document, whether or not we should take the risk
to trigger behavior that is not what we expect by sending unaligned WRITE
SAME + UNMAP commands to SCSI devices?

Thanks,

Bart.
Paolo Bonzini March 29, 2017, 4:53 p.m. UTC | #5
On 29/03/2017 18:28, Bart Van Assche wrote:
> On Wed, 2017-03-29 at 16:51 +0200, Paolo Bonzini wrote:
>> On 28/03/2017 20:50, Bart Van Assche wrote:
>>> This means that just like the start and end of a discard must be aligned on a
>>> discard_granularity boundary, WRITE SAME commands with the UNMAP bit set must
>>> also respect that granularity. I think this means that either
>>> __blkdev_issue_zeroout() has to be modified such that it rejects unaligned
>>> REQ_OP_WRITE_ZEROES operations or that blk_bio_write_same_split() has to be
>>> modified such that it generates REQ_OP_WRITEs for the unaligned start and tail.
>>
>> I don't think this is the case.
> 
> Hello Paolo,
> 
> Can you cite the section(s) from the SCSI specs that support your view? I
> reread the "5.49 WRITE SAME (10) command" and "4.7.3.4.4 WRITE SAME command
> and unmap operations" sections but I have not found any explicit statement
> that specifies the behavior for unaligned WRITE SAME commands with the UNMAP
> bit set. It seems to me like the OPTIMAL UNMAP GRANULARITY parameter was
> overlooked when both sections were written. Should we ask the T10 committee
> for a clarification?

From 4.7.3.4.4:

------
If unmap operations are requested in a WRITE SAME command,
then for each specified LBA:

if the Data-Out Buffer of the WRITE SAME command is the same as the
logical block data returned by a read operation from that LBA while in
the unmapped state (see 4.7.4.5), then:

1) the device server performs the actions described in table 6; and

2) if an unmap operation is not performed in step 1), then the device
server shall perform the specified write operation to that LBA;
------

and from the description of WRITE SAME (10): "subsequent read operations
behave as if the device server wrote the single block of user data
received from the Data-Out Buffer to each logical block without
modification" (I have a slightly older copy though, it's 5.45 here).

It's pretty unambiguous that if the device cannot unmap (including the
case where the request is misaligned with respect to the granularity) it
does a write.

> Another question is, if the specification of WRITE SAME + UNMAP would be
> made unambiguous in the SBC document, whether or not we should take the risk
> to trigger behavior that is not what we expect by sending unaligned WRITE
> SAME + UNMAP commands to SCSI devices?

Yes, I think we should.

Paolo
Martin K. Petersen March 30, 2017, 2:25 a.m. UTC | #6
Bart Van Assche <Bart.VanAssche@sandisk.com> writes:

Hi Bart,

> A quote from SBC: "An OPTIMAL UNMAP GRANULARITY field set to a
> non-zero value indicates the optimal granularity in logical blocks for
> unmap requests (e.g., an UNMAP command or a WRITE SAME (16) command
> with the UNMAP bit set to one).  An unmap request with a number of
> logical blocks that is not a multiple of this value may result in
> unmap operations on fewer LBAs than requested."

Indeed. Fewer LBAs than requested may be *unmapped*. That does not imply
that they are not *written*.

> This means that just like the start and end of a discard must be
> aligned on a discard_granularity boundary, WRITE SAME commands with
> the UNMAP bit set must also respect that granularity. I think this
> means that either __blkdev_issue_zeroout() has to be modified such
> that it rejects unaligned REQ_OP_WRITE_ZEROES operations or that
> blk_bio_write_same_split() has to be modified such that it generates
> REQ_OP_WRITEs for the unaligned start and tail.

No, that's not correct. SBC states:

"a) if the Data-Out Buffer of the WRITE SAME command is the same as the
   logical block data returned by a read operation from that LBA while
   in the unmapped state, then:

   1) the device server performs the actions described in table 6
      [unmap]; and

   2) if an unmap operation is not performed in step 1), then the device
      server shall perform the specified write operation to that LBA;"

I.e. With WRITE SAME it is the responsibility of the device server to
write any LBAs described by the command that were not successfully
unmapped.
diff mbox

Patch

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index af632e350ab4..b6f70a09a301 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -748,7 +748,7 @@  static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -765,13 +765,14 @@  static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd)
 
 	cmd->cmd_len = 16;
 	cmd->cmnd[0] = WRITE_SAME_16;
-	cmd->cmnd[1] = 0x8; /* UNMAP */
+	if (unmap)
+		cmd->cmnd[1] = 0x8; /* UNMAP */
 	put_unaligned_be64(sector, &cmd->cmnd[2]);
 	put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
 
 	cmd->allowed = SD_MAX_RETRIES;
 	cmd->transfersize = data_len;
-	rq->timeout = SD_TIMEOUT;
+	rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
 	scsi_req(rq)->resid_len = data_len;
 
 	return scsi_init_io(cmd);
@@ -801,7 +802,7 @@  static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	cmd->allowed = SD_MAX_RETRIES;
 	cmd->transfersize = data_len;
-	rq->timeout = SD_TIMEOUT;
+	rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
 	scsi_req(rq)->resid_len = data_len;
 
 	return scsi_init_io(cmd);
@@ -857,11 +858,39 @@  static int sd_setup_ata_trim_cmnd(struct scsi_cmnd *cmd)
 	return scsi_init_io(cmd);
 }
 
+static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+{
+	struct request *rq = cmd->request;
+	struct scsi_device *sdp = cmd->device;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+	u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+
+	if (sdp->ata_trim) {
+		if (!sdp->ata_trim_zeroes_data)
+			return BLKPREP_INVALID;
+		return sd_setup_ata_trim_cmnd(cmd);
+	}
+	if (sdp->no_write_same)
+		return BLKPREP_INVALID;
+	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
+		return sd_setup_write_same16_cmnd(cmd, false);
+	return sd_setup_write_same10_cmnd(cmd, false);
+}
+
 static void sd_config_write_same(struct scsi_disk *sdkp)
 {
 	struct request_queue *q = sdkp->disk->queue;
 	unsigned int logical_block_size = sdkp->device->sector_size;
 
+	if (sdkp->device->ata_trim) {
+		if (sdkp->device->ata_trim_zeroes_data)
+			sdkp->max_ws_blocks = 65535 * (512 / sizeof(__le64));
+		else
+			sdkp->max_ws_blocks = 0;
+		goto config_write_zeroes;
+	}
+
 	if (sdkp->device->no_write_same) {
 		sdkp->max_ws_blocks = 0;
 		goto out;
@@ -886,6 +915,9 @@  static void sd_config_write_same(struct scsi_disk *sdkp)
 out:
 	blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
 					 (logical_block_size >> 9));
+config_write_zeroes:
+	blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
+					 (logical_block_size >> 9));
 }
 
 /**
@@ -1226,7 +1258,7 @@  static int sd_init_command(struct scsi_cmnd *cmd)
 		case SD_LBP_UNMAP:
 			return sd_setup_unmap_cmnd(cmd);
 		case SD_LBP_WS16:
-			return sd_setup_write_same16_cmnd(cmd);
+			return sd_setup_write_same16_cmnd(cmd, true);
 		case SD_LBP_WS10:
 			return sd_setup_write_same10_cmnd(cmd, true);
 		case SD_LBP_ZERO:
@@ -1236,6 +1268,8 @@  static int sd_init_command(struct scsi_cmnd *cmd)
 		default:
 			return BLKPREP_INVALID;
 		}
+	case REQ_OP_WRITE_ZEROES:
+		return sd_setup_write_zeroes_cmnd(cmd);
 	case REQ_OP_WRITE_SAME:
 		return sd_setup_write_same_cmnd(cmd);
 	case REQ_OP_FLUSH:
@@ -1876,6 +1910,7 @@  static int sd_done(struct scsi_cmnd *SCpnt)
 
 	switch (req_op(req)) {
 	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET:
 		if (!result) {
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 92620c8ea8ad..1994f7799fce 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -329,6 +329,7 @@  void sd_zbc_complete(struct scsi_cmnd *cmd,
 
 	switch (req_op(rq)) {
 	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET: