diff mbox

block/sd: Return -EREMOTEIO when WRITE SAME and DISCARD are disabled

Message ID 1454568483-11298-1-git-send-email-martin.petersen@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Martin K. Petersen Feb. 4, 2016, 6:48 a.m. UTC
When a storage device rejects a WRITE SAME command we will disable write
same functionality for the device and return -EREMOTEIO to the block
layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or
failing the path.

Yiwen Jiang discovered a small race where WRITE SAME requests issued
simultaneously would cause -EIO to be returned. This happened because
any requests being prepared after WRITE SAME had been disabled for the
device caused us to return BLKPREP_KILL. The latter caused the block
layer to return -EIO upon completion.

To overcome this we introduce BLKPREP_INVALID which indicates that this
is an invalid request for the device. blk_peek_request() is modified to
return -EREMOTEIO in that case.

Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Suggested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

---

I contemplated making blk_peek_request() use rq->errors to decide what
to return up the stack. However, I cringed at mixing errnos and SCSI
midlayer status information in the same location.
---
 block/blk-core.c       | 6 ++++--
 drivers/scsi/sd.c      | 4 ++--
 include/linux/blkdev.h | 9 ++++++---
 3 files changed, 12 insertions(+), 7 deletions(-)

Comments

Jiangyiwen Feb. 4, 2016, 9:16 a.m. UTC | #1
On 2016/2/4 14:48, Martin K. Petersen wrote:
> When a storage device rejects a WRITE SAME command we will disable write
> same functionality for the device and return -EREMOTEIO to the block
> layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or
> failing the path.
> 
> Yiwen Jiang discovered a small race where WRITE SAME requests issued
> simultaneously would cause -EIO to be returned. This happened because
> any requests being prepared after WRITE SAME had been disabled for the
> device caused us to return BLKPREP_KILL. The latter caused the block
> layer to return -EIO upon completion.
> 
> To overcome this we introduce BLKPREP_INVALID which indicates that this
> is an invalid request for the device. blk_peek_request() is modified to
> return -EREMOTEIO in that case.
> 
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Suggested-by: Mike Snitzer <snitzer@redhat.com>
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> 
> ---
> 
> I contemplated making blk_peek_request() use rq->errors to decide what
> to return up the stack. However, I cringed at mixing errnos and SCSI
> midlayer status information in the same location.
> ---
>  block/blk-core.c       | 6 ++++--
>  drivers/scsi/sd.c      | 4 ++--
>  include/linux/blkdev.h | 9 ++++++---
>  3 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 33e2f62d5062..ee833af2f892 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2446,14 +2446,16 @@ struct request *blk_peek_request(struct request_queue *q)
>  
>  			rq = NULL;
>  			break;
> -		} else if (ret == BLKPREP_KILL) {
> +		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
> +			int err = ret == BLKPREP_INVALID ? -EREMOTEIO : -EIO;
> +
>  			rq->cmd_flags |= REQ_QUIET;
>  			/*
>  			 * Mark this request as started so we don't trigger
>  			 * any debug logic in the end I/O path.
>  			 */
>  			blk_start_request(rq);
> -			__blk_end_request_all(rq, -EIO);
> +			__blk_end_request_all(rq, err);
>  		} else {
>  			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
>  			break;
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index ec163d08f6c3..6e841c6da632 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -761,7 +761,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
>  		break;
>  
>  	default:
> -		ret = BLKPREP_KILL;
> +		ret = BLKPREP_INVALID;
>  		goto out;
>  	}
>  
> @@ -839,7 +839,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
>  	int ret;
>  
>  	if (sdkp->device->no_write_same)
> -		return BLKPREP_KILL;
> +		return BLKPREP_INVALID;
>  
>  	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
>  
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c70e3588a48c..e990d181625a 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -680,9 +680,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
>  /*
>   * q->prep_rq_fn return values
>   */
> -#define BLKPREP_OK		0	/* serve it */
> -#define BLKPREP_KILL		1	/* fatal error, kill */
> -#define BLKPREP_DEFER		2	/* leave on queue */
> +enum {
> +	BLKPREP_OK,		/* serve it */
> +	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
> +	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
> +	BLKPREP_DEFER,		/* leave on queue */
> +};
>  
>  extern unsigned long blk_max_low_pfn, blk_max_pfn;
>  
> 
Hi Martin,
It is very good, I totally agree with this patch. But I have
three questions:

First, I don't understand why blk_peek_request() return EREMOTEIO,
as I know, in this situation we only prepare scsi command
without sending to device, and I think EREMOTEIO should
be returned only when IO has already sent to device, maybe
I don't understand definition of EREMOTEIO.
So, Why don't return the errno with EOPNOTSUPP?

In addition, I still worried with whether there has other
situations which will return EIO or other error. In this
way, MD/DM still can happen this type of problem, so I think
may be in multipath we still needs a protection to avoid it.

At last, I have a additional problem, I remember that you
previously send a series of patches about XCOPY, why don't
have any news latter later? I very much expect that I can
see these patches which are merged into kernel.

Thanks,
Yiwen Jiang.

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke Feb. 4, 2016, 2:14 p.m. UTC | #2
On 02/04/2016 07:48 AM, Martin K. Petersen wrote:
> When a storage device rejects a WRITE SAME command we will disable write
> same functionality for the device and return -EREMOTEIO to the block
> layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or
> failing the path.
> 
> Yiwen Jiang discovered a small race where WRITE SAME requests issued
> simultaneously would cause -EIO to be returned. This happened because
> any requests being prepared after WRITE SAME had been disabled for the
> device caused us to return BLKPREP_KILL. The latter caused the block
> layer to return -EIO upon completion.
> 
> To overcome this we introduce BLKPREP_INVALID which indicates that this
> is an invalid request for the device. blk_peek_request() is modified to
> return -EREMOTEIO in that case.
> 
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Suggested-by: Mike Snitzer <snitzer@redhat.com>
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> 
> ---
> 
> I contemplated making blk_peek_request() use rq->errors to decide what
> to return up the stack. However, I cringed at mixing errnos and SCSI
> midlayer status information in the same location.

Well, rq->errors has a mixed heritage anyway; occasionally it's an
errno, sometimes a SCSI midlayer status, you name it.
One should clean it up eventually ...

> ---
>  block/blk-core.c       | 6 ++++--
>  drivers/scsi/sd.c      | 4 ++--
>  include/linux/blkdev.h | 9 ++++++---
>  3 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 33e2f62d5062..ee833af2f892 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2446,14 +2446,16 @@ struct request *blk_peek_request(struct request_queue *q)
>  
>  			rq = NULL;
>  			break;
> -		} else if (ret == BLKPREP_KILL) {
> +		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
> +			int err = ret == BLKPREP_INVALID ? -EREMOTEIO : -EIO;
> +
>  			rq->cmd_flags |= REQ_QUIET;
>  			/*
>  			 * Mark this request as started so we don't trigger
>  			 * any debug logic in the end I/O path.
>  			 */
>  			blk_start_request(rq);
> -			__blk_end_request_all(rq, -EIO);
> +			__blk_end_request_all(rq, err);
>  		} else {
>  			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
>  			break;
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index ec163d08f6c3..6e841c6da632 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -761,7 +761,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
>  		break;
>  
>  	default:
> -		ret = BLKPREP_KILL;
> +		ret = BLKPREP_INVALID;
>  		goto out;
>  	}
>  
> @@ -839,7 +839,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
>  	int ret;
>  
>  	if (sdkp->device->no_write_same)
> -		return BLKPREP_KILL;
> +		return BLKPREP_INVALID;
>  
>  	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
>  
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c70e3588a48c..e990d181625a 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -680,9 +680,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
>  /*
>   * q->prep_rq_fn return values
>   */
> -#define BLKPREP_OK		0	/* serve it */
> -#define BLKPREP_KILL		1	/* fatal error, kill */
> -#define BLKPREP_DEFER		2	/* leave on queue */
> +enum {
> +	BLKPREP_OK,		/* serve it */
> +	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
> +	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
> +	BLKPREP_DEFER,		/* leave on queue */
> +};
>  
>  extern unsigned long blk_max_low_pfn, blk_max_pfn;
>  
> 
Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
Ewan Milne Feb. 4, 2016, 4:36 p.m. UTC | #3
See below.

On Thu, 2016-02-04 at 01:48 -0500, Martin K. Petersen wrote:
> When a storage device rejects a WRITE SAME command we will disable write
> same functionality for the device and return -EREMOTEIO to the block
> layer. -EREMOTEIO will in turn prevent DM from retrying the I/O and/or
> failing the path.
> 
> Yiwen Jiang discovered a small race where WRITE SAME requests issued
> simultaneously would cause -EIO to be returned. This happened because
> any requests being prepared after WRITE SAME had been disabled for the
> device caused us to return BLKPREP_KILL. The latter caused the block
> layer to return -EIO upon completion.
> 
> To overcome this we introduce BLKPREP_INVALID which indicates that this
> is an invalid request for the device. blk_peek_request() is modified to
> return -EREMOTEIO in that case.
> 
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Suggested-by: Mike Snitzer <snitzer@redhat.com>
> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
> 
> ---
> 
> I contemplated making blk_peek_request() use rq->errors to decide what
> to return up the stack. However, I cringed at mixing errnos and SCSI
> midlayer status information in the same location.
> ---
>  block/blk-core.c       | 6 ++++--
>  drivers/scsi/sd.c      | 4 ++--
>  include/linux/blkdev.h | 9 ++++++---
>  3 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 33e2f62d5062..ee833af2f892 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2446,14 +2446,16 @@ struct request *blk_peek_request(struct request_queue *q)
>  
>  			rq = NULL;
>  			break;
> -		} else if (ret == BLKPREP_KILL) {
> +		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
> +			int err = ret == BLKPREP_INVALID ? -EREMOTEIO : -EIO;

Maybe it's unnecessary, but I would put parenthesis around (ret == ... -EIO); for clarity.

> +
>  			rq->cmd_flags |= REQ_QUIET;
>  			/*
>  			 * Mark this request as started so we don't trigger
>  			 * any debug logic in the end I/O path.
>  			 */
>  			blk_start_request(rq);
> -			__blk_end_request_all(rq, -EIO);
> +			__blk_end_request_all(rq, err);
>  		} else {
>  			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
>  			break;
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index ec163d08f6c3..6e841c6da632 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -761,7 +761,7 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
>  		break;
>  
>  	default:
> -		ret = BLKPREP_KILL;
> +		ret = BLKPREP_INVALID;
>  		goto out;
>  	}
>  
> @@ -839,7 +839,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
>  	int ret;
>  
>  	if (sdkp->device->no_write_same)
> -		return BLKPREP_KILL;
> +		return BLKPREP_INVALID;
>  
>  	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
>  
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index c70e3588a48c..e990d181625a 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -680,9 +680,12 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
>  /*
>   * q->prep_rq_fn return values
>   */
> -#define BLKPREP_OK		0	/* serve it */
> -#define BLKPREP_KILL		1	/* fatal error, kill */
> -#define BLKPREP_DEFER		2	/* leave on queue */
> +enum {
> +	BLKPREP_OK,		/* serve it */
> +	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
> +	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
> +	BLKPREP_DEFER,		/* leave on queue */
> +};

I would prefer that additional enum/constant values be added to the end
of the series, here you are changing the ordinal value of BLKPREP_DEFER
from 2 to 3.  Could you swap these?

>  extern unsigned long blk_max_low_pfn, blk_max_pfn;
>  

Reviewed-by: Ewan D. Milne <emilne@redhat.com>


--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin K. Petersen Feb. 5, 2016, 3:13 a.m. UTC | #4
>>>>> "Yiwen" == jiangyiwen  <jiangyiwen@huawei.com> writes:

Yiwen,

Yiwen> First, I don't understand why blk_peek_request() return
Yiwen> EREMOTEIO, as I know, in this situation we only prepare scsi
Yiwen> command without sending to device, and I think EREMOTEIO should
Yiwen> be returned only when IO has already sent to device, maybe I
Yiwen> don't understand definition of EREMOTEIO.  So, Why don't return
Yiwen> the errno with EOPNOTSUPP?

DM currently has special handling for EREMOTEIO failures (because that's
what we'd return when a device responds with ILLEGAL REQUEST).

I am not opposed to returning EOPNOTSUPP but it would require more
changes and since this is a bugfix for stable I want to keep it as small
as possible.

Yiwen> In addition, I still worried with whether there has other
Yiwen> situations which will return EIO or other error. In this way,
Yiwen> MD/DM still can happen this type of problem, so I think may be in
Yiwen> multipath we still needs a protection to avoid it.

There are various error scenarios where we can end up bailing with a
BLKPREP_KILL. But the general rule of thumb is that these conditions all
demand a retry. The optional commands like WRITE SAME and UNMAP are
special in that they are irrecoverable.

Yiwen> At last, I have a additional problem, I remember that you
Yiwen> previously send a series of patches about XCOPY, why don't have
Yiwen> any news latter later? I very much expect that I can see these
Yiwen> patches which are merged into kernel.

I am working on a refresh of the series that includes token-based copy
offload support in addition to EXTENDED COPY. The patches depend on Mike
Christie's request flag patch series which has yet to be merged.
Martin K. Petersen Feb. 5, 2016, 3:16 a.m. UTC | #5
>>>>> "Ewan" == Ewan Milne <emilne@redhat.com> writes:

Ewan> Maybe it's unnecessary, but I would put parenthesis around (ret ==
Ewan> ... -EIO); for clarity.

Sure.

Ewan> I would prefer that additional enum/constant values be added to
Ewan> the end of the series, here you are changing the ordinal value of
Ewan> BLKPREP_DEFER from 2 to 3.  Could you swap these?

I felt that KILL and INVALID were closely related. That's why I kept
them together. But it doesn't matter to me.
Jiangyiwen Feb. 5, 2016, 3:39 a.m. UTC | #6
On 2016/2/5 11:13, Martin K. Petersen wrote:
>>>>>> "Yiwen" == jiangyiwen  <jiangyiwen@huawei.com> writes:
> 
> Yiwen,
> 
> Yiwen> First, I don't understand why blk_peek_request() return
> Yiwen> EREMOTEIO, as I know, in this situation we only prepare scsi
> Yiwen> command without sending to device, and I think EREMOTEIO should
> Yiwen> be returned only when IO has already sent to device, maybe I
> Yiwen> don't understand definition of EREMOTEIO.  So, Why don't return
> Yiwen> the errno with EOPNOTSUPP?
> 
> DM currently has special handling for EREMOTEIO failures (because that's
> what we'd return when a device responds with ILLEGAL REQUEST).
> 
> I am not opposed to returning EOPNOTSUPP but it would require more
> changes and since this is a bugfix for stable I want to keep it as small
> as possible.
> 
> Yiwen> In addition, I still worried with whether there has other
> Yiwen> situations which will return EIO or other error. In this way,
> Yiwen> MD/DM still can happen this type of problem, so I think may be in
> Yiwen> multipath we still needs a protection to avoid it.
> 
> There are various error scenarios where we can end up bailing with a
> BLKPREP_KILL. But the general rule of thumb is that these conditions all
> demand a retry. The optional commands like WRITE SAME and UNMAP are
> special in that they are irrecoverable.
> 
> Yiwen> At last, I have a additional problem, I remember that you
> Yiwen> previously send a series of patches about XCOPY, why don't have
> Yiwen> any news latter later? I very much expect that I can see these
> Yiwen> patches which are merged into kernel.
> 
> I am working on a refresh of the series that includes token-based copy
> offload support in addition to EXTENDED COPY. The patches depend on Mike
> Christie's request flag patch series which has yet to be merged.
> 
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>

Thanks,
Yiwen Jiang.

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 33e2f62d5062..ee833af2f892 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2446,14 +2446,16 @@  struct request *blk_peek_request(struct request_queue *q)
 
 			rq = NULL;
 			break;
-		} else if (ret == BLKPREP_KILL) {
+		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
+			int err = ret == BLKPREP_INVALID ? -EREMOTEIO : -EIO;
+
 			rq->cmd_flags |= REQ_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
 			 */
 			blk_start_request(rq);
-			__blk_end_request_all(rq, -EIO);
+			__blk_end_request_all(rq, err);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ec163d08f6c3..6e841c6da632 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -761,7 +761,7 @@  static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 		break;
 
 	default:
-		ret = BLKPREP_KILL;
+		ret = BLKPREP_INVALID;
 		goto out;
 	}
 
@@ -839,7 +839,7 @@  static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 	int ret;
 
 	if (sdkp->device->no_write_same)
-		return BLKPREP_KILL;
+		return BLKPREP_INVALID;
 
 	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c70e3588a48c..e990d181625a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -680,9 +680,12 @@  static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 /*
  * q->prep_rq_fn return values
  */
-#define BLKPREP_OK		0	/* serve it */
-#define BLKPREP_KILL		1	/* fatal error, kill */
-#define BLKPREP_DEFER		2	/* leave on queue */
+enum {
+	BLKPREP_OK,		/* serve it */
+	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
+	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
+	BLKPREP_DEFER,		/* leave on queue */
+};
 
 extern unsigned long blk_max_low_pfn, blk_max_pfn;