Message ID | 1464162903-14735-5-git-send-email-mchristi@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Adding Hannes to the Cc list as he's been looking into EH improvements in this area. On Wed, May 25, 2016 at 02:55:02AM -0500, mchristi@redhat.com wrote: > From: Mike Christie <mchristi@redhat.com> > > Currently, if the SCSI eh runs then before we do a LUN_RESET > we stop the host. This patch and the block layer one before it > begin to add infrastructure to be able to do a LUN_RESET and > eventually do a transport level recovery without having to stop the > host. > > For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, > which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the > LLD manages the commands that are affected. > > eh_async_device_reset_handler: > > The LLD should perform a LUN RESET that affects all commands > that have been accepted by its queuecommand callout for the > device passed in to the callout. While the reset handler is running, > queuecommand will not be running or called for the device. > > Unlike eh_device_reset_handler, queuecommand may still be > called for other devices, and the LLD must call scsi_done for the > commands that have been affected by the reset. > > If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up > must be failed with DID_ABORT. > > Signed-off-by: Mike Christie <mchristi@redhat.com> > --- > drivers/scsi/scsi_error.c | 31 ++++++++++++++++++++++++++++--- > drivers/scsi/scsi_lib.c | 6 ++++++ > drivers/scsi/scsi_priv.h | 1 + > include/scsi/scsi_host.h | 17 +++++++++++++++++ > 4 files changed, 52 insertions(+), 3 deletions(-) > > diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c > index 984ddcb..cec2dfb 100644 > --- a/drivers/scsi/scsi_error.c > +++ b/drivers/scsi/scsi_error.c > @@ -853,16 +853,41 @@ static int scsi_try_bus_device_reset(struct scsi_cmnd *scmd) > { > int rtn; > struct scsi_host_template *hostt = scmd->device->host->hostt; > + struct scsi_device *sdev = scmd->device; > > - if (!hostt->eh_device_reset_handler) > + if (!hostt->eh_device_reset_handler && > + !hostt->eh_async_device_reset_handler) > return FAILED; > > - rtn = hostt->eh_device_reset_handler(scmd); > + if (hostt->eh_device_reset_handler) { > + rtn = hostt->eh_device_reset_handler(scmd); > + } else { > + if (!blk_reset_queue(sdev->request_queue)) > + rtn = SUCCESS; > + else > + rtn = FAILED; > + } > if (rtn == SUCCESS) > - __scsi_report_device_reset(scmd->device, NULL); > + __scsi_report_device_reset(sdev, NULL); > return rtn; > } > > +enum blk_eh_timer_return scsi_reset_queue(struct request_queue *q) > +{ > + struct scsi_device *sdev = q->queuedata; > + struct scsi_host_template *hostt = sdev->host->hostt; > + int rtn; > + > + if (!hostt->eh_async_device_reset_handler) > + return -EOPNOTSUPP; > + > + rtn = hostt->eh_async_device_reset_handler(sdev); > + if (rtn == SUCCESS || rtn == FAST_IO_FAIL) > + return BLK_EH_HANDLED; > + > + return BLK_EH_NOT_HANDLED; > +} > + > /** > * scsi_try_to_abort_cmd - Ask host to abort a SCSI command > * @hostt: SCSI driver host template > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c > index 8106515..11374dd 100644 > --- a/drivers/scsi/scsi_lib.c > +++ b/drivers/scsi/scsi_lib.c > @@ -779,6 +779,10 @@ static int __scsi_error_from_host_byte(struct scsi_cmnd *cmd, int result) > set_host_byte(cmd, DID_OK); > error = -ENODATA; > break; > + case DID_ABORT: > + set_host_byte(cmd, DID_OK); > + error = -EINTR; > + break; > default: > error = -EIO; > break; > @@ -2159,6 +2163,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) > blk_queue_softirq_done(q, scsi_softirq_done); > blk_queue_rq_timed_out(q, scsi_times_out); > blk_queue_lld_busy(q, scsi_lld_busy); > + blk_queue_reset(q, scsi_reset_queue); > return q; > } > > @@ -2167,6 +2172,7 @@ static struct blk_mq_ops scsi_mq_ops = { > .queue_rq = scsi_queue_rq, > .complete = scsi_softirq_done, > .timeout = scsi_timeout, > + .reset = scsi_reset_queue, > .init_request = scsi_init_request, > .exit_request = scsi_exit_request, > }; > diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h > index 27b4d0a..2e03168 100644 > --- a/drivers/scsi/scsi_priv.h > +++ b/drivers/scsi/scsi_priv.h > @@ -67,6 +67,7 @@ extern void scsi_exit_devinfo(void); > > /* scsi_error.c */ > extern void scmd_eh_abort_handler(struct work_struct *work); > +extern enum blk_eh_timer_return scsi_reset_queue(struct request_queue *q); > extern enum blk_eh_timer_return scsi_times_out(struct request *req); > extern int scsi_error_handler(void *host); > extern int scsi_decide_disposition(struct scsi_cmnd *cmd); > diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h > index fcfa3d7..532deb5 100644 > --- a/include/scsi/scsi_host.h > +++ b/include/scsi/scsi_host.h > @@ -146,6 +146,23 @@ struct scsi_host_template { > */ > int (* eh_abort_handler)(struct scsi_cmnd *); > int (* eh_device_reset_handler)(struct scsi_cmnd *); > + /* > + * eh_async_device_reset_handler - Perform LUN RESET > + * @scsi_device: scsi device to reset > + * > + * The LLD should perform a LUN RESET that affects all commands > + * that have been accepted by its queuecommand callout for the > + * device passed in. While the reset handler is running, queuecommand > + * will not be called for the device. > + * > + * Unlike eh_device_reset_handler, queuecommand may still be called > + * for other devices, and the LLD must call scsi_done for the commands > + * that have been affected by the reset. > + * > + * If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds for > + * scsi_device must be failed with DID_ABORT. > + */ > + int (* eh_async_device_reset_handler)(struct scsi_device *); > int (* eh_target_reset_handler)(struct scsi_cmnd *); > int (* eh_bus_reset_handler)(struct scsi_cmnd *); > int (* eh_host_reset_handler)(struct scsi_cmnd *); > -- > 2.7.2 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ---end quoted text--- -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/27/2016 10:23 AM, Christoph Hellwig wrote: > Adding Hannes to the Cc list as he's been looking into EH improvements > in this area. > > On Wed, May 25, 2016 at 02:55:02AM -0500, mchristi@redhat.com wrote: >> From: Mike Christie <mchristi@redhat.com> >> >> Currently, if the SCSI eh runs then before we do a LUN_RESET >> we stop the host. This patch and the block layer one before it >> begin to add infrastructure to be able to do a LUN_RESET and >> eventually do a transport level recovery without having to stop the >> host. >> >> For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, >> which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the >> LLD manages the commands that are affected. >> >> eh_async_device_reset_handler: >> >> The LLD should perform a LUN RESET that affects all commands >> that have been accepted by its queuecommand callout for the >> device passed in to the callout. While the reset handler is running, >> queuecommand will not be running or called for the device. >> >> Unlike eh_device_reset_handler, queuecommand may still be >> called for other devices, and the LLD must call scsi_done for the >> commands that have been affected by the reset. >> >> If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up >> must be failed with DID_ABORT. >> >> Signed-off-by: Mike Christie <mchristi@redhat.com> In general I like the approach. I'll be looking into it more closely next week. Cheers, Hannes
On 05/25/2016 09:55 AM, mchristi@redhat.com wrote: > From: Mike Christie <mchristi@redhat.com> > > Currently, if the SCSI eh runs then before we do a LUN_RESET > we stop the host. This patch and the block layer one before it > begin to add infrastructure to be able to do a LUN_RESET and > eventually do a transport level recovery without having to stop the > host. > > For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, > which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the > LLD manages the commands that are affected. > > eh_async_device_reset_handler: > > The LLD should perform a LUN RESET that affects all commands > that have been accepted by its queuecommand callout for the > device passed in to the callout. While the reset handler is running, > queuecommand will not be running or called for the device. > > Unlike eh_device_reset_handler, queuecommand may still be > called for other devices, and the LLD must call scsi_done for the > commands that have been affected by the reset. > > If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up > must be failed with DID_ABORT. > Hmm. With this patch you essentially just replaced the existing eh_device_reset_handler() with eh_async_device_request_handler(). So how does this differ from the original behaviour? By the time we're calling it the SCSI host is already in EH, ie all commands have been completed or failed, so why again do we need to wait for the queue to be empty? And how exactly can queuecommand be called for other devices, as the host is already in EH? Cheers, Hannes
On 05/30/2016 01:27 AM, Hannes Reinecke wrote: > On 05/25/2016 09:55 AM, mchristi@redhat.com wrote: >> From: Mike Christie <mchristi@redhat.com> >> >> Currently, if the SCSI eh runs then before we do a LUN_RESET >> we stop the host. This patch and the block layer one before it >> begin to add infrastructure to be able to do a LUN_RESET and >> eventually do a transport level recovery without having to stop the >> host. >> >> For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, >> which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the >> LLD manages the commands that are affected. >> >> eh_async_device_reset_handler: >> >> The LLD should perform a LUN RESET that affects all commands >> that have been accepted by its queuecommand callout for the >> device passed in to the callout. While the reset handler is running, >> queuecommand will not be running or called for the device. >> >> Unlike eh_device_reset_handler, queuecommand may still be >> called for other devices, and the LLD must call scsi_done for the >> commands that have been affected by the reset. >> >> If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up >> must be failed with DID_ABORT. >> > Hmm. With this patch you essentially just replaced the existing > eh_device_reset_handler() with eh_async_device_request_handler(). > So how does this differ from the original behaviour? 1. LLD must call scsi_done and set host byte on each command affected by the reset. This is what they have to do for the SG ioctl reset, but for the scsi eh reset, LLDs do not have to because scsi-ml manages the commands for them. When doing a SG ioctl based reset or if the reset is called from the target like in the last patch it is not possible to have scsi-ml track the outstanding commands like we do today based on timeouts. 2. LLDs have to support commands to other luns during device resets, so they cannot have any sort of host wide device resource lock/resource that they can rely on. It has to be per device. 3. We can now support being able to do a lun reset without having to stop then entire host. > By the time we're calling it the SCSI host is already in EH, ie all > commands have been completed or failed, so why again do we need to > wait for the queue to be empty? I am not sure what you mean here. The patches in this set never go into host reset or even target level reset handling. For this set, we only want to drivers to be able to do a device/lun reset whenever they are asked to do so. > > And how exactly can queuecommand be called for other devices, as the > host is already in EH? > Where in this patchset do we stop the host? -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/31/2016 02:38 PM, Mike Christie wrote: > On 05/30/2016 01:27 AM, Hannes Reinecke wrote: >> On 05/25/2016 09:55 AM, mchristi@redhat.com wrote: >>> From: Mike Christie <mchristi@redhat.com> >>> >>> Currently, if the SCSI eh runs then before we do a LUN_RESET >>> we stop the host. This patch and the block layer one before it >>> begin to add infrastructure to be able to do a LUN_RESET and >>> eventually do a transport level recovery without having to stop the >>> host. >>> >>> For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, >>> which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the >>> LLD manages the commands that are affected. >>> >>> eh_async_device_reset_handler: >>> >>> The LLD should perform a LUN RESET that affects all commands >>> that have been accepted by its queuecommand callout for the >>> device passed in to the callout. While the reset handler is running, >>> queuecommand will not be running or called for the device. >>> >>> Unlike eh_device_reset_handler, queuecommand may still be >>> called for other devices, and the LLD must call scsi_done for the >>> commands that have been affected by the reset. >>> >>> If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up >>> must be failed with DID_ABORT. >>> >> Hmm. With this patch you essentially just replaced the existing >> eh_device_reset_handler() with eh_async_device_request_handler(). >> So how does this differ from the original behaviour? > > > 1. LLD must call scsi_done and set host byte on each command affected by > the reset. This is what they have to do for the SG ioctl reset, but for > the scsi eh reset, LLDs do not have to because scsi-ml manages the > commands for them. > > When doing a SG ioctl based reset or if the reset is called from the > target like in the last patch it is not possible to have scsi-ml track > the outstanding commands like we do today based on timeouts. > > 2. LLDs have to support commands to other luns during device resets, so > they cannot have any sort of host wide device resource lock/resource > that they can rely on. It has to be per device. > > 3. We can now support being able to do a lun reset without having to > stop then entire host. > >> By the time we're calling it the SCSI host is already in EH, ie all >> commands have been completed or failed, so why again do we need to >> wait for the queue to be empty? > > I am not sure what you mean here. The patches in this set never go into > host reset or even target level reset handling. For this set, we only > want to drivers to be able to do a device/lun reset whenever they are > asked to do so. > >> >> And how exactly can queuecommand be called for other devices, as the >> host is already in EH? >> > > Where in this patchset do we stop the host? > Oh yeah, I mean new entry points I am adding in this set. In the 3rd patch, we never start the scsi eh and we never stop the host. The target layer just calls the new callout through the block layer helper added in the 2nd patch. -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/31/2016 02:38 PM, Mike Christie wrote: > On 05/30/2016 01:27 AM, Hannes Reinecke wrote: >> On 05/25/2016 09:55 AM, mchristi@redhat.com wrote: >>> From: Mike Christie <mchristi@redhat.com> >>> >>> Currently, if the SCSI eh runs then before we do a LUN_RESET >>> we stop the host. This patch and the block layer one before it >>> begin to add infrastructure to be able to do a LUN_RESET and >>> eventually do a transport level recovery without having to stop the >>> host. >>> >>> For LUn-reset, this patch adds a new callout, eh_async_device_reset_handler, >>> which works similar to how LLDs handle SG_SCSI_RESET_DEVICE where the >>> LLD manages the commands that are affected. >>> >>> eh_async_device_reset_handler: >>> >>> The LLD should perform a LUN RESET that affects all commands >>> that have been accepted by its queuecommand callout for the >>> device passed in to the callout. While the reset handler is running, >>> queuecommand will not be running or called for the device. >>> >>> Unlike eh_device_reset_handler, queuecommand may still be >>> called for other devices, and the LLD must call scsi_done for the >>> commands that have been affected by the reset. >>> >>> If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds cleaned up >>> must be failed with DID_ABORT. >>> >> Hmm. With this patch you essentially just replaced the existing >> eh_device_reset_handler() with eh_async_device_request_handler(). >> So how does this differ from the original behaviour? > > > 1. LLD must call scsi_done and set host byte on each command affected by > the reset. This is what they have to do for the SG ioctl reset, but for > the scsi eh reset, LLDs do not have to because scsi-ml manages the > commands for them. > > When doing a SG ioctl based reset or if the reset is called from the > target like in the last patch it is not possible to have scsi-ml track > the outstanding commands like we do today based on timeouts. > > 2. LLDs have to support commands to other luns during device resets, so > they cannot have any sort of host wide device resource lock/resource > that they can rely on. It has to be per device. > > 3. We can now support being able to do a lun reset without having to > stop then entire host. > One other difference is that for the SG ioctl reset case, it is a pain to handle the race where queuecommand and eh_device_reset_handler are running while also trying to remove locks and atomics from the LLD and move to mq. Probably when the SG reset code was made we had the host_lock taken in queuecommand, so it was simple to handle. If we stop the queue like in the second patch then the LLD at least does not have to worry about this. -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 984ddcb..cec2dfb 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -853,16 +853,41 @@ static int scsi_try_bus_device_reset(struct scsi_cmnd *scmd) { int rtn; struct scsi_host_template *hostt = scmd->device->host->hostt; + struct scsi_device *sdev = scmd->device; - if (!hostt->eh_device_reset_handler) + if (!hostt->eh_device_reset_handler && + !hostt->eh_async_device_reset_handler) return FAILED; - rtn = hostt->eh_device_reset_handler(scmd); + if (hostt->eh_device_reset_handler) { + rtn = hostt->eh_device_reset_handler(scmd); + } else { + if (!blk_reset_queue(sdev->request_queue)) + rtn = SUCCESS; + else + rtn = FAILED; + } if (rtn == SUCCESS) - __scsi_report_device_reset(scmd->device, NULL); + __scsi_report_device_reset(sdev, NULL); return rtn; } +enum blk_eh_timer_return scsi_reset_queue(struct request_queue *q) +{ + struct scsi_device *sdev = q->queuedata; + struct scsi_host_template *hostt = sdev->host->hostt; + int rtn; + + if (!hostt->eh_async_device_reset_handler) + return -EOPNOTSUPP; + + rtn = hostt->eh_async_device_reset_handler(sdev); + if (rtn == SUCCESS || rtn == FAST_IO_FAIL) + return BLK_EH_HANDLED; + + return BLK_EH_NOT_HANDLED; +} + /** * scsi_try_to_abort_cmd - Ask host to abort a SCSI command * @hostt: SCSI driver host template diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 8106515..11374dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -779,6 +779,10 @@ static int __scsi_error_from_host_byte(struct scsi_cmnd *cmd, int result) set_host_byte(cmd, DID_OK); error = -ENODATA; break; + case DID_ABORT: + set_host_byte(cmd, DID_OK); + error = -EINTR; + break; default: error = -EIO; break; @@ -2159,6 +2163,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) blk_queue_softirq_done(q, scsi_softirq_done); blk_queue_rq_timed_out(q, scsi_times_out); blk_queue_lld_busy(q, scsi_lld_busy); + blk_queue_reset(q, scsi_reset_queue); return q; } @@ -2167,6 +2172,7 @@ static struct blk_mq_ops scsi_mq_ops = { .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, + .reset = scsi_reset_queue, .init_request = scsi_init_request, .exit_request = scsi_exit_request, }; diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h index 27b4d0a..2e03168 100644 --- a/drivers/scsi/scsi_priv.h +++ b/drivers/scsi/scsi_priv.h @@ -67,6 +67,7 @@ extern void scsi_exit_devinfo(void); /* scsi_error.c */ extern void scmd_eh_abort_handler(struct work_struct *work); +extern enum blk_eh_timer_return scsi_reset_queue(struct request_queue *q); extern enum blk_eh_timer_return scsi_times_out(struct request *req); extern int scsi_error_handler(void *host); extern int scsi_decide_disposition(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index fcfa3d7..532deb5 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -146,6 +146,23 @@ struct scsi_host_template { */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); + /* + * eh_async_device_reset_handler - Perform LUN RESET + * @scsi_device: scsi device to reset + * + * The LLD should perform a LUN RESET that affects all commands + * that have been accepted by its queuecommand callout for the + * device passed in. While the reset handler is running, queuecommand + * will not be called for the device. + * + * Unlike eh_device_reset_handler, queuecommand may still be called + * for other devices, and the LLD must call scsi_done for the commands + * that have been affected by the reset. + * + * If SUCCESS or FAST_IO_FAIL is returned, the scsi_cmnds for + * scsi_device must be failed with DID_ABORT. + */ + int (* eh_async_device_reset_handler)(struct scsi_device *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *);