Message ID | 20191224220248.30138-7-bvanassche@acm.org (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | Six UFS patches | expand |
On 2019-12-25 06:02, Bart Van Assche wrote: > The UFS SCSI timeout handler was needed to compensate that > ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long > time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by > eliminating > tag conflicts") fixed this so the timeout handler is no longer > necessary. > > See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout > handler"). > > Cc: Bean Huo <beanhuo@micron.com> > Cc: Can Guo <cang@codeaurora.org> > Cc: Avri Altman <avri.altman@wdc.com> > Cc: Stanley Chu <stanley.chu@mediatek.com> > Cc: Tomas Winkler <tomas.winkler@intel.com> > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > drivers/scsi/ufs/ufshcd.c | 36 ------------------------------------ > 1 file changed, 36 deletions(-) > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c > index edcc137c436b..763e41286a4b 100644 > --- a/drivers/scsi/ufs/ufshcd.c > +++ b/drivers/scsi/ufs/ufshcd.c > @@ -7092,41 +7092,6 @@ static void ufshcd_async_scan(void *data, > async_cookie_t cookie) > ufshcd_probe_hba(hba); > } > > -static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd > *scmd) > -{ > - unsigned long flags; > - struct Scsi_Host *host; > - struct ufs_hba *hba; > - int index; > - bool found = false; > - > - if (!scmd || !scmd->device || !scmd->device->host) > - return BLK_EH_DONE; > - > - host = scmd->device->host; > - hba = shost_priv(host); > - if (!hba) > - return BLK_EH_DONE; > - > - spin_lock_irqsave(host->host_lock, flags); > - > - for_each_set_bit(index, &hba->outstanding_reqs, hba->nutrs) { > - if (hba->lrb[index].cmd == scmd) { > - found = true; > - break; > - } > - } > - > - spin_unlock_irqrestore(host->host_lock, flags); > - > - /* > - * Bypass SCSI error handling and reset the block layer timer if this > - * SCSI command was not actually dispatched to UFS driver, otherwise > - * let SCSI layer handle the error as usual. > - */ > - return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER; > -} > - > static const struct attribute_group *ufshcd_driver_groups[] = { > &ufs_sysfs_unit_descriptor_group, > &ufs_sysfs_lun_attributes_group, > @@ -7145,7 +7110,6 @@ static struct scsi_host_template > ufshcd_driver_template = { > .eh_abort_handler = ufshcd_abort, > .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > - .eh_timed_out = ufshcd_eh_timed_out, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN, Reviewed-by: Can Guo <cang@codeaurora.org>
On Wed, Dec 25, 2019 at 3:35 AM Bart Van Assche <bvanassche@acm.org> wrote: > > The UFS SCSI timeout handler was needed to compensate that > ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long > time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by eliminating > tag conflicts") fixed this so the timeout handler is no longer necessary. > > See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout handler"). > > Cc: Bean Huo <beanhuo@micron.com> > Cc: Can Guo <cang@codeaurora.org> > Cc: Avri Altman <avri.altman@wdc.com> > Cc: Stanley Chu <stanley.chu@mediatek.com> > Cc: Tomas Winkler <tomas.winkler@intel.com> > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com> > drivers/scsi/ufs/ufshcd.c | 36 ------------------------------------ > 1 file changed, 36 deletions(-) > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c > index edcc137c436b..763e41286a4b 100644 > --- a/drivers/scsi/ufs/ufshcd.c > +++ b/drivers/scsi/ufs/ufshcd.c > @@ -7092,41 +7092,6 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) > ufshcd_probe_hba(hba); > } > > -static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd) > -{ > - unsigned long flags; > - struct Scsi_Host *host; > - struct ufs_hba *hba; > - int index; > - bool found = false; > - > - if (!scmd || !scmd->device || !scmd->device->host) > - return BLK_EH_DONE; > - > - host = scmd->device->host; > - hba = shost_priv(host); > - if (!hba) > - return BLK_EH_DONE; > - > - spin_lock_irqsave(host->host_lock, flags); > - > - for_each_set_bit(index, &hba->outstanding_reqs, hba->nutrs) { > - if (hba->lrb[index].cmd == scmd) { > - found = true; > - break; > - } > - } > - > - spin_unlock_irqrestore(host->host_lock, flags); > - > - /* > - * Bypass SCSI error handling and reset the block layer timer if this > - * SCSI command was not actually dispatched to UFS driver, otherwise > - * let SCSI layer handle the error as usual. > - */ > - return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER; > -} > - > static const struct attribute_group *ufshcd_driver_groups[] = { > &ufs_sysfs_unit_descriptor_group, > &ufs_sysfs_lun_attributes_group, > @@ -7145,7 +7110,6 @@ static struct scsi_host_template ufshcd_driver_template = { > .eh_abort_handler = ufshcd_abort, > .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > - .eh_timed_out = ufshcd_eh_timed_out, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN,
Hi Bart, On 2019-12-25 06:02, Bart Van Assche wrote: > The UFS SCSI timeout handler was needed to compensate that > ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long > time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by > eliminating > tag conflicts") fixed this so the timeout handler is no longer > necessary. > > See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout > handler"). > Sorry for bugging you on this old change. I am afraid we may need to add this timeout handler back. Because there is till chances that a request gets stuck somewhere in ufshcd_queuecommand() path before ufshcd_send_command() gets called. e.g. ufshcd_queuecommand() ->ufshcd_map_sg() -->scsi_dma_map() --->dma_map_sg() ---->dev->ops->map_sg() map_sg() ops may get stuck. map_sg() method can vary on different platforms based on actual IOMMU engines. We cannot gaurantee map_sg() ops must return immediately as we don't know what is actually inside map_sg() ops. And if it gets stuck there for a long time till the request times out, without the UFS timeout handler, scsi layer will try to abort this request from UFS driver by calling ufshcd_abort() eventually. ufshcd_abort() will think this request has been completed due to its tag is not in hba->outstanding_reqs or UFS host's door bell reg. However, actually, this request is still in ufshcd_queuecommand() path. I don't need to continue on the subsequent impact to UFS driver if ufshcd_abort() happens in this case. This is a corner case, but it is still possible (I did see map_sg() ops hangs on real devices). Having the UFS timeout handler back will prevent this situation as UFS timeout handler checks if the tag is in hba->outstanding_reqs (for our case, it is not in there), if no, it returns BLK_EH_RESET_TIMER so that scsi/block layer will keep waiting. What do you think? Please let me know your ideas on this, thanks! <--code snippet--> static int ufshcd_abort(struct scsi_cmnd *cmd) { ... reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL); /* If command is already aborted/completed, return SUCCESS */ if (!(test_bit(tag, &hba->outstanding_reqs))) { dev_err(hba->dev, "%s: cmd at tag %d already completed, outstanding=0x%lx, doorbell=0x%x\n", __func__, tag, hba->outstanding_reqs, reg); goto out; } ... } <--code snippet--> Thanks, Can Guo. > Cc: Bean Huo <beanhuo@micron.com> > Cc: Can Guo <cang@codeaurora.org> > Cc: Avri Altman <avri.altman@wdc.com> > Cc: Stanley Chu <stanley.chu@mediatek.com> > Cc: Tomas Winkler <tomas.winkler@intel.com> > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > drivers/scsi/ufs/ufshcd.c | 36 ------------------------------------ > 1 file changed, 36 deletions(-) > > diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c > index edcc137c436b..763e41286a4b 100644 > --- a/drivers/scsi/ufs/ufshcd.c > +++ b/drivers/scsi/ufs/ufshcd.c > @@ -7092,41 +7092,6 @@ static void ufshcd_async_scan(void *data, > async_cookie_t cookie) > ufshcd_probe_hba(hba); > } > > -static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd > *scmd) > -{ > - unsigned long flags; > - struct Scsi_Host *host; > - struct ufs_hba *hba; > - int index; > - bool found = false; > - > - if (!scmd || !scmd->device || !scmd->device->host) > - return BLK_EH_DONE; > - > - host = scmd->device->host; > - hba = shost_priv(host); > - if (!hba) > - return BLK_EH_DONE; > - > - spin_lock_irqsave(host->host_lock, flags); > - > - for_each_set_bit(index, &hba->outstanding_reqs, hba->nutrs) { > - if (hba->lrb[index].cmd == scmd) { > - found = true; > - break; > - } > - } > - > - spin_unlock_irqrestore(host->host_lock, flags); > - > - /* > - * Bypass SCSI error handling and reset the block layer timer if this > - * SCSI command was not actually dispatched to UFS driver, otherwise > - * let SCSI layer handle the error as usual. > - */ > - return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER; > -} > - > static const struct attribute_group *ufshcd_driver_groups[] = { > &ufs_sysfs_unit_descriptor_group, > &ufs_sysfs_lun_attributes_group, > @@ -7145,7 +7110,6 @@ static struct scsi_host_template > ufshcd_driver_template = { > .eh_abort_handler = ufshcd_abort, > .eh_device_reset_handler = ufshcd_eh_device_reset_handler, > .eh_host_reset_handler = ufshcd_eh_host_reset_handler, > - .eh_timed_out = ufshcd_eh_timed_out, > .this_id = -1, > .sg_tablesize = SG_ALL, > .cmd_per_lun = UFSHCD_CMD_PER_LUN,
On 2020-05-28 02:47, Can Guo wrote: > Hi Bart, > > On 2019-12-25 06:02, Bart Van Assche wrote: >> The UFS SCSI timeout handler was needed to compensate that >> ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long >> time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by eliminating >> tag conflicts") fixed this so the timeout handler is no longer necessary. >> >> See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout >> handler"). >> > > Sorry for bugging you on this old change. I am afraid we may need to add > this timeout handler back. Because there is till chances that a request > gets stuck somewhere in ufshcd_queuecommand() path before > ufshcd_send_command() gets called. e.g. > > ufshcd_queuecommand() > ->ufshcd_map_sg() > -->scsi_dma_map() > --->dma_map_sg() > ---->dev->ops->map_sg() > > map_sg() ops may get stuck. map_sg() method can vary on different platforms > based on actual IOMMU engines. We cannot gaurantee map_sg() ops must return > immediately as we don't know what is actually inside map_sg() ops. > > And if it gets stuck there for a long time till the request times out, > without > the UFS timeout handler, scsi layer will try to abort this request from UFS > driver by calling ufshcd_abort() eventually. ufshcd_abort() will think this > request has been completed due to its tag is not in hba->outstanding_reqs > or UFS host's door bell reg. However, actually, this request is still in > ufshcd_queuecommand() path. I don't need to continue on the subsequent > impact > to UFS driver if ufshcd_abort() happens in this case. This is a corner > case, > but it is still possible (I did see map_sg() ops hangs on real devices). > > Having the UFS timeout handler back will prevent this situation as UFS > timeout > handler checks if the tag is in hba->outstanding_reqs (for our case, it > is not > in there), if no, it returns BLK_EH_RESET_TIMER so that scsi/block layer > will > keep waiting. > > What do you think? Please let me know your ideas on this, thanks! Hi Can, I see the following issues with the above proposal: - Although I haven't been able to find explicit documentation of this, I think that dma_map_sg() must not sleep. If it would sleep that would break most block and SCSI drivers because many of these drivers call dma_map_sg() from their .queue_rq() or .queuecommand() implementation and if BLK_MQ_F_BLOCKING has not been set these functions must not sleep. - A timeout handler must not be invoked while .queuecommand() is still in progress. The SCSI core calls blk_mq_start_request() before it calls ufshcd_queuecommand(). The blk_mq_start_request() activates the block layer timeout mechanism. ufshcd_queuecommand() must have finished before the block layer timeout handler is activated. Please fix the root cause, namely the map_sg implementation that may get stuck. Thanks, Bart.
On 2020-05-29 00:12, Bart Van Assche wrote: > On 2020-05-28 02:47, Can Guo wrote: >> Hi Bart, >> >> On 2019-12-25 06:02, Bart Van Assche wrote: >>> The UFS SCSI timeout handler was needed to compensate that >>> ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long >>> time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by >>> eliminating >>> tag conflicts") fixed this so the timeout handler is no longer >>> necessary. >>> >>> See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout >>> handler"). >>> >> >> Sorry for bugging you on this old change. I am afraid we may need to >> add >> this timeout handler back. Because there is till chances that a >> request >> gets stuck somewhere in ufshcd_queuecommand() path before >> ufshcd_send_command() gets called. e.g. >> >> ufshcd_queuecommand() >> ->ufshcd_map_sg() >> -->scsi_dma_map() >> --->dma_map_sg() >> ---->dev->ops->map_sg() >> >> map_sg() ops may get stuck. map_sg() method can vary on different >> platforms >> based on actual IOMMU engines. We cannot gaurantee map_sg() ops must >> return >> immediately as we don't know what is actually inside map_sg() ops. >> >> And if it gets stuck there for a long time till the request times out, >> without >> the UFS timeout handler, scsi layer will try to abort this request >> from UFS >> driver by calling ufshcd_abort() eventually. ufshcd_abort() will think >> this >> request has been completed due to its tag is not in >> hba->outstanding_reqs >> or UFS host's door bell reg. However, actually, this request is still >> in >> ufshcd_queuecommand() path. I don't need to continue on the subsequent >> impact >> to UFS driver if ufshcd_abort() happens in this case. This is a corner >> case, >> but it is still possible (I did see map_sg() ops hangs on real >> devices). >> >> Having the UFS timeout handler back will prevent this situation as UFS >> timeout >> handler checks if the tag is in hba->outstanding_reqs (for our case, >> it >> is not >> in there), if no, it returns BLK_EH_RESET_TIMER so that scsi/block >> layer >> will >> keep waiting. >> >> What do you think? Please let me know your ideas on this, thanks! Hi Bart, > > Hi Can, > > I see the following issues with the above proposal: > - Although I haven't been able to find explicit documentation of this, > I > think that dma_map_sg() must not sleep. If it would sleep that would > break most block and SCSI drivers because many of these drivers call > dma_map_sg() from their .queue_rq() or .queuecommand() implementation > and if BLK_MQ_F_BLOCKING has not been set these functions must not > sleep. > - A timeout handler must not be invoked while .queuecommand() is still > in progress. The SCSI core calls blk_mq_start_request() before it > calls ufshcd_queuecommand(). The blk_mq_start_request() activates the > block layer timeout mechanism. ufshcd_queuecommand() must have > finished before the block layer timeout handler is activated. > > Please fix the root cause, namely the map_sg implementation that may > get > stuck. > > Thanks, > > Bart. queuecommand path should not sleep - that is right, due to queuecommand can be invoked from contexts where preempt is disabled, e.g. softirq. I don't know why map_sg() ops can take that long, but apparently it does not sleep, otherwise we should have seen schedule while atomic error long time ago. > ufshcd_queuecommand() must have > finished before the block layer timeout handler is activated. This is the ideal/expected situation, but we are seeing the corner case. Fixing the root cause of that is one thing, but having the timeout handler back can prevent UFS driver from messing up the subsequent requests further in such case, causing possible data corruption. Is there any drawbacks if we have it back? Thanks, Can Guo.
On 2020-05-28 18:39, Can Guo wrote: > On 2020-05-29 00:12, Bart Van Assche wrote: >> ufshcd_queuecommand() must have >> finished before the block layer timeout handler is activated. > > This is the ideal/expected situation, but we are seeing the corner case. > > Fixing the root cause of that is one thing, but having the timeout handler > back can prevent UFS driver from messing up the subsequent requests further > in such case, causing possible data corruption. Is there any drawbacks if > we have it back? Hi Can, My conclusion from your emails is that ufshcd_queuecommand() can spend more time than the SCSI timeout (30 seconds) in dma_map_sg(). A dma_map_sg() call that keeps the CPU busy during more than 30 seconds is not only weird but it is also a disaster from the point of view of energy consumption. Please fix the root cause. Thanks, Bart.
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index edcc137c436b..763e41286a4b 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7092,41 +7092,6 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) ufshcd_probe_hba(hba); } -static enum blk_eh_timer_return ufshcd_eh_timed_out(struct scsi_cmnd *scmd) -{ - unsigned long flags; - struct Scsi_Host *host; - struct ufs_hba *hba; - int index; - bool found = false; - - if (!scmd || !scmd->device || !scmd->device->host) - return BLK_EH_DONE; - - host = scmd->device->host; - hba = shost_priv(host); - if (!hba) - return BLK_EH_DONE; - - spin_lock_irqsave(host->host_lock, flags); - - for_each_set_bit(index, &hba->outstanding_reqs, hba->nutrs) { - if (hba->lrb[index].cmd == scmd) { - found = true; - break; - } - } - - spin_unlock_irqrestore(host->host_lock, flags); - - /* - * Bypass SCSI error handling and reset the block layer timer if this - * SCSI command was not actually dispatched to UFS driver, otherwise - * let SCSI layer handle the error as usual. - */ - return found ? BLK_EH_DONE : BLK_EH_RESET_TIMER; -} - static const struct attribute_group *ufshcd_driver_groups[] = { &ufs_sysfs_unit_descriptor_group, &ufs_sysfs_lun_attributes_group, @@ -7145,7 +7110,6 @@ static struct scsi_host_template ufshcd_driver_template = { .eh_abort_handler = ufshcd_abort, .eh_device_reset_handler = ufshcd_eh_device_reset_handler, .eh_host_reset_handler = ufshcd_eh_host_reset_handler, - .eh_timed_out = ufshcd_eh_timed_out, .this_id = -1, .sg_tablesize = SG_ALL, .cmd_per_lun = UFSHCD_CMD_PER_LUN,
The UFS SCSI timeout handler was needed to compensate that ufshcd_queuecommand() could return SCSI_MLQUEUE_HOST_BUSY for a long time. Commit a276c19e3e98 ("scsi: ufs: Avoid busy-waiting by eliminating tag conflicts") fixed this so the timeout handler is no longer necessary. See also commit f550c65b543b ("scsi: ufs: implement scsi host timeout handler"). Cc: Bean Huo <beanhuo@micron.com> Cc: Can Guo <cang@codeaurora.org> Cc: Avri Altman <avri.altman@wdc.com> Cc: Stanley Chu <stanley.chu@mediatek.com> Cc: Tomas Winkler <tomas.winkler@intel.com> Signed-off-by: Bart Van Assche <bvanassche@acm.org> --- drivers/scsi/ufs/ufshcd.c | 36 ------------------------------------ 1 file changed, 36 deletions(-)