diff mbox series

[v10,2/2] ufs: core: requeue aborted request

Message ID 20241001091917.6917-3-peter.wang@mediatek.com (mailing list archive)
State Changes Requested
Headers show
Series fix abort defect | expand

Commit Message

Peter Wang (王信友) Oct. 1, 2024, 9:19 a.m. UTC
From: Peter Wang <peter.wang@mediatek.com>

After the SQ cleanup fix, the CQ will receive a response with
the corresponding tag marked as OCS: ABORTED. To align with
the behavior of Legacy SDB mode, the handling of OCS: ABORTED
has been changed to match that of OCS_INVALID_COMMAND_STATUS
(SDB), with both returning a SCSI result of DID_REQUEUE.

Furthermore, the workaround implemented before the SQ cleanup
fix can be removed.

Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ mode")
Cc: stable@vger.kernel.org
Signed-off-by: Peter Wang <peter.wang@mediatek.com>
---
 drivers/ufs/core/ufshcd.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

Comments

Bart Van Assche Oct. 1, 2024, 5:13 p.m. UTC | #1
On 10/1/24 2:19 AM, peter.wang@mediatek.com wrote:
> From: Peter Wang <peter.wang@mediatek.com>
> 
> After the SQ cleanup fix, the CQ will receive a response with
> the corresponding tag marked as OCS: ABORTED. To align with
> the behavior of Legacy SDB mode, the handling of OCS: ABORTED
> has been changed to match that of OCS_INVALID_COMMAND_STATUS
> (SDB), with both returning a SCSI result of DID_REQUEUE.
> 
> Furthermore, the workaround implemented before the SQ cleanup
> fix can be removed.
> 
> Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ mode")
> Cc: stable@vger.kernel.org
> Signed-off-by: Peter Wang <peter.wang@mediatek.com>
> ---
>   drivers/ufs/core/ufshcd.c | 20 ++++----------------
>   1 file changed, 4 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> index 24a32e2fd75e..8e2a7889a565 100644
> --- a/drivers/ufs/core/ufshcd.c
> +++ b/drivers/ufs/core/ufshcd.c
> @@ -5417,10 +5417,12 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
>   		}
>   		break;
>   	case OCS_ABORTED:
> -		result |= DID_ABORT << 16;
> -		break;
>   	case OCS_INVALID_COMMAND_STATUS:
>   		result |= DID_REQUEUE << 16;
> +		dev_warn(hba->dev,
> +				"OCS %s from controller for tag %d\n",
> +				(ocs == OCS_ABORTED? "aborted" : "invalid"),
> +				lrbp->task_tag);
>   		break;
>   	case OCS_INVALID_CMD_TABLE_ATTR:
>   	case OCS_INVALID_PRDT_ATTR:
> @@ -6466,26 +6468,12 @@ static bool ufshcd_abort_one(struct request *rq, void *priv)
>   	struct scsi_device *sdev = cmd->device;
>   	struct Scsi_Host *shost = sdev->host;
>   	struct ufs_hba *hba = shost_priv(shost);
> -	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
> -	struct ufs_hw_queue *hwq;
> -	unsigned long flags;
>   
>   	*ret = ufshcd_try_to_abort_task(hba, tag);
>   	dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag,
>   		hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1,
>   		*ret ? "failed" : "succeeded");
>   
> -	/* Release cmd in MCQ mode if abort succeeds */
> -	if (hba->mcq_enabled && (*ret == 0)) {
> -		hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd));
> -		if (!hwq)
> -			return 0;
> -		spin_lock_irqsave(&hwq->cq_lock, flags);
> -		if (ufshcd_cmd_inflight(lrbp->cmd))
> -			ufshcd_release_scsi_cmd(hba, lrbp);
> -		spin_unlock_irqrestore(&hwq->cq_lock, flags);
> -	}
> -
>   	return *ret == 0;
>   }

As mentioned before, ufshcd_try_to_abort_task() cannot handle concurrent
scsi_done() calls. ufshcd_abort_one() calls ufshcd_try_to_abort_task()
without even trying to prevent that scsi_done() is called concurrently. 
Since this could result in a kernel crash, I think that it is important 
that this gets fixed, even if it requires modifying the SCSI core.

Bart.
Peter Wang (王信友) Oct. 2, 2024, 12:42 p.m. UTC | #2
On Tue, 2024-10-01 at 10:13 -0700, Bart Van Assche wrote:
>  	 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
>  On 10/1/24 2:19 AM, peter.wang@mediatek.com wrote:
> > From: Peter Wang <peter.wang@mediatek.com>
> > 
> > After the SQ cleanup fix, the CQ will receive a response with
> > the corresponding tag marked as OCS: ABORTED. To align with
> > the behavior of Legacy SDB mode, the handling of OCS: ABORTED
> > has been changed to match that of OCS_INVALID_COMMAND_STATUS
> > (SDB), with both returning a SCSI result of DID_REQUEUE.
> > 
> > Furthermore, the workaround implemented before the SQ cleanup
> > fix can be removed.
> > 
> > Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ
> mode")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Peter Wang <peter.wang@mediatek.com>
> > ---
> >   drivers/ufs/core/ufshcd.c | 20 ++++----------------
> >   1 file changed, 4 insertions(+), 16 deletions(-)
> > 
> > diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
> > index 24a32e2fd75e..8e2a7889a565 100644
> > --- a/drivers/ufs/core/ufshcd.c
> > +++ b/drivers/ufs/core/ufshcd.c
> > @@ -5417,10 +5417,12 @@ ufshcd_transfer_rsp_status(struct ufs_hba
> *hba, struct ufshcd_lrb *lrbp,
> >   }
> >   break;
> >   case OCS_ABORTED:
> > -result |= DID_ABORT << 16;
> > -break;
> >   case OCS_INVALID_COMMAND_STATUS:
> >   result |= DID_REQUEUE << 16;
> > +dev_warn(hba->dev,
> > +"OCS %s from controller for tag %d\n",
> > +(ocs == OCS_ABORTED? "aborted" : "invalid"),
> > +lrbp->task_tag);
> >   break;
> >   case OCS_INVALID_CMD_TABLE_ATTR:
> >   case OCS_INVALID_PRDT_ATTR:
> > @@ -6466,26 +6468,12 @@ static bool ufshcd_abort_one(struct request
> *rq, void *priv)
> >   struct scsi_device *sdev = cmd->device;
> >   struct Scsi_Host *shost = sdev->host;
> >   struct ufs_hba *hba = shost_priv(shost);
> > -struct ufshcd_lrb *lrbp = &hba->lrb[tag];
> > -struct ufs_hw_queue *hwq;
> > -unsigned long flags;
> >   
> >   *ret = ufshcd_try_to_abort_task(hba, tag);
> >   dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag,
> >   hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1,
> >   *ret ? "failed" : "succeeded");
> >   
> > -/* Release cmd in MCQ mode if abort succeeds */
> > -if (hba->mcq_enabled && (*ret == 0)) {
> > -hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd));
> > -if (!hwq)
> > -return 0;
> > -spin_lock_irqsave(&hwq->cq_lock, flags);
> > -if (ufshcd_cmd_inflight(lrbp->cmd))
> > -ufshcd_release_scsi_cmd(hba, lrbp);
> > -spin_unlock_irqrestore(&hwq->cq_lock, flags);
> > -}
> > -
> >   return *ret == 0;
> >   }
> 
> As mentioned before, ufshcd_try_to_abort_task() cannot handle
> concurrent
> scsi_done() calls. ufshcd_abort_one() calls
> ufshcd_try_to_abort_task()
> without even trying to prevent that scsi_done() is called
> concurrently. 
> Since this could result in a kernel crash, I think that it is
> important 
> that this gets fixed, even if it requires modifying the SCSI core.
> 
> Bart.
> 
> 

Hi Bart,

This patch merely aligns with the approach of SDB mode 
and does not involve the flow of scsi_done. Besides, 
I don't see any issue with concurrency between 
ufshcd_abort_one() calling ufshcd_try_to_abort_task() 
and scsi_done(). Can you point out the specific flow where 
the problem occurs? If there is one, shouldn't SDB mode
have the same issue?

Thanks
Peter
Bart Van Assche Oct. 3, 2024, 8:02 p.m. UTC | #3
On 10/2/24 5:42 AM, Peter Wang (王信友) wrote:
> This patch merely aligns with the approach of SDB mode
> and does not involve the flow of scsi_done. Besides,
> I don't see any issue with concurrency between
> ufshcd_abort_one() calling ufshcd_try_to_abort_task()
> and scsi_done(). Can you point out the specific flow where
> the problem occurs? If there is one, shouldn't SDB mode
> have the same issue?

Hi Peter,

Correct, my comment applies to both legacy mode and MCQ mode. From the 
section in the UFS standard about ABORT TASK: "A response of FUNCTION
COMPLETE shall indicate that the command was aborted or was not in the 
task set." In other words, if a command completes just before 
ufshcd_try_to_abort_task() calls ufshcd_issue_tm_cmd(), then
ufshcd_try_to_abort_task() will call ufshcd_clear_cmd() for a command
that has already completed. In legacy mode, this call will succeed.
Hence, both ufshcd_compl_one_cqe() and ufshcd_abort_all() will call
ufshcd_release(hba). This will cause hba->clk_gating.active_reqs to be
decremented twice instead of only once. Do you agree that this can
happen and also that it should be prevented that this happens?

Thanks,

Bart.
diff mbox series

Patch

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 24a32e2fd75e..8e2a7889a565 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -5417,10 +5417,12 @@  ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 		}
 		break;
 	case OCS_ABORTED:
-		result |= DID_ABORT << 16;
-		break;
 	case OCS_INVALID_COMMAND_STATUS:
 		result |= DID_REQUEUE << 16;
+		dev_warn(hba->dev,
+				"OCS %s from controller for tag %d\n",
+				(ocs == OCS_ABORTED? "aborted" : "invalid"),
+				lrbp->task_tag);
 		break;
 	case OCS_INVALID_CMD_TABLE_ATTR:
 	case OCS_INVALID_PRDT_ATTR:
@@ -6466,26 +6468,12 @@  static bool ufshcd_abort_one(struct request *rq, void *priv)
 	struct scsi_device *sdev = cmd->device;
 	struct Scsi_Host *shost = sdev->host;
 	struct ufs_hba *hba = shost_priv(shost);
-	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
-	struct ufs_hw_queue *hwq;
-	unsigned long flags;
 
 	*ret = ufshcd_try_to_abort_task(hba, tag);
 	dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag,
 		hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1,
 		*ret ? "failed" : "succeeded");
 
-	/* Release cmd in MCQ mode if abort succeeds */
-	if (hba->mcq_enabled && (*ret == 0)) {
-		hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd));
-		if (!hwq)
-			return 0;
-		spin_lock_irqsave(&hwq->cq_lock, flags);
-		if (ufshcd_cmd_inflight(lrbp->cmd))
-			ufshcd_release_scsi_cmd(hba, lrbp);
-		spin_unlock_irqrestore(&hwq->cq_lock, flags);
-	}
-
 	return *ret == 0;
 }