diff mbox series

lpfc: fix pt2pt nvme PRLI reject LOGO loop

Message ID 20220212163120.15385-1-jsmart2021@gmail.com (mailing list archive)
State Accepted
Headers show
Series lpfc: fix pt2pt nvme PRLI reject LOGO loop | expand

Commit Message

James Smart Feb. 12, 2022, 4:31 p.m. UTC
When connected point to point, the driver does not know the FC4's
supported by the other end. In Fabrics, it can query the nameserver.
Thus the driver must send PRLI's for the FC4s it supports and enable
support based on the acc(ept) or rej(ect) of the respective FC4 PRLI.
Currently the driver supports SCSI and NVMe PRLI's.

Unfortunately, although the behavior is per standard, many devices
have come to expect only SCSI PRLI's. In this particular example, the
NVMe PRLI is properly RJT'd but the target decided that it must LOGO after
seeing the unexpected NVMe PRLI. The LOGO causes the sequence to restart
and login is now in an infinite failure loop.

Fix the problem by having the driver, on a pt2pt link, remember NVMe PRLI
accept or reject status across logout as long as the link stays "up".
When retrying login, if the prior NVMe PRLI was rejected, it will not be
sent on the next login.

Cut against 5.18/scsi-queue

Cc: <stable@vger.kernel.org> # v5.4+
Signed-off-by: James Smart <jsmart2021@gmail.com>
---
 drivers/scsi/lpfc/lpfc.h           |  1 +
 drivers/scsi/lpfc/lpfc_attr.c      |  3 +++
 drivers/scsi/lpfc/lpfc_els.c       | 20 +++++++++++++++++++-
 drivers/scsi/lpfc/lpfc_nportdisc.c |  5 +++--
 4 files changed, 26 insertions(+), 3 deletions(-)

Comments

Ewan Milne Feb. 14, 2022, 8:46 p.m. UTC | #1
Reviewed-by: Ewan D. Milne <emilne@redhat.com>

On Sat, Feb 12, 2022 at 11:32 AM James Smart <jsmart2021@gmail.com> wrote:
>
> When connected point to point, the driver does not know the FC4's
> supported by the other end. In Fabrics, it can query the nameserver.
> Thus the driver must send PRLI's for the FC4s it supports and enable
> support based on the acc(ept) or rej(ect) of the respective FC4 PRLI.
> Currently the driver supports SCSI and NVMe PRLI's.
>
> Unfortunately, although the behavior is per standard, many devices
> have come to expect only SCSI PRLI's. In this particular example, the
> NVMe PRLI is properly RJT'd but the target decided that it must LOGO after
> seeing the unexpected NVMe PRLI. The LOGO causes the sequence to restart
> and login is now in an infinite failure loop.
>
> Fix the problem by having the driver, on a pt2pt link, remember NVMe PRLI
> accept or reject status across logout as long as the link stays "up".
> When retrying login, if the prior NVMe PRLI was rejected, it will not be
> sent on the next login.
>
> Cut against 5.18/scsi-queue
>
> Cc: <stable@vger.kernel.org> # v5.4+
> Signed-off-by: James Smart <jsmart2021@gmail.com>
> ---
>  drivers/scsi/lpfc/lpfc.h           |  1 +
>  drivers/scsi/lpfc/lpfc_attr.c      |  3 +++
>  drivers/scsi/lpfc/lpfc_els.c       | 20 +++++++++++++++++++-
>  drivers/scsi/lpfc/lpfc_nportdisc.c |  5 +++--
>  4 files changed, 26 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
> index a1e0a106c132..98cabe09c040 100644
> --- a/drivers/scsi/lpfc/lpfc.h
> +++ b/drivers/scsi/lpfc/lpfc.h
> @@ -592,6 +592,7 @@ struct lpfc_vport {
>  #define FC_VPORT_LOGO_RCVD      0x200    /* LOGO received on vport */
>  #define FC_RSCN_DISCOVERY       0x400   /* Auth all devices after RSCN */
>  #define FC_LOGO_RCVD_DID_CHNG   0x800    /* FDISC on phys port detect DID chng*/
> +#define FC_PT2PT_NO_NVME        0x1000   /* Don't send NVME PRLI */
>  #define FC_SCSI_SCAN_TMO        0x4000  /* scsi scan timer running */
>  #define FC_ABORT_DISCOVERY      0x8000  /* we want to abort discovery */
>  #define FC_NDISC_ACTIVE         0x10000         /* NPort discovery active */
> diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
> index bac78fbce8d6..fa8415259cb8 100644
> --- a/drivers/scsi/lpfc/lpfc_attr.c
> +++ b/drivers/scsi/lpfc/lpfc_attr.c
> @@ -1315,6 +1315,9 @@ lpfc_issue_lip(struct Scsi_Host *shost)
>         pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK;
>         pmboxq->u.mb.mbxOwner = OWN_HOST;
>
> +       if ((vport->fc_flag & FC_PT2PT) && (vport->fc_flag & FC_PT2PT_NO_NVME))
> +               vport->fc_flag &= ~FC_PT2PT_NO_NVME;
> +
>         mbxstatus = lpfc_sli_issue_mbox_wait(phba, pmboxq, LPFC_MBOX_TMO * 2);
>
>         if ((mbxstatus == MBX_SUCCESS) &&
> diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
> index db5ccae1b63d..f936833c9909 100644
> --- a/drivers/scsi/lpfc/lpfc_els.c
> +++ b/drivers/scsi/lpfc/lpfc_els.c
> @@ -1072,7 +1072,8 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
>
>                 /* FLOGI failed, so there is no fabric */
>                 spin_lock_irq(shost->host_lock);
> -               vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP);
> +               vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP |
> +                                   FC_PT2PT_NO_NVME);
>                 spin_unlock_irq(shost->host_lock);
>
>                 /* If private loop, then allow max outstanding els to be
> @@ -4607,6 +4608,23 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
>                 /* Added for Vendor specifc support
>                  * Just keep retrying for these Rsn / Exp codes
>                  */
> +               if ((vport->fc_flag & FC_PT2PT) &&
> +                   cmd == ELS_CMD_NVMEPRLI) {
> +                       switch (stat.un.b.lsRjtRsnCode) {
> +                       case LSRJT_UNABLE_TPC:
> +                       case LSRJT_INVALID_CMD:
> +                       case LSRJT_LOGICAL_ERR:
> +                       case LSRJT_CMD_UNSUPPORTED:
> +                               lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS,
> +                                                "0168 NVME PRLI LS_RJT "
> +                                                "reason %x port doesn't "
> +                                                "support NVME, disabling NVME\n",
> +                                                stat.un.b.lsRjtRsnCode);
> +                               retry = 0;
> +                               vport->fc_flag |= FC_PT2PT_NO_NVME;
> +                               goto out_retry;
> +                       }
> +               }
>                 switch (stat.un.b.lsRjtRsnCode) {
>                 case LSRJT_UNABLE_TPC:
>                         /* The driver has a VALID PLOGI but the rport has
> diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
> index 7d717a4ac14d..fdf5e777bf11 100644
> --- a/drivers/scsi/lpfc/lpfc_nportdisc.c
> +++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
> @@ -1961,8 +1961,9 @@ lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport,
>                          * is configured try it.
>                          */
>                         ndlp->nlp_fc4_type |= NLP_FC4_FCP;
> -                       if ((vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH) ||
> -                           (vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) {
> +                       if ((!(vport->fc_flag & FC_PT2PT_NO_NVME)) &&
> +                           (vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH ||
> +                           vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) {
>                                 ndlp->nlp_fc4_type |= NLP_FC4_NVME;
>                                 /* We need to update the localport also */
>                                 lpfc_nvme_update_localport(vport);
> --
> 2.26.2
>
Martin K. Petersen Feb. 15, 2022, 3:18 a.m. UTC | #2
On Sat, 12 Feb 2022 08:31:20 -0800, James Smart wrote:

> When connected point to point, the driver does not know the FC4's
> supported by the other end. In Fabrics, it can query the nameserver.
> Thus the driver must send PRLI's for the FC4s it supports and enable
> support based on the acc(ept) or rej(ect) of the respective FC4 PRLI.
> Currently the driver supports SCSI and NVMe PRLI's.
> 
> Unfortunately, although the behavior is per standard, many devices
> have come to expect only SCSI PRLI's. In this particular example, the
> NVMe PRLI is properly RJT'd but the target decided that it must LOGO after
> seeing the unexpected NVMe PRLI. The LOGO causes the sequence to restart
> and login is now in an infinite failure loop.
> 
> [...]

Applied to 5.17/scsi-fixes, thanks!

[1/1] lpfc: fix pt2pt nvme PRLI reject LOGO loop
      https://git.kernel.org/mkp/scsi/c/7f4c5a26f735
diff mbox series

Patch

diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index a1e0a106c132..98cabe09c040 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -592,6 +592,7 @@  struct lpfc_vport {
 #define FC_VPORT_LOGO_RCVD      0x200    /* LOGO received on vport */
 #define FC_RSCN_DISCOVERY       0x400	 /* Auth all devices after RSCN */
 #define FC_LOGO_RCVD_DID_CHNG   0x800    /* FDISC on phys port detect DID chng*/
+#define FC_PT2PT_NO_NVME        0x1000   /* Don't send NVME PRLI */
 #define FC_SCSI_SCAN_TMO        0x4000	 /* scsi scan timer running */
 #define FC_ABORT_DISCOVERY      0x8000	 /* we want to abort discovery */
 #define FC_NDISC_ACTIVE         0x10000	 /* NPort discovery active */
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index bac78fbce8d6..fa8415259cb8 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -1315,6 +1315,9 @@  lpfc_issue_lip(struct Scsi_Host *shost)
 	pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK;
 	pmboxq->u.mb.mbxOwner = OWN_HOST;
 
+	if ((vport->fc_flag & FC_PT2PT) && (vport->fc_flag & FC_PT2PT_NO_NVME))
+		vport->fc_flag &= ~FC_PT2PT_NO_NVME;
+
 	mbxstatus = lpfc_sli_issue_mbox_wait(phba, pmboxq, LPFC_MBOX_TMO * 2);
 
 	if ((mbxstatus == MBX_SUCCESS) &&
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index db5ccae1b63d..f936833c9909 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -1072,7 +1072,8 @@  lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 
 		/* FLOGI failed, so there is no fabric */
 		spin_lock_irq(shost->host_lock);
-		vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP);
+		vport->fc_flag &= ~(FC_FABRIC | FC_PUBLIC_LOOP |
+				    FC_PT2PT_NO_NVME);
 		spin_unlock_irq(shost->host_lock);
 
 		/* If private loop, then allow max outstanding els to be
@@ -4607,6 +4608,23 @@  lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		/* Added for Vendor specifc support
 		 * Just keep retrying for these Rsn / Exp codes
 		 */
+		if ((vport->fc_flag & FC_PT2PT) &&
+		    cmd == ELS_CMD_NVMEPRLI) {
+			switch (stat.un.b.lsRjtRsnCode) {
+			case LSRJT_UNABLE_TPC:
+			case LSRJT_INVALID_CMD:
+			case LSRJT_LOGICAL_ERR:
+			case LSRJT_CMD_UNSUPPORTED:
+				lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS,
+						 "0168 NVME PRLI LS_RJT "
+						 "reason %x port doesn't "
+						 "support NVME, disabling NVME\n",
+						 stat.un.b.lsRjtRsnCode);
+				retry = 0;
+				vport->fc_flag |= FC_PT2PT_NO_NVME;
+				goto out_retry;
+			}
+		}
 		switch (stat.un.b.lsRjtRsnCode) {
 		case LSRJT_UNABLE_TPC:
 			/* The driver has a VALID PLOGI but the rport has
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index 7d717a4ac14d..fdf5e777bf11 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -1961,8 +1961,9 @@  lpfc_cmpl_reglogin_reglogin_issue(struct lpfc_vport *vport,
 			 * is configured try it.
 			 */
 			ndlp->nlp_fc4_type |= NLP_FC4_FCP;
-			if ((vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH) ||
-			    (vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) {
+			if ((!(vport->fc_flag & FC_PT2PT_NO_NVME)) &&
+			    (vport->cfg_enable_fc4_type == LPFC_ENABLE_BOTH ||
+			    vport->cfg_enable_fc4_type == LPFC_ENABLE_NVME)) {
 				ndlp->nlp_fc4_type |= NLP_FC4_NVME;
 				/* We need to update the localport also */
 				lpfc_nvme_update_localport(vport);