diff mbox series

[1/2] mpt3sas: Perform additional retries if Doorbell read returns 0

Message ID 20230628070511.27774-2-ranjan.kumar@broadcom.com (mailing list archive)
State Changes Requested
Headers show
Series mpt3sas: Contains Defect | expand

Commit Message

Ranjan Kumar June 28, 2023, 7:05 a.m. UTC
Doorbell and Host diagnostic registers could return 0 even
after 3 retries and that leads to occasional resets of the
controllers, hence increased the retry count to thirty.

Fixes: b899202901a8 ("mpt3sas: Add separate function for aero doorbell reads ")
Cc: stable@vger.kernel.org
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
---
 drivers/scsi/mpt3sas/mpt3sas_base.c | 50 ++++++++++++++++-------------
 drivers/scsi/mpt3sas/mpt3sas_base.h |  4 ++-
 2 files changed, 31 insertions(+), 23 deletions(-)

Comments

Damien Le Moal June 28, 2023, 11:53 p.m. UTC | #1
On 6/28/23 16:05, Ranjan Kumar wrote:
> Doorbell and Host diagnostic registers could return 0 even
> after 3 retries and that leads to occasional resets of the
> controllers, hence increased the retry count to thirty.

The magic value "3" for retry count was already that, magic. Why would things
work better with 30 ? What is the reasoning ? Isn't a udelay needed to avoid
that many retries ?

> 
> Fixes: b899202901a8 ("mpt3sas: Add separate function for aero doorbell reads ")
> Cc: stable@vger.kernel.org
> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>

[..]

> diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
> index 05364aa15ecd..3b8ec4fd2d21 100644
> --- a/drivers/scsi/mpt3sas/mpt3sas_base.h
> +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
> @@ -160,6 +160,8 @@
>  
>  #define IOC_OPERATIONAL_WAIT_COUNT	10
>  
> +#define READL_RETRY_COUNT_OF_THIRTY	30
> +#define READL_RETRY_COUNT_OF_THREE	3

Less than ideal naming I think. If the values need to be changed again, a lot of
code will need to change. What about soemthing like:

#define READL_RETRY_COUNT	30
#define READL_RETRY_SHORT_COUNT	3

>  /*
>   * NVMe defines
>   */
> @@ -994,7 +996,7 @@ typedef void (*NVME_BUILD_PRP)(struct MPT3SAS_ADAPTER *ioc, u16 smid,
>  typedef void (*PUT_SMID_IO_FP_HIP) (struct MPT3SAS_ADAPTER *ioc, u16 smid,
>  	u16 funcdep);
>  typedef void (*PUT_SMID_DEFAULT) (struct MPT3SAS_ADAPTER *ioc, u16 smid);
> -typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr);
> +typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr, u8 retry_count);
>  /*
>   * To get high iops reply queue's msix index when high iops mode is enabled
>   * else get the msix index of general reply queues.
Damien Le Moal July 5, 2023, 9:54 p.m. UTC | #2
On 7/5/23 21:37, Ranjan kumar wrote:
> Hi Damien,
> Regarding delay:
> As Sathya already mentioned  as this is our hardware specific behavior and
> we are confident that the increased retry count
> is sufficient from our hardware perspective for any new systems too. So, we
> want to go with this change .

Fine. Adding a comment above the macro definitions to explain something like
that would be nice.

> Apart from that, I will change the name as suggested .
> 
> Thanks & Regards,
> Ranjan

Please avoid top posting.

> On Thu, 29 Jun 2023 at 05:24, Damien Le Moal <dlemoal@kernel.org> wrote:
> 
>> On 6/28/23 16:05, Ranjan Kumar wrote:
>>> Doorbell and Host diagnostic registers could return 0 even
>>> after 3 retries and that leads to occasional resets of the
>>> controllers, hence increased the retry count to thirty.
>>
>> The magic value "3" for retry count was already that, magic. Why would
>> things
>> work better with 30 ? What is the reasoning ? Isn't a udelay needed to
>> avoid
>> that many retries ?
>>
>>>
>>> Fixes: b899202901a8 ("mpt3sas: Add separate function for aero doorbell
>> reads ")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
>>
>> [..]
>>
>>> diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h
>> b/drivers/scsi/mpt3sas/mpt3sas_base.h
>>> index 05364aa15ecd..3b8ec4fd2d21 100644
>>> --- a/drivers/scsi/mpt3sas/mpt3sas_base.h
>>> +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
>>> @@ -160,6 +160,8 @@
>>>
>>>  #define IOC_OPERATIONAL_WAIT_COUNT   10
>>>
>>> +#define READL_RETRY_COUNT_OF_THIRTY  30
>>> +#define READL_RETRY_COUNT_OF_THREE   3
>>
>> Less than ideal naming I think. If the values need to be changed again, a
>> lot of
>> code will need to change. What about soemthing like:
>>
>> #define READL_RETRY_COUNT       30
>> #define READL_RETRY_SHORT_COUNT 3
>>
>>>  /*
>>>   * NVMe defines
>>>   */
>>> @@ -994,7 +996,7 @@ typedef void (*NVME_BUILD_PRP)(struct
>> MPT3SAS_ADAPTER *ioc, u16 smid,
>>>  typedef void (*PUT_SMID_IO_FP_HIP) (struct MPT3SAS_ADAPTER *ioc, u16
>> smid,
>>>       u16 funcdep);
>>>  typedef void (*PUT_SMID_DEFAULT) (struct MPT3SAS_ADAPTER *ioc, u16
>> smid);
>>> -typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr);
>>> +typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr, u8
>> retry_count);
>>>  /*
>>>   * To get high iops reply queue's msix index when high iops mode is
>> enabled
>>>   * else get the msix index of general reply queues.
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>
>>
>
diff mbox series

Patch

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 53f5492579cb..44e7ccb6f780 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -201,20 +201,20 @@  module_param_call(mpt3sas_fwfault_debug, _scsih_set_fwfault_debug,
  * while reading the system interface register.
  */
 static inline u32
-_base_readl_aero(const volatile void __iomem *addr)
+_base_readl_aero(const volatile void __iomem *addr, u8 retry_count)
 {
 	u32 i = 0, ret_val;
 
 	do {
 		ret_val = readl(addr);
 		i++;
-	} while (ret_val == 0 && i < 3);
+	} while (ret_val == 0 && i < retry_count);
 
 	return ret_val;
 }
 
 static inline u32
-_base_readl(const volatile void __iomem *addr)
+_base_readl(const volatile void __iomem *addr, u8 retry_count)
 {
 	return readl(addr);
 }
@@ -940,7 +940,7 @@  mpt3sas_halt_firmware(struct MPT3SAS_ADAPTER *ioc)
 
 	dump_stack();
 
-	doorbell = ioc->base_readl(&ioc->chip->Doorbell);
+	doorbell = ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY);
 	if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
 		mpt3sas_print_fault_code(ioc, doorbell &
 		    MPI2_DOORBELL_DATA_MASK);
@@ -1617,10 +1617,10 @@  mpt3sas_base_mask_interrupts(struct MPT3SAS_ADAPTER *ioc)
 	u32 him_register;
 
 	ioc->mask_interrupts = 1;
-	him_register = ioc->base_readl(&ioc->chip->HostInterruptMask);
+	him_register = ioc->base_readl(&ioc->chip->HostInterruptMask, READL_RETRY_COUNT_OF_THREE);
 	him_register |= MPI2_HIM_DIM + MPI2_HIM_RIM + MPI2_HIM_RESET_IRQ_MASK;
 	writel(him_register, &ioc->chip->HostInterruptMask);
-	ioc->base_readl(&ioc->chip->HostInterruptMask);
+	ioc->base_readl(&ioc->chip->HostInterruptMask, READL_RETRY_COUNT_OF_THREE);
 }
 
 /**
@@ -1634,7 +1634,7 @@  mpt3sas_base_unmask_interrupts(struct MPT3SAS_ADAPTER *ioc)
 {
 	u32 him_register;
 
-	him_register = ioc->base_readl(&ioc->chip->HostInterruptMask);
+	him_register = ioc->base_readl(&ioc->chip->HostInterruptMask, READL_RETRY_COUNT_OF_THREE);
 	him_register &= ~MPI2_HIM_RIM;
 	writel(him_register, &ioc->chip->HostInterruptMask);
 	ioc->mask_interrupts = 0;
@@ -6686,7 +6686,7 @@  mpt3sas_base_get_iocstate(struct MPT3SAS_ADAPTER *ioc, int cooked)
 {
 	u32 s, sc;
 
-	s = ioc->base_readl(&ioc->chip->Doorbell);
+	s = ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY);
 	sc = s & MPI2_IOC_STATE_MASK;
 	return cooked ? sc : s;
 }
@@ -6760,7 +6760,8 @@  _base_wait_for_doorbell_int(struct MPT3SAS_ADAPTER *ioc, int timeout)
 	count = 0;
 	cntdn = 1000 * timeout;
 	do {
-		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus);
+		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus,
+							READL_RETRY_COUNT_OF_THREE);
 		if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
 			dhsprintk(ioc,
 				  ioc_info(ioc, "%s: successful count(%d), timeout(%d)\n",
@@ -6786,7 +6787,8 @@  _base_spin_on_doorbell_int(struct MPT3SAS_ADAPTER *ioc, int timeout)
 	count = 0;
 	cntdn = 2000 * timeout;
 	do {
-		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus);
+		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus,
+							READL_RETRY_COUNT_OF_THREE);
 		if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
 			dhsprintk(ioc,
 				  ioc_info(ioc, "%s: successful count(%d), timeout(%d)\n",
@@ -6824,14 +6826,16 @@  _base_wait_for_doorbell_ack(struct MPT3SAS_ADAPTER *ioc, int timeout)
 	count = 0;
 	cntdn = 1000 * timeout;
 	do {
-		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus);
+		int_status = ioc->base_readl(&ioc->chip->HostInterruptStatus,
+							READL_RETRY_COUNT_OF_THREE);
 		if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
 			dhsprintk(ioc,
 				  ioc_info(ioc, "%s: successful count(%d), timeout(%d)\n",
 					   __func__, count, timeout));
 			return 0;
 		} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
-			doorbell = ioc->base_readl(&ioc->chip->Doorbell);
+			doorbell = ioc->base_readl(&ioc->chip->Doorbell,
+							READL_RETRY_COUNT_OF_THIRTY);
 			if ((doorbell & MPI2_IOC_STATE_MASK) ==
 			    MPI2_IOC_STATE_FAULT) {
 				mpt3sas_print_fault_code(ioc, doorbell);
@@ -6871,7 +6875,7 @@  _base_wait_for_doorbell_not_used(struct MPT3SAS_ADAPTER *ioc, int timeout)
 	count = 0;
 	cntdn = 1000 * timeout;
 	do {
-		doorbell_reg = ioc->base_readl(&ioc->chip->Doorbell);
+		doorbell_reg = ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY);
 		if (!(doorbell_reg & MPI2_DOORBELL_USED)) {
 			dhsprintk(ioc,
 				  ioc_info(ioc, "%s: successful count(%d), timeout(%d)\n",
@@ -7019,13 +7023,13 @@  _base_handshake_req_reply_wait(struct MPT3SAS_ADAPTER *ioc, int request_bytes,
 	__le32 *mfp;
 
 	/* make sure doorbell is not in use */
-	if ((ioc->base_readl(&ioc->chip->Doorbell) & MPI2_DOORBELL_USED)) {
+	if ((ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY) & MPI2_DOORBELL_USED)) {
 		ioc_err(ioc, "doorbell is in use (line=%d)\n", __LINE__);
 		return -EFAULT;
 	}
 
 	/* clear pending doorbell interrupts from previous state changes */
-	if (ioc->base_readl(&ioc->chip->HostInterruptStatus) &
+	if (ioc->base_readl(&ioc->chip->HostInterruptStatus, READL_RETRY_COUNT_OF_THREE) &
 	    MPI2_HIS_IOC2SYS_DB_STATUS)
 		writel(0, &ioc->chip->HostInterruptStatus);
 
@@ -7068,7 +7072,7 @@  _base_handshake_req_reply_wait(struct MPT3SAS_ADAPTER *ioc, int request_bytes,
 	}
 
 	/* read the first two 16-bits, it gives the total length of the reply */
-	reply[0] = le16_to_cpu(ioc->base_readl(&ioc->chip->Doorbell)
+	reply[0] = le16_to_cpu(ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY)
 	    & MPI2_DOORBELL_DATA_MASK);
 	writel(0, &ioc->chip->HostInterruptStatus);
 	if ((_base_wait_for_doorbell_int(ioc, 5))) {
@@ -7076,7 +7080,7 @@  _base_handshake_req_reply_wait(struct MPT3SAS_ADAPTER *ioc, int request_bytes,
 			__LINE__);
 		return -EFAULT;
 	}
-	reply[1] = le16_to_cpu(ioc->base_readl(&ioc->chip->Doorbell)
+	reply[1] = le16_to_cpu(ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY)
 	    & MPI2_DOORBELL_DATA_MASK);
 	writel(0, &ioc->chip->HostInterruptStatus);
 
@@ -7087,10 +7091,10 @@  _base_handshake_req_reply_wait(struct MPT3SAS_ADAPTER *ioc, int request_bytes,
 			return -EFAULT;
 		}
 		if (i >=  reply_bytes/2) /* overflow case */
-			ioc->base_readl(&ioc->chip->Doorbell);
+			ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY);
 		else
 			reply[i] = le16_to_cpu(
-			    ioc->base_readl(&ioc->chip->Doorbell)
+			    ioc->base_readl(&ioc->chip->Doorbell, READL_RETRY_COUNT_OF_THIRTY)
 			    & MPI2_DOORBELL_DATA_MASK);
 		writel(0, &ioc->chip->HostInterruptStatus);
 	}
@@ -7949,14 +7953,15 @@  _base_diag_reset(struct MPT3SAS_ADAPTER *ioc)
 			goto out;
 		}
 
-		host_diagnostic = ioc->base_readl(&ioc->chip->HostDiagnostic);
+		host_diagnostic = ioc->base_readl(&ioc->chip->HostDiagnostic,
+								READL_RETRY_COUNT_OF_THIRTY);
 		drsprintk(ioc,
 			  ioc_info(ioc, "wrote magic sequence: count(%d), host_diagnostic(0x%08x)\n",
 				   count, host_diagnostic));
 
 	} while ((host_diagnostic & MPI2_DIAG_DIAG_WRITE_ENABLE) == 0);
 
-	hcb_size = ioc->base_readl(&ioc->chip->HCBSize);
+	hcb_size = ioc->base_readl(&ioc->chip->HCBSize, READL_RETRY_COUNT_OF_THREE);
 
 	drsprintk(ioc, ioc_info(ioc, "diag reset: issued\n"));
 	writel(host_diagnostic | MPI2_DIAG_RESET_ADAPTER,
@@ -7969,7 +7974,8 @@  _base_diag_reset(struct MPT3SAS_ADAPTER *ioc)
 	for (count = 0; count < (300000000 /
 		MPI2_HARD_RESET_PCIE_SECOND_READ_DELAY_MICRO_SEC); count++) {
 
-		host_diagnostic = ioc->base_readl(&ioc->chip->HostDiagnostic);
+		host_diagnostic = ioc->base_readl(&ioc->chip->HostDiagnostic,
+								READL_RETRY_COUNT_OF_THIRTY);
 
 		if (host_diagnostic == 0xFFFFFFFF) {
 			ioc_info(ioc,
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 05364aa15ecd..3b8ec4fd2d21 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -160,6 +160,8 @@ 
 
 #define IOC_OPERATIONAL_WAIT_COUNT	10
 
+#define READL_RETRY_COUNT_OF_THIRTY	30
+#define READL_RETRY_COUNT_OF_THREE	3
 /*
  * NVMe defines
  */
@@ -994,7 +996,7 @@  typedef void (*NVME_BUILD_PRP)(struct MPT3SAS_ADAPTER *ioc, u16 smid,
 typedef void (*PUT_SMID_IO_FP_HIP) (struct MPT3SAS_ADAPTER *ioc, u16 smid,
 	u16 funcdep);
 typedef void (*PUT_SMID_DEFAULT) (struct MPT3SAS_ADAPTER *ioc, u16 smid);
-typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr);
+typedef u32 (*BASE_READ_REG) (const volatile void __iomem *addr, u8 retry_count);
 /*
  * To get high iops reply queue's msix index when high iops mode is enabled
  * else get the msix index of general reply queues.