diff mbox series

[v2,08/11] spi: cadence-qspi: add early busywait to cqspi_wait_for_bit()

Message ID 20240405-cdns-qspi-mbly-v2-8-956679866d6d@bootlin.com (mailing list archive)
State Superseded
Headers show
Series spi: cadence-qspi: add Mobileye EyeQ5 support | expand

Commit Message

Théo Lebrun April 5, 2024, 3:02 p.m. UTC
If the CQSPI_BUSYWAIT_EARLY quirk flag is on, call
readl_relaxed_poll_timeout() with no sleep at the start of
cqspi_wait_for_bit(). If its short timeout expires, a sleeping
readl_relaxed_poll_timeout() call takes the relay.

Behavior is hidden behind a quirk flag to keep the previous behavior the
same on all platforms.

The reason is to avoid hrtimer interrupts on the system. All read
operations take less than 100µs.

Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
---
 drivers/spi/spi-cadence-quadspi.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

Comments

Mark Brown April 8, 2024, 2:16 p.m. UTC | #1
On Fri, Apr 05, 2024 at 05:02:18PM +0200, Théo Lebrun wrote:

> If the CQSPI_BUSYWAIT_EARLY quirk flag is on, call
> readl_relaxed_poll_timeout() with no sleep at the start of
> cqspi_wait_for_bit(). If its short timeout expires, a sleeping
> readl_relaxed_poll_timeout() call takes the relay.
> 
> Behavior is hidden behind a quirk flag to keep the previous behavior the
> same on all platforms.
> 
> The reason is to avoid hrtimer interrupts on the system. All read
> operations take less than 100µs.

Why would this be platform specific, this seems like a very standard
optimisation technique?
Théo Lebrun April 8, 2024, 2:42 p.m. UTC | #2
Hello,

On Mon Apr 8, 2024 at 4:16 PM CEST, Mark Brown wrote:
> On Fri, Apr 05, 2024 at 05:02:18PM +0200, Théo Lebrun wrote:
>
> > If the CQSPI_BUSYWAIT_EARLY quirk flag is on, call
> > readl_relaxed_poll_timeout() with no sleep at the start of
> > cqspi_wait_for_bit(). If its short timeout expires, a sleeping
> > readl_relaxed_poll_timeout() call takes the relay.
> > 
> > Behavior is hidden behind a quirk flag to keep the previous behavior the
> > same on all platforms.
> > 
> > The reason is to avoid hrtimer interrupts on the system. All read
> > operations take less than 100µs.
>
> Why would this be platform specific, this seems like a very standard
> optimisation technique?

It does not make sense if you know that all read operations take more
than 100µs. I preferred being conservative. If you confirm it makes
sense I'll remove the quirk.

Regards,

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
Mark Brown April 8, 2024, 4:40 p.m. UTC | #3
On Mon, Apr 08, 2024 at 04:42:43PM +0200, Théo Lebrun wrote:
> On Mon Apr 8, 2024 at 4:16 PM CEST, Mark Brown wrote:
> > On Fri, Apr 05, 2024 at 05:02:18PM +0200, Théo Lebrun wrote:

> > > The reason is to avoid hrtimer interrupts on the system. All read
> > > operations take less than 100µs.

> > Why would this be platform specific, this seems like a very standard
> > optimisation technique?

> It does not make sense if you know that all read operations take more
> than 100µs. I preferred being conservative. If you confirm it makes
> sense I'll remove the quirk.

It does seem plausible at least, and the time could be made a tuneable
with quirks or otherwise if that's needed.  I think I'd expect the MIPS
platform you're working with to be towards the lower end of performance
for systems that are new enough to have this hardware.
Théo Lebrun April 9, 2024, 10:09 a.m. UTC | #4
Hello,

On Mon Apr 8, 2024 at 6:40 PM CEST, Mark Brown wrote:
> On Mon, Apr 08, 2024 at 04:42:43PM +0200, Théo Lebrun wrote:
> > On Mon Apr 8, 2024 at 4:16 PM CEST, Mark Brown wrote:
> > > On Fri, Apr 05, 2024 at 05:02:18PM +0200, Théo Lebrun wrote:
>
> > > > The reason is to avoid hrtimer interrupts on the system. All read
> > > > operations take less than 100µs.
>
> > > Why would this be platform specific, this seems like a very standard
> > > optimisation technique?
>
> > It does not make sense if you know that all read operations take more
> > than 100µs. I preferred being conservative. If you confirm it makes
> > sense I'll remove the quirk.
>
> It does seem plausible at least, and the time could be made a tuneable
> with quirks or otherwise if that's needed.  I think I'd expect the MIPS
> platform you're working with to be towards the lower end of performance
> for systems that are new enough to have this hardware.

Next revision will do the same busywait behavior unconditionally then.

Thanks!

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
diff mbox series

Patch

diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index ebb8c35f50fd..230aad490e03 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -44,6 +44,7 @@  static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX);
 #define CQSPI_NEEDS_APB_AHB_HAZARD_WAR	BIT(5)
 #define CQSPI_DETECT_FIFO_DEPTH		BIT(6)
 #define CQSPI_RD_NO_IRQ			BIT(7)
+#define CQSPI_BUSYWAIT_EARLY		BIT(8)
 
 /* Capabilities */
 #define CQSPI_SUPPORTS_OCTAL		BIT(0)
@@ -110,7 +111,7 @@  struct cqspi_st {
 
 struct cqspi_driver_platdata {
 	u32 hwcaps_mask;
-	u8 quirks;
+	u16 quirks;
 	int (*indirect_read_dma)(struct cqspi_flash_pdata *f_pdata,
 				 u_char *rxbuf, loff_t from_addr, size_t n_rx);
 	u32 (*get_dma_status)(struct cqspi_st *cqspi);
@@ -121,6 +122,7 @@  struct cqspi_driver_platdata {
 /* Operation timeout value */
 #define CQSPI_TIMEOUT_MS			500
 #define CQSPI_READ_TIMEOUT_MS			10
+#define CQSPI_BUSYWAIT_TIMEOUT_US		500
 
 /* Runtime_pm autosuspend delay */
 #define CQSPI_AUTOSUSPEND_TIMEOUT		2000
@@ -299,13 +301,27 @@  struct cqspi_driver_platdata {
 
 #define CQSPI_REG_VERSAL_DMA_VAL		0x602
 
-static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clr)
+static int cqspi_wait_for_bit(const struct cqspi_driver_platdata *ddata,
+			      void __iomem *reg, const u32 mask, bool clr,
+			      bool busywait)
 {
+	u64 timeout_us = CQSPI_TIMEOUT_MS * USEC_PER_MSEC;
 	u32 val;
 
+	if (busywait && ddata && ddata->quirks & CQSPI_BUSYWAIT_EARLY) {
+		int ret = readl_relaxed_poll_timeout(reg, val,
+						     (((clr ? ~val : val) & mask) == mask),
+						     0, CQSPI_BUSYWAIT_TIMEOUT_US);
+
+		if (ret != -ETIMEDOUT)
+			return ret;
+
+		timeout_us -= CQSPI_BUSYWAIT_TIMEOUT_US;
+	}
+
 	return readl_relaxed_poll_timeout(reg, val,
 					  (((clr ? ~val : val) & mask) == mask),
-					  10, CQSPI_TIMEOUT_MS * 1000);
+					  10, timeout_us);
 }
 
 static bool cqspi_is_idle(struct cqspi_st *cqspi)
@@ -435,8 +451,8 @@  static int cqspi_exec_flash_cmd(struct cqspi_st *cqspi, unsigned int reg)
 	writel(reg, reg_base + CQSPI_REG_CMDCTRL);
 
 	/* Polling for completion. */
-	ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_CMDCTRL,
-				 CQSPI_REG_CMDCTRL_INPROGRESS_MASK, 1);
+	ret = cqspi_wait_for_bit(cqspi->ddata, reg_base + CQSPI_REG_CMDCTRL,
+				 CQSPI_REG_CMDCTRL_INPROGRESS_MASK, 1, true);
 	if (ret) {
 		dev_err(&cqspi->pdev->dev,
 			"Flash command execution timed out.\n");
@@ -791,8 +807,8 @@  static int cqspi_indirect_read_execute(struct cqspi_flash_pdata *f_pdata,
 	}
 
 	/* Check indirect done status */
-	ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTRD,
-				 CQSPI_REG_INDIRECTRD_DONE_MASK, 0);
+	ret = cqspi_wait_for_bit(cqspi->ddata, reg_base + CQSPI_REG_INDIRECTRD,
+				 CQSPI_REG_INDIRECTRD_DONE_MASK, 0, true);
 	if (ret) {
 		dev_err(dev, "Indirect read completion error (%i)\n", ret);
 		goto failrd;
@@ -1092,8 +1108,8 @@  static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
 	}
 
 	/* Check indirect done status */
-	ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR,
-				 CQSPI_REG_INDIRECTWR_DONE_MASK, 0);
+	ret = cqspi_wait_for_bit(cqspi->ddata, reg_base + CQSPI_REG_INDIRECTWR,
+				 CQSPI_REG_INDIRECTWR_DONE_MASK, 0, false);
 	if (ret) {
 		dev_err(dev, "Indirect write completion error (%i)\n", ret);
 		goto failwr;