diff mbox series

[3/5] arm64: refactor delay() to enable polling for value

Message ID 20241105183041.1531976-4-harisokn@amazon.com (mailing list archive)
State New
Headers show
Series [1/5] asm-generic: add smp_vcond_load_relaxed() | expand

Commit Message

Okanovic, Haris Nov. 5, 2024, 6:30 p.m. UTC
Refactor arm64's delay() to poll for a mask/value condition (vcond) in
it's wfet(), wfe(), and relaxed polling loops.

Signed-off-by: Haris Okanovic <harisokn@amazon.com>
---
 arch/arm64/lib/delay.c | 70 ++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 20 deletions(-)

Comments

Christoph Lameter (Ampere) Nov. 5, 2024, 7:42 p.m. UTC | #1
On Tue, 5 Nov 2024, Haris Okanovic wrote:

> -#define USECS_TO_CYCLES(time_usecs)			\
> -	xloops_to_cycles((time_usecs) * 0x10C7UL)
> -
> -static inline unsigned long xloops_to_cycles(unsigned long xloops)
> +static inline u64 xloops_to_cycles(u64 xloops)
>  {
>  	return (xloops * loops_per_jiffy * HZ) >> 32;
>  }
>
> -void __delay(unsigned long cycles)
> +#define USECS_TO_XLOOPS(time_usecs) \
> +	((time_usecs) * 0x10C7UL)
> +
> +#define USECS_TO_CYCLES(time_usecs) \
> +	xloops_to_cycles(USECS_TO_XLOOPS(time_usecs))
> +


> +#define NSECS_TO_XLOOPS(time_nsecs) \
> +	((time_nsecs) * 0x10C7UL)

The constant here is the same value as for microseconds. If I remember
correctly its 5UL for nanoseconds.
Catalin Marinas Nov. 6, 2024, 9:18 a.m. UTC | #2
On Tue, Nov 05, 2024 at 12:30:39PM -0600, Haris Okanovic wrote:
> +		do {
> +			cur = __READ_ONCE_EX(*addr);
> +			if ((cur & mask) == val) {
> +				break;
> +			}
>  			wfet(end);

Constructs like this need to be entirely in assembly. The compiler may
spill 'cur' onto the stack and the write could clear the exclusive
monitor which makes the wfet return immediately. That's highly CPU
implementation specific but it's the reason we have functions like
__cmpwait() in assembly (or whatever else deals with exclusives). IOW,
we can't have other memory accesses between the LDXR and whatever is
consuming the exclusive monitor armed state (typically STXR but WFE/WFET
can be another).
Okanovic, Haris Nov. 6, 2024, 5:38 p.m. UTC | #3
On Wed, 2024-11-06 at 09:18 +0000, Catalin Marinas wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Tue, Nov 05, 2024 at 12:30:39PM -0600, Haris Okanovic wrote:
> > +             do {
> > +                     cur = __READ_ONCE_EX(*addr);
> > +                     if ((cur & mask) == val) {
> > +                             break;
> > +                     }
> >                       wfet(end);
> 
> Constructs like this need to be entirely in assembly. The compiler may
> spill 'cur' onto the stack and the write could clear the exclusive
> monitor which makes the wfet return immediately. That's highly CPU
> implementation specific but it's the reason we have functions like
> __cmpwait() in assembly (or whatever else deals with exclusives). IOW,
> we can't have other memory accesses between the LDXR and whatever is
> consuming the exclusive monitor armed state (typically STXR but WFE/WFET
> can be another).

Agreed, will rewrite parts of delay() in asm.

> 
> --
> Catalin
Okanovic, Haris Nov. 6, 2024, 5:42 p.m. UTC | #4
On Tue, 2024-11-05 at 11:42 -0800, Christoph Lameter (Ampere) wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Tue, 5 Nov 2024, Haris Okanovic wrote:
> 
> > -#define USECS_TO_CYCLES(time_usecs)                  \
> > -     xloops_to_cycles((time_usecs) * 0x10C7UL)
> > -
> > -static inline unsigned long xloops_to_cycles(unsigned long xloops)
> > +static inline u64 xloops_to_cycles(u64 xloops)
> >  {
> >       return (xloops * loops_per_jiffy * HZ) >> 32;
> >  }
> > 
> > -void __delay(unsigned long cycles)
> > +#define USECS_TO_XLOOPS(time_usecs) \
> > +     ((time_usecs) * 0x10C7UL)
> > +
> > +#define USECS_TO_CYCLES(time_usecs) \
> > +     xloops_to_cycles(USECS_TO_XLOOPS(time_usecs))
> > +
> 
> 
> > +#define NSECS_TO_XLOOPS(time_nsecs) \
> > +     ((time_nsecs) * 0x10C7UL)
> 
> The constant here is the same value as for microseconds. If I remember
> correctly its 5UL for nanoseconds.
> 

You're right, good catch. Should be `nsecs * 0x5UL` per old code.
diff mbox series

Patch

diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index cb2062e7e234..a7c3040af316 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -14,43 +14,73 @@ 
 #include <linux/timex.h>
 
 #include <clocksource/arm_arch_timer.h>
+#include <asm/readex.h>
 
-#define USECS_TO_CYCLES(time_usecs)			\
-	xloops_to_cycles((time_usecs) * 0x10C7UL)
-
-static inline unsigned long xloops_to_cycles(unsigned long xloops)
+static inline u64 xloops_to_cycles(u64 xloops)
 {
 	return (xloops * loops_per_jiffy * HZ) >> 32;
 }
 
-void __delay(unsigned long cycles)
+#define USECS_TO_XLOOPS(time_usecs) \
+	((time_usecs) * 0x10C7UL)
+
+#define USECS_TO_CYCLES(time_usecs) \
+	xloops_to_cycles(USECS_TO_XLOOPS(time_usecs))
+
+#define NSECS_TO_XLOOPS(time_nsecs) \
+	((time_nsecs) * 0x10C7UL)
+
+#define NSECS_TO_CYCLES(time_nsecs) \
+	xloops_to_cycles(NSECS_TO_XLOOPS(time_nsecs))
+
+static unsigned long __delay_until_ul(u64 cycles, unsigned long* addr, unsigned long mask, unsigned long val)
 {
-	cycles_t start = get_cycles();
+	u64 start = get_cycles();
+	unsigned long cur;
 
 	if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) {
 		u64 end = start + cycles;
 
-		/*
-		 * Start with WFIT. If an interrupt makes us resume
-		 * early, use a WFET loop to complete the delay.
-		 */
-		wfit(end);
-		while ((get_cycles() - start) < cycles)
+		do {
+			cur = __READ_ONCE_EX(*addr);
+			if ((cur & mask) == val) {
+				break;
+			}
 			wfet(end);
-	} else 	if (arch_timer_evtstrm_available()) {
-		const cycles_t timer_evt_period =
+		} while ((get_cycles() - start) < cycles);
+	} else if (arch_timer_evtstrm_available()) {
+		const u64 timer_evt_period =
 			USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US);
 
-		while ((get_cycles() - start + timer_evt_period) < cycles)
+		do {
+			cur = __READ_ONCE_EX(*addr);
+			if ((cur & mask) == val) {
+				break;
+			}
 			wfe();
+		} while ((get_cycles() - start + timer_evt_period) < cycles);
+	} else {
+		do {
+			cur = __READ_ONCE_EX(*addr);
+			if ((cur & mask) == val) {
+				break;
+			}
+			cpu_relax();
+		} while ((get_cycles() - start) < cycles);
 	}
 
-	while ((get_cycles() - start) < cycles)
-		cpu_relax();
+	return cur;
+}
+
+void __delay(unsigned long cycles)
+{
+	/* constant word for wfet()/wfe() to poll */
+	unsigned long dummy ____cacheline_aligned = 0;
+	__delay_until_ul(cycles, &dummy, 0, 1);
 }
 EXPORT_SYMBOL(__delay);
 
-inline void __const_udelay(unsigned long xloops)
+void __const_udelay(unsigned long xloops)
 {
 	__delay(xloops_to_cycles(xloops));
 }
@@ -58,12 +88,12 @@  EXPORT_SYMBOL(__const_udelay);
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */
+	__delay(USECS_TO_CYCLES(usecs));
 }
 EXPORT_SYMBOL(__udelay);
 
 void __ndelay(unsigned long nsecs)
 {
-	__const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */
+	__delay(NSECS_TO_CYCLES(nsecs));
 }
 EXPORT_SYMBOL(__ndelay);