Message ID | 20241105183041.1531976-4-harisokn@amazon.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [1/5] asm-generic: add smp_vcond_load_relaxed() | expand |
On Tue, 5 Nov 2024, Haris Okanovic wrote: > -#define USECS_TO_CYCLES(time_usecs) \ > - xloops_to_cycles((time_usecs) * 0x10C7UL) > - > -static inline unsigned long xloops_to_cycles(unsigned long xloops) > +static inline u64 xloops_to_cycles(u64 xloops) > { > return (xloops * loops_per_jiffy * HZ) >> 32; > } > > -void __delay(unsigned long cycles) > +#define USECS_TO_XLOOPS(time_usecs) \ > + ((time_usecs) * 0x10C7UL) > + > +#define USECS_TO_CYCLES(time_usecs) \ > + xloops_to_cycles(USECS_TO_XLOOPS(time_usecs)) > + > +#define NSECS_TO_XLOOPS(time_nsecs) \ > + ((time_nsecs) * 0x10C7UL) The constant here is the same value as for microseconds. If I remember correctly its 5UL for nanoseconds.
On Tue, Nov 05, 2024 at 12:30:39PM -0600, Haris Okanovic wrote: > + do { > + cur = __READ_ONCE_EX(*addr); > + if ((cur & mask) == val) { > + break; > + } > wfet(end); Constructs like this need to be entirely in assembly. The compiler may spill 'cur' onto the stack and the write could clear the exclusive monitor which makes the wfet return immediately. That's highly CPU implementation specific but it's the reason we have functions like __cmpwait() in assembly (or whatever else deals with exclusives). IOW, we can't have other memory accesses between the LDXR and whatever is consuming the exclusive monitor armed state (typically STXR but WFE/WFET can be another).
On Wed, 2024-11-06 at 09:18 +0000, Catalin Marinas wrote: > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > On Tue, Nov 05, 2024 at 12:30:39PM -0600, Haris Okanovic wrote: > > + do { > > + cur = __READ_ONCE_EX(*addr); > > + if ((cur & mask) == val) { > > + break; > > + } > > wfet(end); > > Constructs like this need to be entirely in assembly. The compiler may > spill 'cur' onto the stack and the write could clear the exclusive > monitor which makes the wfet return immediately. That's highly CPU > implementation specific but it's the reason we have functions like > __cmpwait() in assembly (or whatever else deals with exclusives). IOW, > we can't have other memory accesses between the LDXR and whatever is > consuming the exclusive monitor armed state (typically STXR but WFE/WFET > can be another). Agreed, will rewrite parts of delay() in asm. > > -- > Catalin
On Tue, 2024-11-05 at 11:42 -0800, Christoph Lameter (Ampere) wrote: > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe. > > > > On Tue, 5 Nov 2024, Haris Okanovic wrote: > > > -#define USECS_TO_CYCLES(time_usecs) \ > > - xloops_to_cycles((time_usecs) * 0x10C7UL) > > - > > -static inline unsigned long xloops_to_cycles(unsigned long xloops) > > +static inline u64 xloops_to_cycles(u64 xloops) > > { > > return (xloops * loops_per_jiffy * HZ) >> 32; > > } > > > > -void __delay(unsigned long cycles) > > +#define USECS_TO_XLOOPS(time_usecs) \ > > + ((time_usecs) * 0x10C7UL) > > + > > +#define USECS_TO_CYCLES(time_usecs) \ > > + xloops_to_cycles(USECS_TO_XLOOPS(time_usecs)) > > + > > > > +#define NSECS_TO_XLOOPS(time_nsecs) \ > > + ((time_nsecs) * 0x10C7UL) > > The constant here is the same value as for microseconds. If I remember > correctly its 5UL for nanoseconds. > You're right, good catch. Should be `nsecs * 0x5UL` per old code.
diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index cb2062e7e234..a7c3040af316 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -14,43 +14,73 @@ #include <linux/timex.h> #include <clocksource/arm_arch_timer.h> +#include <asm/readex.h> -#define USECS_TO_CYCLES(time_usecs) \ - xloops_to_cycles((time_usecs) * 0x10C7UL) - -static inline unsigned long xloops_to_cycles(unsigned long xloops) +static inline u64 xloops_to_cycles(u64 xloops) { return (xloops * loops_per_jiffy * HZ) >> 32; } -void __delay(unsigned long cycles) +#define USECS_TO_XLOOPS(time_usecs) \ + ((time_usecs) * 0x10C7UL) + +#define USECS_TO_CYCLES(time_usecs) \ + xloops_to_cycles(USECS_TO_XLOOPS(time_usecs)) + +#define NSECS_TO_XLOOPS(time_nsecs) \ + ((time_nsecs) * 0x10C7UL) + +#define NSECS_TO_CYCLES(time_nsecs) \ + xloops_to_cycles(NSECS_TO_XLOOPS(time_nsecs)) + +static unsigned long __delay_until_ul(u64 cycles, unsigned long* addr, unsigned long mask, unsigned long val) { - cycles_t start = get_cycles(); + u64 start = get_cycles(); + unsigned long cur; if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) { u64 end = start + cycles; - /* - * Start with WFIT. If an interrupt makes us resume - * early, use a WFET loop to complete the delay. - */ - wfit(end); - while ((get_cycles() - start) < cycles) + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } wfet(end); - } else if (arch_timer_evtstrm_available()) { - const cycles_t timer_evt_period = + } while ((get_cycles() - start) < cycles); + } else if (arch_timer_evtstrm_available()) { + const u64 timer_evt_period = USECS_TO_CYCLES(ARCH_TIMER_EVT_STREAM_PERIOD_US); - while ((get_cycles() - start + timer_evt_period) < cycles) + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } wfe(); + } while ((get_cycles() - start + timer_evt_period) < cycles); + } else { + do { + cur = __READ_ONCE_EX(*addr); + if ((cur & mask) == val) { + break; + } + cpu_relax(); + } while ((get_cycles() - start) < cycles); } - while ((get_cycles() - start) < cycles) - cpu_relax(); + return cur; +} + +void __delay(unsigned long cycles) +{ + /* constant word for wfet()/wfe() to poll */ + unsigned long dummy ____cacheline_aligned = 0; + __delay_until_ul(cycles, &dummy, 0, 1); } EXPORT_SYMBOL(__delay); -inline void __const_udelay(unsigned long xloops) +void __const_udelay(unsigned long xloops) { __delay(xloops_to_cycles(xloops)); } @@ -58,12 +88,12 @@ EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */ + __delay(USECS_TO_CYCLES(usecs)); } EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { - __const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */ + __delay(NSECS_TO_CYCLES(nsecs)); } EXPORT_SYMBOL(__ndelay);
Refactor arm64's delay() to poll for a mask/value condition (vcond) in it's wfet(), wfe(), and relaxed polling loops. Signed-off-by: Haris Okanovic <harisokn@amazon.com> --- arch/arm64/lib/delay.c | 70 ++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 20 deletions(-)