Message ID | 20240819-seq_optimize-v2-1-9d0da82b022f@gentwo.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] Avoid memory barrier in read_seqcount() through load acquire | expand |
On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote: > diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h > index d90d8ee29d81..353fcf32b800 100644 > --- a/include/linux/seqlock.h > +++ b/include/linux/seqlock.h > @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ > return seq; \ > } \ > \ > +static __always_inline unsigned \ > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \ > +{ \ > + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ > + \ > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ > + return seq; \ > + \ > + if (preemptible && unlikely(seq & 1)) { \ > + __SEQ_LOCK(lockbase##_lock(s->lock)); \ > + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ > + \ > + /* \ > + * Re-read the sequence counter since the (possibly \ > + * preempted) writer made progress. \ > + */ \ > + seq = smp_load_acquire(&s->seqcount.sequence); \ We could probably do even better with LDAPR here, as that should be sufficient for this. It's a can of worms though, as it's not implemented on all CPUs and relaxing smp_load_acquire() might introduce subtle breakage in places where it's used to build other types of lock. Maybe you can hack something to see if there's any performance left behind without it? > + } \ > + \ > + return seq; \ > +} \ > + \ > static __always_inline bool \ > __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ > { \ > @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s) > return READ_ONCE(s->sequence); > } > > +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s) > +{ > + return smp_load_acquire(&s->sequence); > +} > + > static inline bool __seqprop_preemptible(const seqcount_t *s) > { > return false; > @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > #define seqprop_ptr(s) __seqprop(s, ptr)(s) > #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) > #define seqprop_sequence(s) __seqprop(s, sequence)(s) > +#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s) > #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) > #define seqprop_assert(s) __seqprop(s, assert)(s) > > @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > * > * Return: count to be passed to read_seqcount_retry() > */ > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > +#define raw_read_seqcount_begin(s) \ > +({ \ > + unsigned _seq; \ > + \ > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > + cpu_relax(); \ It would also be interesting to see whether smp_cond_load_acquire() performs any better that this loop in the !RT case. Both things to look at separately though, so: Acked-by: Will Deacon <will@kernel.org> I assume this will go via -tip. Will
On Fri, 23 Aug 2024, Will Deacon wrote: > On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote: > > +static __always_inline unsigned \ > > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \ > > +{ \ > > + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ > > + \ > > + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ > > + return seq; \ > > + \ > > + if (preemptible && unlikely(seq & 1)) { \ > > + __SEQ_LOCK(lockbase##_lock(s->lock)); \ > > + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ > > + \ > > + /* \ > > + * Re-read the sequence counter since the (possibly \ > > + * preempted) writer made progress. \ > > + */ \ > > + seq = smp_load_acquire(&s->seqcount.sequence); \ > > We could probably do even better with LDAPR here, as that should be > sufficient for this. It's a can of worms though, as it's not implemented > on all CPUs and relaxing smp_load_acquire() might introduce subtle > breakage in places where it's used to build other types of lock. Maybe > you can hack something to see if there's any performance left behind > without it? I added the following patch. Kernel booted fine. No change in the cycles of read_seq() LDAPR --------------------------- Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL write seq : 13 98 385 764 1551 3043 6259 11922 read seq : 8 8 8 8 8 8 9 10 rw seq : 8 101 247 300 467 742 1384 2101 LDA --------------------------- Test Single 2 CPU 4 CPU 8 CPU 16 CPU 32 CPU 64 CPU ALL write seq : 13 90 343 785 1533 3032 6315 11073 read seq : 8 8 8 8 8 8 9 11 rw seq : 8 79 227 313 423 755 1313 2220 Index: linux/arch/arm64/include/asm/barrier.h =================================================================== --- linux.orig/arch/arm64/include/asm/barrier.h +++ linux/arch/arm64/include/asm/barrier.h @@ -167,22 +167,22 @@ do { \ kasan_check_read(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ - asm volatile ("ldarb %w0, %1" \ + asm volatile (".arch_extension rcpc\nldaprb %w0, %1" \ : "=r" (*(__u8 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 2: \ - asm volatile ("ldarh %w0, %1" \ + asm volatile (".arch_extension rcpc\nldaprh %w0, %1" \ : "=r" (*(__u16 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 4: \ - asm volatile ("ldar %w0, %1" \ + asm volatile (".arch_extension rcpc\nldapr %w0, %1" \ : "=r" (*(__u32 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \ case 8: \ - asm volatile ("ldar %0, %1" \ + asm volatile (".arch_extension rcpc\nldapr %0, %1" \ : "=r" (*(__u64 *)__u.__c) \ : "Q" (*__p) : "memory"); \ break; \
On Fri, 23 Aug 2024, Will Deacon wrote: > > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > > +#define raw_read_seqcount_begin(s) \ > > +({ \ > > + unsigned _seq; \ > > + \ > > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > > + cpu_relax(); \ > > It would also be interesting to see whether smp_cond_load_acquire() > performs any better that this loop in the !RT case. The hack to do this follows. Kernel boots but no change in cycles. Also builds a kernel just fine. Another benchmark may be better. All my synthetic tests do is run the function calls in a loop in parallel on multiple cpus. The main effect here may be the reduction of power since the busyloop is no longer required. I would favor a solution like this. But the patch is not clean given the need to get rid of the const attribute with a cast. Index: linux/include/linux/seqlock.h =================================================================== --- linux.orig/include/linux/seqlock.h +++ linux/include/linux/seqlock.h @@ -325,9 +325,9 @@ SEQCOUNT_LOCKNAME(mutex, struct m #define raw_read_seqcount_begin(s) \ ({ \ unsigned _seq; \ + seqcount_t *e = seqprop_ptr((struct seqcount_spinlock *)s); \ \ - while ((_seq = seqprop_sequence_acquire(s)) & 1) \ - cpu_relax(); \ + _seq = smp_cond_load_acquire(&e->sequence, ((e->sequence & 1) == 0)); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ _seq; \
On Mon, Aug 19 2024 at 11:30, Christoph Lameter via wrote: > @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) > * > * Return: count to be passed to read_seqcount_retry() > */ > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE > +#define raw_read_seqcount_begin(s) \ > +({ \ > + unsigned _seq; \ > + \ > + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ > + cpu_relax(); \ > + \ > + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ > + _seq; \ > +}) So this covers only raw_read_seqcount_begin(), but not raw_read_seqcount() which has the same smp_rmb() inside. This all can be done without the extra copies of the counter accessors. Uncompiled patch below. It's a little larger than I initialy wanted to do it, but I had to keep the raw READ_ONCE() for __read_seqcount_begin() to not inflict the smp_load_acquire() to the only usage site in the dcache code. The acquire conditional in __seqprop_load_sequence() is optimized out by the compiler as all of this is macro/__always_inline. Thanks, tglx --- --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -132,6 +132,14 @@ static inline void seqcount_lockdep_read #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) +static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) +{ + if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) + return smp_load_acquire(&s->sequence); + else + return READ_ONCE(s->sequence); +} + /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t @@ -155,9 +163,10 @@ static __always_inline const seqcount_t } \ \ static __always_inline unsigned \ -__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s, \ + bool acquire) \ { \ - unsigned seq = READ_ONCE(s->seqcount.sequence); \ + unsigned seq = __seqprop_load_sequence(&s->seqcount, acquire); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ @@ -170,7 +179,7 @@ static __always_inline unsigned \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ - seq = READ_ONCE(s->seqcount.sequence); \ + seq = __seqprop_load_sequence(&s->seqcount, acquire); \ } \ \ return seq; \ @@ -206,9 +215,9 @@ static inline const seqcount_t *__seqpro return s; } -static inline unsigned __seqprop_sequence(const seqcount_t *s) +static inline unsigned __seqprop_sequence(const seqcount_t *s, bool acquire) { - return READ_ONCE(s->sequence); + return __seqprop_load_sequence(s, acquire); } static inline bool __seqprop_preemptible(const seqcount_t *s) @@ -258,29 +267,23 @@ SEQCOUNT_LOCKNAME(mutex, struct m #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) -#define seqprop_sequence(s) __seqprop(s, sequence)(s) +#define seqprop_sequence(s, a) __seqprop(s, sequence)(s, a) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) /** - * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier - * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants - * - * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() - * barrier. Callers should ensure that smp_rmb() or equivalent ordering is - * provided before actually loading any of the variables that are to be - * protected in this critical section. - * - * Use carefully, only in critical code, and comment how the barrier is - * provided. + * read_seqcount_begin_cond_acquire() - begin a seqcount_t read section + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * @acquire: If true, the read of the sequence count uses smp_load_acquire() + * if the architecure provides and enabled it. * * Return: count to be passed to read_seqcount_retry() */ -#define __read_seqcount_begin(s) \ +#define read_seqcount_begin_cond_acquire(s, acquire) \ ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while ((__seq = seqprop_sequence(s, acquire)) & 1) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ @@ -288,6 +291,26 @@ SEQCOUNT_LOCKNAME(mutex, struct m }) /** + * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * __read_seqcount_begin is like read_seqcount_begin, but it neither + * provides a smp_rmb() barrier nor does it use smp_load_acquire() on + * architectures which provide it. + * + * Callers should ensure that smp_rmb() or equivalent ordering is provided + * before actually loading any of the variables that are to be protected in + * this critical section. + * + * Use carefully, only in critical code, and comment how the barrier is + * provided. + * + * Return: count to be passed to read_seqcount_retry() + */ +#define __read_seqcount_begin(s) \ + read_seqcount_begin_cond_acquire(s, false) + +/** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @@ -295,9 +318,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m */ #define raw_read_seqcount_begin(s) \ ({ \ - unsigned _seq = __read_seqcount_begin(s); \ + unsigned _seq = read_seqcount_begin_cond_acquire(s, true); \ \ - smp_rmb(); \ + if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \ + smp_rmb(); \ _seq; \ }) @@ -326,9 +350,10 @@ SEQCOUNT_LOCKNAME(mutex, struct m */ #define raw_read_seqcount(s) \ ({ \ - unsigned __seq = seqprop_sequence(s); \ + unsigned __seq = seqprop_sequence(s, true); \ \ - smp_rmb(); \ + if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) \ + smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ })
On Fri, 23 Aug 2024, Thomas Gleixner wrote: > This all can be done without the extra copies of the counter > accessors. Uncompiled patch below. Great. Thanks. Tried it too initially but could not make it work right. One thing that we also want is the use of the smp_cond_load_acquire to have the cpu power down while waiting for a cacheline change. The code has several places where loops occur when the last bit is set in the seqcount. We could use smp_cond_load_acquire in load_sequence() but what do we do about the loops at the higher level? Also this does not sync with the lock checking logic. diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 68b3af8bd6c6..4442a97ffe9a 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -135,7 +135,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) { if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) - return smp_load_acquire(&s->sequence); + return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0); else return READ_ONCE(s->sequence); }
On Wed, Aug 28 2024 at 10:15, Christoph Lameter wrote: > On Fri, 23 Aug 2024, Thomas Gleixner wrote: > >> This all can be done without the extra copies of the counter >> accessors. Uncompiled patch below. > > Great. Thanks. Tried it too initially but could not make it work right. > > One thing that we also want is the use of the smp_cond_load_acquire to > have the cpu power down while waiting for a cacheline change. > > The code has several places where loops occur when the last bit is set in > the seqcount. > > We could use smp_cond_load_acquire in load_sequence() but what do we do > about the loops at the higher level? Also this does not sync with the lock > checking logic. Come on. It's not rocket science to figure that out. Uncompiled delta patch below. Thanks, tglx --- --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -23,6 +23,13 @@ #include <asm/processor.h> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE +# define USE_LOAD_ACQUIRE true +# define USE_COND_LOAD_ACQUIRE !IS_ENABLED(CONFIG_PREEMPT_RT) +#else +# define USE_LOAD_ACQUIRE false +# define USE_COND_LOAD_ACQUIRE false +#endif /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to @@ -134,10 +141,13 @@ static inline void seqcount_lockdep_read static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire) { - if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE)) - return smp_load_acquire(&s->sequence); - else + if (!acquire || !USE_LOAD_ACQUIRE) return READ_ONCE(s->sequence); + + if (USE_COND_LOAD_ACQUIRE) + return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0); + + return smp_load_acquire(&s->sequence); } /* @@ -283,8 +293,12 @@ SEQCOUNT_LOCKNAME(mutex, struct m ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s, acquire)) & 1) \ - cpu_relax(); \ + if (acquire && USE_COND_LOAD_ACQUIRE) { \ + __seq = seqprop_sequence(s, acquire); \ + } else { \ + while ((__seq = seqprop_sequence(s, acquire)) & 1) \ + cpu_relax(); \ + } \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \
diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd..3c270f496231 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1600,6 +1600,14 @@ config ARCH_HAS_KERNEL_FPU_SUPPORT Architectures that select this option can run floating-point code in the kernel, as described in Documentation/core-api/floating-point.rst. +config ARCH_HAS_ACQUIRE_RELEASE + bool + help + Setting ARCH_HAS_ACQUIRE_RELEASE indicates that the architecture + supports load acquire and release. Typically these are more effective + than memory barriers. Code will prefer the use of load acquire and + store release over memory barriers if this option is enabled. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2f8ff354ca6..19e34fff145f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -39,6 +39,7 @@ config ARM64 select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG + select ARCH_HAS_ACQUIRE_RELEASE select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index d90d8ee29d81..353fcf32b800 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ return seq; \ } \ \ +static __always_inline unsigned \ +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \ +{ \ + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ + \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ + return seq; \ + \ + if (preemptible && unlikely(seq & 1)) { \ + __SEQ_LOCK(lockbase##_lock(s->lock)); \ + __SEQ_LOCK(lockbase##_unlock(s->lock)); \ + \ + /* \ + * Re-read the sequence counter since the (possibly \ + * preempted) writer made progress. \ + */ \ + seq = smp_load_acquire(&s->seqcount.sequence); \ + } \ + \ + return seq; \ +} \ + \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s) return READ_ONCE(s->sequence); } +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s) +{ + return smp_load_acquire(&s->sequence); +} + static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) #define seqprop_sequence(s) __seqprop(s, sequence)(s) +#define seqprop_sequence_acquire(s) __seqprop(s, sequence_acquire)(s) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * * Return: count to be passed to read_seqcount_retry() */ +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE +#define raw_read_seqcount_begin(s) \ +({ \ + unsigned _seq; \ + \ + while ((_seq = seqprop_sequence_acquire(s)) & 1) \ + cpu_relax(); \ + \ + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ + _seq; \ +}) +#else #define raw_read_seqcount_begin(s) \ ({ \ unsigned _seq = __read_seqcount_begin(s); \ @@ -300,6 +340,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) smp_rmb(); \ _seq; \ }) +#endif /** * read_seqcount_begin() - begin a seqcount_t read critical section