diff mbox series

[v2] Avoid memory barrier in read_seqcount() through load acquire

Message ID 20240819-seq_optimize-v2-1-9d0da82b022f@gentwo.org (mailing list archive)
State New
Headers show
Series [v2] Avoid memory barrier in read_seqcount() through load acquire | expand

Commit Message

Christoph Lameter via B4 Relay Aug. 19, 2024, 6:30 p.m. UTC
From: "Christoph Lameter (Ampere)" <cl@gentwo.org>

Some architectures support load acquire which can save us a memory
barrier and save some cycles.

A typical sequence

	do {
		seq = read_seqcount_begin(&s);
		<something>
	} while (read_seqcount_retry(&s, seq);

requires 13 cycles on ARM64 for an empty loop. Two read memory
barriers are needed. One for each of the seqcount_* functions.

We can replace the first read barrier with a load acquire of
the seqcount which saves us one barrier.

On ARM64 doing so reduces the cycle count from 13 to 8.

This is a general improvement for the ARM64 architecture and not
specific to a certain processor. The cycle count here was
obtained on a Neoverse N1 (Ampere Altra).

The ARM documentation states that load acquire is more effective
than a load plus barrier. In general that tends to be true on all
compute platforms that support both.

See (as quoted by Linus Torvalds):
   https://developer.arm.com/documentation/102336/0100/Load-Acquire-and-Store-Release-instructions

 "Weaker ordering requirements that are imposed by Load-Acquire and
  Store-Release instructions allow for micro-architectural
  optimizations, which could reduce some of the performance impacts that
  are otherwise imposed by an explicit memory barrier.

  If the ordering requirement is satisfied using either a Load-Acquire
  or Store-Release, then it would be preferable to use these
  instructions instead of a DMB"

Signed-off-by: Christoph Lameter (Ampere) <cl@gentwo.org>
---
V1->V2
- Describe the benefit of load acquire vs barriers
- Explain the CONFIG_ARCH_HAS_ACQUIRE_RELEASE option better
---
 arch/Kconfig            |  8 ++++++++
 arch/arm64/Kconfig      |  1 +
 include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+)


---
base-commit: b0da640826ba3b6506b4996a6b23a429235e6923
change-id: 20240813-seq_optimize-68c48696c798

Best regards,

Comments

Will Deacon Aug. 23, 2024, 10:32 a.m. UTC | #1
On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote:
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index d90d8ee29d81..353fcf32b800 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -176,6 +176,28 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
>  	return seq;							\
>  }									\
>  									\
> +static __always_inline unsigned						\
> +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
> +{									\
> +	unsigned seq = smp_load_acquire(&s->seqcount.sequence);		\
> +									\
> +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))				\
> +		return seq;						\
> +									\
> +	if (preemptible && unlikely(seq & 1)) {				\
> +		__SEQ_LOCK(lockbase##_lock(s->lock));			\
> +		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
> +									\
> +		/*							\
> +		 * Re-read the sequence counter since the (possibly	\
> +		 * preempted) writer made progress.			\
> +		 */							\
> +		seq = smp_load_acquire(&s->seqcount.sequence);		\

We could probably do even better with LDAPR here, as that should be
sufficient for this. It's a can of worms though, as it's not implemented
on all CPUs and relaxing smp_load_acquire() might introduce subtle
breakage in places where it's used to build other types of lock. Maybe
you can hack something to see if there's any performance left behind
without it?

> +	}								\
> +									\
> +	return seq;							\
> +}									\
> +									\
>  static __always_inline bool						\
>  __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)	\
>  {									\
> @@ -211,6 +233,11 @@ static inline unsigned __seqprop_sequence(const seqcount_t *s)
>  	return READ_ONCE(s->sequence);
>  }
>  
> +static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
> +{
> +	return smp_load_acquire(&s->sequence);
> +}
> +
>  static inline bool __seqprop_preemptible(const seqcount_t *s)
>  {
>  	return false;
> @@ -259,6 +286,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
>  #define seqprop_ptr(s)			__seqprop(s, ptr)(s)
>  #define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
>  #define seqprop_sequence(s)		__seqprop(s, sequence)(s)
> +#define seqprop_sequence_acquire(s)	__seqprop(s, sequence_acquire)(s)
>  #define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
>  #define seqprop_assert(s)		__seqprop(s, assert)(s)
>  
> @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
>   *
>   * Return: count to be passed to read_seqcount_retry()
>   */
> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> +#define raw_read_seqcount_begin(s)					\
> +({									\
> +	unsigned _seq;							\
> +									\
> +	while ((_seq = seqprop_sequence_acquire(s)) & 1)		\
> +		cpu_relax();						\

It would also be interesting to see whether smp_cond_load_acquire()
performs any better that this loop in the !RT case.

Both things to look at separately though, so:

Acked-by: Will Deacon <will@kernel.org>

I assume this will go via -tip.

Will
Christoph Lameter (Ampere) Aug. 23, 2024, 5:56 p.m. UTC | #2
On Fri, 23 Aug 2024, Will Deacon wrote:

> On Mon, Aug 19, 2024 at 11:30:15AM -0700, Christoph Lameter via B4 Relay wrote:
> > +static __always_inline unsigned						\
> > +__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
> > +{									\
> > +	unsigned seq = smp_load_acquire(&s->seqcount.sequence);		\
> > +									\
> > +	if (!IS_ENABLED(CONFIG_PREEMPT_RT))				\
> > +		return seq;						\
> > +									\
> > +	if (preemptible && unlikely(seq & 1)) {				\
> > +		__SEQ_LOCK(lockbase##_lock(s->lock));			\
> > +		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
> > +									\
> > +		/*							\
> > +		 * Re-read the sequence counter since the (possibly	\
> > +		 * preempted) writer made progress.			\
> > +		 */							\
> > +		seq = smp_load_acquire(&s->seqcount.sequence);		\
>
> We could probably do even better with LDAPR here, as that should be
> sufficient for this. It's a can of worms though, as it's not implemented
> on all CPUs and relaxing smp_load_acquire() might introduce subtle
> breakage in places where it's used to build other types of lock. Maybe
> you can hack something to see if there's any performance left behind
> without it?

I added the following patch. Kernel booted fine. No change in the cycles
of read_seq()

LDAPR
---------------------------
Test					Single	2 CPU	4 CPU	8 CPU	16 CPU	32 CPU	64 CPU	ALL
write seq			:	13	98	385	764	1551	3043	6259	11922
read seq			:	8	8	8	8	8	8	9	10
rw seq				:	8	101	247	300	467	742	1384	2101

LDA
---------------------------
Test                                     Single  2 CPU   4 CPU   8 CPU   16 CPU  32 CPU  64 CPU  ALL
write seq                        :       13      90      343     785     1533    3032    6315    11073
read seq                         :       8       8       8       8       8       8       9       11
rw seq                           :       8       79      227     313     423     755     1313    2220





Index: linux/arch/arm64/include/asm/barrier.h
===================================================================
--- linux.orig/arch/arm64/include/asm/barrier.h
+++ linux/arch/arm64/include/asm/barrier.h
@@ -167,22 +167,22 @@ do {									\
 	kasan_check_read(__p, sizeof(*p));				\
 	switch (sizeof(*p)) {						\
 	case 1:								\
-		asm volatile ("ldarb %w0, %1"				\
+		asm volatile (".arch_extension rcpc\nldaprb %w0, %1"	\
 			: "=r" (*(__u8 *)__u.__c)			\
 			: "Q" (*__p) : "memory");			\
 		break;							\
 	case 2:								\
-		asm volatile ("ldarh %w0, %1"				\
+		asm volatile (".arch_extension rcpc\nldaprh %w0, %1"				\
 			: "=r" (*(__u16 *)__u.__c)			\
 			: "Q" (*__p) : "memory");			\
 		break;							\
 	case 4:								\
-		asm volatile ("ldar %w0, %1"				\
+		asm volatile (".arch_extension rcpc\nldapr %w0, %1"				\
 			: "=r" (*(__u32 *)__u.__c)			\
 			: "Q" (*__p) : "memory");			\
 		break;							\
 	case 8:								\
-		asm volatile ("ldar %0, %1"				\
+		asm volatile (".arch_extension rcpc\nldapr %0, %1"				\
 			: "=r" (*(__u64 *)__u.__c)			\
 			: "Q" (*__p) : "memory");			\
 		break;							\
Christoph Lameter (Ampere) Aug. 23, 2024, 7:38 p.m. UTC | #3
On Fri, 23 Aug 2024, Will Deacon wrote:

> > +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> > +#define raw_read_seqcount_begin(s)					\
> > +({									\
> > +	unsigned _seq;							\
> > +									\
> > +	while ((_seq = seqprop_sequence_acquire(s)) & 1)		\
> > +		cpu_relax();						\
>
> It would also be interesting to see whether smp_cond_load_acquire()
> performs any better that this loop in the !RT case.

The hack to do this follows. Kernel boots but no change in cycles. Also
builds a kernel just fine.

Another benchmark may be better. All my synthetic tests do is run the
function calls in a loop in parallel on multiple cpus.

The main effect here may be the reduction of power since the busyloop is
no longer required. I would favor a solution like this. But the patch is
not clean given the need to get rid of the const attribute with a cast.


Index: linux/include/linux/seqlock.h
===================================================================
--- linux.orig/include/linux/seqlock.h
+++ linux/include/linux/seqlock.h
@@ -325,9 +325,9 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
 #define raw_read_seqcount_begin(s)					\
 ({									\
 	unsigned _seq;							\
+	seqcount_t *e = seqprop_ptr((struct seqcount_spinlock *)s);	\
 									\
-	while ((_seq = seqprop_sequence_acquire(s)) & 1)		\
-		cpu_relax();						\
+	_seq = smp_cond_load_acquire(&e->sequence, ((e->sequence & 1) == 0));	\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
 	_seq;								\
Thomas Gleixner Aug. 23, 2024, 9:05 p.m. UTC | #4
On Mon, Aug 19 2024 at 11:30, Christoph Lameter via wrote:
> @@ -293,6 +321,18 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
>   *
>   * Return: count to be passed to read_seqcount_retry()
>   */
> +#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
> +#define raw_read_seqcount_begin(s)					\
> +({									\
> +	unsigned _seq;							\
> +									\
> +	while ((_seq = seqprop_sequence_acquire(s)) & 1)		\
> +		cpu_relax();						\
> +									\
> +	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
> +	_seq;								\
> +})

So this covers only raw_read_seqcount_begin(), but not
raw_read_seqcount() which has the same smp_rmb() inside.

This all can be done without the extra copies of the counter
accessors. Uncompiled patch below.

It's a little larger than I initialy wanted to do it, but I had to keep
the raw READ_ONCE() for __read_seqcount_begin() to not inflict the
smp_load_acquire() to the only usage site in the dcache code.

The acquire conditional in __seqprop_load_sequence() is optimized out by
the compiler as all of this is macro/__always_inline.

Thanks,

        tglx
---
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -132,6 +132,14 @@ static inline void seqcount_lockdep_read
 #define seqcount_rwlock_init(s, lock)		seqcount_LOCKNAME_init(s, lock, rwlock)
 #define seqcount_mutex_init(s, lock)		seqcount_LOCKNAME_init(s, lock, mutex)
 
+static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
+{
+	if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
+		return smp_load_acquire(&s->sequence);
+	else
+		return READ_ONCE(s->sequence);
+}
+
 /*
  * SEQCOUNT_LOCKNAME()	- Instantiate seqcount_LOCKNAME_t and helpers
  * seqprop_LOCKNAME_*()	- Property accessors for seqcount_LOCKNAME_t
@@ -155,9 +163,10 @@ static __always_inline const seqcount_t
 }									\
 									\
 static __always_inline unsigned						\
-__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
+__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s,	\
+				bool acquire)				\
 {									\
-	unsigned seq = READ_ONCE(s->seqcount.sequence);			\
+	unsigned seq = __seqprop_load_sequence(&s->seqcount, acquire);	\
 									\
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT))				\
 		return seq;						\
@@ -170,7 +179,7 @@ static __always_inline unsigned						\
 		 * Re-read the sequence counter since the (possibly	\
 		 * preempted) writer made progress.			\
 		 */							\
-		seq = READ_ONCE(s->seqcount.sequence);			\
+		seq = __seqprop_load_sequence(&s->seqcount, acquire);	\
 	}								\
 									\
 	return seq;							\
@@ -206,9 +215,9 @@ static inline const seqcount_t *__seqpro
 	return s;
 }
 
-static inline unsigned __seqprop_sequence(const seqcount_t *s)
+static inline unsigned __seqprop_sequence(const seqcount_t *s, bool acquire)
 {
-	return READ_ONCE(s->sequence);
+	return __seqprop_load_sequence(s, acquire);
 }
 
 static inline bool __seqprop_preemptible(const seqcount_t *s)
@@ -258,29 +267,23 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
 
 #define seqprop_ptr(s)			__seqprop(s, ptr)(s)
 #define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
-#define seqprop_sequence(s)		__seqprop(s, sequence)(s)
+#define seqprop_sequence(s, a)		__seqprop(s, sequence)(s, a)
 #define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
 #define seqprop_assert(s)		__seqprop(s, assert)(s)
 
 /**
- * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
- * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
- *
- * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
- * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
- * provided before actually loading any of the variables that are to be
- * protected in this critical section.
- *
- * Use carefully, only in critical code, and comment how the barrier is
- * provided.
+ * read_seqcount_begin_cond_acquire() - begin a seqcount_t read section
+ * @s:	     Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ * @acquire: If true, the read of the sequence count uses smp_load_acquire()
+ *	     if the architecure provides and enabled it.
  *
  * Return: count to be passed to read_seqcount_retry()
  */
-#define __read_seqcount_begin(s)					\
+#define read_seqcount_begin_cond_acquire(s, acquire)			\
 ({									\
 	unsigned __seq;							\
 									\
-	while ((__seq = seqprop_sequence(s)) & 1)			\
+	while ((__seq = seqprop_sequence(s, acquire)) & 1)		\
 		cpu_relax();						\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
@@ -288,6 +291,26 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
 })
 
 /**
+ * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
+ * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+ *
+ * __read_seqcount_begin is like read_seqcount_begin, but it neither
+ * provides a smp_rmb() barrier nor does it use smp_load_acquire() on
+ * architectures which provide it.
+ *
+ * Callers should ensure that smp_rmb() or equivalent ordering is provided
+ * before actually loading any of the variables that are to be protected in
+ * this critical section.
+ *
+ * Use carefully, only in critical code, and comment how the barrier is
+ * provided.
+ *
+ * Return: count to be passed to read_seqcount_retry()
+ */
+#define __read_seqcount_begin(s)					\
+	read_seqcount_begin_cond_acquire(s, false)
+
+/**
  * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
  * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
  *
@@ -295,9 +318,10 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
  */
 #define raw_read_seqcount_begin(s)					\
 ({									\
-	unsigned _seq = __read_seqcount_begin(s);			\
+	unsigned _seq = read_seqcount_begin_cond_acquire(s, true);	\
 									\
-	smp_rmb();							\
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))		\
+		smp_rmb();						\
 	_seq;								\
 })
 
@@ -326,9 +350,10 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
  */
 #define raw_read_seqcount(s)						\
 ({									\
-	unsigned __seq = seqprop_sequence(s);				\
+	unsigned __seq = seqprop_sequence(s, true);			\
 									\
-	smp_rmb();							\
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))		\
+		smp_rmb();						\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
 	__seq;								\
 })
Christoph Lameter (Ampere) Aug. 28, 2024, 5:15 p.m. UTC | #5
On Fri, 23 Aug 2024, Thomas Gleixner wrote:

> This all can be done without the extra copies of the counter
> accessors. Uncompiled patch below.

Great. Thanks. Tried it too initially but could not make it work right.

One thing that we also want is the use of the smp_cond_load_acquire to
have the cpu power down while waiting for a cacheline change.

The code has several places where loops occur when the last bit is set in
the seqcount.

We could use smp_cond_load_acquire in load_sequence() but what do we do
about the loops at the higher level? Also this does not sync with the lock
checking logic.


diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 68b3af8bd6c6..4442a97ffe9a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -135,7 +135,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
 static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
 {
 	if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
-		return smp_load_acquire(&s->sequence);
+		return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0);
 	else
 		return READ_ONCE(s->sequence);
 }
Thomas Gleixner Sept. 2, 2024, 11:55 a.m. UTC | #6
On Wed, Aug 28 2024 at 10:15, Christoph Lameter wrote:
> On Fri, 23 Aug 2024, Thomas Gleixner wrote:
>
>> This all can be done without the extra copies of the counter
>> accessors. Uncompiled patch below.
>
> Great. Thanks. Tried it too initially but could not make it work right.
>
> One thing that we also want is the use of the smp_cond_load_acquire to
> have the cpu power down while waiting for a cacheline change.
>
> The code has several places where loops occur when the last bit is set in
> the seqcount.
>
> We could use smp_cond_load_acquire in load_sequence() but what do we do
> about the loops at the higher level? Also this does not sync with the lock
> checking logic.

Come on. It's not rocket science to figure that out.

Uncompiled delta patch below.

Thanks,

        tglx
---
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -23,6 +23,13 @@
 
 #include <asm/processor.h>
 
+#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
+# define USE_LOAD_ACQUIRE	true
+# define USE_COND_LOAD_ACQUIRE	!IS_ENABLED(CONFIG_PREEMPT_RT)
+#else
+# define USE_LOAD_ACQUIRE	false
+# define USE_COND_LOAD_ACQUIRE	false
+#endif
 /*
  * The seqlock seqcount_t interface does not prescribe a precise sequence of
  * read begin/retry/end. For readers, typically there is a call to
@@ -134,10 +141,13 @@ static inline void seqcount_lockdep_read
 
 static __always_inline unsigned __seqprop_load_sequence(const seqcount_t *s, bool acquire)
 {
-	if (acquire && IS_ENABLED(CONFIG_ARCH_HAS_ACQUIRE_RELEASE))
-		return smp_load_acquire(&s->sequence);
-	else
+	if (!acquire || !USE_LOAD_ACQUIRE)
 		return READ_ONCE(s->sequence);
+
+	if (USE_COND_LOAD_ACQUIRE)
+		return smp_cond_load_acquire(&s->sequence, (s->sequence & 1) == 0);
+
+	return smp_load_acquire(&s->sequence);
 }
 
 /*
@@ -283,8 +293,12 @@ SEQCOUNT_LOCKNAME(mutex,        struct m
 ({									\
 	unsigned __seq;							\
 									\
-	while ((__seq = seqprop_sequence(s, acquire)) & 1)		\
-		cpu_relax();						\
+	if (acquire && USE_COND_LOAD_ACQUIRE) {				\
+		__seq = seqprop_sequence(s, acquire);			\
+	} else {							\
+		while ((__seq = seqprop_sequence(s, acquire)) & 1)	\
+			cpu_relax();					\
+	}								\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
 	__seq;								\
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..3c270f496231 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1600,6 +1600,14 @@  config ARCH_HAS_KERNEL_FPU_SUPPORT
 	  Architectures that select this option can run floating-point code in
 	  the kernel, as described in Documentation/core-api/floating-point.rst.
 
+config ARCH_HAS_ACQUIRE_RELEASE
+	bool
+	help
+	  Setting ARCH_HAS_ACQUIRE_RELEASE indicates that the architecture
+	  supports load acquire and release. Typically these are more effective
+	  than memory barriers. Code will prefer the use of load acquire and
+	  store release over memory barriers if this option is enabled.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..19e34fff145f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -39,6 +39,7 @@  config ARM64
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
+	select ARCH_HAS_ACQUIRE_RELEASE
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_SET_MEMORY
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d90d8ee29d81..353fcf32b800 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -176,6 +176,28 @@  __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)	\
 	return seq;							\
 }									\
 									\
+static __always_inline unsigned						\
+__seqprop_##lockname##_sequence_acquire(const seqcount_##lockname##_t *s) \
+{									\
+	unsigned seq = smp_load_acquire(&s->seqcount.sequence);		\
+									\
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))				\
+		return seq;						\
+									\
+	if (preemptible && unlikely(seq & 1)) {				\
+		__SEQ_LOCK(lockbase##_lock(s->lock));			\
+		__SEQ_LOCK(lockbase##_unlock(s->lock));			\
+									\
+		/*							\
+		 * Re-read the sequence counter since the (possibly	\
+		 * preempted) writer made progress.			\
+		 */							\
+		seq = smp_load_acquire(&s->seqcount.sequence);		\
+	}								\
+									\
+	return seq;							\
+}									\
+									\
 static __always_inline bool						\
 __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)	\
 {									\
@@ -211,6 +233,11 @@  static inline unsigned __seqprop_sequence(const seqcount_t *s)
 	return READ_ONCE(s->sequence);
 }
 
+static inline unsigned __seqprop_sequence_acquire(const seqcount_t *s)
+{
+	return smp_load_acquire(&s->sequence);
+}
+
 static inline bool __seqprop_preemptible(const seqcount_t *s)
 {
 	return false;
@@ -259,6 +286,7 @@  SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 #define seqprop_ptr(s)			__seqprop(s, ptr)(s)
 #define seqprop_const_ptr(s)		__seqprop(s, const_ptr)(s)
 #define seqprop_sequence(s)		__seqprop(s, sequence)(s)
+#define seqprop_sequence_acquire(s)	__seqprop(s, sequence_acquire)(s)
 #define seqprop_preemptible(s)		__seqprop(s, preemptible)(s)
 #define seqprop_assert(s)		__seqprop(s, assert)(s)
 
@@ -293,6 +321,18 @@  SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
  *
  * Return: count to be passed to read_seqcount_retry()
  */
+#ifdef CONFIG_ARCH_HAS_ACQUIRE_RELEASE
+#define raw_read_seqcount_begin(s)					\
+({									\
+	unsigned _seq;							\
+									\
+	while ((_seq = seqprop_sequence_acquire(s)) & 1)		\
+		cpu_relax();						\
+									\
+	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
+	_seq;								\
+})
+#else
 #define raw_read_seqcount_begin(s)					\
 ({									\
 	unsigned _seq = __read_seqcount_begin(s);			\
@@ -300,6 +340,7 @@  SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 	smp_rmb();							\
 	_seq;								\
 })
+#endif
 
 /**
  * read_seqcount_begin() - begin a seqcount_t read critical section