diff mbox series

[bpf-next,v2,17/26] rqspinlock: Hardcode cond_acquire loops to asm-generic implementation

Message ID 20250206105435.2159977-18-memxor@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Resilient Queued Spin Lock | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 6 this patch: 6
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 3 maintainers not CCed: longman@redhat.com boqun.feng@gmail.com mingo@redhat.com
netdev/build_clang fail Errors and warnings before: 6 this patch: 6
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 6 this patch: 6
netdev/checkpatch warning CHECK: Macro argument 'ptr' may be better as '(ptr)' to avoid precedence issues WARNING: line length of 109 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Kumar Kartikeya Dwivedi Feb. 6, 2025, 10:54 a.m. UTC
Currently, for rqspinlock usage, the implementation of
smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
susceptible to stalls on arm64, because they do not guarantee that the
conditional expression will be repeatedly invoked if the address being
loaded from is not written to by other CPUs. When support for
event-streams is absent (which unblocks stuck WFE-based loops every
~100us), we may end up being stuck forever.

This causes a problem for us, as we need to repeatedly invoke the
RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
expires.

Hardcode the implementation to the asm-generic version in rqspinlock.c
until support for smp_cond_load_acquire_timewait [0] lands upstream.

  [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com

Cc: Ankur Arora <ankur.a.arora@oracle.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 kernel/locking/rqspinlock.c | 41 ++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov Feb. 8, 2025, 1:58 a.m. UTC | #1
On Thu, Feb 6, 2025 at 2:55 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> Currently, for rqspinlock usage, the implementation of
> smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
> susceptible to stalls on arm64, because they do not guarantee that the
> conditional expression will be repeatedly invoked if the address being
> loaded from is not written to by other CPUs. When support for
> event-streams is absent (which unblocks stuck WFE-based loops every
> ~100us), we may end up being stuck forever.
>
> This causes a problem for us, as we need to repeatedly invoke the
> RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
> expires.
>
> Hardcode the implementation to the asm-generic version in rqspinlock.c
> until support for smp_cond_load_acquire_timewait [0] lands upstream.
>
>   [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
>
> Cc: Ankur Arora <ankur.a.arora@oracle.com>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>  kernel/locking/rqspinlock.c | 41 ++++++++++++++++++++++++++++++++++---
>  1 file changed, 38 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/locking/rqspinlock.c b/kernel/locking/rqspinlock.c
> index 49b4f3c75a3e..b4cceeecf29c 100644
> --- a/kernel/locking/rqspinlock.c
> +++ b/kernel/locking/rqspinlock.c
> @@ -325,6 +325,41 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock, u64 timeout)
>   */
>  static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
>
> +/*
> + * Hardcode smp_cond_load_acquire and atomic_cond_read_acquire implementations
> + * to the asm-generic implementation. In rqspinlock code, our conditional
> + * expression involves checking the value _and_ additionally a timeout. However,
> + * on arm64, the WFE-based implementation may never spin again if no stores
> + * occur to the locked byte in the lock word. As such, we may be stuck forever
> + * if event-stream based unblocking is not available on the platform for WFE
> + * spin loops (arch_timer_evtstrm_available).
> + *
> + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
> + * workaround.
> + *
> + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
> + */

It's fine as a workaround for now to avoid being blocked
on Ankur's set (which will go via different tree too),
but in v3 pls add an extra patch that demonstrates the final result
with WFE stuff working as designed without amortizing
in RES_CHECK_TIMEOUT() macro.
Guessing RES_CHECK_TIMEOUT will have some ifdef to handle that case?
Kumar Kartikeya Dwivedi Feb. 8, 2025, 3:04 a.m. UTC | #2
On Sat, 8 Feb 2025 at 02:58, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Feb 6, 2025 at 2:55 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
> >
> > Currently, for rqspinlock usage, the implementation of
> > smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
> > susceptible to stalls on arm64, because they do not guarantee that the
> > conditional expression will be repeatedly invoked if the address being
> > loaded from is not written to by other CPUs. When support for
> > event-streams is absent (which unblocks stuck WFE-based loops every
> > ~100us), we may end up being stuck forever.
> >
> > This causes a problem for us, as we need to repeatedly invoke the
> > RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
> > expires.
> >
> > Hardcode the implementation to the asm-generic version in rqspinlock.c
> > until support for smp_cond_load_acquire_timewait [0] lands upstream.
> >
> >   [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
> >
> > Cc: Ankur Arora <ankur.a.arora@oracle.com>
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >  kernel/locking/rqspinlock.c | 41 ++++++++++++++++++++++++++++++++++---
> >  1 file changed, 38 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/locking/rqspinlock.c b/kernel/locking/rqspinlock.c
> > index 49b4f3c75a3e..b4cceeecf29c 100644
> > --- a/kernel/locking/rqspinlock.c
> > +++ b/kernel/locking/rqspinlock.c
> > @@ -325,6 +325,41 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock, u64 timeout)
> >   */
> >  static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
> >
> > +/*
> > + * Hardcode smp_cond_load_acquire and atomic_cond_read_acquire implementations
> > + * to the asm-generic implementation. In rqspinlock code, our conditional
> > + * expression involves checking the value _and_ additionally a timeout. However,
> > + * on arm64, the WFE-based implementation may never spin again if no stores
> > + * occur to the locked byte in the lock word. As such, we may be stuck forever
> > + * if event-stream based unblocking is not available on the platform for WFE
> > + * spin loops (arch_timer_evtstrm_available).
> > + *
> > + * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
> > + * workaround.
> > + *
> > + * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
> > + */
>
> It's fine as a workaround for now to avoid being blocked
> on Ankur's set (which will go via different tree too),
> but in v3 pls add an extra patch that demonstrates the final result
> with WFE stuff working as designed without amortizing
> in RES_CHECK_TIMEOUT() macro.
> Guessing RES_CHECK_TIMEOUT will have some ifdef to handle that case?

Yes, or we can pass in the check_timeout expression directly. I'll
make the change in v3.
Peter Zijlstra Feb. 10, 2025, 9:53 a.m. UTC | #3
On Thu, Feb 06, 2025 at 02:54:25AM -0800, Kumar Kartikeya Dwivedi wrote:
> Currently, for rqspinlock usage, the implementation of
> smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
> susceptible to stalls on arm64, because they do not guarantee that the
> conditional expression will be repeatedly invoked if the address being
> loaded from is not written to by other CPUs. When support for
> event-streams is absent (which unblocks stuck WFE-based loops every
> ~100us), we may end up being stuck forever.
> 
> This causes a problem for us, as we need to repeatedly invoke the
> RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
> expires.
> 
> Hardcode the implementation to the asm-generic version in rqspinlock.c
> until support for smp_cond_load_acquire_timewait [0] lands upstream.
> 

*sigh*.. this patch should go *before* patch 8. As is that's still
horribly broken and I was WTF-ing because your 0/n changelog said you
fixed it.
Peter Zijlstra Feb. 10, 2025, 10:03 a.m. UTC | #4
On Mon, Feb 10, 2025 at 10:53:25AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 06, 2025 at 02:54:25AM -0800, Kumar Kartikeya Dwivedi wrote:
> > Currently, for rqspinlock usage, the implementation of
> > smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
> > susceptible to stalls on arm64, because they do not guarantee that the
> > conditional expression will be repeatedly invoked if the address being
> > loaded from is not written to by other CPUs. When support for
> > event-streams is absent (which unblocks stuck WFE-based loops every
> > ~100us), we may end up being stuck forever.
> > 
> > This causes a problem for us, as we need to repeatedly invoke the
> > RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
> > expires.
> > 
> > Hardcode the implementation to the asm-generic version in rqspinlock.c
> > until support for smp_cond_load_acquire_timewait [0] lands upstream.
> > 
> 
> *sigh*.. this patch should go *before* patch 8. As is that's still
> horribly broken and I was WTF-ing because your 0/n changelog said you
> fixed it.

And since you're doing local copies of things, why not take a lobal copy
of the smp_cond_load_acquire_timewait() thing?
Kumar Kartikeya Dwivedi Feb. 13, 2025, 6:15 a.m. UTC | #5
On Mon, 10 Feb 2025 at 11:03, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Feb 10, 2025 at 10:53:25AM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 06, 2025 at 02:54:25AM -0800, Kumar Kartikeya Dwivedi wrote:
> > > Currently, for rqspinlock usage, the implementation of
> > > smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are
> > > susceptible to stalls on arm64, because they do not guarantee that the
> > > conditional expression will be repeatedly invoked if the address being
> > > loaded from is not written to by other CPUs. When support for
> > > event-streams is absent (which unblocks stuck WFE-based loops every
> > > ~100us), we may end up being stuck forever.
> > >
> > > This causes a problem for us, as we need to repeatedly invoke the
> > > RES_CHECK_TIMEOUT in the spin loop to break out when the timeout
> > > expires.
> > >
> > > Hardcode the implementation to the asm-generic version in rqspinlock.c
> > > until support for smp_cond_load_acquire_timewait [0] lands upstream.
> > >
> >
> > *sigh*.. this patch should go *before* patch 8. As is that's still
> > horribly broken and I was WTF-ing because your 0/n changelog said you
> > fixed it.
>

Sorry about that, I will move it before the patch using this.

> And since you're doing local copies of things, why not take a lobal copy
> of the smp_cond_load_acquire_timewait() thing?

Ack, I'll address this in v3.
diff mbox series

Patch

diff --git a/kernel/locking/rqspinlock.c b/kernel/locking/rqspinlock.c
index 49b4f3c75a3e..b4cceeecf29c 100644
--- a/kernel/locking/rqspinlock.c
+++ b/kernel/locking/rqspinlock.c
@@ -325,6 +325,41 @@  int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock, u64 timeout)
  */
 static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
 
+/*
+ * Hardcode smp_cond_load_acquire and atomic_cond_read_acquire implementations
+ * to the asm-generic implementation. In rqspinlock code, our conditional
+ * expression involves checking the value _and_ additionally a timeout. However,
+ * on arm64, the WFE-based implementation may never spin again if no stores
+ * occur to the locked byte in the lock word. As such, we may be stuck forever
+ * if event-stream based unblocking is not available on the platform for WFE
+ * spin loops (arch_timer_evtstrm_available).
+ *
+ * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
+ * workaround.
+ *
+ * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
+ */
+#define res_smp_cond_load_relaxed(ptr, cond_expr) ({		\
+	typeof(ptr) __PTR = (ptr);				\
+	__unqual_scalar_typeof(*ptr) VAL;			\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cpu_relax();					\
+	}							\
+	(typeof(*ptr))VAL;					\
+})
+
+#define res_smp_cond_load_acquire(ptr, cond_expr) ({		\
+	__unqual_scalar_typeof(*ptr) _val;			\
+	_val = res_smp_cond_load_relaxed(ptr, cond_expr);	\
+	smp_acquire__after_ctrl_dep();				\
+	(typeof(*ptr))_val;					\
+})
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
 /**
  * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
  * @lock: Pointer to queued spinlock structure
@@ -419,7 +454,7 @@  int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val,
 	 */
 	if (val & _Q_LOCKED_MASK) {
 		RES_RESET_TIMEOUT(ts);
-		smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
 	}
 
 	if (ret) {
@@ -568,8 +603,8 @@  int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val,
 	 * does not imply a full barrier.
 	 */
 	RES_RESET_TIMEOUT(ts);
-	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
-				       RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+					   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
 
 waitq_timeout:
 	if (ret) {