diff mbox series

[bpf-next,v1,14/22] rqspinlock: Add macros for rqspinlock usage

Message ID 20250107140004.2732830-15-memxor@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Resilient Queued Spin Lock | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-12 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers fail 2 maintainers not CCed: arnd@arndb.de linux-arch@vger.kernel.org
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/checkpatch warning WARNING: do not add new typedefs WARNING: line length of 110 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 0 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Kumar Kartikeya Dwivedi Jan. 7, 2025, 1:59 p.m. UTC
Introduce helper macros that wrap around the rqspinlock slow path and
provide an interface analogous to the raw_spin_lock API. Note that
in case of error conditions, preemption and IRQ disabling is
automatically unrolled before returning the error back to the caller.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/asm-generic/rqspinlock.h | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

Comments

Waiman Long Jan. 8, 2025, 4:55 p.m. UTC | #1
On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
> Introduce helper macros that wrap around the rqspinlock slow path and
> provide an interface analogous to the raw_spin_lock API. Note that
> in case of error conditions, preemption and IRQ disabling is
> automatically unrolled before returning the error back to the caller.
>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>   include/asm-generic/rqspinlock.h | 58 ++++++++++++++++++++++++++++++++
>   1 file changed, 58 insertions(+)
>
> diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
> index dc436ab01471..53be8426373c 100644
> --- a/include/asm-generic/rqspinlock.h
> +++ b/include/asm-generic/rqspinlock.h
> @@ -12,8 +12,10 @@
>   #include <linux/types.h>
>   #include <vdso/time64.h>
>   #include <linux/percpu.h>
> +#include <asm/qspinlock.h>
>   
>   struct qspinlock;
> +typedef struct qspinlock rqspinlock_t;
>   
>   extern int resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val, u64 timeout);
>   
> @@ -82,4 +84,60 @@ static __always_inline void release_held_lock_entry(void)
>   	this_cpu_dec(rqspinlock_held_locks.cnt);
>   }
>   
> +/**
> + * res_spin_lock - acquire a queued spinlock
> + * @lock: Pointer to queued spinlock structure
> + */
> +static __always_inline int res_spin_lock(rqspinlock_t *lock)
> +{
> +	int val = 0;
> +
> +	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
> +		grab_held_lock_entry(lock);
> +		return 0;
> +	}
> +	return resilient_queued_spin_lock_slowpath(lock, val, RES_DEF_TIMEOUT);
> +}
> +
> +static __always_inline void res_spin_unlock(rqspinlock_t *lock)
> +{
> +	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
> +
> +	if (unlikely(rqh->cnt > RES_NR_HELD))
> +		goto unlock;
> +	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
> +	/*
> +	 * Release barrier, ensuring ordering. See release_held_lock_entry.
> +	 */
> +unlock:
> +	queued_spin_unlock(lock);
> +	this_cpu_dec(rqspinlock_held_locks.cnt);
> +}
> +
> +#define raw_res_spin_lock_init(lock) ({ *(lock) = (struct qspinlock)__ARCH_SPIN_LOCK_UNLOCKED; })
> +
> +#define raw_res_spin_lock(lock)                    \
> +	({                                         \
> +		int __ret;                         \
> +		preempt_disable();                 \
> +		__ret = res_spin_lock(lock);	   \
> +		if (__ret)                         \
> +			preempt_enable();          \
> +		__ret;                             \
> +	})
> +
> +#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
> +
> +#define raw_res_spin_lock_irqsave(lock, flags)    \
> +	({                                        \
> +		int __ret;                        \
> +		local_irq_save(flags);            \
> +		__ret = raw_res_spin_lock(lock);  \
> +		if (__ret)                        \
> +			local_irq_restore(flags); \
> +		__ret;                            \
> +	})
> +
> +#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
> +
>   #endif /* __ASM_GENERIC_RQSPINLOCK_H */

Lockdep calls aren't included in the helper functions. That means all 
the *res_spin_lock*() calls will be outside the purview of lockdep. That 
also means a multi-CPU circular locking dependency involving a mixture 
of qspinlocks and rqspinlocks may not be detectable.

Cheers,
Longman
Kumar Kartikeya Dwivedi Jan. 8, 2025, 8:41 p.m. UTC | #2
On Wed, 8 Jan 2025 at 22:26, Waiman Long <llong@redhat.com> wrote:
>
> On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
> > Introduce helper macros that wrap around the rqspinlock slow path and
> > provide an interface analogous to the raw_spin_lock API. Note that
> > in case of error conditions, preemption and IRQ disabling is
> > automatically unrolled before returning the error back to the caller.
> >
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >   include/asm-generic/rqspinlock.h | 58 ++++++++++++++++++++++++++++++++
> >   1 file changed, 58 insertions(+)
> >
> > diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
> > index dc436ab01471..53be8426373c 100644
> > --- a/include/asm-generic/rqspinlock.h
> > +++ b/include/asm-generic/rqspinlock.h
> > @@ -12,8 +12,10 @@
> >   #include <linux/types.h>
> >   #include <vdso/time64.h>
> >   #include <linux/percpu.h>
> > +#include <asm/qspinlock.h>
> >
> >   struct qspinlock;
> > +typedef struct qspinlock rqspinlock_t;
> >
> >   extern int resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val, u64 timeout);
> >
> > @@ -82,4 +84,60 @@ static __always_inline void release_held_lock_entry(void)
> >       this_cpu_dec(rqspinlock_held_locks.cnt);
> >   }
> >
> > +/**
> > + * res_spin_lock - acquire a queued spinlock
> > + * @lock: Pointer to queued spinlock structure
> > + */
> > +static __always_inline int res_spin_lock(rqspinlock_t *lock)
> > +{
> > +     int val = 0;
> > +
> > +     if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
> > +             grab_held_lock_entry(lock);
> > +             return 0;
> > +     }
> > +     return resilient_queued_spin_lock_slowpath(lock, val, RES_DEF_TIMEOUT);
> > +}
> > +
> > +static __always_inline void res_spin_unlock(rqspinlock_t *lock)
> > +{
> > +     struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
> > +
> > +     if (unlikely(rqh->cnt > RES_NR_HELD))
> > +             goto unlock;
> > +     WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
> > +     /*
> > +      * Release barrier, ensuring ordering. See release_held_lock_entry.
> > +      */
> > +unlock:
> > +     queued_spin_unlock(lock);
> > +     this_cpu_dec(rqspinlock_held_locks.cnt);
> > +}
> > +
> > +#define raw_res_spin_lock_init(lock) ({ *(lock) = (struct qspinlock)__ARCH_SPIN_LOCK_UNLOCKED; })
> > +
> > +#define raw_res_spin_lock(lock)                    \
> > +     ({                                         \
> > +             int __ret;                         \
> > +             preempt_disable();                 \
> > +             __ret = res_spin_lock(lock);       \
> > +             if (__ret)                         \
> > +                     preempt_enable();          \
> > +             __ret;                             \
> > +     })
> > +
> > +#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
> > +
> > +#define raw_res_spin_lock_irqsave(lock, flags)    \
> > +     ({                                        \
> > +             int __ret;                        \
> > +             local_irq_save(flags);            \
> > +             __ret = raw_res_spin_lock(lock);  \
> > +             if (__ret)                        \
> > +                     local_irq_restore(flags); \
> > +             __ret;                            \
> > +     })
> > +
> > +#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
> > +
> >   #endif /* __ASM_GENERIC_RQSPINLOCK_H */
>
> Lockdep calls aren't included in the helper functions. That means all
> the *res_spin_lock*() calls will be outside the purview of lockdep. That
> also means a multi-CPU circular locking dependency involving a mixture
> of qspinlocks and rqspinlocks may not be detectable.

Yes, this is true, but I am not sure whether lockdep fits well in this
case, or how to map its semantics.
Some BPF users (e.g. in patch 17) expect and rely on rqspinlock to
return errors on AA deadlocks, as nesting is possible, so we'll get
false alarms with it. Lockdep also needs to treat rqspinlock as a
trylock, since it's essentially fallible, and IIUC it skips diagnosing
in those cases.
Most of the users use rqspinlock because it is expected a deadlock may
be constructed at runtime (either due to BPF programs or by attaching
programs to the kernel), so lockdep splats will not be helpful on
debug kernels.

Say if a mix of both qspinlock and rqspinlock were involved in an ABBA
situation, as long as rqspinlock is being acquired on one of the
threads, it will still timeout even if check_deadlock fails to
establish presence of a deadlock. This will mean the qspinlock call on
the other side will make progress as long as the kernel unwinds locks
correctly on failures (by handling rqspinlock errors and releasing
held locks on the way out).

>
> Cheers,
> Longman
>
Waiman Long Jan. 9, 2025, 1:11 a.m. UTC | #3
On 1/8/25 3:41 PM, Kumar Kartikeya Dwivedi wrote:
> On Wed, 8 Jan 2025 at 22:26, Waiman Long <llong@redhat.com> wrote:
>> On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
>>> Introduce helper macros that wrap around the rqspinlock slow path and
>>> provide an interface analogous to the raw_spin_lock API. Note that
>>> in case of error conditions, preemption and IRQ disabling is
>>> automatically unrolled before returning the error back to the caller.
>>>
>>> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>>> ---
>>>    include/asm-generic/rqspinlock.h | 58 ++++++++++++++++++++++++++++++++
>>>    1 file changed, 58 insertions(+)
>>>
>>> diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
>>> index dc436ab01471..53be8426373c 100644
>>> --- a/include/asm-generic/rqspinlock.h
>>> +++ b/include/asm-generic/rqspinlock.h
>>> @@ -12,8 +12,10 @@
>>>    #include <linux/types.h>
>>>    #include <vdso/time64.h>
>>>    #include <linux/percpu.h>
>>> +#include <asm/qspinlock.h>
>>>
>>>    struct qspinlock;
>>> +typedef struct qspinlock rqspinlock_t;
>>>
>>>    extern int resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val, u64 timeout);
>>>
>>> @@ -82,4 +84,60 @@ static __always_inline void release_held_lock_entry(void)
>>>        this_cpu_dec(rqspinlock_held_locks.cnt);
>>>    }
>>>
>>> +/**
>>> + * res_spin_lock - acquire a queued spinlock
>>> + * @lock: Pointer to queued spinlock structure
>>> + */
>>> +static __always_inline int res_spin_lock(rqspinlock_t *lock)
>>> +{
>>> +     int val = 0;
>>> +
>>> +     if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
>>> +             grab_held_lock_entry(lock);
>>> +             return 0;
>>> +     }
>>> +     return resilient_queued_spin_lock_slowpath(lock, val, RES_DEF_TIMEOUT);
>>> +}
>>> +
>>> +static __always_inline void res_spin_unlock(rqspinlock_t *lock)
>>> +{
>>> +     struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
>>> +
>>> +     if (unlikely(rqh->cnt > RES_NR_HELD))
>>> +             goto unlock;
>>> +     WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
>>> +     /*
>>> +      * Release barrier, ensuring ordering. See release_held_lock_entry.
>>> +      */
>>> +unlock:
>>> +     queued_spin_unlock(lock);
>>> +     this_cpu_dec(rqspinlock_held_locks.cnt);
>>> +}
>>> +
>>> +#define raw_res_spin_lock_init(lock) ({ *(lock) = (struct qspinlock)__ARCH_SPIN_LOCK_UNLOCKED; })
>>> +
>>> +#define raw_res_spin_lock(lock)                    \
>>> +     ({                                         \
>>> +             int __ret;                         \
>>> +             preempt_disable();                 \
>>> +             __ret = res_spin_lock(lock);       \
>>> +             if (__ret)                         \
>>> +                     preempt_enable();          \
>>> +             __ret;                             \
>>> +     })
>>> +
>>> +#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
>>> +
>>> +#define raw_res_spin_lock_irqsave(lock, flags)    \
>>> +     ({                                        \
>>> +             int __ret;                        \
>>> +             local_irq_save(flags);            \
>>> +             __ret = raw_res_spin_lock(lock);  \
>>> +             if (__ret)                        \
>>> +                     local_irq_restore(flags); \
>>> +             __ret;                            \
>>> +     })
>>> +
>>> +#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
>>> +
>>>    #endif /* __ASM_GENERIC_RQSPINLOCK_H */
>> Lockdep calls aren't included in the helper functions. That means all
>> the *res_spin_lock*() calls will be outside the purview of lockdep. That
>> also means a multi-CPU circular locking dependency involving a mixture
>> of qspinlocks and rqspinlocks may not be detectable.
> Yes, this is true, but I am not sure whether lockdep fits well in this
> case, or how to map its semantics.
> Some BPF users (e.g. in patch 17) expect and rely on rqspinlock to
> return errors on AA deadlocks, as nesting is possible, so we'll get
> false alarms with it. Lockdep also needs to treat rqspinlock as a
> trylock, since it's essentially fallible, and IIUC it skips diagnosing
> in those cases.
Yes, we can certainly treat rqspinlock as a trylock.

> Most of the users use rqspinlock because it is expected a deadlock may
> be constructed at runtime (either due to BPF programs or by attaching
> programs to the kernel), so lockdep splats will not be helpful on
> debug kernels.

In most cases, lockdep will report a cyclic locking dependency 
(potential deadlock) before a real deadlock happens as it requires the 
right combination of events happening in a specific sequence. So lockdep 
can report a deadlock while the runtime check of rqspinlock may not see 
it and there is no locking stall. Also rqspinlock will not see the other 
locks held in the current context.


> Say if a mix of both qspinlock and rqspinlock were involved in an ABBA
> situation, as long as rqspinlock is being acquired on one of the
> threads, it will still timeout even if check_deadlock fails to
> establish presence of a deadlock. This will mean the qspinlock call on
> the other side will make progress as long as the kernel unwinds locks
> correctly on failures (by handling rqspinlock errors and releasing
> held locks on the way out).

That is true only if the latest lock to be acquired is a rqspinlock. If 
all the rqspinlocks in the circular path have already been acquired, no 
unwinding is possible.

That is probably not an issue with the limited rqspinlock conversion in 
this patch series. In the future when more and more locks are converted 
to use rqspinlock, this scenario may happen.

Cheers,
Longman
Alexei Starovoitov Jan. 9, 2025, 3:30 a.m. UTC | #4
On Wed, Jan 8, 2025 at 5:11 PM Waiman Long <llong@redhat.com> wrote:
>
>
> > Most of the users use rqspinlock because it is expected a deadlock may
> > be constructed at runtime (either due to BPF programs or by attaching
> > programs to the kernel), so lockdep splats will not be helpful on
> > debug kernels.
>
> In most cases, lockdep will report a cyclic locking dependency
> (potential deadlock) before a real deadlock happens as it requires the
> right combination of events happening in a specific sequence. So lockdep
> can report a deadlock while the runtime check of rqspinlock may not see
> it and there is no locking stall. Also rqspinlock will not see the other
> locks held in the current context.
>
>
> > Say if a mix of both qspinlock and rqspinlock were involved in an ABBA
> > situation, as long as rqspinlock is being acquired on one of the
> > threads, it will still timeout even if check_deadlock fails to
> > establish presence of a deadlock. This will mean the qspinlock call on
> > the other side will make progress as long as the kernel unwinds locks
> > correctly on failures (by handling rqspinlock errors and releasing
> > held locks on the way out).
>
> That is true only if the latest lock to be acquired is a rqspinlock. If.
> all the rqspinlocks in the circular path have already been acquired, no
> unwinding is possible.

There is no 'last lock'. If it's not an AA deadlock there are more
than 1 cpu that are spinning. In a hypothetical mix of rqspinlocks
and regular raw_spinlocks at least one cpu will be spinning on
rqspinlock and despite missing the entries in the lock table it will
still exit by timeout. The execution will continue and eventually
all locks will be released.

We considered annotating rqspinlock as trylock with
raw_spin_lock_init lock class, but usefulness is quite limited.
It's trylock only. So it may appear in a circular dependency
only if it's a combination of raw_spin_locks and rqspinlocks
which is not supposed to ever happen once we convert all bpf inner
parts to rqspinlock.
Patches 17,18,19 convert the main offenders. Few remain
that need a bit more thinking.
At the end all locks at the leaves will be rqspinlocks and
no normal locks will be taken after
(unless NMIs are doing silly things).
And since rqspinlock is a trylock, lockdep will never complain
on rqspinlock.
Even if NMI handler is buggy it's unlikely that NMI's raw_spin_lock
is in a circular dependency with rqspinlock on bpf side.
So rqspinlock entries will be adding computational
overhead to lockdep engine to filter out and not much more.

This all assumes that rqspinlocks are limited to bpf, of course.

If rqspinlock has use cases beyond bpf then, sure, let's add
trylock lockdep annotations.

Note that if there is an actual bug on bpf side with rqspinlock usage
it will be reported even when lockdep is off.
This is patch 13.
Currently it's pr_info() of held rqspinlocks and dumpstack,
but in the future we plan to make it better consumable by bpf
side. Printing into something like a special trace_pipe.
This is tbd.

> That is probably not an issue with the limited rqspinlock conversion in
> this patch series. In the future when more and more locks are converted
> to use rqspinlock, this scenario may happen.

The rqspinlock usage should be limited to bpf and no other
normal lock should be taken after.
At least that was the intent.
If folks feel that it's useful beyond bpf then we need to think harder.
lockdep annotations is an easy part to add.
Waiman Long Jan. 9, 2025, 4:09 a.m. UTC | #5
On 1/8/25 10:30 PM, Alexei Starovoitov wrote:
> On Wed, Jan 8, 2025 at 5:11 PM Waiman Long <llong@redhat.com> wrote:
>>
>>> Most of the users use rqspinlock because it is expected a deadlock may
>>> be constructed at runtime (either due to BPF programs or by attaching
>>> programs to the kernel), so lockdep splats will not be helpful on
>>> debug kernels.
>> In most cases, lockdep will report a cyclic locking dependency
>> (potential deadlock) before a real deadlock happens as it requires the
>> right combination of events happening in a specific sequence. So lockdep
>> can report a deadlock while the runtime check of rqspinlock may not see
>> it and there is no locking stall. Also rqspinlock will not see the other
>> locks held in the current context.
>>
>>
>>> Say if a mix of both qspinlock and rqspinlock were involved in an ABBA
>>> situation, as long as rqspinlock is being acquired on one of the
>>> threads, it will still timeout even if check_deadlock fails to
>>> establish presence of a deadlock. This will mean the qspinlock call on
>>> the other side will make progress as long as the kernel unwinds locks
>>> correctly on failures (by handling rqspinlock errors and releasing
>>> held locks on the way out).
>> That is true only if the latest lock to be acquired is a rqspinlock. If.
>> all the rqspinlocks in the circular path have already been acquired, no
>> unwinding is possible.
> There is no 'last lock'. If it's not an AA deadlock there are more
> than 1 cpu that are spinning. In a hypothetical mix of rqspinlocks
> and regular raw_spinlocks at least one cpu will be spinning on
> rqspinlock and despite missing the entries in the lock table it will
> still exit by timeout. The execution will continue and eventually
> all locks will be released.
>
> We considered annotating rqspinlock as trylock with
> raw_spin_lock_init lock class, but usefulness is quite limited.
> It's trylock only. So it may appear in a circular dependency
> only if it's a combination of raw_spin_locks and rqspinlocks
> which is not supposed to ever happen once we convert all bpf inner
> parts to rqspinlock.
> Patches 17,18,19 convert the main offenders. Few remain
> that need a bit more thinking.
> At the end all locks at the leaves will be rqspinlocks and
> no normal locks will be taken after
> (unless NMIs are doing silly things).
> And since rqspinlock is a trylock, lockdep will never complain
> on rqspinlock.
> Even if NMI handler is buggy it's unlikely that NMI's raw_spin_lock
> is in a circular dependency with rqspinlock on bpf side.
> So rqspinlock entries will be adding computational
> overhead to lockdep engine to filter out and not much more.
>
> This all assumes that rqspinlocks are limited to bpf, of course.
>
> If rqspinlock has use cases beyond bpf then, sure, let's add
> trylock lockdep annotations.
>
> Note that if there is an actual bug on bpf side with rqspinlock usage
> it will be reported even when lockdep is off.
> This is patch 13.
> Currently it's pr_info() of held rqspinlocks and dumpstack,
> but in the future we plan to make it better consumable by bpf
> side. Printing into something like a special trace_pipe.
> This is tbd.

If rqspinlock is only limited to within the BPF core and BPF progs and 
won't call out to other subsystems that may acquire other 
raw_spinlock's, lockdep may not be needed. Once the scope is extended 
beyond that, we certainly need to have lockdep enabled. Again, this has 
to be clearly documented.

Cheers,
Longman
diff mbox series

Patch

diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index dc436ab01471..53be8426373c 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -12,8 +12,10 @@ 
 #include <linux/types.h>
 #include <vdso/time64.h>
 #include <linux/percpu.h>
+#include <asm/qspinlock.h>
 
 struct qspinlock;
+typedef struct qspinlock rqspinlock_t;
 
 extern int resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val, u64 timeout);
 
@@ -82,4 +84,60 @@  static __always_inline void release_held_lock_entry(void)
 	this_cpu_dec(rqspinlock_held_locks.cnt);
 }
 
+/**
+ * res_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline int res_spin_lock(rqspinlock_t *lock)
+{
+	int val = 0;
+
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
+		grab_held_lock_entry(lock);
+		return 0;
+	}
+	return resilient_queued_spin_lock_slowpath(lock, val, RES_DEF_TIMEOUT);
+}
+
+static __always_inline void res_spin_unlock(rqspinlock_t *lock)
+{
+	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+
+	if (unlikely(rqh->cnt > RES_NR_HELD))
+		goto unlock;
+	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
+	/*
+	 * Release barrier, ensuring ordering. See release_held_lock_entry.
+	 */
+unlock:
+	queued_spin_unlock(lock);
+	this_cpu_dec(rqspinlock_held_locks.cnt);
+}
+
+#define raw_res_spin_lock_init(lock) ({ *(lock) = (struct qspinlock)__ARCH_SPIN_LOCK_UNLOCKED; })
+
+#define raw_res_spin_lock(lock)                    \
+	({                                         \
+		int __ret;                         \
+		preempt_disable();                 \
+		__ret = res_spin_lock(lock);	   \
+		if (__ret)                         \
+			preempt_enable();          \
+		__ret;                             \
+	})
+
+#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
+
+#define raw_res_spin_lock_irqsave(lock, flags)    \
+	({                                        \
+		int __ret;                        \
+		local_irq_save(flags);            \
+		__ret = raw_res_spin_lock(lock);  \
+		if (__ret)                        \
+			local_irq_restore(flags); \
+		__ret;                            \
+	})
+
+#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
+
 #endif /* __ASM_GENERIC_RQSPINLOCK_H */