diff mbox series

[bpf-next,v5,3/7] locking/local_lock: Introduce local_trylock_irqsave()

Message ID 20250115021746.34691-4-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 560 this patch: 560
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 3 maintainers not CCed: clrkwllms@kernel.org linux-rt-devel@lists.linux.dev kuba@kernel.org
netdev/build_clang success Errors and warnings before: 22695 this patch: 22695
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16005 this patch: 16005
netdev/checkpatch warning WARNING: Macros with flow control statements should be avoided WARNING: labels should not be indented WARNING: line length of 81 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Alexei Starovoitov Jan. 15, 2025, 2:17 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Similar to local_lock_irqsave() introduce local_trylock_irqsave().
This is inspired by 'struct local_tryirq_lock' in:
https://lore.kernel.org/all/20241112-slub-percpu-caches-v1-5-ddc0bdc27e05@suse.cz/

Use spin_trylock in PREEMPT_RT when not in hard IRQ and not in NMI
and fail instantly otherwise, since spin_trylock is not safe from IRQ
due to PI issues.

In !PREEMPT_RT use simple active flag to prevent IRQs or NMIs
reentering locked region.

Note there is no need to use local_inc for active flag.
If IRQ handler grabs the same local_lock after READ_ONCE(lock->active)
already completed it has to unlock it before returning.
Similar with NMI handler. So there is a strict nesting of scopes.
It's a per cpu lock. Multiple cpus do not access it in parallel.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/local_lock.h          |  9 ++++
 include/linux/local_lock_internal.h | 76 ++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 7 deletions(-)

Comments

Alexei Starovoitov Jan. 15, 2025, 2:23 a.m. UTC | #1
On Tue, Jan 14, 2025 at 6:18 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Similar to local_lock_irqsave() introduce local_trylock_irqsave().
> This is inspired by 'struct local_tryirq_lock' in:
> https://lore.kernel.org/all/20241112-slub-percpu-caches-v1-5-ddc0bdc27e05@suse.cz/
>
> Use spin_trylock in PREEMPT_RT when not in hard IRQ and not in NMI
> and fail instantly otherwise, since spin_trylock is not safe from IRQ
> due to PI issues.
>
> In !PREEMPT_RT use simple active flag to prevent IRQs or NMIs
> reentering locked region.
>
> Note there is no need to use local_inc for active flag.
> If IRQ handler grabs the same local_lock after READ_ONCE(lock->active)
> already completed it has to unlock it before returning.
> Similar with NMI handler. So there is a strict nesting of scopes.
> It's a per cpu lock. Multiple cpus do not access it in parallel.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/local_lock.h          |  9 ++++
>  include/linux/local_lock_internal.h | 76 ++++++++++++++++++++++++++---
>  2 files changed, 78 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
> index 091dc0b6bdfb..84ee560c4f51 100644
> --- a/include/linux/local_lock.h
> +++ b/include/linux/local_lock.h
> @@ -30,6 +30,15 @@
>  #define local_lock_irqsave(lock, flags)                                \
>         __local_lock_irqsave(lock, flags)
>
> +/**
> + * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
> + *                        interrupts. Always fails in RT when in_hardirq or NMI.
> + * @lock:      The lock variable
> + * @flags:     Storage for interrupt flags
> + */
> +#define local_trylock_irqsave(lock, flags)                     \
> +       __local_trylock_irqsave(lock, flags)
> +
>  /**
>   * local_unlock - Release a per CPU local lock
>   * @lock:      The lock variable
> diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
> index 8dd71fbbb6d2..93672127c73d 100644
> --- a/include/linux/local_lock_internal.h
> +++ b/include/linux/local_lock_internal.h
> @@ -9,6 +9,7 @@
>  #ifndef CONFIG_PREEMPT_RT
>
>  typedef struct {
> +       int active;
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>         struct lockdep_map      dep_map;
>         struct task_struct      *owner;
> @@ -22,7 +23,7 @@ typedef struct {
>                 .wait_type_inner = LD_WAIT_CONFIG,      \
>                 .lock_type = LD_LOCK_PERCPU,            \
>         },                                              \
> -       .owner = NULL,
> +       .owner = NULL, .active = 0

Sebastian,

could you please review/ack this patch ?

I looked through all current users of local_lock and the extra active
flag will be in the noise in all cases.
So I don't see any runtime/memory concerns
while extra lockdep_assert to catch reentrance issues
in RT and !RT will certainly help long term.
Sebastian Andrzej Siewior Jan. 15, 2025, 7:22 a.m. UTC | #2
On 2025-01-14 18:23:21 [-0800], Alexei Starovoitov wrote:
> 
> Sebastian,
> 
> could you please review/ack this patch ?

I will look at this (and the other things sent) tomorrow, or FRI at the
latest. 

Sebastian
Vlastimil Babka Jan. 15, 2025, 2:22 p.m. UTC | #3
On 1/15/25 03:17, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Similar to local_lock_irqsave() introduce local_trylock_irqsave().
> This is inspired by 'struct local_tryirq_lock' in:
> https://lore.kernel.org/all/20241112-slub-percpu-caches-v1-5-ddc0bdc27e05@suse.cz/

Let's see what locking maintainers say about adding the flag to every
local_lock even if it doesn't use the trylock operation.

> Use spin_trylock in PREEMPT_RT when not in hard IRQ and not in NMI
> and fail instantly otherwise, since spin_trylock is not safe from IRQ
> due to PI issues.
> 
> In !PREEMPT_RT use simple active flag to prevent IRQs or NMIs
> reentering locked region.
> 
> Note there is no need to use local_inc for active flag.
> If IRQ handler grabs the same local_lock after READ_ONCE(lock->active)

IRQ handler AFAICS can't do that since __local_trylock_irqsave() is the only
trylock operation and it still does local_irq_save(). Could you have added a
__local_trylock() operation instead? Guess not for your use case because I
see in Patch 4 you want to use the local_unlock_irqrestore() universally for
sections that are earlier locked either by local_trylock_irqsave() or
local_lock_irqsave(). But I wonder if those can be changed (will reply on
that patch).

The motivation in my case was to avoid the overhead of irqsave/restore on
!PREEMPT_RT. If there was a separate "flavor" of local_lock that would
support the trylock operation, I think it would not need the _irq and
_irqsave variants at all, and it would also avoid adding the "active" flag
on !PREEMPT_RT. Meanwhile on PREEMPT_RT, a single implementation could
likely handle both flavors with no downsides?

> already completed it has to unlock it before returning.
> Similar with NMI handler. So there is a strict nesting of scopes.
> It's a per cpu lock. Multiple cpus do not access it in parallel.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/local_lock.h          |  9 ++++
>  include/linux/local_lock_internal.h | 76 ++++++++++++++++++++++++++---
>  2 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
> index 091dc0b6bdfb..84ee560c4f51 100644
> --- a/include/linux/local_lock.h
> +++ b/include/linux/local_lock.h
> @@ -30,6 +30,15 @@
>  #define local_lock_irqsave(lock, flags)				\
>  	__local_lock_irqsave(lock, flags)
>  
> +/**
> + * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
> + *			   interrupts. Always fails in RT when in_hardirq or NMI.
> + * @lock:	The lock variable
> + * @flags:	Storage for interrupt flags
> + */
> +#define local_trylock_irqsave(lock, flags)			\
> +	__local_trylock_irqsave(lock, flags)
> +
>  /**
>   * local_unlock - Release a per CPU local lock
>   * @lock:	The lock variable
> diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
> index 8dd71fbbb6d2..93672127c73d 100644
> --- a/include/linux/local_lock_internal.h
> +++ b/include/linux/local_lock_internal.h
> @@ -9,6 +9,7 @@
>  #ifndef CONFIG_PREEMPT_RT
>  
>  typedef struct {
> +	int active;
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  	struct lockdep_map	dep_map;
>  	struct task_struct	*owner;
> @@ -22,7 +23,7 @@ typedef struct {
>  		.wait_type_inner = LD_WAIT_CONFIG,	\
>  		.lock_type = LD_LOCK_PERCPU,		\
>  	},						\
> -	.owner = NULL,
> +	.owner = NULL, .active = 0
>  
>  static inline void local_lock_acquire(local_lock_t *l)
>  {
> @@ -31,6 +32,13 @@ static inline void local_lock_acquire(local_lock_t *l)
>  	l->owner = current;
>  }
>  
> +static inline void local_trylock_acquire(local_lock_t *l)
> +{
> +	lock_map_acquire_try(&l->dep_map);
> +	DEBUG_LOCKS_WARN_ON(l->owner);
> +	l->owner = current;
> +}
> +
>  static inline void local_lock_release(local_lock_t *l)
>  {
>  	DEBUG_LOCKS_WARN_ON(l->owner != current);
> @@ -45,6 +53,7 @@ static inline void local_lock_debug_init(local_lock_t *l)
>  #else /* CONFIG_DEBUG_LOCK_ALLOC */
>  # define LOCAL_LOCK_DEBUG_INIT(lockname)
>  static inline void local_lock_acquire(local_lock_t *l) { }
> +static inline void local_trylock_acquire(local_lock_t *l) { }
>  static inline void local_lock_release(local_lock_t *l) { }
>  static inline void local_lock_debug_init(local_lock_t *l) { }
>  #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
> @@ -60,6 +69,7 @@ do {								\
>  			      0, LD_WAIT_CONFIG, LD_WAIT_INV,	\
>  			      LD_LOCK_PERCPU);			\
>  	local_lock_debug_init(lock);				\
> +	(lock)->active = 0;					\
>  } while (0)
>  
>  #define __spinlock_nested_bh_init(lock)				\
> @@ -75,37 +85,73 @@ do {								\
>  
>  #define __local_lock(lock)					\
>  	do {							\
> +		local_lock_t *l;				\
>  		preempt_disable();				\
> -		local_lock_acquire(this_cpu_ptr(lock));		\
> +		l = this_cpu_ptr(lock);				\
> +		lockdep_assert(l->active == 0);			\
> +		WRITE_ONCE(l->active, 1);			\
> +		local_lock_acquire(l);				\
>  	} while (0)
>  
>  #define __local_lock_irq(lock)					\
>  	do {							\
> +		local_lock_t *l;				\
>  		local_irq_disable();				\
> -		local_lock_acquire(this_cpu_ptr(lock));		\
> +		l = this_cpu_ptr(lock);				\
> +		lockdep_assert(l->active == 0);			\
> +		WRITE_ONCE(l->active, 1);			\
> +		local_lock_acquire(l);				\
>  	} while (0)
>  
>  #define __local_lock_irqsave(lock, flags)			\
>  	do {							\
> +		local_lock_t *l;				\
>  		local_irq_save(flags);				\
> -		local_lock_acquire(this_cpu_ptr(lock));		\
> +		l = this_cpu_ptr(lock);				\
> +		lockdep_assert(l->active == 0);			\
> +		WRITE_ONCE(l->active, 1);			\
> +		local_lock_acquire(l);				\
>  	} while (0)
>  
> +#define __local_trylock_irqsave(lock, flags)			\
> +	({							\
> +		local_lock_t *l;				\
> +		local_irq_save(flags);				\
> +		l = this_cpu_ptr(lock);				\
> +		if (READ_ONCE(l->active) == 1) {		\
> +			local_irq_restore(flags);		\
> +			l = NULL;				\
> +		} else {					\
> +			WRITE_ONCE(l->active, 1);		\
> +			local_trylock_acquire(l);		\
> +		}						\
> +		!!l;						\
> +	})
> +
>  #define __local_unlock(lock)					\
>  	do {							\
> -		local_lock_release(this_cpu_ptr(lock));		\
> +		local_lock_t *l = this_cpu_ptr(lock);		\
> +		lockdep_assert(l->active == 1);			\
> +		WRITE_ONCE(l->active, 0);			\
> +		local_lock_release(l);				\
>  		preempt_enable();				\
>  	} while (0)
>  
>  #define __local_unlock_irq(lock)				\
>  	do {							\
> -		local_lock_release(this_cpu_ptr(lock));		\
> +		local_lock_t *l = this_cpu_ptr(lock);		\
> +		lockdep_assert(l->active == 1);			\
> +		WRITE_ONCE(l->active, 0);			\
> +		local_lock_release(l);				\
>  		local_irq_enable();				\
>  	} while (0)
>  
>  #define __local_unlock_irqrestore(lock, flags)			\
>  	do {							\
> -		local_lock_release(this_cpu_ptr(lock));		\
> +		local_lock_t *l = this_cpu_ptr(lock);		\
> +		lockdep_assert(l->active == 1);			\
> +		WRITE_ONCE(l->active, 0);			\
> +		local_lock_release(l);				\
>  		local_irq_restore(flags);			\
>  	} while (0)
>  
> @@ -148,6 +194,22 @@ typedef spinlock_t local_lock_t;
>  		__local_lock(lock);				\
>  	} while (0)
>  
> +#define __local_trylock_irqsave(lock, flags)			\
> +	({							\
> +		__label__ out;					\
> +		int ret = 0;					\
> +		typecheck(unsigned long, flags);		\
> +		flags = 0;					\
> +		if (in_nmi() || in_hardirq())			\
> +			goto out;				\
> +		migrate_disable();				\
> +		ret = spin_trylock(this_cpu_ptr((lock)));	\
> +		if (!ret)					\
> +			migrate_enable();			\
> +	out:							\
> +		ret;						\
> +	})
> +
>  #define __local_unlock(__lock)					\
>  	do {							\
>  		spin_unlock(this_cpu_ptr((__lock)));		\
Alexei Starovoitov Jan. 16, 2025, 2:20 a.m. UTC | #4
On Wed, Jan 15, 2025 at 6:22 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/15/25 03:17, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Similar to local_lock_irqsave() introduce local_trylock_irqsave().
> > This is inspired by 'struct local_tryirq_lock' in:
> > https://lore.kernel.org/all/20241112-slub-percpu-caches-v1-5-ddc0bdc27e05@suse.cz/
>
> Let's see what locking maintainers say about adding the flag to every
> local_lock even if it doesn't use the trylock operation.

As I replied to Sebastian there are very few users and
hot users of local_lock like networking use it in RT only.
local_lock_nested_bh() stays true nop in !RT.
This patch doesn't change it.
So active flag on !RT is not in critical path
(at least as much I could study the code)

> > Use spin_trylock in PREEMPT_RT when not in hard IRQ and not in NMI
> > and fail instantly otherwise, since spin_trylock is not safe from IRQ
> > due to PI issues.
> >
> > In !PREEMPT_RT use simple active flag to prevent IRQs or NMIs
> > reentering locked region.
> >
> > Note there is no need to use local_inc for active flag.
> > If IRQ handler grabs the same local_lock after READ_ONCE(lock->active)
>
> IRQ handler AFAICS can't do that since __local_trylock_irqsave() is the only
> trylock operation and it still does local_irq_save(). Could you have added a
> __local_trylock() operation instead? Guess not for your use case because I
> see in Patch 4 you want to use the local_unlock_irqrestore() universally for
> sections that are earlier locked either by local_trylock_irqsave() or
> local_lock_irqsave(). But I wonder if those can be changed (will reply on
> that patch).

Pasting your reply from patch 4 here to reply to both...

Yes, I'm only adding local_trylock_irqsave() and not local_trylock(),
since memcg and slab are using local_lock_irqsave() in numerous
places, and adding local_trylock() here would be just dead code.

> The motivation in my case was to avoid the overhead of irqsave/restore on
> !PREEMPT_RT. If there was a separate "flavor" of local_lock that would
> support the trylock operation, I think it would not need the _irq and
> _irqsave variants at all, and it would also avoid adding the "active" flag
> on !PREEMPT_RT. Meanwhile on PREEMPT_RT, a single implementation could
> likely handle both flavors with no downsides?

I agree with the desire to use local_lock() in slab and memcg long term,
but this is something that definitely should _not_ be done in this patch set.
try_alloc_page() needs to learn to walk before we teach it to run.

> The last line can practially only happen on RT, right? On non-RT irqsave
> means we could only fail the trylock from a nmi and then we should have
> gfp_flags that don't allow spinning.

Correct.

> So suppose we used local_trylock(), local_lock() and local_unlock()  (no
> _irqsave) instead, as I mentioned in reply to 3/7. The RT implementation
> would be AFAICS the same. On !RT the trylock could now fail from a IRQ
> context in addition to NMI context, but that should also have a gfp_mask
> that does not allow spinning, so it should work fine.

Also correct.

> It would however mean converting all users of the lock, i.e. also
> consume_obj_stock() etc., but AFAIU that will be necessary anyway to have
> opportunistic slab allocations?

Exactly. And as soon as we do that we start to conflict between trees.
But the main concern is that change like that needs to be
thoroughly analyzed.
I'm not convinced that stock_lock as preempt_disable() will work for memcg.

People do GFP_NOWAIT allocations from IRQ and assume it works.
If memcg local_irqsave (aka local_lock_irqsave) is replaced
with preempt_disable the IRQ can happen in the middle of memcg
update of the counters,
so ALL of stock_lock operations would have to local_TRYlock()
with fallback in case IRQ kmalloc(GFP_NOWAIT) happens to reenter.

Same issue with slub.
local_lock_irqsave(&s->cpu_slab->lock)
as irq disabled region works for kmalloc(GPF_NOWAIT) users.
If it becomes preempt_disable I suspect it will break things.

Like perf and bpf use irq_work do wakeups and allocations.
slub's s->cpu_slab protected by preempt_disable
would mean that 'perf record -a' will be triggering in the
middle of slab partial, deactivate slab logic and
perf will be doing wakups right there. I suspect it will be sad.
While right now irq work handler will be called only
after the last local_unlock_irqrestore enables irqs.

So replacing local_lock_irqsave in slab and memcg with
local_lock is not something to take lightly.
Sebastian Andrzej Siewior Jan. 17, 2025, 8:33 p.m. UTC | #5
On 2025-01-14 18:17:42 [-0800], Alexei Starovoitov wrote:
> diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
> index 8dd71fbbb6d2..93672127c73d 100644
> --- a/include/linux/local_lock_internal.h
> +++ b/include/linux/local_lock_internal.h
> @@ -75,37 +85,73 @@ do {								\
>  
>  #define __local_lock(lock)					\
>  	do {							\
> +		local_lock_t *l;				\
>  		preempt_disable();				\
> -		local_lock_acquire(this_cpu_ptr(lock));		\
> +		l = this_cpu_ptr(lock);				\
> +		lockdep_assert(l->active == 0);			\
> +		WRITE_ONCE(l->active, 1);			\
> +		local_lock_acquire(l);				\
>  	} while (0)
> +#define __local_trylock_irqsave(lock, flags)			\
> +	({							\
> +		local_lock_t *l;				\
> +		local_irq_save(flags);				\
> +		l = this_cpu_ptr(lock);				\
> +		if (READ_ONCE(l->active) == 1) {		\
> +			local_irq_restore(flags);		\
> +			l = NULL;				\
> +		} else {					\
> +			WRITE_ONCE(l->active, 1);		\
> +			local_trylock_acquire(l);		\
> +		}						\
> +		!!l;						\
> +	})
> +

Part of the selling for local_lock_t was that it does not affect
!PREEMPT_RT builds. By adding `active' you extend every data structure
and you have an extra write on every local_lock(). It was meant to
replace preempt_disable()/ local_irq_save() based locking with something
that actually does locking on PREEMPT_RT without risking my life once
people with pitchforks come talk about the new locking :)

I admire that you try to make PREEMPT_RT and !PREEMPT_RT similar in a
way that both detect recursive locking which not meant to be supported. 

Realistically speaking as of today we don't have any recursive lock
detection other than lockdep. So it should be enough given that the bots
use it often and hopefully local testing.
Your assert in local_lock() does not work without lockdep. It will only
make local_trylock_irqsave() detect recursion and lead to two splats
with lockdep enabled in local_lock() (one from the assert and the second
from lockdep).

I would say you could get rid of the `active' field and solely rely on
lockdep and the owner field. So __local_trylock_irqsave() could maybe
use local_trylock_acquire() to conditionally acquire the lock if `owner'
is NULL.

This makes it possible to have recursive code without lockdep, but with
lockdep enabled the testcase should fail if it relies on recursion.
Other than that, I don't see any advantage. Would that work?

Sebastian
Vlastimil Babka Jan. 21, 2025, 3:59 p.m. UTC | #6
On 1/17/25 9:33 PM, Sebastian Andrzej Siewior wrote:
> On 2025-01-14 18:17:42 [-0800], Alexei Starovoitov wrote:
>> diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
>> index 8dd71fbbb6d2..93672127c73d 100644
>> --- a/include/linux/local_lock_internal.h
>> +++ b/include/linux/local_lock_internal.h
>> @@ -75,37 +85,73 @@ do {								\
>>  
>>  #define __local_lock(lock)					\
>>  	do {							\
>> +		local_lock_t *l;				\
>>  		preempt_disable();				\
>> -		local_lock_acquire(this_cpu_ptr(lock));		\
>> +		l = this_cpu_ptr(lock);				\
>> +		lockdep_assert(l->active == 0);			\
>> +		WRITE_ONCE(l->active, 1);			\
>> +		local_lock_acquire(l);				\
>>  	} while (0)
> 
> …
> 
>> +#define __local_trylock_irqsave(lock, flags)			\
>> +	({							\
>> +		local_lock_t *l;				\
>> +		local_irq_save(flags);				\
>> +		l = this_cpu_ptr(lock);				\
>> +		if (READ_ONCE(l->active) == 1) {		\
>> +			local_irq_restore(flags);		\
>> +			l = NULL;				\
>> +		} else {					\
>> +			WRITE_ONCE(l->active, 1);		\
>> +			local_trylock_acquire(l);		\
>> +		}						\
>> +		!!l;						\
>> +	})
>> +
> 
> Part of the selling for local_lock_t was that it does not affect
> !PREEMPT_RT builds. By adding `active' you extend every data structure
> and you have an extra write on every local_lock(). It was meant to
> replace preempt_disable()/ local_irq_save() based locking with something
> that actually does locking on PREEMPT_RT without risking my life once
> people with pitchforks come talk about the new locking :)
> 
> I admire that you try to make PREEMPT_RT and !PREEMPT_RT similar in a
> way that both detect recursive locking which not meant to be supported. 
> 
> Realistically speaking as of today we don't have any recursive lock
> detection other than lockdep. So it should be enough given that the bots
> use it often and hopefully local testing.
> Your assert in local_lock() does not work without lockdep. It will only
> make local_trylock_irqsave() detect recursion and lead to two splats
> with lockdep enabled in local_lock() (one from the assert and the second
> from lockdep).
> 
> I would say you could get rid of the `active' field and solely rely on
> lockdep and the owner field. So __local_trylock_irqsave() could maybe
> use local_trylock_acquire() to conditionally acquire the lock if `owner'
> is NULL.
> 
> This makes it possible to have recursive code without lockdep, but with
> lockdep enabled the testcase should fail if it relies on recursion.
> Other than that, I don't see any advantage. Would that work?

I don't think it would work, or am I missing something? The goal is to
allow the operation (alloc, free) to opportunistically succeed in e.g.
nmi context, but only if we didn't interrupt anything that holds the
lock. Otherwise we must allow for failure - hence trylock.
(a possible extension that I mentioned is to also stop doing irqsave to
avoid its overhead and thus also operations from an irq context would be
oportunistic)
But if we detect the "trylock must fail" cases only using lockdep, we'll
deadlock without lockdep. So e.g. the "active" flag has to be there?

So yes this goes beyond the original purpose of local_lock. Do you think
it should be a different lock type then, which would mean the other
users of current local_lock that don't want the opportunistic nesting
via trylock, would not inflict the "active" flag overhead?

AFAICS the RT implementation of local_lock could then be shared for both
of these types, but I might be missing some nuance there.

> Sebastian
Sebastian Andrzej Siewior Jan. 21, 2025, 4:43 p.m. UTC | #7
On 2025-01-21 16:59:40 [+0100], Vlastimil Babka wrote:
> I don't think it would work, or am I missing something? The goal is to
> allow the operation (alloc, free) to opportunistically succeed in e.g.
> nmi context, but only if we didn't interrupt anything that holds the
> lock. Otherwise we must allow for failure - hence trylock.
> (a possible extension that I mentioned is to also stop doing irqsave to
> avoid its overhead and thus also operations from an irq context would be
> oportunistic)
> But if we detect the "trylock must fail" cases only using lockdep, we'll
> deadlock without lockdep. So e.g. the "active" flag has to be there?

You are right. I noticed that myself but didn't get to reply…

> So yes this goes beyond the original purpose of local_lock. Do you think
> it should be a different lock type then, which would mean the other
> users of current local_lock that don't want the opportunistic nesting
> via trylock, would not inflict the "active" flag overhead?
> 
> AFAICS the RT implementation of local_lock could then be shared for both
> of these types, but I might be missing some nuance there.

I was thinking about this over the weekend and this implementation
extends the data structure by 4 bytes and has this mandatory read/ write
on every lock/ unlock operation. This is what makes it a bit different
than the original.

If the local_lock_t is replaced with spinlock_t then the data structure
is still extended by four bytes (assuming no lockdep) and we have a
mandatory read/ write operation. The whole thing still does not work on
PREEMPT_RT but it isn't much different from what we have now. This is
kind of my favorite. This could be further optimized to avoid the atomic
operation given it is always local per-CPU memory. Maybe a
local_spinlock_t :)

Sebastian
Alexei Starovoitov Jan. 22, 2025, 1:35 a.m. UTC | #8
On Tue, Jan 21, 2025 at 8:43 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2025-01-21 16:59:40 [+0100], Vlastimil Babka wrote:
> > I don't think it would work, or am I missing something? The goal is to
> > allow the operation (alloc, free) to opportunistically succeed in e.g.
> > nmi context, but only if we didn't interrupt anything that holds the
> > lock. Otherwise we must allow for failure - hence trylock.
> > (a possible extension that I mentioned is to also stop doing irqsave to
> > avoid its overhead and thus also operations from an irq context would be
> > oportunistic)
> > But if we detect the "trylock must fail" cases only using lockdep, we'll
> > deadlock without lockdep. So e.g. the "active" flag has to be there?
>
> You are right. I noticed that myself but didn't get to reply…
>
> > So yes this goes beyond the original purpose of local_lock. Do you think
> > it should be a different lock type then, which would mean the other
> > users of current local_lock that don't want the opportunistic nesting
> > via trylock, would not inflict the "active" flag overhead?
> >
> > AFAICS the RT implementation of local_lock could then be shared for both
> > of these types, but I might be missing some nuance there.
>
> I was thinking about this over the weekend and this implementation
> extends the data structure by 4 bytes and has this mandatory read/ write
> on every lock/ unlock operation. This is what makes it a bit different
> than the original.
>
> If the local_lock_t is replaced with spinlock_t then the data structure
> is still extended by four bytes (assuming no lockdep) and we have a
> mandatory read/ write operation. The whole thing still does not work on
> PREEMPT_RT but it isn't much different from what we have now. This is
> kind of my favorite. This could be further optimized to avoid the atomic
> operation given it is always local per-CPU memory. Maybe a
> local_spinlock_t :)

I think the concern of people with pitchforks is overblown.
These are the only users of local_lock_t:

drivers/block/zram/zcomp.h:     local_lock_t lock;
drivers/char/random.c:  local_lock_t lock;
drivers/connector/cn_proc.c:    local_lock_t lock;
drivers/md/raid5.h:     local_lock_t    lock;
kernel/softirq.c:       local_lock_t    lock;
mm/memcontrol.c:        local_lock_t stock_lock;
mm/mlock.c:     local_lock_t lock;
mm/slub.c:      local_lock_t lock;
mm/swap.c:      local_lock_t lock;
mm/swap.c:      local_lock_t lock_irq;
mm/zsmalloc.c:  local_lock_t lock;
net/bridge/br_netfilter_hooks.c:        local_lock_t bh_lock;
net/core/skbuff.c:      local_lock_t bh_lock;
net/ipv4/tcp_sigpool.c: local_lock_t bh_lock;

Networking is the one that really cares about performance
and there 'int active' adds 4 bytes, but no run-time overhead,
since it's using local_lock_nested_bh() that doesn't touch 'active'.

memcontrol.c and slub.c are those that need these new trylock
logic with 'active' field to protect from NMI on !RT.
They will change to new local_trylock_t anyway.

What's left is not perf critical. Single WRITE_ONCE in lock
and another WRITE_ONCE in unlock is really in the noise.

Hence I went with a simple approach you see in this patch.
I can go with new local_trylock_t and _Generic() trick.
The patch will look like this:

diff --git a/include/linux/local_lock_internal.h
b/include/linux/local_lock_internal.h
index 8dd71fbbb6d2..ed4623e0c71a 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -15,6 +15,14 @@ typedef struct {
 #endif
 } local_lock_t;

+typedef struct {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lockdep_map      dep_map;
+       struct task_struct      *owner;
+#endif
+       int active;
+} local_trylock_t;

 #define __local_lock_irqsave(lock, flags)                      \
        do {                                                    \
+               local_lock_t *l;                                \
                local_irq_save(flags);                          \
-               local_lock_acquire(this_cpu_ptr(lock));         \
+               l = (local_lock_t *)this_cpu_ptr(lock);         \
+               _Generic((lock),                                \
+                       local_trylock_t *:                      \
+                       ({                                      \
+                       lockdep_assert(((local_trylock_t *)l)->active == 0); \
+                        WRITE_ONCE(((local_trylock_t *)l)->active, 1); \
+                       }),                                     \
+                       default:(void)0);                       \
+               local_lock_acquire(l);                          \
        } while (0)

+
+#define __local_trylock_irqsave(lock, flags)                   \
+       ({                                                      \
+               local_trylock_t *l;                             \
+               local_irq_save(flags);                          \
+               l = this_cpu_ptr(lock);                         \
+               if (READ_ONCE(l->active) == 1) {                \
+                       local_irq_restore(flags);               \
+                       l = NULL;                               \
+               } else {                                        \
+                       WRITE_ONCE(l->active, 1);               \
+                       local_trylock_acquire((local_lock_t *)l);
         \
+               }                                               \
+               !!l;                                            \
+       })

and use new local_trylock_t where necessary.
Like:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..8fe141e93a0b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1722,7 +1722,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 }

 struct memcg_stock_pcp {
-       local_lock_t stock_lock;
+       local_trylock_t stock_lock;

All lines where stock_lock is used will stay as-is.
So no code churn.
Above _Generic() isn't pretty, but not horrible either.
I guess that's a decent trade off.
Better ideas?
diff mbox series

Patch

diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 091dc0b6bdfb..84ee560c4f51 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -30,6 +30,15 @@ 
 #define local_lock_irqsave(lock, flags)				\
 	__local_lock_irqsave(lock, flags)
 
+/**
+ * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
+ *			   interrupts. Always fails in RT when in_hardirq or NMI.
+ * @lock:	The lock variable
+ * @flags:	Storage for interrupt flags
+ */
+#define local_trylock_irqsave(lock, flags)			\
+	__local_trylock_irqsave(lock, flags)
+
 /**
  * local_unlock - Release a per CPU local lock
  * @lock:	The lock variable
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 8dd71fbbb6d2..93672127c73d 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -9,6 +9,7 @@ 
 #ifndef CONFIG_PREEMPT_RT
 
 typedef struct {
+	int active;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 	struct task_struct	*owner;
@@ -22,7 +23,7 @@  typedef struct {
 		.wait_type_inner = LD_WAIT_CONFIG,	\
 		.lock_type = LD_LOCK_PERCPU,		\
 	},						\
-	.owner = NULL,
+	.owner = NULL, .active = 0
 
 static inline void local_lock_acquire(local_lock_t *l)
 {
@@ -31,6 +32,13 @@  static inline void local_lock_acquire(local_lock_t *l)
 	l->owner = current;
 }
 
+static inline void local_trylock_acquire(local_lock_t *l)
+{
+	lock_map_acquire_try(&l->dep_map);
+	DEBUG_LOCKS_WARN_ON(l->owner);
+	l->owner = current;
+}
+
 static inline void local_lock_release(local_lock_t *l)
 {
 	DEBUG_LOCKS_WARN_ON(l->owner != current);
@@ -45,6 +53,7 @@  static inline void local_lock_debug_init(local_lock_t *l)
 #else /* CONFIG_DEBUG_LOCK_ALLOC */
 # define LOCAL_LOCK_DEBUG_INIT(lockname)
 static inline void local_lock_acquire(local_lock_t *l) { }
+static inline void local_trylock_acquire(local_lock_t *l) { }
 static inline void local_lock_release(local_lock_t *l) { }
 static inline void local_lock_debug_init(local_lock_t *l) { }
 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
@@ -60,6 +69,7 @@  do {								\
 			      0, LD_WAIT_CONFIG, LD_WAIT_INV,	\
 			      LD_LOCK_PERCPU);			\
 	local_lock_debug_init(lock);				\
+	(lock)->active = 0;					\
 } while (0)
 
 #define __spinlock_nested_bh_init(lock)				\
@@ -75,37 +85,73 @@  do {								\
 
 #define __local_lock(lock)					\
 	do {							\
+		local_lock_t *l;				\
 		preempt_disable();				\
-		local_lock_acquire(this_cpu_ptr(lock));		\
+		l = this_cpu_ptr(lock);				\
+		lockdep_assert(l->active == 0);			\
+		WRITE_ONCE(l->active, 1);			\
+		local_lock_acquire(l);				\
 	} while (0)
 
 #define __local_lock_irq(lock)					\
 	do {							\
+		local_lock_t *l;				\
 		local_irq_disable();				\
-		local_lock_acquire(this_cpu_ptr(lock));		\
+		l = this_cpu_ptr(lock);				\
+		lockdep_assert(l->active == 0);			\
+		WRITE_ONCE(l->active, 1);			\
+		local_lock_acquire(l);				\
 	} while (0)
 
 #define __local_lock_irqsave(lock, flags)			\
 	do {							\
+		local_lock_t *l;				\
 		local_irq_save(flags);				\
-		local_lock_acquire(this_cpu_ptr(lock));		\
+		l = this_cpu_ptr(lock);				\
+		lockdep_assert(l->active == 0);			\
+		WRITE_ONCE(l->active, 1);			\
+		local_lock_acquire(l);				\
 	} while (0)
 
+#define __local_trylock_irqsave(lock, flags)			\
+	({							\
+		local_lock_t *l;				\
+		local_irq_save(flags);				\
+		l = this_cpu_ptr(lock);				\
+		if (READ_ONCE(l->active) == 1) {		\
+			local_irq_restore(flags);		\
+			l = NULL;				\
+		} else {					\
+			WRITE_ONCE(l->active, 1);		\
+			local_trylock_acquire(l);		\
+		}						\
+		!!l;						\
+	})
+
 #define __local_unlock(lock)					\
 	do {							\
-		local_lock_release(this_cpu_ptr(lock));		\
+		local_lock_t *l = this_cpu_ptr(lock);		\
+		lockdep_assert(l->active == 1);			\
+		WRITE_ONCE(l->active, 0);			\
+		local_lock_release(l);				\
 		preempt_enable();				\
 	} while (0)
 
 #define __local_unlock_irq(lock)				\
 	do {							\
-		local_lock_release(this_cpu_ptr(lock));		\
+		local_lock_t *l = this_cpu_ptr(lock);		\
+		lockdep_assert(l->active == 1);			\
+		WRITE_ONCE(l->active, 0);			\
+		local_lock_release(l);				\
 		local_irq_enable();				\
 	} while (0)
 
 #define __local_unlock_irqrestore(lock, flags)			\
 	do {							\
-		local_lock_release(this_cpu_ptr(lock));		\
+		local_lock_t *l = this_cpu_ptr(lock);		\
+		lockdep_assert(l->active == 1);			\
+		WRITE_ONCE(l->active, 0);			\
+		local_lock_release(l);				\
 		local_irq_restore(flags);			\
 	} while (0)
 
@@ -148,6 +194,22 @@  typedef spinlock_t local_lock_t;
 		__local_lock(lock);				\
 	} while (0)
 
+#define __local_trylock_irqsave(lock, flags)			\
+	({							\
+		__label__ out;					\
+		int ret = 0;					\
+		typecheck(unsigned long, flags);		\
+		flags = 0;					\
+		if (in_nmi() || in_hardirq())			\
+			goto out;				\
+		migrate_disable();				\
+		ret = spin_trylock(this_cpu_ptr((lock)));	\
+		if (!ret)					\
+			migrate_enable();			\
+	out:							\
+		ret;						\
+	})
+
 #define __local_unlock(__lock)					\
 	do {							\
 		spin_unlock(this_cpu_ptr((__lock)));		\