diff mbox series

[bpf-next,v2,1/6] mm, bpf: Introduce __GFP_TRYLOCK for opportunistic page allocation

Message ID 20241210023936.46871-2-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce __GFP_TRYLOCK | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 568 this patch: 568
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 16 maintainers not CCed: alexander.shishkin@linux.intel.com jolsa@kernel.org mingo@redhat.com namhyung@kernel.org acme@kernel.org clrkwllms@kernel.org mhiramat@kernel.org mathieu.desnoyers@efficios.com mark.rutland@arm.com adrian.hunter@intel.com david@redhat.com linux-trace-kernel@vger.kernel.org irogers@google.com linux-perf-users@vger.kernel.org linux-rt-devel@lists.linux.dev kan.liang@linux.intel.com
netdev/build_clang success Errors and warnings before: 980 this patch: 980
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 15944 this patch: 15945
netdev/checkpatch fail CHECK: No space is necessary after a cast CHECK: Please use a blank line after function/struct/union/enum declarations ERROR: space required before the open brace '{' WARNING: line length of 81 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 5 this patch: 5
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-14 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-25 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-18 / veristat-kernel

Commit Message

Alexei Starovoitov Dec. 10, 2024, 2:39 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Tracing BPF programs execute from tracepoints and kprobes where running
context is unknown, but they need to request additional memory.
The prior workarounds were using pre-allocated memory and BPF specific
freelists to satisfy such allocation requests. Instead, introduce
__GFP_TRYLOCK flag that makes page allocator accessible from any context.
It relies on percpu free list of pages that rmqueue_pcplist() should be
able to pop the page from. If it fails (due to IRQ re-entrancy or list
being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
and refill percpu freelist as normal.
BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
so trylock is the only option.
In theory we can introduce percpu reentrance counter and increment it
every time spin_lock_irqsave(&zone->lock, flags) is used,
but we cannot rely on it. Even if this cpu is not in page_alloc path
the spin_lock_irqsave() is not safe, since BPF prog might be called
from tracepoint where preemption is disabled. So trylock only.

Note, free_page and memcg are not taught about __GFP_TRYLOCK yet.
The support comes in the next patches.

This is a first step towards supporting BPF requirements in SLUB
and getting rid of bpf_mem_alloc.
That goal was discussed at LSFMM: https://lwn.net/Articles/974138/

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/gfp.h            | 25 +++++++++++++++++++++++++
 include/linux/gfp_types.h      |  3 +++
 include/trace/events/mmflags.h |  1 +
 mm/fail_page_alloc.c           |  6 ++++++
 mm/internal.h                  |  1 +
 mm/page_alloc.c                | 17 ++++++++++++++---
 tools/perf/builtin-kmem.c      |  1 +
 7 files changed, 51 insertions(+), 3 deletions(-)

Comments

Matthew Wilcox Dec. 10, 2024, 5:31 a.m. UTC | #1
On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> +	if (preemptible() && !rcu_preempt_depth())
> +		return alloc_pages_node_noprof(nid,
> +					       GFP_NOWAIT | __GFP_ZERO,
> +					       order);
> +	return alloc_pages_node_noprof(nid,
> +				       __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> +				       order);

[...]

> @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
>  	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
>  	 */
>  	alloc_flags |= (__force int)
> -		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> +		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));

It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
I was originally wondering if this wasn't a memalloc_nolock_save() /
memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
but I wonder if we can simply do:

	if (!preemptible() || rcu_preempt_depth())
		alloc_flags |= ALLOC_TRYLOCK;
Sebastian Andrzej Siewior Dec. 10, 2024, 9:01 a.m. UTC | #2
On 2024-12-09 18:39:31 [-0800], Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Tracing BPF programs execute from tracepoints and kprobes where running
> context is unknown, but they need to request additional memory.
> The prior workarounds were using pre-allocated memory and BPF specific
> freelists to satisfy such allocation requests. Instead, introduce
> __GFP_TRYLOCK flag that makes page allocator accessible from any context.
> It relies on percpu free list of pages that rmqueue_pcplist() should be
> able to pop the page from. If it fails (due to IRQ re-entrancy or list
> being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
> and refill percpu freelist as normal.
> BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
> so trylock is the only option.

The __GFP_TRYLOCK flag looks reasonable given the challenges for BPF
where it is not known how much memory will be needed and what the
calling context is. I hope it does not spread across the kernel where
people do ATOMIC in preempt/ IRQ-off on PREEMPT_RT and then once they
learn that this does not work, add this flag to the mix to make it work
without spending some time on reworking it.

Side note: I am in the process of hopefully getting rid of the
preempt_disable() from trace points. What remains then is attaching BPF
programs to any code/ function with a raw_spinlock_t and I am not yet
sure what to do here.

Sebastian
Michal Hocko Dec. 10, 2024, 9:05 a.m. UTC | #3
On Tue 10-12-24 05:31:30, Matthew Wilcox wrote:
> On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > +	if (preemptible() && !rcu_preempt_depth())
> > +		return alloc_pages_node_noprof(nid,
> > +					       GFP_NOWAIT | __GFP_ZERO,
> > +					       order);
> > +	return alloc_pages_node_noprof(nid,
> > +				       __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > +				       order);
> 
> [...]
> 
> > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> >  	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> >  	 */
> >  	alloc_flags |= (__force int)
> > -		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > +		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
> 
> It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> I was originally wondering if this wasn't a memalloc_nolock_save() /
> memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
> but I wonder if we can simply do:
> 
> 	if (!preemptible() || rcu_preempt_depth())
> 		alloc_flags |= ALLOC_TRYLOCK;

preemptible is unusable without CONFIG_PREEMPT_COUNT but I do agree that
__GFP_TRYLOCK is not really a preferred way to go forward. For 3
reasons. 

First I do not really like the name as it tells what it does rather than
how it should be used. This is a general pattern of many gfp flags
unfotrunatelly and historically it has turned out error prone. If a gfp
flag is really needed then something like __GFP_ANY_CONTEXT should be
used.  If the current implementation requires to use try_lock for
zone->lock or other changes is not an implementation detail but the user
should have a clear understanding that allocation is allowed from any
context (NMI, IRQ or otherwise atomic contexts).

Is there any reason why GFP_ATOMIC cannot be extended to support new
contexts? This allocation mode is already documented to be usable from
atomic contexts except from NMI and raw_spinlocks. But is it feasible to
extend the current implementation to use only trylock on zone->lock if
called from in_nmi() to reduce unexpected failures on contention for
existing users?

Third, do we even want such a strong guarantee in the generic page
allocator path and make it even more complex and harder to maintain? We
already have a precence in form of __alloc_pages_bulk which is a special
case allocator mode living outside of the page allocator path. It seems
that it covers most of your requirements except the fallback to the
regular allocation path AFAICS. Is this something you could piggy back
on?
Vlastimil Babka Dec. 10, 2024, 6:39 p.m. UTC | #4
On 12/10/24 03:39, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Tracing BPF programs execute from tracepoints and kprobes where running
> context is unknown, but they need to request additional memory.
> The prior workarounds were using pre-allocated memory and BPF specific
> freelists to satisfy such allocation requests. Instead, introduce
> __GFP_TRYLOCK flag that makes page allocator accessible from any context.
> It relies on percpu free list of pages that rmqueue_pcplist() should be
> able to pop the page from. If it fails (due to IRQ re-entrancy or list
> being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
> and refill percpu freelist as normal.
> BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
> so trylock is the only option.
> In theory we can introduce percpu reentrance counter and increment it
> every time spin_lock_irqsave(&zone->lock, flags) is used,
> but we cannot rely on it. Even if this cpu is not in page_alloc path
> the spin_lock_irqsave() is not safe, since BPF prog might be called
> from tracepoint where preemption is disabled. So trylock only.
> 
> Note, free_page and memcg are not taught about __GFP_TRYLOCK yet.
> The support comes in the next patches.
> 
> This is a first step towards supporting BPF requirements in SLUB
> and getting rid of bpf_mem_alloc.
> That goal was discussed at LSFMM: https://lwn.net/Articles/974138/
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

I think there might be more non-try spin_locks reachable from page allocations:

- in reserve_highatomic_pageblock() which I think is reachable unless this
is limited to order-0
- try_to_accept_memory_one()
- as part of post_alloc_hook() in set_page_owner(), stack depot might do
raw_spin_lock_irqsave(), is that one ok?

hope I didn't miss anything else especially in those other debugging hooks
(KASAN etc)
Shakeel Butt Dec. 10, 2024, 8:25 p.m. UTC | #5
On Tue, Dec 10, 2024 at 10:05:22AM +0100, Michal Hocko wrote:
> On Tue 10-12-24 05:31:30, Matthew Wilcox wrote:
> > On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > > +	if (preemptible() && !rcu_preempt_depth())
> > > +		return alloc_pages_node_noprof(nid,
> > > +					       GFP_NOWAIT | __GFP_ZERO,
> > > +					       order);
> > > +	return alloc_pages_node_noprof(nid,
> > > +				       __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > > +				       order);
> > 
> > [...]
> > 
> > > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> > >  	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> > >  	 */
> > >  	alloc_flags |= (__force int)
> > > -		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > > +		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
> > 
> > It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> > I was originally wondering if this wasn't a memalloc_nolock_save() /
> > memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
> > but I wonder if we can simply do:
> > 
> > 	if (!preemptible() || rcu_preempt_depth())
> > 		alloc_flags |= ALLOC_TRYLOCK;
> 
> preemptible is unusable without CONFIG_PREEMPT_COUNT but I do agree that
> __GFP_TRYLOCK is not really a preferred way to go forward. For 3
> reasons. 
> 
> First I do not really like the name as it tells what it does rather than
> how it should be used. This is a general pattern of many gfp flags
> unfotrunatelly and historically it has turned out error prone. If a gfp
> flag is really needed then something like __GFP_ANY_CONTEXT should be
> used.  If the current implementation requires to use try_lock for
> zone->lock or other changes is not an implementation detail but the user
> should have a clear understanding that allocation is allowed from any
> context (NMI, IRQ or otherwise atomic contexts).
> 
> Is there any reason why GFP_ATOMIC cannot be extended to support new

GFP_ATOMIC has access to memory reserves. I see GFP_NOWAIT a better fit
and if someone wants access to the reserve they can use __GFP_HIGH with
GFP_NOWAIT.

> contexts? This allocation mode is already documented to be usable from
> atomic contexts except from NMI and raw_spinlocks. But is it feasible to
> extend the current implementation to use only trylock on zone->lock if
> called from in_nmi() to reduce unexpected failures on contention for
> existing users?

I think this is the question we (MM folks) need to answer, not the
users.

> 
> Third, do we even want such a strong guarantee in the generic page
> allocator path and make it even more complex and harder to maintain?

I think the alternative would be higher maintenance cost i.e. everyone
creating their own layer/solution/caching over page allocator which I
think we agree we want to avoid (Vlastimil's LSFMM talk).

> We
> already have a precence in form of __alloc_pages_bulk which is a special
> case allocator mode living outside of the page allocator path. It seems
> that it covers most of your requirements except the fallback to the
> regular allocation path AFAICS. Is this something you could piggy back
> on?

BPF already have bpf_mem_alloc() and IIUC this series is an effort to
unify and have a single solution.
Alexei Starovoitov Dec. 10, 2024, 9:42 p.m. UTC | #6
On Mon, Dec 9, 2024 at 9:31 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > +     if (preemptible() && !rcu_preempt_depth())
> > +             return alloc_pages_node_noprof(nid,
> > +                                            GFP_NOWAIT | __GFP_ZERO,
> > +                                            order);
> > +     return alloc_pages_node_noprof(nid,
> > +                                    __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > +                                    order);
>
> [...]
>
> > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> >        * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> >        */
> >       alloc_flags |= (__force int)
> > -             (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > +             (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
>
> It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> I was originally wondering if this wasn't a memalloc_nolock_save() /
> memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),

Interesting idea. It could be useful to pass extra flags into free_page
path, since it doesn't have flags today and I'm adding free_pages_nolock()
in patch 2 just to pass fpi_t fpi_flags around.

memalloc_nofs_save()-like makes the most sense when there are
multiple allocations and code path attempts to be generic.
For bpf use case it's probably overkill.
I guess we might have both __GFP_TRYLOCK and
memalloc_nolock_save() that clear many flags.
Note it needs to clear __GFP_KSWAPD_RECLAIM which is not safe
when raw spin lock is held.

> but I wonder if we can simply do:
>
>         if (!preemptible() || rcu_preempt_depth())
>                 alloc_flags |= ALLOC_TRYLOCK;

I don't think we can do that.
It will penalize existing GFP_ATOMIC/NOWAIT users.
kmalloc from RCU CS with GFP_NOWAIT is fine today.
Alexei Starovoitov Dec. 10, 2024, 9:53 p.m. UTC | #7
On Tue, Dec 10, 2024 at 1:01 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2024-12-09 18:39:31 [-0800], Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Tracing BPF programs execute from tracepoints and kprobes where running
> > context is unknown, but they need to request additional memory.
> > The prior workarounds were using pre-allocated memory and BPF specific
> > freelists to satisfy such allocation requests. Instead, introduce
> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
> > It relies on percpu free list of pages that rmqueue_pcplist() should be
> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
> > and refill percpu freelist as normal.
> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
> > so trylock is the only option.
>
> The __GFP_TRYLOCK flag looks reasonable given the challenges for BPF
> where it is not known how much memory will be needed and what the
> calling context is.

Exactly.

> I hope it does not spread across the kernel where
> people do ATOMIC in preempt/ IRQ-off on PREEMPT_RT and then once they
> learn that this does not work, add this flag to the mix to make it work
> without spending some time on reworking it.

We can call it __GFP_BPF to discourage any other usage,
but that seems like an odd "solution" to code review problem.
If people start using __GFP_TRYLOCK just to shut up lockdep splats
they will soon realize that it's an _oportunistic_ allocator.
bpf doesn't need more than a page and single page will likely
will be found in percpu free page pool, so this opportunistic approach
will work most of the time for bpf, but unlikely for others
that need order >= PAGE_ALLOC_COSTLY_ORDER (3).

> Side note: I am in the process of hopefully getting rid of the
> preempt_disable() from trace points. What remains then is attaching BPF
> programs to any code/ function with a raw_spinlock_t and I am not yet
> sure what to do here.

That won't help the bpf core.
There are tracepoints that are called after preemption is disabled.
The worst is trace_contention_begin() and people have good reasons
to attach bpf prog there to collect contention stats.
In such case bpf prog has no idea what kind of spin_lock is contending.
It might have disabled preemption and/or irqs before getting to
that tracepoint. So removal of preempt_disable from tracepoint
handling logic doesn't help bpf core. It's a good thing to do anyway.
Alexei Starovoitov Dec. 10, 2024, 10:06 p.m. UTC | #8
On Tue, Dec 10, 2024 at 1:05 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 10-12-24 05:31:30, Matthew Wilcox wrote:
> > On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > > +   if (preemptible() && !rcu_preempt_depth())
> > > +           return alloc_pages_node_noprof(nid,
> > > +                                          GFP_NOWAIT | __GFP_ZERO,
> > > +                                          order);
> > > +   return alloc_pages_node_noprof(nid,
> > > +                                  __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > > +                                  order);
> >
> > [...]
> >
> > > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> > >      * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> > >      */
> > >     alloc_flags |= (__force int)
> > > -           (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > > +           (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
> >
> > It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> > I was originally wondering if this wasn't a memalloc_nolock_save() /
> > memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
> > but I wonder if we can simply do:
> >
> >       if (!preemptible() || rcu_preempt_depth())
> >               alloc_flags |= ALLOC_TRYLOCK;
>
> preemptible is unusable without CONFIG_PREEMPT_COUNT but I do agree that
> __GFP_TRYLOCK is not really a preferred way to go forward. For 3
> reasons.
>
> First I do not really like the name as it tells what it does rather than
> how it should be used. This is a general pattern of many gfp flags
> unfotrunatelly and historically it has turned out error prone. If a gfp
> flag is really needed then something like __GFP_ANY_CONTEXT should be
> used.  If the current implementation requires to use try_lock for
> zone->lock or other changes is not an implementation detail but the user
> should have a clear understanding that allocation is allowed from any
> context (NMI, IRQ or otherwise atomic contexts).

__GFP_ANY_CONTEXT would make sense if we wanted to make it available
for all kernel users. In this case I agree with Sebastian.
This is bpf specific feature, since it doesn't know the context.
All other kernel users should pick GFP_KERNEL or ATOMIC or NOWAIT.
Exposing GFP_ANY_CONTEXT to all may lead to sloppy code in drivers
and elsewhere.

> Is there any reason why GFP_ATOMIC cannot be extended to support new
> contexts? This allocation mode is already documented to be usable from
> atomic contexts except from NMI and raw_spinlocks. But is it feasible to
> extend the current implementation to use only trylock on zone->lock if
> called from in_nmi() to reduce unexpected failures on contention for
> existing users?

No. in_nmi() doesn't help. It's the lack of reentrance of slab and page
allocator that is an issue.
The page alloctor might grab zone lock. In !RT it will disable irqs.
In RT will stay sleepable. Both paths will be calling other
kernel code including tracepoints, potential kprobes, etc
and bpf prog may be attached somewhere.
If it calls alloc_page() it may deadlock on zone->lock.
pcpu lock is thankfully trylock already.
So !irqs_disabled() part of preemptible() guarantees that
zone->lock won't deadlock in !RT.
And rcu_preempt_depth() case just steers bpf into try lock only path in RT.
Since there is no way to tell whether it's safe to call
sleepable spin_lock(&zone->lock).

>
> Third, do we even want such a strong guarantee in the generic page
> allocator path and make it even more complex and harder to maintain?

I'm happy to add myself as R: or M: for trylock bits,
since that will be a fundamental building block for bpf.

> We
> already have a precence in form of __alloc_pages_bulk which is a special
> case allocator mode living outside of the page allocator path. It seems
> that it covers most of your requirements except the fallback to the
> regular allocation path AFAICS. Is this something you could piggy back
> on?

__alloc_pages_bulk() has all the same issues. It takes locks.
Also it doesn't support GFP_ACCOUNT which is a show stopper.
All bpf allocations are going through memcg.
Alexei Starovoitov Dec. 10, 2024, 10:42 p.m. UTC | #9
On Tue, Dec 10, 2024 at 10:39 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 12/10/24 03:39, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Tracing BPF programs execute from tracepoints and kprobes where running
> > context is unknown, but they need to request additional memory.
> > The prior workarounds were using pre-allocated memory and BPF specific
> > freelists to satisfy such allocation requests. Instead, introduce
> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
> > It relies on percpu free list of pages that rmqueue_pcplist() should be
> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
> > and refill percpu freelist as normal.
> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
> > so trylock is the only option.
> > In theory we can introduce percpu reentrance counter and increment it
> > every time spin_lock_irqsave(&zone->lock, flags) is used,
> > but we cannot rely on it. Even if this cpu is not in page_alloc path
> > the spin_lock_irqsave() is not safe, since BPF prog might be called
> > from tracepoint where preemption is disabled. So trylock only.
> >
> > Note, free_page and memcg are not taught about __GFP_TRYLOCK yet.
> > The support comes in the next patches.
> >
> > This is a first step towards supporting BPF requirements in SLUB
> > and getting rid of bpf_mem_alloc.
> > That goal was discussed at LSFMM: https://lwn.net/Articles/974138/
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
> I think there might be more non-try spin_locks reachable from page allocations:
>
> - in reserve_highatomic_pageblock() which I think is reachable unless this
> is limited to order-0

Good point. I missed this bit:
   if (order > 0)
     alloc_flags |= ALLOC_HIGHATOMIC;

In bpf use case it will be called with order == 0 only,
but it's better to fool proof it.
I will switch to:
__GFP_NOMEMALLOC | __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT


> - try_to_accept_memory_one()

when I studied the code it looked to me that there should be no
unaccepted_pages.
I think you're saying that there could be unaccepted memory
from the previous allocation and trylock attempt just got unlucky
to reach that path?
What do you think of the following:
-               cond_accept_memory(zone, order);
+               cond_accept_memory(zone, order, alloc_flags);

                /*
                 * Detect whether the number of free pages is below high
@@ -7024,7 +7024,8 @@ static inline bool has_unaccepted_memory(void)
        return static_branch_unlikely(&zones_with_unaccepted_pages);
 }

-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+                              unsigned int alloc_flags)
 {
        long to_accept;
        bool ret = false;
@@ -7032,6 +7033,9 @@ static bool cond_accept_memory(struct zone
*zone, unsigned int order)
        if (!has_unaccepted_memory())
                return false;

+       if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+               return false;
+

or is there a better approach?

Reading from current->flags the way Matthew proposed?

> - as part of post_alloc_hook() in set_page_owner(), stack depot might do
> raw_spin_lock_irqsave(), is that one ok?

Well, I looked at the stack depot and was tempted to add trylock
handling there, but it looked to be a bit dodgy in general and
I figured it should be done separately from this set.
Like:
        if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
                page = alloc_pages(gfp_nested_mask(alloc_flags),
followed by:
        if (in_nmi()) {
                /* We can never allocate in NMI context. */
                WARN_ON_ONCE(can_alloc);

that warn is too late. If we were in_nmi and called alloc_pages
the kernel might be misbehaving already.

>
> hope I didn't miss anything else especially in those other debugging hooks
> (KASAN etc)

I looked through them and could be missing something, of course.
kasan usage in alloc_page path seems fine.
But for slab I found kasan_quarantine logic which needs a special treatment.
Other slab debugging bits pose issues too.
The rough idea is to do kmalloc_nolock() / kfree_nolock() that
don't call into any pre/post hooks (including slab_free_hook,
slab_pre_alloc_hook).
kmalloc_nolock() will pretty much call __slab_alloc_node() directly
and do basic kasan poison stuff that needs no locks.

I will be going over all the paths again, of course.

Thanks for the reviews so far!
Vlastimil Babka Dec. 11, 2024, 8:38 a.m. UTC | #10
On 12/10/24 22:53, Alexei Starovoitov wrote:
> On Tue, Dec 10, 2024 at 1:01 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
>>
>> On 2024-12-09 18:39:31 [-0800], Alexei Starovoitov wrote:
>> > From: Alexei Starovoitov <ast@kernel.org>
>> >
>> > Tracing BPF programs execute from tracepoints and kprobes where running
>> > context is unknown, but they need to request additional memory.
>> > The prior workarounds were using pre-allocated memory and BPF specific
>> > freelists to satisfy such allocation requests. Instead, introduce
>> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
>> > It relies on percpu free list of pages that rmqueue_pcplist() should be
>> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
>> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
>> > and refill percpu freelist as normal.
>> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
>> > so trylock is the only option.
>>
>> The __GFP_TRYLOCK flag looks reasonable given the challenges for BPF
>> where it is not known how much memory will be needed and what the
>> calling context is.
> 
> Exactly.
> 
>> I hope it does not spread across the kernel where
>> people do ATOMIC in preempt/ IRQ-off on PREEMPT_RT and then once they
>> learn that this does not work, add this flag to the mix to make it work
>> without spending some time on reworking it.
> 
> We can call it __GFP_BPF to discourage any other usage,
> but that seems like an odd "solution" to code review problem.

Could we perhaps not expose the flag to public headers at all, and keep it
only as an internal detail of try_alloc_pages_noprof()?
Vlastimil Babka Dec. 11, 2024, 8:48 a.m. UTC | #11
On 12/10/24 23:42, Alexei Starovoitov wrote:
> On Tue, Dec 10, 2024 at 10:39 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 12/10/24 03:39, Alexei Starovoitov wrote:
>> > From: Alexei Starovoitov <ast@kernel.org>
>> >
>> > Tracing BPF programs execute from tracepoints and kprobes where running
>> > context is unknown, but they need to request additional memory.
>> > The prior workarounds were using pre-allocated memory and BPF specific
>> > freelists to satisfy such allocation requests. Instead, introduce
>> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
>> > It relies on percpu free list of pages that rmqueue_pcplist() should be
>> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
>> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
>> > and refill percpu freelist as normal.
>> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
>> > so trylock is the only option.
>> > In theory we can introduce percpu reentrance counter and increment it
>> > every time spin_lock_irqsave(&zone->lock, flags) is used,
>> > but we cannot rely on it. Even if this cpu is not in page_alloc path
>> > the spin_lock_irqsave() is not safe, since BPF prog might be called
>> > from tracepoint where preemption is disabled. So trylock only.
>> >
>> > Note, free_page and memcg are not taught about __GFP_TRYLOCK yet.
>> > The support comes in the next patches.
>> >
>> > This is a first step towards supporting BPF requirements in SLUB
>> > and getting rid of bpf_mem_alloc.
>> > That goal was discussed at LSFMM: https://lwn.net/Articles/974138/
>> >
>> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>>
>> I think there might be more non-try spin_locks reachable from page allocations:
>>
>> - in reserve_highatomic_pageblock() which I think is reachable unless this
>> is limited to order-0
> 
> Good point. I missed this bit:
>    if (order > 0)
>      alloc_flags |= ALLOC_HIGHATOMIC;
> 
> In bpf use case it will be called with order == 0 only,
> but it's better to fool proof it.
> I will switch to:
> __GFP_NOMEMALLOC | __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT

Should work, yeah.

>> - try_to_accept_memory_one()
> 
> when I studied the code it looked to me that there should be no
> unaccepted_pages.
> I think you're saying that there could be unaccepted memory
> from the previous allocation and trylock attempt just got unlucky
> to reach that path?

The unaccepted memory can exist after boot in confidential computing guests
to speed up their boot, and then is accepted (encrypted for the guest) on
demand.

> What do you think of the following:
> -               cond_accept_memory(zone, order);
> +               cond_accept_memory(zone, order, alloc_flags);
> 
>                 /*
>                  * Detect whether the number of free pages is below high
> @@ -7024,7 +7024,8 @@ static inline bool has_unaccepted_memory(void)
>         return static_branch_unlikely(&zones_with_unaccepted_pages);
>  }
> 
> -static bool cond_accept_memory(struct zone *zone, unsigned int order)
> +static bool cond_accept_memory(struct zone *zone, unsigned int order,
> +                              unsigned int alloc_flags)
>  {
>         long to_accept;
>         bool ret = false;
> @@ -7032,6 +7033,9 @@ static bool cond_accept_memory(struct zone
> *zone, unsigned int order)
>         if (!has_unaccepted_memory())
>                 return false;
> 
> +       if (unlikely(alloc_flags & ALLOC_TRYLOCK))
> +               return false;
> +

Yeah that should be the straightforward fix.

>>
>> hope I didn't miss anything else especially in those other debugging hooks
>> (KASAN etc)
> 
> I looked through them and could be missing something, of course.
> kasan usage in alloc_page path seems fine.
> But for slab I found kasan_quarantine logic which needs a special treatment.
> Other slab debugging bits pose issues too.
> The rough idea is to do kmalloc_nolock() / kfree_nolock() that
> don't call into any pre/post hooks (including slab_free_hook,
> slab_pre_alloc_hook).
> kmalloc_nolock() will pretty much call __slab_alloc_node() directly
> and do basic kasan poison stuff that needs no locks.

You'll probably want the memcg handling though? Alloc tagging would be good
too. Both IIRC propagate the gfp flags when trying to allocate their
metadata and can deal with failure so maybe it will work in a straightfoward
way. Slab debugging will have to likely be limited, but it's also not
handled in the pre/post hooks but deeper.

> I will be going over all the paths again, of course.
> 
> Thanks for the reviews so far!
Michal Hocko Dec. 11, 2024, 10:08 a.m. UTC | #12
On Tue 10-12-24 12:25:04, Shakeel Butt wrote:
> On Tue, Dec 10, 2024 at 10:05:22AM +0100, Michal Hocko wrote:
> > On Tue 10-12-24 05:31:30, Matthew Wilcox wrote:
> > > On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > > > +	if (preemptible() && !rcu_preempt_depth())
> > > > +		return alloc_pages_node_noprof(nid,
> > > > +					       GFP_NOWAIT | __GFP_ZERO,
> > > > +					       order);
> > > > +	return alloc_pages_node_noprof(nid,
> > > > +				       __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > > > +				       order);
> > > 
> > > [...]
> > > 
> > > > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> > > >  	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> > > >  	 */
> > > >  	alloc_flags |= (__force int)
> > > > -		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > > > +		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
> > > 
> > > It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> > > I was originally wondering if this wasn't a memalloc_nolock_save() /
> > > memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
> > > but I wonder if we can simply do:
> > > 
> > > 	if (!preemptible() || rcu_preempt_depth())
> > > 		alloc_flags |= ALLOC_TRYLOCK;
> > 
> > preemptible is unusable without CONFIG_PREEMPT_COUNT but I do agree that
> > __GFP_TRYLOCK is not really a preferred way to go forward. For 3
> > reasons. 
> > 
> > First I do not really like the name as it tells what it does rather than
> > how it should be used. This is a general pattern of many gfp flags
> > unfotrunatelly and historically it has turned out error prone. If a gfp
> > flag is really needed then something like __GFP_ANY_CONTEXT should be
> > used.  If the current implementation requires to use try_lock for
> > zone->lock or other changes is not an implementation detail but the user
> > should have a clear understanding that allocation is allowed from any
> > context (NMI, IRQ or otherwise atomic contexts).
> > 
> > Is there any reason why GFP_ATOMIC cannot be extended to support new
> 
> GFP_ATOMIC has access to memory reserves. I see GFP_NOWAIT a better fit
> and if someone wants access to the reserve they can use __GFP_HIGH with
> GFP_NOWAIT.

Right. The problem with GFP_NOWAIT is that it is very often used as an
opportunistic allocation attempt before a more costly fallback. Failing
those just because of the zone lock (or other internal locks) contention
seems too aggressive.

> > Third, do we even want such a strong guarantee in the generic page
> > allocator path and make it even more complex and harder to maintain?
> 
> I think the alternative would be higher maintenance cost i.e. everyone
> creating their own layer/solution/caching over page allocator which I
> think we agree we want to avoid (Vlastimil's LSFMM talk).

Yes, I do agree that we do not want to grow special case allocators. I
was merely interested in an option to reuse existing bulk allocator for
this new purpose.
Michal Hocko Dec. 11, 2024, 10:19 a.m. UTC | #13
On Tue 10-12-24 14:06:32, Alexei Starovoitov wrote:
> On Tue, Dec 10, 2024 at 1:05 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 10-12-24 05:31:30, Matthew Wilcox wrote:
> > > On Mon, Dec 09, 2024 at 06:39:31PM -0800, Alexei Starovoitov wrote:
> > > > +   if (preemptible() && !rcu_preempt_depth())
> > > > +           return alloc_pages_node_noprof(nid,
> > > > +                                          GFP_NOWAIT | __GFP_ZERO,
> > > > +                                          order);
> > > > +   return alloc_pages_node_noprof(nid,
> > > > +                                  __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
> > > > +                                  order);
> > >
> > > [...]
> > >
> > > > @@ -4009,7 +4018,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
> > > >      * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
> > > >      */
> > > >     alloc_flags |= (__force int)
> > > > -           (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
> > > > +           (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
> > >
> > > It's not quite clear to me that we need __GFP_TRYLOCK to implement this.
> > > I was originally wondering if this wasn't a memalloc_nolock_save() /
> > > memalloc_nolock_restore() situation (akin to memalloc_nofs_save/restore),
> > > but I wonder if we can simply do:
> > >
> > >       if (!preemptible() || rcu_preempt_depth())
> > >               alloc_flags |= ALLOC_TRYLOCK;
> >
> > preemptible is unusable without CONFIG_PREEMPT_COUNT but I do agree that
> > __GFP_TRYLOCK is not really a preferred way to go forward. For 3
> > reasons.
> >
> > First I do not really like the name as it tells what it does rather than
> > how it should be used. This is a general pattern of many gfp flags
> > unfotrunatelly and historically it has turned out error prone. If a gfp
> > flag is really needed then something like __GFP_ANY_CONTEXT should be
> > used.  If the current implementation requires to use try_lock for
> > zone->lock or other changes is not an implementation detail but the user
> > should have a clear understanding that allocation is allowed from any
> > context (NMI, IRQ or otherwise atomic contexts).
> 
> __GFP_ANY_CONTEXT would make sense if we wanted to make it available
> for all kernel users. In this case I agree with Sebastian.
> This is bpf specific feature, since it doesn't know the context.
> All other kernel users should pick GFP_KERNEL or ATOMIC or NOWAIT.
> Exposing GFP_ANY_CONTEXT to all may lead to sloppy code in drivers
> and elsewhere.

I do not think we want a single user special allocation mode. Not only
there is no way to enforce this to remain BPF special feature, it is
also not really a good idea to have a single user feature in the
allocator.

> > Is there any reason why GFP_ATOMIC cannot be extended to support new
> > contexts? This allocation mode is already documented to be usable from
> > atomic contexts except from NMI and raw_spinlocks. But is it feasible to
> > extend the current implementation to use only trylock on zone->lock if
> > called from in_nmi() to reduce unexpected failures on contention for
> > existing users?
> 
> No. in_nmi() doesn't help. It's the lack of reentrance of slab and page
> allocator that is an issue.
> The page alloctor might grab zone lock. In !RT it will disable irqs.
> In RT will stay sleepable. Both paths will be calling other
> kernel code including tracepoints, potential kprobes, etc
> and bpf prog may be attached somewhere.
> If it calls alloc_page() it may deadlock on zone->lock.
> pcpu lock is thankfully trylock already.
> So !irqs_disabled() part of preemptible() guarantees that
> zone->lock won't deadlock in !RT.
> And rcu_preempt_depth() case just steers bpf into try lock only path in RT.
> Since there is no way to tell whether it's safe to call
> sleepable spin_lock(&zone->lock).

OK I see!

> > We
> > already have a precence in form of __alloc_pages_bulk which is a special
> > case allocator mode living outside of the page allocator path. It seems
> > that it covers most of your requirements except the fallback to the
> > regular allocation path AFAICS. Is this something you could piggy back
> > on?
> 
> __alloc_pages_bulk() has all the same issues. It takes locks.
> Also it doesn't support GFP_ACCOUNT which is a show stopper.
> All bpf allocations are going through memcg.

OK, this requirement was not clear until I've reached later patches in
the series (now).
Alexei Starovoitov Dec. 12, 2024, 2:14 a.m. UTC | #14
On Wed, Dec 11, 2024 at 12:39 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 12/10/24 22:53, Alexei Starovoitov wrote:
> > On Tue, Dec 10, 2024 at 1:01 AM Sebastian Andrzej Siewior
> > <bigeasy@linutronix.de> wrote:
> >>
> >> On 2024-12-09 18:39:31 [-0800], Alexei Starovoitov wrote:
> >> > From: Alexei Starovoitov <ast@kernel.org>
> >> >
> >> > Tracing BPF programs execute from tracepoints and kprobes where running
> >> > context is unknown, but they need to request additional memory.
> >> > The prior workarounds were using pre-allocated memory and BPF specific
> >> > freelists to satisfy such allocation requests. Instead, introduce
> >> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
> >> > It relies on percpu free list of pages that rmqueue_pcplist() should be
> >> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
> >> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
> >> > and refill percpu freelist as normal.
> >> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
> >> > so trylock is the only option.
> >>
> >> The __GFP_TRYLOCK flag looks reasonable given the challenges for BPF
> >> where it is not known how much memory will be needed and what the
> >> calling context is.
> >
> > Exactly.
> >
> >> I hope it does not spread across the kernel where
> >> people do ATOMIC in preempt/ IRQ-off on PREEMPT_RT and then once they
> >> learn that this does not work, add this flag to the mix to make it work
> >> without spending some time on reworking it.
> >
> > We can call it __GFP_BPF to discourage any other usage,
> > but that seems like an odd "solution" to code review problem.
>
> Could we perhaps not expose the flag to public headers at all, and keep it
> only as an internal detail of try_alloc_pages_noprof()?

public headers?
To pass additional bit via gfp flags into alloc_pages
gfp_types.h has to be touched.

If you mean moving try_alloc_pages() into mm/page_alloc.c and
adding another argument to __alloc_pages_noprof then it's not pretty.
It has 'gfp_t gfp' argument. It should to be used to pass the intent.

We don't have to add GFP_TRYLOCK at all if we go with
memalloc_nolock_save() approach.
So I started looking at it,
but immediately hit trouble with bits.
There are 5 bits left in PF_ and 3 already used for mm needs.
That doesn't look sustainable long term.
How about we alias nolock concept with PF_MEMALLOC_PIN ?

As far as I could trace PF_MEMALLOC_PIN clears GFP_MOVABLE and nothing else.

The same bit plus lack of __GFP_KSWAPD_RECLAIM in gfp flags
would mean nolock mode in alloc_pages,
while PF_MEMALLOC_PIN alone would mean nolock in free_pages
and deeper inside memcg paths and such.

thoughts? too hacky?
Vlastimil Babka Dec. 12, 2024, 8:54 a.m. UTC | #15
On 12/12/24 03:14, Alexei Starovoitov wrote:
> On Wed, Dec 11, 2024 at 12:39 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 12/10/24 22:53, Alexei Starovoitov wrote:
>> > On Tue, Dec 10, 2024 at 1:01 AM Sebastian Andrzej Siewior
>> > <bigeasy@linutronix.de> wrote:
>> >>
>> >> On 2024-12-09 18:39:31 [-0800], Alexei Starovoitov wrote:
>> >> > From: Alexei Starovoitov <ast@kernel.org>
>> >> >
>> >> > Tracing BPF programs execute from tracepoints and kprobes where running
>> >> > context is unknown, but they need to request additional memory.
>> >> > The prior workarounds were using pre-allocated memory and BPF specific
>> >> > freelists to satisfy such allocation requests. Instead, introduce
>> >> > __GFP_TRYLOCK flag that makes page allocator accessible from any context.
>> >> > It relies on percpu free list of pages that rmqueue_pcplist() should be
>> >> > able to pop the page from. If it fails (due to IRQ re-entrancy or list
>> >> > being empty) then try_alloc_pages() attempts to spin_trylock zone->lock
>> >> > and refill percpu freelist as normal.
>> >> > BPF program may execute with IRQs disabled and zone->lock is sleeping in RT,
>> >> > so trylock is the only option.
>> >>
>> >> The __GFP_TRYLOCK flag looks reasonable given the challenges for BPF
>> >> where it is not known how much memory will be needed and what the
>> >> calling context is.
>> >
>> > Exactly.
>> >
>> >> I hope it does not spread across the kernel where
>> >> people do ATOMIC in preempt/ IRQ-off on PREEMPT_RT and then once they
>> >> learn that this does not work, add this flag to the mix to make it work
>> >> without spending some time on reworking it.
>> >
>> > We can call it __GFP_BPF to discourage any other usage,
>> > but that seems like an odd "solution" to code review problem.
>>
>> Could we perhaps not expose the flag to public headers at all, and keep it
>> only as an internal detail of try_alloc_pages_noprof()?
> 
> public headers?

I mean it could be (with some work) defined only in e.g. mm/internal.h,
which the flag printing code would then need to include.

> To pass additional bit via gfp flags into alloc_pages
> gfp_types.h has to be touched.

Ah right, try_alloc_pages_noprof() would need to move to page_alloc.c
instead of being static inline in the header.

> If you mean moving try_alloc_pages() into mm/page_alloc.c and
> adding another argument to __alloc_pages_noprof then it's not pretty.
> It has 'gfp_t gfp' argument. It should to be used to pass the intent.

__GFP_TRYLOCK could be visible in page_alloc.c to do this, but not ouside mm
code.

> We don't have to add GFP_TRYLOCK at all if we go with
> memalloc_nolock_save() approach.

I have doubts about that idea. We recently rejected PF_MEMALLOC_NORECLAIM
because it could lead to allocations nested in that scope failing and they
might not expect it. Scoped trylock would have even higher chance of failing.

I think here we need to pass the flag as part of gfp flags only within
nested allocations (for metadata or debugging) within the slab/page
allocator itself, which we already mostly do. The harder problem is not
missing any place where it should affect taking a lock, and a PF_ flag won't
help with that (as we can't want all locking functions to look at it).
Maybe it could help with lockdep helping us find locks that we missed, but
I'm sure lockdep could be made to track the trylock scope even without a PF
flag?

> So I started looking at it,
> but immediately hit trouble with bits.
> There are 5 bits left in PF_ and 3 already used for mm needs.
> That doesn't look sustainable long term.
> How about we alias nolock concept with PF_MEMALLOC_PIN ?
> 
> As far as I could trace PF_MEMALLOC_PIN clears GFP_MOVABLE and nothing else.
> 
> The same bit plus lack of __GFP_KSWAPD_RECLAIM in gfp flags
> would mean nolock mode in alloc_pages,
> while PF_MEMALLOC_PIN alone would mean nolock in free_pages
> and deeper inside memcg paths and such.
> 
> thoughts? too hacky?
Sebastian Andrzej Siewior Dec. 12, 2024, 3:07 p.m. UTC | #16
On 2024-12-10 14:06:32 [-0800], Alexei Starovoitov wrote:
> > Is there any reason why GFP_ATOMIC cannot be extended to support new
> > contexts? This allocation mode is already documented to be usable from
> > atomic contexts except from NMI and raw_spinlocks. But is it feasible to
> > extend the current implementation to use only trylock on zone->lock if
> > called from in_nmi() to reduce unexpected failures on contention for
> > existing users?
> 
> No. in_nmi() doesn't help. It's the lack of reentrance of slab and page
> allocator that is an issue.
> The page alloctor might grab zone lock. In !RT it will disable irqs.
> In RT will stay sleepable. Both paths will be calling other
> kernel code including tracepoints, potential kprobes, etc
> and bpf prog may be attached somewhere.
> If it calls alloc_page() it may deadlock on zone->lock.
> pcpu lock is thankfully trylock already.
> So !irqs_disabled() part of preemptible() guarantees that
> zone->lock won't deadlock in !RT.
> And rcu_preempt_depth() case just steers bpf into try lock only path in RT.
> Since there is no way to tell whether it's safe to call
> sleepable spin_lock(&zone->lock).

Oh. You don't need to check rcu_preempt_depth() for that. On PREEMPT_RT
rcu_preempt_depth() is incremented with every spin_lock() because we
need an explicit start of a RCU section (same thing happens with
preempt_disable() spin_lock()). If there is already a RCU section
(rcu_preempt_depth() > 0) you can still try to acquire a spinlock_t and
maybe schedule out/ sleep. That is okay.

But since I see in_nmi(). You can't trylock from NMI on RT. The trylock
part is easy but unlock might need to acquire rt_mutex_base::wait_lock
and worst case is to wake a waiter via wake_up_process().

Sebastian
Michal Hocko Dec. 12, 2024, 3:21 p.m. UTC | #17
On Thu 12-12-24 16:07:44, Sebastian Sewior wrote:
> But since I see in_nmi(). You can't trylock from NMI on RT. The trylock
> part is easy but unlock might need to acquire rt_mutex_base::wait_lock
> and worst case is to wake a waiter via wake_up_process().

Ohh, I didn't realize that. So try_lock would only be safe on
raw_spin_lock right?
Sebastian Andrzej Siewior Dec. 12, 2024, 3:35 p.m. UTC | #18
On 2024-12-12 16:21:28 [+0100], Michal Hocko wrote:
> On Thu 12-12-24 16:07:44, Sebastian Sewior wrote:
> > But since I see in_nmi(). You can't trylock from NMI on RT. The trylock
> > part is easy but unlock might need to acquire rt_mutex_base::wait_lock
> > and worst case is to wake a waiter via wake_up_process().
> 
> Ohh, I didn't realize that. So try_lock would only be safe on
> raw_spin_lock right?

If NMI is one of the possible calling contexts, yes.

One thing I am not 100% sure about is how "good" a spinlock_t trylock is
if attempted from hardirq (on PREEMPT_RT). Obtaining the lock und
unlocking is doable. The lock part will assign the "current" task as the
task that owns the lock now. This task is just randomly on the CPU while
the hardirq triggered. The regular spin_lock() will see this random task
as the owner and might PI-boost it. This could work…

Sebastian
Steven Rostedt Dec. 12, 2024, 3:48 p.m. UTC | #19
On Thu, 12 Dec 2024 16:35:06 +0100
Sebastian Sewior <bigeasy@linutronix.de> wrote:

> If NMI is one of the possible calling contexts, yes.
> 
> One thing I am not 100% sure about is how "good" a spinlock_t trylock is
> if attempted from hardirq (on PREEMPT_RT). Obtaining the lock und
> unlocking is doable. The lock part will assign the "current" task as the
> task that owns the lock now. This task is just randomly on the CPU while
> the hardirq triggered. The regular spin_lock() will see this random task
> as the owner and might PI-boost it. This could work…

Looking at the unlock code, it and the slowtrylock() appears to use
raw_spin_lock_irqsave(). Hence it expects that it can be called from
irq disabled context. If it can be used in interrupt disabled context,
I don't see why it wouldn't work in actual interrupt context.

-- Steve
Sebastian Andrzej Siewior Dec. 12, 2024, 4 p.m. UTC | #20
On 2024-12-12 10:48:09 [-0500], Steven Rostedt wrote:
> On Thu, 12 Dec 2024 16:35:06 +0100
> Sebastian Sewior <bigeasy@linutronix.de> wrote:
> 
> > If NMI is one of the possible calling contexts, yes.
> > 
> > One thing I am not 100% sure about is how "good" a spinlock_t trylock is
> > if attempted from hardirq (on PREEMPT_RT). Obtaining the lock und
> > unlocking is doable. The lock part will assign the "current" task as the
> > task that owns the lock now. This task is just randomly on the CPU while
> > the hardirq triggered. The regular spin_lock() will see this random task
> > as the owner and might PI-boost it. This could work…
> 
> Looking at the unlock code, it and the slowtrylock() appears to use
> raw_spin_lock_irqsave(). Hence it expects that it can be called from
> irq disabled context. If it can be used in interrupt disabled context,
> I don't see why it wouldn't work in actual interrupt context.

trylock records current as owner. This task did not attempt to acquire
the lock.
The lock part will might PI-boost it via task_blocks_on_rt_mutex() -
this random task. That task might already wait on one lock and now this
gets added to the mix.
This could be okay since a task can own multiple locks, wait on one and
get PI boosted on any of those at any time.
However, we never used that way.

The lockig of the raw_spinlock_t has irqsave. Correct. But not because
it expects to be called in interrupt disabled context or an actual
interrupt. It was _irq() but got changed because it is used in the early
init code and would unconditionally enable interrupts which should
remain disabled.

> -- Steve

Sebastian
Alexei Starovoitov Dec. 12, 2024, 9:57 p.m. UTC | #21
On Thu, Dec 12, 2024 at 7:35 AM Sebastian Sewior <bigeasy@linutronix.de> wrote:
>
> On 2024-12-12 16:21:28 [+0100], Michal Hocko wrote:
> > On Thu 12-12-24 16:07:44, Sebastian Sewior wrote:
> > > But since I see in_nmi(). You can't trylock from NMI on RT. The trylock
> > > part is easy but unlock might need to acquire rt_mutex_base::wait_lock
> > > and worst case is to wake a waiter via wake_up_process().
> >
> > Ohh, I didn't realize that. So try_lock would only be safe on
> > raw_spin_lock right?
>
> If NMI is one of the possible calling contexts, yes.

Looks like in_nmi both trylock and unlock are not safe.

pcp_spin_trylock() calls __rt_spin_trylock() on RT,
which can deadlock inside rt_mutex_slowtrylock().
This part has a potential workaround like:

@@ -102,8 +102,11 @@ static __always_inline int
__rt_spin_trylock(spinlock_t *lock)
 {
        int ret = 1;

-       if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+       if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current))) {
+               if (in_nmi())
+                       return 0;
                ret = rt_mutex_slowtrylock(&lock->lock);
+       }

but when there are waiters and cmpxchg in this part fails:
        if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
                rt_mutex_slowunlock(&lock->lock);

will trigger slowunlock that is impossible to do from nmi.
We can punt it to irqwork with IRQ_WORK_HARD_IRQ to make sure
it runs as soon as nmi finishes.
Since it's hard irq debug_rt_mutex_unlock(lock); shouldn't complain.
The current will stay the same ?
Other ideas?
Steven Rostedt Dec. 13, 2024, 5:44 p.m. UTC | #22
On Thu, 12 Dec 2024 17:00:09 +0100
Sebastian Sewior <bigeasy@linutronix.de> wrote:

> 
> The lockig of the raw_spinlock_t has irqsave. Correct. But not because
> it expects to be called in interrupt disabled context or an actual
> interrupt. It was _irq() but got changed because it is used in the early
> init code and would unconditionally enable interrupts which should
> remain disabled.
> 

Yep, I understand that. My point was that because it does it this way, it
should also work in hard interrupt context. But it doesn't!

Looking deeper, I do not think this is safe from interrupt context!

I'm looking at the rt_mutex_slowlock_block():


		if (waiter == rt_mutex_top_waiter(lock))
			owner = rt_mutex_owner(lock);
		else
			owner = NULL;
		raw_spin_unlock_irq(&lock->wait_lock);

		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
			rt_mutex_schedule();


If we take an interrupt right after the raw_spin_unlock_irq() and then do a
trylock on an rt_mutex in the interrupt and it gets the lock. The task is
now both blocked on a lock and also holding a lock that's later in the
chain. I'm not sure the PI logic can handle such a case. That is, we have
in the chain of the task:

 lock A (blocked-waiting-for-lock) -> lock B (taken in interrupt)

If another task blocks on B, it will reverse order the lock logic. It will
see the owner is the task, but the task is blocked on A, the PI logic
assumes that for such a case, the lock order would be:

  B -> A

But this is not the case. I'm not sure what would happen here, but it is
definitely out of scope of the requirements of the PI logic and thus,
trylock must also not be used in hard interrupt context.

-- Steve
Alexei Starovoitov Dec. 13, 2024, 6:44 p.m. UTC | #23
On Fri, Dec 13, 2024 at 9:43 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 12 Dec 2024 17:00:09 +0100
> Sebastian Sewior <bigeasy@linutronix.de> wrote:
>
> >
> > The lockig of the raw_spinlock_t has irqsave. Correct. But not because
> > it expects to be called in interrupt disabled context or an actual
> > interrupt. It was _irq() but got changed because it is used in the early
> > init code and would unconditionally enable interrupts which should
> > remain disabled.
> >
>
> Yep, I understand that. My point was that because it does it this way, it
> should also work in hard interrupt context. But it doesn't!
>
> Looking deeper, I do not think this is safe from interrupt context!
>
> I'm looking at the rt_mutex_slowlock_block():
>
>
>                 if (waiter == rt_mutex_top_waiter(lock))
>                         owner = rt_mutex_owner(lock);
>                 else
>                         owner = NULL;
>                 raw_spin_unlock_irq(&lock->wait_lock);
>
>                 if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
>                         rt_mutex_schedule();
>
>
> If we take an interrupt right after the raw_spin_unlock_irq() and then do a
> trylock on an rt_mutex in the interrupt and it gets the lock. The task is
> now both blocked on a lock and also holding a lock that's later in the
> chain. I'm not sure the PI logic can handle such a case. That is, we have
> in the chain of the task:
>
>  lock A (blocked-waiting-for-lock) -> lock B (taken in interrupt)
>
> If another task blocks on B, it will reverse order the lock logic. It will
> see the owner is the task, but the task is blocked on A, the PI logic
> assumes that for such a case, the lock order would be:
>
>   B -> A
>
> But this is not the case. I'm not sure what would happen here, but it is
> definitely out of scope of the requirements of the PI logic and thus,
> trylock must also not be used in hard interrupt context.

If hard-irq acquired rt_mutex B (spin_lock or spin_trylock doesn't
change the above analysis), the task won't schedule
and it has to release this rt_mutex B before reenabling irq.
The irqrestore without releasing the lock is a bug regardless.

What's the concern then? That PI may see an odd order of locks for this task ?
but it cannot do anything about it anyway, since the task won't schedule.
And before irq handler is over the B will be released and everything
will look normal again.
Alexei Starovoitov Dec. 13, 2024, 6:57 p.m. UTC | #24
On Fri, Dec 13, 2024 at 10:44 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> If hard-irq acquired rt_mutex B (spin_lock or spin_trylock doesn't
> change the above analysis),

attempting to spin_lock from irq is obviously not safe. scratch that part.
Steven Rostedt Dec. 13, 2024, 8:09 p.m. UTC | #25
On Fri, 13 Dec 2024 10:44:26 -0800
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> > But this is not the case. I'm not sure what would happen here, but it is
> > definitely out of scope of the requirements of the PI logic and thus,
> > trylock must also not be used in hard interrupt context.  
> 
> If hard-irq acquired rt_mutex B (spin_lock or spin_trylock doesn't
> change the above analysis), the task won't schedule
> and it has to release this rt_mutex B before reenabling irq.
> The irqrestore without releasing the lock is a bug regardless.
> 
> What's the concern then? That PI may see an odd order of locks for this task ?
> but it cannot do anything about it anyway, since the task won't schedule.
> And before irq handler is over the B will be released and everything
> will look normal again.

The problem is the chain walk. It could also cause unwanted side effects in RT.

If low priority task 1 has lock A and is running on another CPU and low
priority task 2 blocks on lock A and then is interrupted right before going
to sleep as being "blocked on", and takes lock B in the interrupt context.
We then have high priority task 3 on another CPU block on B which will then
see that the owner of B is blocked (even though it is not blocked for B), it
will boost its priority as well as the owner of the lock (A). The A owner
will get boosted where it is not the task that is blocking the high
priority task.

My point is that RT is all about deterministic behavior. It would require
a pretty substantial audit to the PI logic to make sure that this doesn't
cause any unexpected results.

My point is, the PI logic was not designed for taking a lock after being
blocked on another lock. It may work, but we had better prove and show all
side effects that can happen in these cases.

-- Steve
Steven Rostedt Dec. 13, 2024, 9 p.m. UTC | #26
On Fri, 13 Dec 2024 15:09:50 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> > What's the concern then? That PI may see an odd order of locks for this task ?
> > but it cannot do anything about it anyway, since the task won't schedule.
> > And before irq handler is over the B will be released and everything
> > will look normal again.  
> 
> The problem is the chain walk. It could also cause unwanted side effects in RT.
> 
> If low priority task 1 has lock A and is running on another CPU and low
> priority task 2 blocks on lock A and then is interrupted right before going
> to sleep as being "blocked on", and takes lock B in the interrupt context.
> We then have high priority task 3 on another CPU block on B which will then
> see that the owner of B is blocked (even though it is not blocked for B), it
> will boost its priority as well as the owner of the lock (A). The A owner
> will get boosted where it is not the task that is blocking the high
> priority task.
> 
> My point is that RT is all about deterministic behavior. It would require
> a pretty substantial audit to the PI logic to make sure that this doesn't
> cause any unexpected results.
> 
> My point is, the PI logic was not designed for taking a lock after being
> blocked on another lock. It may work, but we had better prove and show all
> side effects that can happen in these cases.

When B is released, task 2 will be unboosted, but task 1 will not. That's
because a task is only unboosted when it releases a lock (or a lock times
out, and then the waiter will unboost the chain, but that's not this case).

Task 1 will unboost when it finally releases lock A.

Another issue is that interrupts are not stopped by spin_lock_irq*() as in
RT spin_lock_irq*() is the same as spin_lock(). As spin_locks() are assumed
to only be in threaded irq context, there's no reason to actually disable
interrupts when taking one of these spin_lock_irq*().

That means we could have task 1 trying to take lock B too. If we have a
lock order of:

  A -> B

Where B is the trylock in the irq context, we have:


  CPU 1			CPU 2			CPU 3
  -----			-----			-----
 task 1 takes A
			task 2 blocks on A, gets interrupted, trylock B and succeeds:

 task 1 takes B (blocks)

						High priority task 3 blocks on B

Now we have this in the PI chain:

  Task 3 boosts tasks 2 but task 2 is blocked on A
  Task 3 then boosts owner of A which is task 1 which is blocked on lock B
  Task 3 then boosts owner of lock B which is task 2
  [ wash, rinse, repeat!!! ]

There is a safety valve in the code that will prevent an infinite loop, and
it will trigger a printk message or it may stop because the priorities are
equal, but still, this is not desirable and may even have other side
effects. If deadlock detection is enabled, this will trigger it.

Again, allowing spin_locks being taken in hard irq context in RT, even with
trylock is going to open a nasty can of worms that will make this less
deterministic and determinism is the entire point of RT. If we allow one
user to have spin_lock_trylock() in hard irq context, we have to allow
anyone to do it.

-- Steve
Alexei Starovoitov Dec. 13, 2024, 10:02 p.m. UTC | #27
On Fri, Dec 13, 2024 at 1:00 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Again, allowing spin_locks being taken in hard irq context in RT, even with
> trylock is going to open a nasty can of worms that will make this less
> deterministic and determinism is the entire point of RT. If we allow one
> user to have spin_lock_trylock() in hard irq context, we have to allow
> anyone to do it.

The boosting mess is a concern indeed, but looks like it's
happening already.

See netpoll_send_udp() -> netpoll_send_skb() that does
local_irq_save() and then __netpoll_send_skb() ->
HARD_TX_TRYLOCK() -> __netif_tx_trylock() -> spin_trylock()

netconsole is the key mechanism for some hyperscalers.
It's not a niche feature.

Sounds like it's sort-of broken or rather not-working correctly
by RT standards, but replacing _xmit_lock with raw_spin_lock
is probably not an option.
So either netpoll needs to be redesigned somehow, since it has to
printk->netcons->udp->skb->tx_lock->xmit on the wire,
otherwise dmesg messages will be lost.
or make PI aware of this.

local_irq_save() is not an issue per-se,
it's printk->necons that can be called from any context
including hard irq.

For the purpose of this patch set I think I have to
if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_hardirq() || in_nmi()))
  goto out;

Looks like it's safe to call spin_trylock() on RT after
local_irq_save() and/or when irqs_disabled().
It's a hardirq/nmi context that is a problem.
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b0fe9f62d15b..f68daa9c997b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -347,6 +347,31 @@  static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
 }
 #define alloc_page_vma(...)			alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
 
+static inline struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+	/*
+	 * If spin_locks are not held and interrupts are enabled, use normal
+	 * path. BPF progs run under rcu_read_lock(), so in PREEMPT_RT
+	 * rcu_preempt_depth() will be >= 1 and will use trylock path.
+	 */
+	if (preemptible() && !rcu_preempt_depth())
+		return alloc_pages_node_noprof(nid,
+					       GFP_NOWAIT | __GFP_ZERO,
+					       order);
+	/*
+	 * Best effort allocation from percpu free list.
+	 * If it's empty attempt to spin_trylock zone->lock.
+	 * Do not specify __GFP_KSWAPD_RECLAIM to avoid wakeup_kswapd
+	 * that may need to grab a lock.
+	 * Do not specify __GFP_ACCOUNT to avoid local_lock.
+	 * Do not warn either.
+	 */
+	return alloc_pages_node_noprof(nid,
+				       __GFP_TRYLOCK | __GFP_NOWARN | __GFP_ZERO,
+				       order);
+}
+#define try_alloc_pages(...)			alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
+
 extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
 #define __get_free_pages(...)			alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
 
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 65db9349f905..72b385a7888d 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -48,6 +48,7 @@  enum {
 	___GFP_THISNODE_BIT,
 	___GFP_ACCOUNT_BIT,
 	___GFP_ZEROTAGS_BIT,
+	___GFP_TRYLOCK_BIT,
 #ifdef CONFIG_KASAN_HW_TAGS
 	___GFP_SKIP_ZERO_BIT,
 	___GFP_SKIP_KASAN_BIT,
@@ -86,6 +87,7 @@  enum {
 #define ___GFP_THISNODE		BIT(___GFP_THISNODE_BIT)
 #define ___GFP_ACCOUNT		BIT(___GFP_ACCOUNT_BIT)
 #define ___GFP_ZEROTAGS		BIT(___GFP_ZEROTAGS_BIT)
+#define ___GFP_TRYLOCK		BIT(___GFP_TRYLOCK_BIT)
 #ifdef CONFIG_KASAN_HW_TAGS
 #define ___GFP_SKIP_ZERO	BIT(___GFP_SKIP_ZERO_BIT)
 #define ___GFP_SKIP_KASAN	BIT(___GFP_SKIP_KASAN_BIT)
@@ -293,6 +295,7 @@  enum {
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
 #define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_TRYLOCK	((__force gfp_t)___GFP_TRYLOCK)
 #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
 #define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN)
 
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index bb8a59c6caa2..592c93ee5f35 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -50,6 +50,7 @@ 
 	gfpflag_string(__GFP_RECLAIM),		\
 	gfpflag_string(__GFP_DIRECT_RECLAIM),	\
 	gfpflag_string(__GFP_KSWAPD_RECLAIM),	\
+	gfpflag_string(__GFP_TRYLOCK),		\
 	gfpflag_string(__GFP_ZEROTAGS)
 
 #ifdef CONFIG_KASAN_HW_TAGS
diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c
index 7647096170e9..b3b297d67909 100644
--- a/mm/fail_page_alloc.c
+++ b/mm/fail_page_alloc.c
@@ -31,6 +31,12 @@  bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
+	if (gfp_mask & __GFP_TRYLOCK)
+		/*
+		 * Internals of should_fail_ex() are not compatible
+		 * with trylock concept.
+		 */
+		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_reclaim &&
diff --git a/mm/internal.h b/mm/internal.h
index cb8d8e8e3ffa..c082b8fa1d71 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1175,6 +1175,7 @@  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 #endif
 #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+#define ALLOC_TRYLOCK		0x1000000 /* Only use spin_trylock in allocation path */
 
 /* Flags that allow allocations below the min watermark. */
 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1cb4b8c8886d..d511e68903c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2304,7 +2304,11 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	unsigned long flags;
 	int i;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (!spin_trylock_irqsave(&zone->lock, flags)) {
+		if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+			return 0;
+		spin_lock_irqsave(&zone->lock, flags);
+	}
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
 								alloc_flags);
@@ -2904,7 +2908,11 @@  struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 
 	do {
 		page = NULL;
-		spin_lock_irqsave(&zone->lock, flags);
+		if (!spin_trylock_irqsave(&zone->lock, flags)){
+			if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+				return NULL;
+			spin_lock_irqsave(&zone->lock, flags);
+		}
 		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 		if (!page) {
@@ -4001,6 +4009,7 @@  gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 	 */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
 	BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
+	BUILD_BUG_ON(__GFP_TRYLOCK != (__force gfp_t) ALLOC_TRYLOCK);
 
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
@@ -4009,7 +4018,7 @@  gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int)
-		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
+		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
 
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
 		/*
@@ -4509,6 +4518,8 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 
 	might_alloc(gfp_mask);
 
+	*alloc_flags |= (__force int) (gfp_mask & __GFP_TRYLOCK);
+
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 4d8d94146f8d..1f7f4269fa10 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -682,6 +682,7 @@  static const struct {
 	{ "__GFP_RECLAIM",		"R" },
 	{ "__GFP_DIRECT_RECLAIM",	"DR" },
 	{ "__GFP_KSWAPD_RECLAIM",	"KR" },
+	{ "__GFP_TRYLOCK",		"TL" },
 };
 
 static size_t max_gfp_len;