diff mbox series

[bpf-next,v3,1/6] mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation

Message ID 20241218030720.1602449-2-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 557 this patch: 557
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 2 maintainers not CCed: clrkwllms@kernel.org linux-rt-devel@lists.linux.dev
netdev/build_clang fail Errors and warnings before: 22694 this patch: 22534
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 16043 this patch: 15832
netdev/checkpatch warning CHECK: No space is necessary after a cast WARNING: line length of 81 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 5 this patch: 5
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Alexei Starovoitov Dec. 18, 2024, 3:07 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Tracing BPF programs execute from tracepoints and kprobes where
running context is unknown, but they need to request additional
memory. The prior workarounds were using pre-allocated memory and
BPF specific freelists to satisfy such allocation requests.
Instead, introduce internal __GFP_TRYLOCK flag that makes page
allocator accessible from any context. It relies on percpu free
list of pages that rmqueue_pcplist() should be able to pop the
page from. If it fails (due to IRQ re-entrancy or list being
empty) then try_alloc_pages() attempts to spin_trylock zone->lock
and refill percpu freelist as normal.
BPF program may execute with IRQs disabled and zone->lock is
sleeping in RT, so trylock is the only option. In theory we can
introduce percpu reentrance counter and increment it every time
spin_lock_irqsave(&zone->lock, flags) is used, but we cannot rely
on it. Even if this cpu is not in page_alloc path the
spin_lock_irqsave() is not safe, since BPF prog might be called
from tracepoint where preemption is disabled. So trylock only.

Note, free_page and memcg are not taught about __GFP_TRYLOCK yet.
The support comes in the next patches.

This is a first step towards supporting BPF requirements in SLUB
and getting rid of bpf_mem_alloc.
That goal was discussed at LSFMM: https://lwn.net/Articles/974138/

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/gfp.h       |  3 ++
 include/linux/gfp_types.h |  1 +
 mm/internal.h             |  2 ++
 mm/page_alloc.c           | 69 ++++++++++++++++++++++++++++++++++++---
 4 files changed, 71 insertions(+), 4 deletions(-)

Comments

Michal Hocko Dec. 18, 2024, 11:32 a.m. UTC | #1
I like this proposal better. I am still not convinced that we really
need internal __GFP_TRYLOCK though. 

If we reduce try_alloc_pages to the gfp usage we are at the following

On Tue 17-12-24 19:07:14, alexei.starovoitov@gmail.com wrote:
[...]
> +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> +{
> +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> +			  __GFP_NOMEMALLOC | __GFP_TRYLOCK;
> +	unsigned int alloc_flags = ALLOC_TRYLOCK;
[...]
> +	prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
> +			    &alloc_gfp, &alloc_flags);
[...]
> +	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
> +
> +	/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
> +
> +	trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
> +	kmsan_alloc_page(page, order, alloc_gfp);
[...]

From those that care about __GFP_TRYLOCK only kmsan_alloc_page doesn't
have alloc_flags. Those could make the locking decision based on
ALLOC_TRYLOCK.

I am not familiar with kmsan internals and my main question is whether
this specific usecase really needs a dedicated reentrant
kmsan_alloc_page rather than rely on gfp flag to be sufficient.
Currently kmsan_in_runtime bails out early in some contexts. The
associated comment about hooks is not completely clear to me though.
Memory allocation down the road is one of those but it is not really
clear to me whether this is the only one.
Shakeel Butt Dec. 19, 2024, 12:05 a.m. UTC | #2
On Wed, Dec 18, 2024 at 12:32:20PM +0100, Michal Hocko wrote:
> I like this proposal better. I am still not convinced that we really
> need internal __GFP_TRYLOCK though. 
> 
> If we reduce try_alloc_pages to the gfp usage we are at the following
> 
> On Tue 17-12-24 19:07:14, alexei.starovoitov@gmail.com wrote:
> [...]
> > +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> > +{
> > +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> > +			  __GFP_NOMEMALLOC | __GFP_TRYLOCK;
> > +	unsigned int alloc_flags = ALLOC_TRYLOCK;
> [...]
> > +	prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
> > +			    &alloc_gfp, &alloc_flags);
> [...]
> > +	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
> > +
> > +	/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
> > +
> > +	trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
> > +	kmsan_alloc_page(page, order, alloc_gfp);
> [...]
> 
> From those that care about __GFP_TRYLOCK only kmsan_alloc_page doesn't
> have alloc_flags. Those could make the locking decision based on
> ALLOC_TRYLOCK.
> 
> I am not familiar with kmsan internals and my main question is whether
> this specific usecase really needs a dedicated reentrant
> kmsan_alloc_page rather than rely on gfp flag to be sufficient.
> Currently kmsan_in_runtime bails out early in some contexts. The
> associated comment about hooks is not completely clear to me though.
> Memory allocation down the road is one of those but it is not really
> clear to me whether this is the only one.

Is the suggestion that just introduce and use ALLOC_TRYLOCK without the
need of __GFP_TRYLOCK?

Regarding KMSAN, the __GFP_ZERO would bypass it. Maybe a comment to
explain that.
Shakeel Butt Dec. 19, 2024, 12:10 a.m. UTC | #3
On Tue, Dec 17, 2024 at 07:07:14PM -0800, alexei.starovoitov@gmail.com wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
[...]
> +
> +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> +{
> +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> +			  __GFP_NOMEMALLOC | __GFP_TRYLOCK;

I think the above needs a comment to be more clear. Basically why zero,
nomemalloc and no warn? Otherwise this looks good. I don't have a strong
opinion on __GFP_TRYLOCK and maybe just ALLOC_TRYLOCK would be good
enough.
Alexei Starovoitov Dec. 19, 2024, 1:18 a.m. UTC | #4
On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
>
> I like this proposal better. I am still not convinced that we really
> need internal __GFP_TRYLOCK though.
>
> If we reduce try_alloc_pages to the gfp usage we are at the following
>
> On Tue 17-12-24 19:07:14, alexei.starovoitov@gmail.com wrote:
> [...]
> > +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> > +{
> > +     gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> > +                       __GFP_NOMEMALLOC | __GFP_TRYLOCK;
> > +     unsigned int alloc_flags = ALLOC_TRYLOCK;
> [...]
> > +     prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
> > +                         &alloc_gfp, &alloc_flags);
> [...]
> > +     page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
> > +
> > +     /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
> > +
> > +     trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
> > +     kmsan_alloc_page(page, order, alloc_gfp);
> [...]
>
> From those that care about __GFP_TRYLOCK only kmsan_alloc_page doesn't
> have alloc_flags. Those could make the locking decision based on
> ALLOC_TRYLOCK.

__GFP_TRYLOCK here sets a baseline and is used in patch 4 by inner
bits of memcg's consume_stock() logic while called from
try_alloc_pages() in patch 5.
We cannot pass alloc_flags into it.
Just too much overhead.

__memcg_kmem_charge_page()
  -> obj_cgroup_charge_pages()
      -> try_charge_memcg()
         -> consume_stock()

all of them would need an extra 'u32 alloc_flags'.
This is too high cost to avoid ___GFP_TRYLOCK_BIT in gfp_types.h

> I am not familiar with kmsan internals and my main question is whether
> this specific usecase really needs a dedicated reentrant
> kmsan_alloc_page rather than rely on gfp flag to be sufficient.
> Currently kmsan_in_runtime bails out early in some contexts. The
> associated comment about hooks is not completely clear to me though.
> Memory allocation down the road is one of those but it is not really
> clear to me whether this is the only one.

As I mentioned in giant v2 thread I'm not touching kasan/kmsan
in this patch set, since it needs its own eyes
from experts in those bits,
but when it happens gfp & __GFP_TRYLOCK would be the way
to adjust whatever is necessary in kasan/kmsan internals.

As Shakeel mentioned, currently kmsan_alloc_page() is gutted,
since I'm using __GFP_ZERO unconditionally here.
We don't even get to kmsan_in_runtime() check.
For bpf use cases __GFP_ZERO and __GFP_ACCOUNT are pretty much
mandatory. When there will be a 2nd user of this try_alloc_pages()
api we can consider making flags for these two
and at that time full analysis kmsan reentrance would be necessary.
It works in this patch because of GFP_ZERO.

So __GFP_TRYLOCK is needed in many cases:
- to make decisions in consume_stock()
- in the future in kasan/kmsan
- and in slab kmalloc. There I'm going to introduce try_kmalloc()
(or kmalloc_nolock(), naming is hard) that will use this
internal __GFP_TRYLOCK flag to avoid locks and when it gets
to new_slab()->allocate_slab()->alloc_slab_page()
the latter will use try_alloc_pages() instead of alloc_pages().
Alexei Starovoitov Dec. 19, 2024, 1:39 a.m. UTC | #5
On Wed, Dec 18, 2024 at 4:10 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Dec 17, 2024 at 07:07:14PM -0800, alexei.starovoitov@gmail.com wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> [...]
> > +
> > +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> > +{
> > +     gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> > +                       __GFP_NOMEMALLOC | __GFP_TRYLOCK;
>
> I think the above needs a comment to be more clear. Basically why zero,
> nomemalloc and no warn? Otherwise this looks good. I don't have a strong
> opinion on __GFP_TRYLOCK and maybe just ALLOC_TRYLOCK would be good
> enough.

__GFP_NOWARN is what bpf uses almost everywhere. There is never a reason
to warn. Also warn means printk() which is unsafe from various places.
Cannot really use printk_deferred_enter() either. The running ctx is unknown.

__GFP_ZERO is to make sure that call to kmsan_alloc_page() is safe
and it's necessary for bpf anyway.

Initially __GFP_NOMEMALLOC was added to avoid ALLOC_HIGHATOMIC paths
when __alloc_pages_slowpath() was still there in try_alloc_pages().
Later the slowpath was removed, but I left __GFP_NOMEMALLOC
as a self-explanatory statement that try_alloc_pages()
doesn't want to deplete reserves.

I'll add a comment in the next version.
Michal Hocko Dec. 19, 2024, 7:13 a.m. UTC | #6
On Wed 18-12-24 17:18:51, Alexei Starovoitov wrote:
> On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > I like this proposal better. I am still not convinced that we really
> > need internal __GFP_TRYLOCK though.
> >
> > If we reduce try_alloc_pages to the gfp usage we are at the following
> >
> > On Tue 17-12-24 19:07:14, alexei.starovoitov@gmail.com wrote:
> > [...]
> > > +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> > > +{
> > > +     gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> > > +                       __GFP_NOMEMALLOC | __GFP_TRYLOCK;
> > > +     unsigned int alloc_flags = ALLOC_TRYLOCK;
> > [...]
> > > +     prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
> > > +                         &alloc_gfp, &alloc_flags);
> > [...]
> > > +     page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
> > > +
> > > +     /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
> > > +
> > > +     trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
> > > +     kmsan_alloc_page(page, order, alloc_gfp);
> > [...]
> >
> > From those that care about __GFP_TRYLOCK only kmsan_alloc_page doesn't
> > have alloc_flags. Those could make the locking decision based on
> > ALLOC_TRYLOCK.
> 
> __GFP_TRYLOCK here sets a baseline and is used in patch 4 by inner
> bits of memcg's consume_stock() logic while called from
> try_alloc_pages() in patch 5.

Yes, I have addressed that part in a reply. In short I believe we can
achieve reentrancy for NOWAIT/ATOMIC charges without a dedicated gfp
flag.

[...]
> > I am not familiar with kmsan internals and my main question is whether
> > this specific usecase really needs a dedicated reentrant
> > kmsan_alloc_page rather than rely on gfp flag to be sufficient.
> > Currently kmsan_in_runtime bails out early in some contexts. The
> > associated comment about hooks is not completely clear to me though.
> > Memory allocation down the road is one of those but it is not really
> > clear to me whether this is the only one.
> 
> As I mentioned in giant v2 thread I'm not touching kasan/kmsan
> in this patch set, since it needs its own eyes
> from experts in those bits,
> but when it happens gfp & __GFP_TRYLOCK would be the way
> to adjust whatever is necessary in kasan/kmsan internals.
> 
> As Shakeel mentioned, currently kmsan_alloc_page() is gutted,
> since I'm using __GFP_ZERO unconditionally here.
> We don't even get to kmsan_in_runtime() check.

I have missed that part! That means that you can drop kmsan_alloc_page
altogether no?

[...]

> - and in slab kmalloc. There I'm going to introduce try_kmalloc()
> (or kmalloc_nolock(), naming is hard) that will use this
> internal __GFP_TRYLOCK flag to avoid locks and when it gets
> to new_slab()->allocate_slab()->alloc_slab_page()
> the latter will use try_alloc_pages() instead of alloc_pages().

I cannot really comment on the slab side of things. All I am saying is
that we should _try_ to avoid __GFP_TRYLOCK if possible/feasible. It
seems that the page allocator can do without that. Maybe slab side can
as well.
Michal Hocko Dec. 19, 2024, 7:18 a.m. UTC | #7
On Wed 18-12-24 16:05:25, Shakeel Butt wrote:
> On Wed, Dec 18, 2024 at 12:32:20PM +0100, Michal Hocko wrote:
> > I like this proposal better. I am still not convinced that we really
> > need internal __GFP_TRYLOCK though. 
> > 
> > If we reduce try_alloc_pages to the gfp usage we are at the following
> > 
> > On Tue 17-12-24 19:07:14, alexei.starovoitov@gmail.com wrote:
> > [...]
> > > +struct page *try_alloc_pages_noprof(int nid, unsigned int order)
> > > +{
> > > +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
> > > +			  __GFP_NOMEMALLOC | __GFP_TRYLOCK;
> > > +	unsigned int alloc_flags = ALLOC_TRYLOCK;
> > [...]
> > > +	prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
> > > +			    &alloc_gfp, &alloc_flags);
> > [...]
> > > +	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
> > > +
> > > +	/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
> > > +
> > > +	trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
> > > +	kmsan_alloc_page(page, order, alloc_gfp);
> > [...]
> > 
> > From those that care about __GFP_TRYLOCK only kmsan_alloc_page doesn't
> > have alloc_flags. Those could make the locking decision based on
> > ALLOC_TRYLOCK.
> > 
> > I am not familiar with kmsan internals and my main question is whether
> > this specific usecase really needs a dedicated reentrant
> > kmsan_alloc_page rather than rely on gfp flag to be sufficient.
> > Currently kmsan_in_runtime bails out early in some contexts. The
> > associated comment about hooks is not completely clear to me though.
> > Memory allocation down the road is one of those but it is not really
> > clear to me whether this is the only one.
> 
> Is the suggestion that just introduce and use ALLOC_TRYLOCK without the
> need of __GFP_TRYLOCK?

Exactly! Because ALLOC_$FOO is strictly internal allocator flag that
cannot leak out to external users by design. __GFP_TRYLOCK in this
implementation tries to achieve the same by hiding it which would work
but it is both ugly and likely unnecessary.
Alexei Starovoitov Dec. 20, 2024, 12:41 a.m. UTC | #8
On Wed, Dec 18, 2024 at 11:14 PM Michal Hocko <mhocko@suse.com> wrote:
>
> > As I mentioned in giant v2 thread I'm not touching kasan/kmsan
> > in this patch set, since it needs its own eyes
> > from experts in those bits,
> > but when it happens gfp & __GFP_TRYLOCK would be the way
> > to adjust whatever is necessary in kasan/kmsan internals.
> >
> > As Shakeel mentioned, currently kmsan_alloc_page() is gutted,
> > since I'm using __GFP_ZERO unconditionally here.
> > We don't even get to kmsan_in_runtime() check.
>
> I have missed that part! That means that you can drop kmsan_alloc_page
> altogether no?

I think it's still necessary.
kmsan_alloc_page() still needs to do the zero-ing.
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b0fe9f62d15b..65b8df1db26a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -347,6 +347,9 @@  static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
 }
 #define alloc_page_vma(...)			alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
 
+struct page *try_alloc_pages_noprof(int nid, unsigned int order);
+#define try_alloc_pages(...)			alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
+
 extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
 #define __get_free_pages(...)			alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
 
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 65db9349f905..65b148ec86eb 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -48,6 +48,7 @@  enum {
 	___GFP_THISNODE_BIT,
 	___GFP_ACCOUNT_BIT,
 	___GFP_ZEROTAGS_BIT,
+	___GFP_TRYLOCK_BIT,
 #ifdef CONFIG_KASAN_HW_TAGS
 	___GFP_SKIP_ZERO_BIT,
 	___GFP_SKIP_KASAN_BIT,
diff --git a/mm/internal.h b/mm/internal.h
index cb8d8e8e3ffa..122fce7e1a9e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1175,6 +1175,8 @@  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 #endif
 #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+#define __GFP_TRYLOCK		((__force gfp_t)BIT(___GFP_TRYLOCK_BIT))
+#define ALLOC_TRYLOCK		0x1000000 /* Only use spin_trylock in allocation path */
 
 /* Flags that allow allocations below the min watermark. */
 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1cb4b8c8886d..d23545057b6e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2304,7 +2304,11 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	unsigned long flags;
 	int i;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (!spin_trylock_irqsave(&zone->lock, flags)) {
+		if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+			return 0;
+		spin_lock_irqsave(&zone->lock, flags);
+	}
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
 								alloc_flags);
@@ -2904,7 +2908,11 @@  struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 
 	do {
 		page = NULL;
-		spin_lock_irqsave(&zone->lock, flags);
+		if (!spin_trylock_irqsave(&zone->lock, flags)) {
+			if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+				return NULL;
+			spin_lock_irqsave(&zone->lock, flags);
+		}
 		if (alloc_flags & ALLOC_HIGHATOMIC)
 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 		if (!page) {
@@ -4001,6 +4009,7 @@  gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 	 */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
 	BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
+	BUILD_BUG_ON(__GFP_TRYLOCK != (__force gfp_t) ALLOC_TRYLOCK);
 
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
@@ -4009,7 +4018,7 @@  gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 	 * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int)
-		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
+		(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM | __GFP_TRYLOCK));
 
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
 		/*
@@ -4509,7 +4518,8 @@  static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 
 	might_alloc(gfp_mask);
 
-	if (should_fail_alloc_page(gfp_mask, order))
+	if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+	    should_fail_alloc_page(gfp_mask, order))
 		return false;
 
 	*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -7023,3 +7033,54 @@  static bool __free_unaccepted(struct page *page)
 }
 
 #endif /* CONFIG_UNACCEPTED_MEMORY */
+
+struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO |
+			  __GFP_NOMEMALLOC | __GFP_TRYLOCK;
+	unsigned int alloc_flags = ALLOC_TRYLOCK;
+	struct alloc_context ac = { };
+	struct page *page;
+
+	/*
+	 * In RT spin_trylock() may call raw_spin_lock() which is unsafe in NMI.
+	 * If spin_trylock() is called from hard IRQ the current task may be
+	 * waiting for one rt_spin_lock, but rt_spin_trylock() will mark the
+	 * task as the owner of another rt_spin_lock which will confuse PI
+	 * logic, so return immediately if called form hard IRQ or NMI.
+	 *
+	 * Note, irqs_disabled() case is ok. This function can be called
+	 * from raw_spin_lock_irqsave region.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+		return NULL;
+	if (!pcp_allowed_order(order))
+		return NULL;
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+	if (has_unaccepted_memory() && !list_empty(&zone->unaccepted_pages))
+		return NULL;
+#endif
+
+	if (nid == NUMA_NO_NODE)
+		nid = numa_node_id();
+
+	prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+			    &alloc_gfp, &alloc_flags);
+
+	/*
+	 * Best effort allocation from percpu free list.
+	 * If it's empty attempt to spin_trylock zone->lock.
+	 * Do not specify __GFP_KSWAPD_RECLAIM to avoid wakeup_kswapd
+	 * that may need to grab a lock.
+	 * Do not specify __GFP_ACCOUNT to avoid local_lock.
+	 * Do not warn either.
+	 */
+	page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+	/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+	trace_mm_page_alloc(page, order, alloc_gfp & ~__GFP_TRYLOCK, ac.migratetype);
+	kmsan_alloc_page(page, order, alloc_gfp);
+	return page;
+}