diff mbox series

[bpf-next,v3,2/6] mm, bpf: Introduce free_pages_nolock()

Message ID 20241218030720.1602449-3-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 557 this patch: 557
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 2 maintainers not CCed: clrkwllms@kernel.org linux-rt-devel@lists.linux.dev
netdev/build_clang fail Errors and warnings before: 22531 this patch: 22531
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 15790 this patch: 15790
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Alexei Starovoitov Dec. 18, 2024, 3:07 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Introduce free_pages_nolock() that can free pages without taking locks.
It relies on trylock and can be called from any context.
Since spin_trylock() cannot be used in RT from hard IRQ or NMI
it uses lockless link list to stash the pages which will be freed
by subsequent free_pages() from good context.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/gfp.h      |  1 +
 include/linux/mm_types.h |  4 ++
 include/linux/mmzone.h   |  3 ++
 mm/page_alloc.c          | 79 ++++++++++++++++++++++++++++++++++++----
 4 files changed, 79 insertions(+), 8 deletions(-)

Comments

Yosry Ahmed Dec. 18, 2024, 4:58 a.m. UTC | #1
On Tue, Dec 17, 2024 at 7:07 PM <alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce free_pages_nolock() that can free pages without taking locks.
> It relies on trylock and can be called from any context.
> Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> it uses lockless link list to stash the pages which will be freed
> by subsequent free_pages() from good context.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/gfp.h      |  1 +
>  include/linux/mm_types.h |  4 ++
>  include/linux/mmzone.h   |  3 ++
>  mm/page_alloc.c          | 79 ++++++++++++++++++++++++++++++++++++----
>  4 files changed, 79 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 65b8df1db26a..ff9060af6295 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -372,6 +372,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
>         __get_free_pages((gfp_mask) | GFP_DMA, (order))
>
>  extern void __free_pages(struct page *page, unsigned int order);
> +extern void free_pages_nolock(struct page *page, unsigned int order);
>  extern void free_pages(unsigned long addr, unsigned int order);
>
>  #define __free_page(page) __free_pages((page), 0)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7361a8f3ab68..52547b3e5fd8 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -99,6 +99,10 @@ struct page {
>                                 /* Or, free page */
>                                 struct list_head buddy_list;
>                                 struct list_head pcp_list;
> +                               struct {
> +                                       struct llist_node pcp_llist;
> +                                       unsigned int order;
> +                               };
>                         };
>                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
>                         struct address_space *mapping;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index b36124145a16..1a854e0a9e3b 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -953,6 +953,9 @@ struct zone {
>         /* Primarily protects free_area */
>         spinlock_t              lock;
>
> +       /* Pages to be freed when next trylock succeeds */
> +       struct llist_head       trylock_free_pages;
> +
>         /* Write-intensive fields used by compaction and vmstats. */
>         CACHELINE_PADDING(_pad2_);
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d23545057b6e..10918bfc6734 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
>   */
>  #define FPI_TO_TAIL            ((__force fpi_t)BIT(1))
>
> +/* Free the page without taking locks. Rely on trylock only. */
> +#define FPI_TRYLOCK            ((__force fpi_t)BIT(2))
> +

The comment above the definition of fpi_t mentions that it's for
non-pcp variants of free_pages(), so I guess that needs to be updated
in this patch.

More importantly, I think the comment states this mainly because the
existing flags won't be properly handled when freeing pages to the
pcplist. The flags will be lost once the pages are added to the
pcplist, and won't be propagated when the pages are eventually freed
to the buddy allocator (e.g. through free_pcppages_bulk()).

So I think we need to at least explicitly check which flags are
allowed when freeing pages to the pcplists or something similar.

>  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
>  static DEFINE_MUTEX(pcp_batch_high_lock);
>  #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
> @@ -1247,13 +1250,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
>         }
>  }
>
> +static void add_page_to_zone_llist(struct zone *zone, struct page *page,
> +                                  unsigned int order)
> +{
> +       /* Remember the order */
> +       page->order = order;
> +       /* Add the page to the free list */
> +       llist_add(&page->pcp_llist, &zone->trylock_free_pages);
> +}
> +
>  static void free_one_page(struct zone *zone, struct page *page,
>                           unsigned long pfn, unsigned int order,
>                           fpi_t fpi_flags)
>  {
> +       struct llist_head *llhead;
>         unsigned long flags;
>
> -       spin_lock_irqsave(&zone->lock, flags);
> +       if (!spin_trylock_irqsave(&zone->lock, flags)) {
> +               if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> +                       add_page_to_zone_llist(zone, page, order);
> +                       return;
> +               }
> +               spin_lock_irqsave(&zone->lock, flags);
> +       }
> +
> +       /* The lock succeeded. Process deferred pages. */
> +       llhead = &zone->trylock_free_pages;
> +       if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
> +               struct llist_node *llnode;
> +               struct page *p, *tmp;
> +
> +               llnode = llist_del_all(llhead);
> +               llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
> +                       unsigned int p_order = p->order;
> +
> +                       split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
> +                       __count_vm_events(PGFREE, 1 << p_order);
> +               }
> +       }
>         split_large_buddy(zone, page, pfn, order, fpi_flags);
>         spin_unlock_irqrestore(&zone->lock, flags);
>
> @@ -2596,7 +2630,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
>
>  static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
>                                    struct page *page, int migratetype,
> -                                  unsigned int order)
> +                                  unsigned int order, fpi_t fpi_flags)
>  {
>         int high, batch;
>         int pindex;
> @@ -2631,6 +2665,14 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
>         }
>         if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
>                 pcp->free_count += (1 << order);
> +
> +       if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> +               /*
> +                * Do not attempt to take a zone lock. Let pcp->count get
> +                * over high mark temporarily.
> +                */
> +               return;
> +       }
>         high = nr_pcp_high(pcp, zone, batch, free_high);
>         if (pcp->count >= high) {
>                 free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
> @@ -2645,7 +2687,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
>  /*
>   * Free a pcp page
>   */
> -void free_unref_page(struct page *page, unsigned int order)
> +static void __free_unref_page(struct page *page, unsigned int order,
> +                             fpi_t fpi_flags)
>  {
>         unsigned long __maybe_unused UP_flags;
>         struct per_cpu_pages *pcp;
> @@ -2654,7 +2697,7 @@ void free_unref_page(struct page *page, unsigned int order)
>         int migratetype;
>
>         if (!pcp_allowed_order(order)) {
> -               __free_pages_ok(page, order, FPI_NONE);
> +               __free_pages_ok(page, order, fpi_flags);
>                 return;
>         }
>
> @@ -2671,24 +2714,33 @@ void free_unref_page(struct page *page, unsigned int order)
>         migratetype = get_pfnblock_migratetype(page, pfn);
>         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
>                 if (unlikely(is_migrate_isolate(migratetype))) {
> -                       free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
> +                       free_one_page(page_zone(page), page, pfn, order, fpi_flags);
>                         return;
>                 }
>                 migratetype = MIGRATE_MOVABLE;
>         }
>
>         zone = page_zone(page);
> +       if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) {
> +               add_page_to_zone_llist(zone, page, order);
> +               return;
> +       }
>         pcp_trylock_prepare(UP_flags);
>         pcp = pcp_spin_trylock(zone->per_cpu_pageset);
>         if (pcp) {
> -               free_unref_page_commit(zone, pcp, page, migratetype, order);
> +               free_unref_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
>                 pcp_spin_unlock(pcp);
>         } else {
> -               free_one_page(zone, page, pfn, order, FPI_NONE);
> +               free_one_page(zone, page, pfn, order, fpi_flags);
>         }
>         pcp_trylock_finish(UP_flags);
>  }
>
> +void free_unref_page(struct page *page, unsigned int order)
> +{
> +       __free_unref_page(page, order, FPI_NONE);
> +}
> +
>  /*
>   * Free a batch of folios
>   */
> @@ -2777,7 +2829,7 @@ void free_unref_folios(struct folio_batch *folios)
>
>                 trace_mm_page_free_batched(&folio->page);
>                 free_unref_page_commit(zone, pcp, &folio->page, migratetype,
> -                               order);
> +                                      order, FPI_NONE);
>         }
>
>         if (pcp) {
> @@ -4854,6 +4906,17 @@ void __free_pages(struct page *page, unsigned int order)
>  }
>  EXPORT_SYMBOL(__free_pages);
>
> +/*
> + * Can be called while holding raw_spin_lock or from IRQ and NMI,
> + * but only for pages that came from try_alloc_pages():
> + * order <= 3, !folio, etc
> + */
> +void free_pages_nolock(struct page *page, unsigned int order)
> +{
> +       if (put_page_testzero(page))
> +               __free_unref_page(page, order, FPI_TRYLOCK);
> +}
> +
>  void free_pages(unsigned long addr, unsigned int order)
>  {
>         if (addr != 0) {
> --
> 2.43.5
>
>
Alexei Starovoitov Dec. 18, 2024, 5:33 a.m. UTC | #2
On Tue, Dec 17, 2024 at 8:59 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Dec 17, 2024 at 7:07 PM <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce free_pages_nolock() that can free pages without taking locks.
> > It relies on trylock and can be called from any context.
> > Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> > it uses lockless link list to stash the pages which will be freed
> > by subsequent free_pages() from good context.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/gfp.h      |  1 +
> >  include/linux/mm_types.h |  4 ++
> >  include/linux/mmzone.h   |  3 ++
> >  mm/page_alloc.c          | 79 ++++++++++++++++++++++++++++++++++++----
> >  4 files changed, 79 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 65b8df1db26a..ff9060af6295 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -372,6 +372,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
> >         __get_free_pages((gfp_mask) | GFP_DMA, (order))
> >
> >  extern void __free_pages(struct page *page, unsigned int order);
> > +extern void free_pages_nolock(struct page *page, unsigned int order);
> >  extern void free_pages(unsigned long addr, unsigned int order);
> >
> >  #define __free_page(page) __free_pages((page), 0)
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 7361a8f3ab68..52547b3e5fd8 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -99,6 +99,10 @@ struct page {
> >                                 /* Or, free page */
> >                                 struct list_head buddy_list;
> >                                 struct list_head pcp_list;
> > +                               struct {
> > +                                       struct llist_node pcp_llist;
> > +                                       unsigned int order;
> > +                               };
> >                         };
> >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> >                         struct address_space *mapping;
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index b36124145a16..1a854e0a9e3b 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -953,6 +953,9 @@ struct zone {
> >         /* Primarily protects free_area */
> >         spinlock_t              lock;
> >
> > +       /* Pages to be freed when next trylock succeeds */
> > +       struct llist_head       trylock_free_pages;
> > +
> >         /* Write-intensive fields used by compaction and vmstats. */
> >         CACHELINE_PADDING(_pad2_);
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index d23545057b6e..10918bfc6734 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
> >   */
> >  #define FPI_TO_TAIL            ((__force fpi_t)BIT(1))
> >
> > +/* Free the page without taking locks. Rely on trylock only. */
> > +#define FPI_TRYLOCK            ((__force fpi_t)BIT(2))
> > +
>
> The comment above the definition of fpi_t mentions that it's for
> non-pcp variants of free_pages(), so I guess that needs to be updated
> in this patch.

No. The comment:
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;

is still valid.
Most of the objective of the FPI_TRYLOCK flag is used after pcp is over.

> More importantly, I think the comment states this mainly because the
> existing flags won't be properly handled when freeing pages to the
> pcplist. The flags will be lost once the pages are added to the
> pcplist, and won't be propagated when the pages are eventually freed
> to the buddy allocator (e.g. through free_pcppages_bulk()).

Correct. fpi_t flags have a local effect. Nothing new here.
Yosry Ahmed Dec. 18, 2024, 5:57 a.m. UTC | #3
On Tue, Dec 17, 2024 at 9:33 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Dec 17, 2024 at 8:59 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Dec 17, 2024 at 7:07 PM <alexei.starovoitov@gmail.com> wrote:
> > >
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Introduce free_pages_nolock() that can free pages without taking locks.
> > > It relies on trylock and can be called from any context.
> > > Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> > > it uses lockless link list to stash the pages which will be freed
> > > by subsequent free_pages() from good context.
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > > ---
> > >  include/linux/gfp.h      |  1 +
> > >  include/linux/mm_types.h |  4 ++
> > >  include/linux/mmzone.h   |  3 ++
> > >  mm/page_alloc.c          | 79 ++++++++++++++++++++++++++++++++++++----
> > >  4 files changed, 79 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > > index 65b8df1db26a..ff9060af6295 100644
> > > --- a/include/linux/gfp.h
> > > +++ b/include/linux/gfp.h
> > > @@ -372,6 +372,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
> > >         __get_free_pages((gfp_mask) | GFP_DMA, (order))
> > >
> > >  extern void __free_pages(struct page *page, unsigned int order);
> > > +extern void free_pages_nolock(struct page *page, unsigned int order);
> > >  extern void free_pages(unsigned long addr, unsigned int order);
> > >
> > >  #define __free_page(page) __free_pages((page), 0)
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 7361a8f3ab68..52547b3e5fd8 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -99,6 +99,10 @@ struct page {
> > >                                 /* Or, free page */
> > >                                 struct list_head buddy_list;
> > >                                 struct list_head pcp_list;
> > > +                               struct {
> > > +                                       struct llist_node pcp_llist;
> > > +                                       unsigned int order;
> > > +                               };
> > >                         };
> > >                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
> > >                         struct address_space *mapping;
> > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > index b36124145a16..1a854e0a9e3b 100644
> > > --- a/include/linux/mmzone.h
> > > +++ b/include/linux/mmzone.h
> > > @@ -953,6 +953,9 @@ struct zone {
> > >         /* Primarily protects free_area */
> > >         spinlock_t              lock;
> > >
> > > +       /* Pages to be freed when next trylock succeeds */
> > > +       struct llist_head       trylock_free_pages;
> > > +
> > >         /* Write-intensive fields used by compaction and vmstats. */
> > >         CACHELINE_PADDING(_pad2_);
> > >
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index d23545057b6e..10918bfc6734 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
> > >   */
> > >  #define FPI_TO_TAIL            ((__force fpi_t)BIT(1))
> > >
> > > +/* Free the page without taking locks. Rely on trylock only. */
> > > +#define FPI_TRYLOCK            ((__force fpi_t)BIT(2))
> > > +
> >
> > The comment above the definition of fpi_t mentions that it's for
> > non-pcp variants of free_pages(), so I guess that needs to be updated
> > in this patch.
>
> No. The comment:
> /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
> typedef int __bitwise fpi_t;
>
> is still valid.
> Most of the objective of the FPI_TRYLOCK flag is used after pcp is over.

Right, but the comment says the flags are for non-pcp variants yet we
are passing them now to pcp variants. Not very clear.

>
> > More importantly, I think the comment states this mainly because the
> > existing flags won't be properly handled when freeing pages to the
> > pcplist. The flags will be lost once the pages are added to the
> > pcplist, and won't be propagated when the pages are eventually freed
> > to the buddy allocator (e.g. through free_pcppages_bulk()).
>
> Correct. fpi_t flags have a local effect. Nothing new here.

What I mean is, functions like __free_unref_page() and
free_unref_page_commit() now accept fpi_flags, but any flags other
than FPI_TRYLOCK are essentially ignored, also not very clear.

Anyway, these are just my 2c, I am just passing by and I thought it's
a bit confusing :)
Alexei Starovoitov Dec. 18, 2024, 6:37 a.m. UTC | #4
On Tue, Dec 17, 2024 at 9:58 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> What I mean is, functions like __free_unref_page() and
> free_unref_page_commit() now accept fpi_flags, but any flags other
> than FPI_TRYLOCK are essentially ignored, also not very clear.

They're not ignored. They are just not useful in this context.
The code rules over comment. If you have a concrete suggestion on
how to improve the comment please say so.
Yosry Ahmed Dec. 18, 2024, 6:49 a.m. UTC | #5
On Tue, Dec 17, 2024 at 10:37 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Dec 17, 2024 at 9:58 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > What I mean is, functions like __free_unref_page() and
> > free_unref_page_commit() now accept fpi_flags, but any flags other
> > than FPI_TRYLOCK are essentially ignored, also not very clear.
>
> They're not ignored. They are just not useful in this context.

I think they are. For example, if you pass FPI_SKIP_REPORT_NOTIFY to
__free_unref_page(), page_reporting_notify_free() will still be called
when the page is eventually freed to the buddy allocator. Same goes
for FPI_NO_TAIL.

> The code rules over comment. If you have a concrete suggestion on
> how to improve the comment please say so.

What I had in mind is adding a WARN in the pcp freeing functions if
any FPI flag but FPI_TRYLOCK is passed, and/or explicitly calling out
that other flags should not be passed as they have no effect in this
context (whether at the function definition, above the WARN, or at the
flag definitions).
Alexei Starovoitov Dec. 18, 2024, 7:25 a.m. UTC | #6
On Tue, Dec 17, 2024 at 10:49 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Dec 17, 2024 at 10:37 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Dec 17, 2024 at 9:58 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > What I mean is, functions like __free_unref_page() and
> > > free_unref_page_commit() now accept fpi_flags, but any flags other
> > > than FPI_TRYLOCK are essentially ignored, also not very clear.
> >
> > They're not ignored. They are just not useful in this context.
>
> I think they are. For example, if you pass FPI_SKIP_REPORT_NOTIFY to
> __free_unref_page(), page_reporting_notify_free() will still be called
> when the page is eventually freed to the buddy allocator. Same goes
> for FPI_NO_TAIL.

free_pcppages_bulk()->page_reporting_notify_free() will _not_ be called
when FPI_TRYLOCK is specified.
They are internal flags. The callers cannot make try_alloc_pages()
pass these extra flags.
The flags are more or less exclusive.

> > The code rules over comment. If you have a concrete suggestion on
> > how to improve the comment please say so.
>
> What I had in mind is adding a WARN in the pcp freeing functions if
> any FPI flag but FPI_TRYLOCK is passed, and/or explicitly calling out
> that other flags should not be passed as they have no effect in this
> context (whether at the function definition, above the WARN, or at the
> flag definitions).

pcp freeing functions?
In particular?
tbh this sounds like defensive programming...
BUILD_BUG_ON is a good thing when api is misused,
but WARN will be wasting run-time cycles on something that should
have been caught during code review.
free_unref_page_commit() is a shallow function when FPI_TRYLOCK is used.
There is no need to propagate fpi_flags further into free_pcppages_bulk
just to issue a WARN.
Yosry Ahmed Dec. 18, 2024, 7:40 a.m. UTC | #7
On Tue, Dec 17, 2024 at 11:25 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Dec 17, 2024 at 10:49 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Dec 17, 2024 at 10:37 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Tue, Dec 17, 2024 at 9:58 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > What I mean is, functions like __free_unref_page() and
> > > > free_unref_page_commit() now accept fpi_flags, but any flags other
> > > > than FPI_TRYLOCK are essentially ignored, also not very clear.
> > >
> > > They're not ignored. They are just not useful in this context.
> >
> > I think they are. For example, if you pass FPI_SKIP_REPORT_NOTIFY to
> > __free_unref_page(), page_reporting_notify_free() will still be called
> > when the page is eventually freed to the buddy allocator. Same goes
> > for FPI_NO_TAIL.
>
> free_pcppages_bulk()->page_reporting_notify_free() will _not_ be called
> when FPI_TRYLOCK is specified.
> They are internal flags. The callers cannot make try_alloc_pages()
> pass these extra flags.
> The flags are more or less exclusive.
>
> > > The code rules over comment. If you have a concrete suggestion on
> > > how to improve the comment please say so.
> >
> > What I had in mind is adding a WARN in the pcp freeing functions if
> > any FPI flag but FPI_TRYLOCK is passed, and/or explicitly calling out
> > that other flags should not be passed as they have no effect in this
> > context (whether at the function definition, above the WARN, or at the
> > flag definitions).
>
> pcp freeing functions?
> In particular?
> tbh this sounds like defensive programming...
> BUILD_BUG_ON is a good thing when api is misused,
> but WARN will be wasting run-time cycles on something that should
> have been caught during code review.
> free_unref_page_commit() is a shallow function when FPI_TRYLOCK is used.
> There is no need to propagate fpi_flags further into free_pcppages_bulk
> just to issue a WARN.

I meant WARN in free_unref_page_commit() if any other FPI flags are
used. Anyway, I don't feel strongly about this as I mentioned earlier.
Michal Hocko Dec. 18, 2024, 11:32 a.m. UTC | #8
On Tue 17-12-24 19:07:15, alexei.starovoitov@gmail.com wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Introduce free_pages_nolock() that can free pages without taking locks.
> It relies on trylock and can be called from any context.
> Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> it uses lockless link list to stash the pages which will be freed
> by subsequent free_pages() from good context.

Yes, this makes sense. Have you tried a simpler implementation that
would just queue on the lockless link list unconditionally? That would
certainly reduce the complexity. Essentially something similar that we
do in vfree_atomic (well, except the queue_work which is likely too
heavy for the usecase and potentialy not reentrant).
Alexei Starovoitov Dec. 19, 2024, 1:45 a.m. UTC | #9
On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 17-12-24 19:07:15, alexei.starovoitov@gmail.com wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce free_pages_nolock() that can free pages without taking locks.
> > It relies on trylock and can be called from any context.
> > Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> > it uses lockless link list to stash the pages which will be freed
> > by subsequent free_pages() from good context.
>
> Yes, this makes sense. Have you tried a simpler implementation that
> would just queue on the lockless link list unconditionally? That would
> certainly reduce the complexity. Essentially something similar that we
> do in vfree_atomic (well, except the queue_work which is likely too
> heavy for the usecase and potentialy not reentrant).

We cannot use llist approach unconditionally.
One of the ways bpf maps are used is non-stop alloc/free.
We cannot delay the free part. When memory is free it's better to
be available for kernel and bpf uses right away.
llist is the last resort.
Michal Hocko Dec. 19, 2024, 7:03 a.m. UTC | #10
On Wed 18-12-24 17:45:20, Alexei Starovoitov wrote:
> On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 17-12-24 19:07:15, alexei.starovoitov@gmail.com wrote:
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Introduce free_pages_nolock() that can free pages without taking locks.
> > > It relies on trylock and can be called from any context.
> > > Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> > > it uses lockless link list to stash the pages which will be freed
> > > by subsequent free_pages() from good context.
> >
> > Yes, this makes sense. Have you tried a simpler implementation that
> > would just queue on the lockless link list unconditionally? That would
> > certainly reduce the complexity. Essentially something similar that we
> > do in vfree_atomic (well, except the queue_work which is likely too
> > heavy for the usecase and potentialy not reentrant).
> 
> We cannot use llist approach unconditionally.
> One of the ways bpf maps are used is non-stop alloc/free.
> We cannot delay the free part. When memory is free it's better to
> be available for kernel and bpf uses right away.
> llist is the last resort.

This is an important detail that should be mentioned in the changelog.
Alexei Starovoitov Dec. 20, 2024, 12:42 a.m. UTC | #11
On Wed, Dec 18, 2024 at 11:03 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 18-12-24 17:45:20, Alexei Starovoitov wrote:
> > On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Tue 17-12-24 19:07:15, alexei.starovoitov@gmail.com wrote:
> > > > From: Alexei Starovoitov <ast@kernel.org>
> > > >
> > > > Introduce free_pages_nolock() that can free pages without taking locks.
> > > > It relies on trylock and can be called from any context.
> > > > Since spin_trylock() cannot be used in RT from hard IRQ or NMI
> > > > it uses lockless link list to stash the pages which will be freed
> > > > by subsequent free_pages() from good context.
> > >
> > > Yes, this makes sense. Have you tried a simpler implementation that
> > > would just queue on the lockless link list unconditionally? That would
> > > certainly reduce the complexity. Essentially something similar that we
> > > do in vfree_atomic (well, except the queue_work which is likely too
> > > heavy for the usecase and potentialy not reentrant).
> >
> > We cannot use llist approach unconditionally.
> > One of the ways bpf maps are used is non-stop alloc/free.
> > We cannot delay the free part. When memory is free it's better to
> > be available for kernel and bpf uses right away.
> > llist is the last resort.
>
> This is an important detail that should be mentioned in the changelog.

yeah. The commit log is a bit too short.
Will expand in the next version.
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 65b8df1db26a..ff9060af6295 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -372,6 +372,7 @@  __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 	__get_free_pages((gfp_mask) | GFP_DMA, (order))
 
 extern void __free_pages(struct page *page, unsigned int order);
+extern void free_pages_nolock(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
 #define __free_page(page) __free_pages((page), 0)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7361a8f3ab68..52547b3e5fd8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -99,6 +99,10 @@  struct page {
 				/* Or, free page */
 				struct list_head buddy_list;
 				struct list_head pcp_list;
+				struct {
+					struct llist_node pcp_llist;
+					unsigned int order;
+				};
 			};
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b36124145a16..1a854e0a9e3b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -953,6 +953,9 @@  struct zone {
 	/* Primarily protects free_area */
 	spinlock_t		lock;
 
+	/* Pages to be freed when next trylock succeeds */
+	struct llist_head	trylock_free_pages;
+
 	/* Write-intensive fields used by compaction and vmstats. */
 	CACHELINE_PADDING(_pad2_);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d23545057b6e..10918bfc6734 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@  typedef int __bitwise fpi_t;
  */
 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
 
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK		((__force fpi_t)BIT(2))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1247,13 +1250,44 @@  static void split_large_buddy(struct zone *zone, struct page *page,
 	}
 }
 
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+				   unsigned int order)
+{
+	/* Remember the order */
+	page->order = order;
+	/* Add the page to the free list */
+	llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
 static void free_one_page(struct zone *zone, struct page *page,
 			  unsigned long pfn, unsigned int order,
 			  fpi_t fpi_flags)
 {
+	struct llist_head *llhead;
 	unsigned long flags;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (!spin_trylock_irqsave(&zone->lock, flags)) {
+		if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+			add_page_to_zone_llist(zone, page, order);
+			return;
+		}
+		spin_lock_irqsave(&zone->lock, flags);
+	}
+
+	/* The lock succeeded. Process deferred pages. */
+	llhead = &zone->trylock_free_pages;
+	if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+		struct llist_node *llnode;
+		struct page *p, *tmp;
+
+		llnode = llist_del_all(llhead);
+		llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+			unsigned int p_order = p->order;
+
+			split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+			__count_vm_events(PGFREE, 1 << p_order);
+		}
+	}
 	split_large_buddy(zone, page, pfn, order, fpi_flags);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -2596,7 +2630,7 @@  static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 
 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 				   struct page *page, int migratetype,
-				   unsigned int order)
+				   unsigned int order, fpi_t fpi_flags)
 {
 	int high, batch;
 	int pindex;
@@ -2631,6 +2665,14 @@  static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	}
 	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
 		pcp->free_count += (1 << order);
+
+	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+		/*
+		 * Do not attempt to take a zone lock. Let pcp->count get
+		 * over high mark temporarily.
+		 */
+		return;
+	}
 	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2645,7 +2687,8 @@  static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 /*
  * Free a pcp page
  */
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_unref_page(struct page *page, unsigned int order,
+			      fpi_t fpi_flags)
 {
 	unsigned long __maybe_unused UP_flags;
 	struct per_cpu_pages *pcp;
@@ -2654,7 +2697,7 @@  void free_unref_page(struct page *page, unsigned int order)
 	int migratetype;
 
 	if (!pcp_allowed_order(order)) {
-		__free_pages_ok(page, order, FPI_NONE);
+		__free_pages_ok(page, order, fpi_flags);
 		return;
 	}
 
@@ -2671,24 +2714,33 @@  void free_unref_page(struct page *page, unsigned int order)
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+			free_one_page(page_zone(page), page, pfn, order, fpi_flags);
 			return;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 
 	zone = page_zone(page);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) {
+		add_page_to_zone_llist(zone, page, order);
+		return;
+	}
 	pcp_trylock_prepare(UP_flags);
 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
-		free_unref_page_commit(zone, pcp, page, migratetype, order);
+		free_unref_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
 		pcp_spin_unlock(pcp);
 	} else {
-		free_one_page(zone, page, pfn, order, FPI_NONE);
+		free_one_page(zone, page, pfn, order, fpi_flags);
 	}
 	pcp_trylock_finish(UP_flags);
 }
 
+void free_unref_page(struct page *page, unsigned int order)
+{
+	__free_unref_page(page, order, FPI_NONE);
+}
+
 /*
  * Free a batch of folios
  */
@@ -2777,7 +2829,7 @@  void free_unref_folios(struct folio_batch *folios)
 
 		trace_mm_page_free_batched(&folio->page);
 		free_unref_page_commit(zone, pcp, &folio->page, migratetype,
-				order);
+				       order, FPI_NONE);
 	}
 
 	if (pcp) {
@@ -4854,6 +4906,17 @@  void __free_pages(struct page *page, unsigned int order)
 }
 EXPORT_SYMBOL(__free_pages);
 
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI,
+ * but only for pages that came from try_alloc_pages():
+ * order <= 3, !folio, etc
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+	if (put_page_testzero(page))
+		__free_unref_page(page, order, FPI_TRYLOCK);
+}
+
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {