diff mbox series

[RFC,v11,01/14] mm: page_frag: add a test module for page_frag

Message ID 20240719093338.55117-2-linyunsheng@huawei.com (mailing list archive)
State Superseded
Headers show
Series Replace page_frag with page_frag_cache for sk_page_frag() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 661 this patch: 661
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 2 of 2 maintainers
netdev/build_clang success Errors and warnings before: 663 this patch: 663
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api fail Found: 'module_param' was: 0 now: 5
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 667 this patch: 667
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 85 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yunsheng Lin July 19, 2024, 9:33 a.m. UTC
Basing on the lib/objpool.c, change it to something like a
ptrpool, so that we can utilize that to test the correctness
and performance of the page_frag.

The testing is done by ensuring that the fragment allocated
from a frag_frag_cache instance is pushed into a ptrpool
instance in a kthread binded to a specified cpu, and a kthread
binded to a specified cpu will pop the fragment from the
ptrpool and free the fragment.

We may refactor out the common part between objpool and ptrpool
if this ptrpool thing turns out to be helpful for other place.

CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 mm/Kconfig.debug    |   8 +
 mm/Makefile         |   1 +
 mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 402 insertions(+)
 create mode 100644 mm/page_frag_test.c

Comments

Alexander Duyck July 21, 2024, 5:34 p.m. UTC | #1
On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> Basing on the lib/objpool.c, change it to something like a
> ptrpool, so that we can utilize that to test the correctness
> and performance of the page_frag.
>
> The testing is done by ensuring that the fragment allocated
> from a frag_frag_cache instance is pushed into a ptrpool
> instance in a kthread binded to a specified cpu, and a kthread
> binded to a specified cpu will pop the fragment from the
> ptrpool and free the fragment.
>
> We may refactor out the common part between objpool and ptrpool
> if this ptrpool thing turns out to be helpful for other place.
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
>  mm/Kconfig.debug    |   8 +
>  mm/Makefile         |   1 +
>  mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 402 insertions(+)
>  create mode 100644 mm/page_frag_test.c

I might have missed it somewhere. Is there any reason why this isn't
in the selftests/mm/ directory? Seems like that would be a better fit
for this.

> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
> index afc72fde0f03..1ebcd45f47d4 100644
> --- a/mm/Kconfig.debug
> +++ b/mm/Kconfig.debug
> @@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
>           kernel code.  However the runtime performance overhead is virtually
>           nil until the tracepoints are actually enabled.
>
> +config DEBUG_PAGE_FRAG_TEST

This isn't a "DEBUG" feature. This is a test feature.

> +       tristate "Test module for page_frag"
> +       default n
> +       depends on m && DEBUG_KERNEL

I am not sure it is valid to have a tristate depend on being built as a module.

I think if you can set it up as a selftest it will have broader use as
you could compile it against any target kernel going forward and add
it as a module rather than having to build it as a part of a debug
kernel.

> +       help
> +         This builds the "page_frag_test" module that is used to test the
> +         correctness and performance of page_frag's implementation.
> +
>  config DEBUG_RODATA_TEST
>      bool "Testcase for the marking rodata read-only"
>      depends on STRICT_KERNEL_RWX
> diff --git a/mm/Makefile b/mm/Makefile
> index 8fb85acda1b1..29d9f7618a33 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -106,6 +106,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>  obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
> +obj-$(CONFIG_DEBUG_PAGE_FRAG_TEST) += page_frag_test.o
>  obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
>  obj-$(CONFIG_PAGE_OWNER) += page_owner.o
>  obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
> diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
> new file mode 100644
> index 000000000000..cf2691f60b67
> --- /dev/null
> +++ b/mm/page_frag_test.c
> @@ -0,0 +1,393 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Test module for page_frag cache
> + *
> + * Copyright: linyunsheng@huawei.com
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/vmalloc.h>
> +#include <linux/atomic.h>
> +#include <linux/irqflags.h>
> +#include <linux/cpumask.h>
> +#include <linux/log2.h>
> +#include <linux/completion.h>
> +#include <linux/kthread.h>
> +
> +#define OBJPOOL_NR_OBJECT_MAX  BIT(24)
> +
> +struct objpool_slot {
> +       u32 head;
> +       u32 tail;
> +       u32 last;
> +       u32 mask;
> +       void *entries[];
> +} __packed;
> +
> +struct objpool_head {
> +       int nr_cpus;
> +       int capacity;
> +       struct objpool_slot **cpu_slots;
> +};
> +
> +/* initialize percpu objpool_slot */
> +static void objpool_init_percpu_slot(struct objpool_head *pool,
> +                                    struct objpool_slot *slot)
> +{
> +       /* initialize elements of percpu objpool_slot */
> +       slot->mask = pool->capacity - 1;
> +}
> +
> +/* allocate and initialize percpu slots */
> +static int objpool_init_percpu_slots(struct objpool_head *pool,
> +                                    int nr_objs, gfp_t gfp)
> +{
> +       int i;
> +
> +       for (i = 0; i < pool->nr_cpus; i++) {
> +               struct objpool_slot *slot;
> +               int size;
> +
> +               /* skip the cpu node which could never be present */
> +               if (!cpu_possible(i))
> +                       continue;
> +
> +               size = struct_size(slot, entries, pool->capacity);
> +
> +               /*
> +                * here we allocate percpu-slot & objs together in a single
> +                * allocation to make it more compact, taking advantage of
> +                * warm caches and TLB hits. in default vmalloc is used to
> +                * reduce the pressure of kernel slab system. as we know,
> +                * minimal size of vmalloc is one page since vmalloc would
> +                * always align the requested size to page size
> +                */
> +               if (gfp & GFP_ATOMIC)
> +                       slot = kmalloc_node(size, gfp, cpu_to_node(i));
> +               else
> +                       slot = __vmalloc_node(size, sizeof(void *), gfp,
> +                                             cpu_to_node(i),
> +                                             __builtin_return_address(0));

When would anyone ever call this with atomic? This is just for your
test isn't it?

> +               if (!slot)
> +                       return -ENOMEM;
> +
> +               memset(slot, 0, size);
> +               pool->cpu_slots[i] = slot;
> +
> +               objpool_init_percpu_slot(pool, slot);
> +       }
> +
> +       return 0;
> +}
> +
> +/* cleanup all percpu slots of the object pool */
> +static void objpool_fini_percpu_slots(struct objpool_head *pool)
> +{
> +       int i;
> +
> +       if (!pool->cpu_slots)
> +               return;
> +
> +       for (i = 0; i < pool->nr_cpus; i++)
> +               kvfree(pool->cpu_slots[i]);
> +       kfree(pool->cpu_slots);
> +}
> +
> +/* initialize object pool and pre-allocate objects */
> +static int objpool_init(struct objpool_head *pool, int nr_objs, gfp_t gfp)
> +{
> +       int rc, capacity, slot_size;
> +
> +       /* check input parameters */
> +       if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX)
> +               return -EINVAL;
> +
> +       /* calculate capacity of percpu objpool_slot */
> +       capacity = roundup_pow_of_two(nr_objs);
> +       if (!capacity)
> +               return -EINVAL;
> +
> +       gfp = gfp & ~__GFP_ZERO;
> +
> +       /* initialize objpool pool */
> +       memset(pool, 0, sizeof(struct objpool_head));
> +       pool->nr_cpus = nr_cpu_ids;
> +       pool->capacity = capacity;
> +       slot_size = pool->nr_cpus * sizeof(struct objpool_slot *);
> +       pool->cpu_slots = kzalloc(slot_size, gfp);
> +       if (!pool->cpu_slots)
> +               return -ENOMEM;
> +
> +       /* initialize per-cpu slots */
> +       rc = objpool_init_percpu_slots(pool, nr_objs, gfp);
> +       if (rc)
> +               objpool_fini_percpu_slots(pool);
> +
> +       return rc;
> +}
> +
> +/* adding object to slot, abort if the slot was already full */
> +static int objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
> +{
> +       struct objpool_slot *slot = pool->cpu_slots[cpu];
> +       u32 head, tail;
> +
> +       /* loading tail and head as a local snapshot, tail first */
> +       tail = READ_ONCE(slot->tail);
> +
> +       do {
> +               head = READ_ONCE(slot->head);
> +               /* fault caught: something must be wrong */
> +               if (unlikely(tail - head >= pool->capacity))
> +                       return -ENOSPC;
> +       } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
> +
> +       /* now the tail position is reserved for the given obj */
> +       WRITE_ONCE(slot->entries[tail & slot->mask], obj);
> +       /* update sequence to make this obj available for pop() */
> +       smp_store_release(&slot->last, tail + 1);
> +
> +       return 0;
> +}
> +
> +/* reclaim an object to object pool */
> +static int objpool_push(void *obj, struct objpool_head *pool)
> +{
> +       unsigned long flags;
> +       int rc;
> +
> +       /* disable local irq to avoid preemption & interruption */
> +       raw_local_irq_save(flags);
> +       rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
> +       raw_local_irq_restore(flags);
> +
> +       return rc;
> +}
> +
> +/* try to retrieve object from slot */
> +static void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
> +{
> +       struct objpool_slot *slot = pool->cpu_slots[cpu];
> +       /* load head snapshot, other cpus may change it */
> +       u32 head = smp_load_acquire(&slot->head);
> +
> +       while (head != READ_ONCE(slot->last)) {
> +               void *obj;
> +
> +               /*
> +                * data visibility of 'last' and 'head' could be out of
> +                * order since memory updating of 'last' and 'head' are
> +                * performed in push() and pop() independently
> +                *
> +                * before any retrieving attempts, pop() must guarantee
> +                * 'last' is behind 'head', that is to say, there must
> +                * be available objects in slot, which could be ensured
> +                * by condition 'last != head && last - head <= nr_objs'
> +                * that is equivalent to 'last - head - 1 < nr_objs' as
> +                * 'last' and 'head' are both unsigned int32
> +                */
> +               if (READ_ONCE(slot->last) - head - 1 >= pool->capacity) {
> +                       head = READ_ONCE(slot->head);
> +                       continue;
> +               }
> +
> +               /* obj must be retrieved before moving forward head */
> +               obj = READ_ONCE(slot->entries[head & slot->mask]);
> +
> +               /* move head forward to mark it's consumption */
> +               if (try_cmpxchg_release(&slot->head, &head, head + 1))
> +                       return obj;
> +       }
> +
> +       return NULL;
> +}
> +
> +/* allocate an object from object pool */
> +static void *objpool_pop(struct objpool_head *pool)
> +{
> +       void *obj = NULL;
> +       unsigned long flags;
> +       int i, cpu;
> +
> +       /* disable local irq to avoid preemption & interruption */
> +       raw_local_irq_save(flags);
> +
> +       cpu = raw_smp_processor_id();
> +       for (i = 0; i < num_possible_cpus(); i++) {
> +               obj = objpool_try_get_slot(pool, cpu);
> +               if (obj)
> +                       break;
> +               cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
> +       }
> +       raw_local_irq_restore(flags);
> +
> +       return obj;
> +}
> +
> +/* release whole objpool forcely */
> +static void objpool_free(struct objpool_head *pool)
> +{
> +       if (!pool->cpu_slots)
> +               return;
> +
> +       /* release percpu slots */
> +       objpool_fini_percpu_slots(pool);
> +}
> +

Why add all this extra objpool overhead? This seems like overkill for
what should be a simple test. Seems like you should just need a simple
array located on one of your CPUs. I'm not sure what is with all the
extra overhead being added here.

> +static struct objpool_head ptr_pool;
> +static int nr_objs = 512;
> +static atomic_t nthreads;
> +static struct completion wait;
> +static struct page_frag_cache test_frag;
> +
> +static int nr_test = 5120000;
> +module_param(nr_test, int, 0);
> +MODULE_PARM_DESC(nr_test, "number of iterations to test");
> +
> +static bool test_align;
> +module_param(test_align, bool, 0);
> +MODULE_PARM_DESC(test_align, "use align API for testing");
> +
> +static int test_alloc_len = 2048;
> +module_param(test_alloc_len, int, 0);
> +MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
> +
> +static int test_push_cpu;
> +module_param(test_push_cpu, int, 0);
> +MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
> +
> +static int test_pop_cpu;
> +module_param(test_pop_cpu, int, 0);
> +MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
> +
> +static int page_frag_pop_thread(void *arg)
> +{
> +       struct objpool_head *pool = arg;
> +       int nr = nr_test;
> +
> +       pr_info("page_frag pop test thread begins on cpu %d\n",
> +               smp_processor_id());
> +
> +       while (nr > 0) {
> +               void *obj = objpool_pop(pool);
> +
> +               if (obj) {
> +                       nr--;
> +                       page_frag_free(obj);
> +               } else {
> +                       cond_resched();
> +               }
> +       }
> +
> +       if (atomic_dec_and_test(&nthreads))
> +               complete(&wait);
> +
> +       pr_info("page_frag pop test thread exits on cpu %d\n",
> +               smp_processor_id());
> +
> +       return 0;
> +}
> +
> +static int page_frag_push_thread(void *arg)
> +{
> +       struct objpool_head *pool = arg;
> +       int nr = nr_test;
> +
> +       pr_info("page_frag push test thread begins on cpu %d\n",
> +               smp_processor_id());
> +
> +       while (nr > 0) {
> +               void *va;
> +               int ret;
> +
> +               if (test_align) {
> +                       va = page_frag_alloc_align(&test_frag, test_alloc_len,
> +                                                  GFP_KERNEL, SMP_CACHE_BYTES);
> +
> +                       WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
> +                                 "unaligned va returned\n");
> +               } else {
> +                       va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
> +               }
> +
> +               if (!va)
> +                       continue;
> +
> +               ret = objpool_push(va, pool);
> +               if (ret) {
> +                       page_frag_free(va);
> +                       cond_resched();
> +               } else {
> +                       nr--;
> +               }
> +       }
> +
> +       pr_info("page_frag push test thread exits on cpu %d\n",
> +               smp_processor_id());
> +
> +       if (atomic_dec_and_test(&nthreads))
> +               complete(&wait);
> +
> +       return 0;
> +}
> +

So looking over these functions they seem to overlook how the network
stack works in many cases. One of the main motivations for the page
frags approach is page recycling. For example with GRO enabled the
headers allocated to record the frags might be freed for all but the
first. As such you can end up with 17 fragments being allocated, and
16 freed within the same thread as NAPI will just be recycling the
buffers.

With this setup it doesn't seem very likely to be triggered since you
are operating in two threads. One test you might want to look at
adding is a test where you are allocating and freeing in the same
thread at a fairly constant rate to test against the "ideal" scenario.

> +static int __init page_frag_test_init(void)
> +{
> +       struct task_struct *tsk_push, *tsk_pop;
> +       ktime_t start;
> +       u64 duration;
> +       int ret;
> +
> +       test_frag.va = NULL;
> +       atomic_set(&nthreads, 2);
> +       init_completion(&wait);
> +
> +       if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0)
> +               return -EINVAL;
> +
> +       ret = objpool_init(&ptr_pool, nr_objs, GFP_KERNEL);
> +       if (ret)
> +               return ret;
> +
> +       tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_pool,
> +                                        test_push_cpu, "page_frag_push");
> +       if (IS_ERR(tsk_push))
> +               return PTR_ERR(tsk_push);
> +
> +       tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_pool,
> +                                       test_pop_cpu, "page_frag_pop");
> +       if (IS_ERR(tsk_pop)) {
> +               kthread_stop(tsk_push);
> +               return PTR_ERR(tsk_pop);
> +       }
> +
> +       start = ktime_get();
> +       wake_up_process(tsk_push);
> +       wake_up_process(tsk_pop);
> +
> +       pr_info("waiting for test to complete\n");
> +       wait_for_completion(&wait);
> +
> +       duration = (u64)ktime_us_delta(ktime_get(), start);
> +       pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
> +               test_align ? "aligned" : "non-aligned", duration);
> +
> +       objpool_free(&ptr_pool);
> +       page_frag_cache_drain(&test_frag);
> +
> +       return -EAGAIN;
> +}
> +
> +static void __exit page_frag_test_exit(void)
> +{
> +}
> +
> +module_init(page_frag_test_init);
> +module_exit(page_frag_test_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
> +MODULE_DESCRIPTION("Test module for page_frag");
> --
> 2.33.0
>
Yunsheng Lin July 23, 2024, 1:19 p.m. UTC | #2
On 2024/7/22 1:34, Alexander Duyck wrote:
> On Fri, Jul 19, 2024 at 2:36 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> Basing on the lib/objpool.c, change it to something like a
>> ptrpool, so that we can utilize that to test the correctness
>> and performance of the page_frag.
>>
>> The testing is done by ensuring that the fragment allocated
>> from a frag_frag_cache instance is pushed into a ptrpool
>> instance in a kthread binded to a specified cpu, and a kthread
>> binded to a specified cpu will pop the fragment from the
>> ptrpool and free the fragment.
>>
>> We may refactor out the common part between objpool and ptrpool
>> if this ptrpool thing turns out to be helpful for other place.
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>>  mm/Kconfig.debug    |   8 +
>>  mm/Makefile         |   1 +
>>  mm/page_frag_test.c | 393 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 402 insertions(+)
>>  create mode 100644 mm/page_frag_test.c
> 
> I might have missed it somewhere. Is there any reason why this isn't
> in the selftests/mm/ directory? Seems like that would be a better fit
> for this.
> 
>> diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
>> index afc72fde0f03..1ebcd45f47d4 100644
>> --- a/mm/Kconfig.debug
>> +++ b/mm/Kconfig.debug
>> @@ -142,6 +142,14 @@ config DEBUG_PAGE_REF
>>           kernel code.  However the runtime performance overhead is virtually
>>           nil until the tracepoints are actually enabled.
>>
>> +config DEBUG_PAGE_FRAG_TEST
> 
> This isn't a "DEBUG" feature. This is a test feature.
> 
>> +       tristate "Test module for page_frag"
>> +       default n
>> +       depends on m && DEBUG_KERNEL
> 
> I am not sure it is valid to have a tristate depend on being built as a module.

Perhaps I was copying the wrong pattern from TEST_OBJPOOL in lib/Kconfig.debug.
Perhaps mm/dmapool_test.c and DMAPOOL_TEST* *was more appropriate pattern
for test module for page_frag?

> 
> I think if you can set it up as a selftest it will have broader use as
> you could compile it against any target kernel going forward and add
> it as a module rather than having to build it as a part of a debug
> kernel.

It seems tools/testing/selftests/mm/* are all about userspace testing
tool, and testing kernel module seems to be in the same directory with
the code to be tested?

> 
>> +       help
>> +         This builds the "page_frag_test" module that is used to test the
>> +         correctness and performance of page_frag's implementation.
>> +
>>  config DEBUG_RODATA_TEST
>>      bool "Testcase for the marking rodata read-only"

...

>> +
>> +               /*
>> +                * here we allocate percpu-slot & objs together in a single
>> +                * allocation to make it more compact, taking advantage of
>> +                * warm caches and TLB hits. in default vmalloc is used to
>> +                * reduce the pressure of kernel slab system. as we know,
>> +                * minimal size of vmalloc is one page since vmalloc would
>> +                * always align the requested size to page size
>> +                */
>> +               if (gfp & GFP_ATOMIC)
>> +                       slot = kmalloc_node(size, gfp, cpu_to_node(i));
>> +               else
>> +                       slot = __vmalloc_node(size, sizeof(void *), gfp,
>> +                                             cpu_to_node(i),
>> +                                             __builtin_return_address(0));
> 
> When would anyone ever call this with atomic? This is just for your
> test isn't it?
> 
>> +               if (!slot)
>> +                       return -ENOMEM;
>> +
>> +               memset(slot, 0, size);
>> +               pool->cpu_slots[i] = slot;
>> +
>> +               objpool_init_percpu_slot(pool, slot);
>> +       }
>> +
>> +       return 0;
>> +}

...

>> +/* release whole objpool forcely */
>> +static void objpool_free(struct objpool_head *pool)
>> +{
>> +       if (!pool->cpu_slots)
>> +               return;
>> +
>> +       /* release percpu slots */
>> +       objpool_fini_percpu_slots(pool);
>> +}
>> +
> 
> Why add all this extra objpool overhead? This seems like overkill for
> what should be a simple test. Seems like you should just need a simple
> array located on one of your CPUs. I'm not sure what is with all the
> extra overhead being added here.

As mentioned in the commit log:
"We may refactor out the common part between objpool and ptrpool
if this ptrpool thing turns out to be helpful for other place."

The next thing I am trying to do is to use ptrpool to optimization
the pcp for mm subsystem. so I would rather not tailor the ptrpool
for page_frag_test, and it doesn't seem to affect the testing that
much.

> 
>> +static struct objpool_head ptr_pool;
>> +static int nr_objs = 512;
>> +static atomic_t nthreads;
>> +static struct completion wait;
>> +static struct page_frag_cache test_frag;
>> +
>> +static int nr_test = 5120000;
>> +module_param(nr_test, int, 0);
>> +MODULE_PARM_DESC(nr_test, "number of iterations to test");
>> +
>> +static bool test_align;
>> +module_param(test_align, bool, 0);
>> +MODULE_PARM_DESC(test_align, "use align API for testing");
>> +
>> +static int test_alloc_len = 2048;
>> +module_param(test_alloc_len, int, 0);
>> +MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
>> +
>> +static int test_push_cpu;
>> +module_param(test_push_cpu, int, 0);
>> +MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
>> +
>> +static int test_pop_cpu;
>> +module_param(test_pop_cpu, int, 0);
>> +MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
>> +
>> +static int page_frag_pop_thread(void *arg)
>> +{
>> +       struct objpool_head *pool = arg;
>> +       int nr = nr_test;
>> +
>> +       pr_info("page_frag pop test thread begins on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       while (nr > 0) {
>> +               void *obj = objpool_pop(pool);
>> +
>> +               if (obj) {
>> +                       nr--;
>> +                       page_frag_free(obj);
>> +               } else {
>> +                       cond_resched();
>> +               }
>> +       }
>> +
>> +       if (atomic_dec_and_test(&nthreads))
>> +               complete(&wait);
>> +
>> +       pr_info("page_frag pop test thread exits on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       return 0;
>> +}
>> +
>> +static int page_frag_push_thread(void *arg)
>> +{
>> +       struct objpool_head *pool = arg;
>> +       int nr = nr_test;
>> +
>> +       pr_info("page_frag push test thread begins on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       while (nr > 0) {
>> +               void *va;
>> +               int ret;
>> +
>> +               if (test_align) {
>> +                       va = page_frag_alloc_align(&test_frag, test_alloc_len,
>> +                                                  GFP_KERNEL, SMP_CACHE_BYTES);
>> +
>> +                       WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
>> +                                 "unaligned va returned\n");
>> +               } else {
>> +                       va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
>> +               }
>> +
>> +               if (!va)
>> +                       continue;
>> +
>> +               ret = objpool_push(va, pool);
>> +               if (ret) {
>> +                       page_frag_free(va);
>> +                       cond_resched();
>> +               } else {
>> +                       nr--;
>> +               }
>> +       }
>> +
>> +       pr_info("page_frag push test thread exits on cpu %d\n",
>> +               smp_processor_id());
>> +
>> +       if (atomic_dec_and_test(&nthreads))
>> +               complete(&wait);
>> +
>> +       return 0;
>> +}
>> +
> 
> So looking over these functions they seem to overlook how the network
> stack works in many cases. One of the main motivations for the page
> frags approach is page recycling. For example with GRO enabled the
> headers allocated to record the frags might be freed for all but the
> first. As such you can end up with 17 fragments being allocated, and
> 16 freed within the same thread as NAPI will just be recycling the
> buffers.
> 
> With this setup it doesn't seem very likely to be triggered since you
> are operating in two threads. One test you might want to look at
> adding is a test where you are allocating and freeing in the same
> thread at a fairly constant rate to test against the "ideal" scenario.

I am not sure if the above is still the "ideal" scenario, as you mentioned
that most drivers are turning to use page_pool for rx, the page frag is really
mostly for tx or skb->data for rx.

>
diff mbox series

Patch

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afc72fde0f03..1ebcd45f47d4 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -142,6 +142,14 @@  config DEBUG_PAGE_REF
 	  kernel code.  However the runtime performance overhead is virtually
 	  nil until the tracepoints are actually enabled.
 
+config DEBUG_PAGE_FRAG_TEST
+	tristate "Test module for page_frag"
+	default n
+	depends on m && DEBUG_KERNEL
+	help
+	  This builds the "page_frag_test" module that is used to test the
+	  correctness and performance of page_frag's implementation.
+
 config DEBUG_RODATA_TEST
     bool "Testcase for the marking rodata read-only"
     depends on STRICT_KERNEL_RWX
diff --git a/mm/Makefile b/mm/Makefile
index 8fb85acda1b1..29d9f7618a33 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -106,6 +106,7 @@  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_PAGE_FRAG_TEST) += page_frag_test.o
 obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/page_frag_test.c b/mm/page_frag_test.c
new file mode 100644
index 000000000000..cf2691f60b67
--- /dev/null
+++ b/mm/page_frag_test.c
@@ -0,0 +1,393 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for page_frag cache
+ *
+ * Copyright: linyunsheng@huawei.com
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+
+#define OBJPOOL_NR_OBJECT_MAX	BIT(24)
+
+struct objpool_slot {
+	u32 head;
+	u32 tail;
+	u32 last;
+	u32 mask;
+	void *entries[];
+} __packed;
+
+struct objpool_head {
+	int nr_cpus;
+	int capacity;
+	struct objpool_slot **cpu_slots;
+};
+
+/* initialize percpu objpool_slot */
+static void objpool_init_percpu_slot(struct objpool_head *pool,
+				     struct objpool_slot *slot)
+{
+	/* initialize elements of percpu objpool_slot */
+	slot->mask = pool->capacity - 1;
+}
+
+/* allocate and initialize percpu slots */
+static int objpool_init_percpu_slots(struct objpool_head *pool,
+				     int nr_objs, gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < pool->nr_cpus; i++) {
+		struct objpool_slot *slot;
+		int size;
+
+		/* skip the cpu node which could never be present */
+		if (!cpu_possible(i))
+			continue;
+
+		size = struct_size(slot, entries, pool->capacity);
+
+		/*
+		 * here we allocate percpu-slot & objs together in a single
+		 * allocation to make it more compact, taking advantage of
+		 * warm caches and TLB hits. in default vmalloc is used to
+		 * reduce the pressure of kernel slab system. as we know,
+		 * minimal size of vmalloc is one page since vmalloc would
+		 * always align the requested size to page size
+		 */
+		if (gfp & GFP_ATOMIC)
+			slot = kmalloc_node(size, gfp, cpu_to_node(i));
+		else
+			slot = __vmalloc_node(size, sizeof(void *), gfp,
+					      cpu_to_node(i),
+					      __builtin_return_address(0));
+		if (!slot)
+			return -ENOMEM;
+
+		memset(slot, 0, size);
+		pool->cpu_slots[i] = slot;
+
+		objpool_init_percpu_slot(pool, slot);
+	}
+
+	return 0;
+}
+
+/* cleanup all percpu slots of the object pool */
+static void objpool_fini_percpu_slots(struct objpool_head *pool)
+{
+	int i;
+
+	if (!pool->cpu_slots)
+		return;
+
+	for (i = 0; i < pool->nr_cpus; i++)
+		kvfree(pool->cpu_slots[i]);
+	kfree(pool->cpu_slots);
+}
+
+/* initialize object pool and pre-allocate objects */
+static int objpool_init(struct objpool_head *pool, int nr_objs, gfp_t gfp)
+{
+	int rc, capacity, slot_size;
+
+	/* check input parameters */
+	if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX)
+		return -EINVAL;
+
+	/* calculate capacity of percpu objpool_slot */
+	capacity = roundup_pow_of_two(nr_objs);
+	if (!capacity)
+		return -EINVAL;
+
+	gfp = gfp & ~__GFP_ZERO;
+
+	/* initialize objpool pool */
+	memset(pool, 0, sizeof(struct objpool_head));
+	pool->nr_cpus = nr_cpu_ids;
+	pool->capacity = capacity;
+	slot_size = pool->nr_cpus * sizeof(struct objpool_slot *);
+	pool->cpu_slots = kzalloc(slot_size, gfp);
+	if (!pool->cpu_slots)
+		return -ENOMEM;
+
+	/* initialize per-cpu slots */
+	rc = objpool_init_percpu_slots(pool, nr_objs, gfp);
+	if (rc)
+		objpool_fini_percpu_slots(pool);
+
+	return rc;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static int objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	u32 head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		if (unlikely(tail - head >= pool->capacity))
+			return -ENOSPC;
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
+
+/* reclaim an object to object pool */
+static int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+
+/* try to retrieve object from slot */
+static void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	u32 head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/*
+		 * data visibility of 'last' and 'head' could be out of
+		 * order since memory updating of 'last' and 'head' are
+		 * performed in push() and pop() independently
+		 *
+		 * before any retrieving attempts, pop() must guarantee
+		 * 'last' is behind 'head', that is to say, there must
+		 * be available objects in slot, which could be ensured
+		 * by condition 'last != head && last - head <= nr_objs'
+		 * that is equivalent to 'last - head - 1 < nr_objs' as
+		 * 'last' and 'head' are both unsigned int32
+		 */
+		if (READ_ONCE(slot->last) - head - 1 >= pool->capacity) {
+			head = READ_ONCE(slot->head);
+			continue;
+		}
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
+/* allocate an object from object pool */
+static void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+
+/* release whole objpool forcely */
+static void objpool_free(struct objpool_head *pool)
+{
+	if (!pool->cpu_slots)
+		return;
+
+	/* release percpu slots */
+	objpool_fini_percpu_slots(pool);
+}
+
+static struct objpool_head ptr_pool;
+static int nr_objs = 512;
+static atomic_t nthreads;
+static struct completion wait;
+static struct page_frag_cache test_frag;
+
+static int nr_test = 5120000;
+module_param(nr_test, int, 0);
+MODULE_PARM_DESC(nr_test, "number of iterations to test");
+
+static bool test_align;
+module_param(test_align, bool, 0);
+MODULE_PARM_DESC(test_align, "use align API for testing");
+
+static int test_alloc_len = 2048;
+module_param(test_alloc_len, int, 0);
+MODULE_PARM_DESC(test_alloc_len, "alloc len for testing");
+
+static int test_push_cpu;
+module_param(test_push_cpu, int, 0);
+MODULE_PARM_DESC(test_push_cpu, "test cpu for pushing fragment");
+
+static int test_pop_cpu;
+module_param(test_pop_cpu, int, 0);
+MODULE_PARM_DESC(test_pop_cpu, "test cpu for popping fragment");
+
+static int page_frag_pop_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag pop test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *obj = objpool_pop(pool);
+
+		if (obj) {
+			nr--;
+			page_frag_free(obj);
+		} else {
+			cond_resched();
+		}
+	}
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	pr_info("page_frag pop test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	return 0;
+}
+
+static int page_frag_push_thread(void *arg)
+{
+	struct objpool_head *pool = arg;
+	int nr = nr_test;
+
+	pr_info("page_frag push test thread begins on cpu %d\n",
+		smp_processor_id());
+
+	while (nr > 0) {
+		void *va;
+		int ret;
+
+		if (test_align) {
+			va = page_frag_alloc_align(&test_frag, test_alloc_len,
+						   GFP_KERNEL, SMP_CACHE_BYTES);
+
+			WARN_ONCE((unsigned long)va & (SMP_CACHE_BYTES - 1),
+				  "unaligned va returned\n");
+		} else {
+			va = page_frag_alloc(&test_frag, test_alloc_len, GFP_KERNEL);
+		}
+
+		if (!va)
+			continue;
+
+		ret = objpool_push(va, pool);
+		if (ret) {
+			page_frag_free(va);
+			cond_resched();
+		} else {
+			nr--;
+		}
+	}
+
+	pr_info("page_frag push test thread exits on cpu %d\n",
+		smp_processor_id());
+
+	if (atomic_dec_and_test(&nthreads))
+		complete(&wait);
+
+	return 0;
+}
+
+static int __init page_frag_test_init(void)
+{
+	struct task_struct *tsk_push, *tsk_pop;
+	ktime_t start;
+	u64 duration;
+	int ret;
+
+	test_frag.va = NULL;
+	atomic_set(&nthreads, 2);
+	init_completion(&wait);
+
+	if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0)
+		return -EINVAL;
+
+	ret = objpool_init(&ptr_pool, nr_objs, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	tsk_push = kthread_create_on_cpu(page_frag_push_thread, &ptr_pool,
+					 test_push_cpu, "page_frag_push");
+	if (IS_ERR(tsk_push))
+		return PTR_ERR(tsk_push);
+
+	tsk_pop = kthread_create_on_cpu(page_frag_pop_thread, &ptr_pool,
+					test_pop_cpu, "page_frag_pop");
+	if (IS_ERR(tsk_pop)) {
+		kthread_stop(tsk_push);
+		return PTR_ERR(tsk_pop);
+	}
+
+	start = ktime_get();
+	wake_up_process(tsk_push);
+	wake_up_process(tsk_pop);
+
+	pr_info("waiting for test to complete\n");
+	wait_for_completion(&wait);
+
+	duration = (u64)ktime_us_delta(ktime_get(), start);
+	pr_info("%d of iterations for %s testing took: %lluus\n", nr_test,
+		test_align ? "aligned" : "non-aligned", duration);
+
+	objpool_free(&ptr_pool);
+	page_frag_cache_drain(&test_frag);
+
+	return -EAGAIN;
+}
+
+static void __exit page_frag_test_exit(void)
+{
+}
+
+module_init(page_frag_test_init);
+module_exit(page_frag_test_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yunsheng Lin <linyunsheng@huawei.com>");
+MODULE_DESCRIPTION("Test module for page_frag");