diff mbox series

[bpf,v2,6/9] bpf: Switch to bpf mem allocator for LPM trie

Message ID 20241127004641.1118269-7-houtao@huaweicloud.com (mailing list archive)
State New
Delegated to: BPF
Headers show
Series Fixes for LPM trie | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch warning WARNING: The commit message has 'syzkaller', perhaps it also needs a 'Fixes:' tag?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-14 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-15 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (sched_ext, false, 360) / sched_ext on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (sched_ext, false, 360) / sched_ext on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (sched_ext, false, 360) / sched_ext on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (sched_ext, false, 360) / sched_ext on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-45 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Hou Tao Nov. 27, 2024, 12:46 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Multiple syzbot warnings have been reported. These warnings are mainly
about the lock order between trie->lock and kmalloc()'s internal lock.
See report [1] as an example:

======================================================
WARNING: possible circular locking dependency detected
6.10.0-rc7-syzkaller-00003-g4376e966ecb7 #0 Not tainted
------------------------------------------------------
syz.3.2069/15008 is trying to acquire lock:
ffff88801544e6d8 (&n->list_lock){-.-.}-{2:2}, at: get_partial_node ...

but task is already holding lock:
ffff88802dcc89f8 (&trie->lock){-.-.}-{2:2}, at: trie_update_elem ...

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&trie->lock){-.-.}-{2:2}:
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       trie_delete_elem+0xb0/0x820
       ___bpf_prog_run+0x3e51/0xabd0
       __bpf_prog_run32+0xc1/0x100
       bpf_dispatcher_nop_func
       ......
       bpf_trace_run2+0x231/0x590
       __bpf_trace_contention_end+0xca/0x110
       trace_contention_end.constprop.0+0xea/0x170
       __pv_queued_spin_lock_slowpath+0x28e/0xcc0
       pv_queued_spin_lock_slowpath
       queued_spin_lock_slowpath
       queued_spin_lock
       do_raw_spin_lock+0x210/0x2c0
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x42/0x60
       __put_partials+0xc3/0x170
       qlink_free
       qlist_free_all+0x4e/0x140
       kasan_quarantine_reduce+0x192/0x1e0
       __kasan_slab_alloc+0x69/0x90
       kasan_slab_alloc
       slab_post_alloc_hook
       slab_alloc_node
       kmem_cache_alloc_node_noprof+0x153/0x310
       __alloc_skb+0x2b1/0x380
       ......

-> #0 (&n->list_lock){-.-.}-{2:2}:
       check_prev_add
       check_prevs_add
       validate_chain
       __lock_acquire+0x2478/0x3b30
       lock_acquire
       lock_acquire+0x1b1/0x560
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       get_partial_node.part.0+0x20/0x350
       get_partial_node
       get_partial
       ___slab_alloc+0x65b/0x1870
       __slab_alloc.constprop.0+0x56/0xb0
       __slab_alloc_node
       slab_alloc_node
       __do_kmalloc_node
       __kmalloc_node_noprof+0x35c/0x440
       kmalloc_node_noprof
       bpf_map_kmalloc_node+0x98/0x4a0
       lpm_trie_node_alloc
       trie_update_elem+0x1ef/0xe00
       bpf_map_update_value+0x2c1/0x6c0
       map_update_elem+0x623/0x910
       __sys_bpf+0x90c/0x49a0
       ...

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&trie->lock);
                               lock(&n->list_lock);
                               lock(&trie->lock);
  lock(&n->list_lock);

 *** DEADLOCK ***

[1]: https://syzkaller.appspot.com/bug?extid=9045c0a3d5a7f1b119f7

A bpf program attached to trace_contention_end() triggers after
acquiring &n->list_lock. The program invokes trie_delete_elem(), which
then acquires trie->lock. However, it is possible that another
process is invoking trie_update_elem(). trie_update_elem() will acquire
trie->lock first, then invoke kmalloc_node(). kmalloc_node() may invoke
get_partial_node() and try to acquire &n->list_lock (not necessarily the
same lock object). Therefore, lockdep warns about the circular locking
dependency.

Invoking kmalloc() before acquiring trie->lock could fix the warning.
However, since BPF programs call be invoked from any context (e.g.,
through kprobe/tracepoint/fentry), there may still be lock ordering
problems for internal locks in kmalloc() or trie->lock itself.

To eliminate these potential lock ordering problems with kmalloc()'s
internal locks, replacing kmalloc()/kfree()/kfree_rcu() with equivalent
BPF memory allocator APIs that can be invoked in any context. The lock
ordering problems with trie->lock (e.g., reentrance) will be handled
separately.

Two aspects of this change require explanation:

1. Intermediate and leaf nodes are allocated from the same allocator.
The value size of LPM trie is usually small and only use one allocator
reduces the memory overhead of BPF memory allocator.

2. nodes are freed before invoking spin_unlock_irqrestore(). Therefore,
there is no need to add paired migrate_{disable|enable}() calls for
these free operations.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/lpm_trie.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

Comments

Alexei Starovoitov Nov. 27, 2024, 5:51 a.m. UTC | #1
On Tue, Nov 26, 2024 at 4:34 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> 2. nodes are freed before invoking spin_unlock_irqrestore(). Therefore,
> there is no need to add paired migrate_{disable|enable}() calls for
> these free operations.

...

>         if (ret)
> -               kfree(new_node);
> +               bpf_mem_cache_free(&trie->ma, new_node);
> +       bpf_mem_cache_free_rcu(&trie->ma, free_node);
>         spin_unlock_irqrestore(&trie->lock, irq_flags);
> -       kfree_rcu(free_node, rcu);

...

> +       bpf_mem_cache_free_rcu(&trie->ma, free_parent);
> +       bpf_mem_cache_free_rcu(&trie->ma, free_node);
>         spin_unlock_irqrestore(&trie->lock, irq_flags);
> -       kfree_rcu(free_parent, rcu);
> -       kfree_rcu(free_node, rcu);

going back to under lock wasn't obvious.
I only understood after reading the commit log for the 2nd time.

Probably a code comment would be good.

Though I wonder whether we should add migrate_disable/enable
in the syscall path of these callbacks.
We already wrapped them with rcu_read_lock().
Extra migrate_disable() won't hurt.

And it will help this patch. bpf_mem_cache_free_rcu() can be
done outside of bucket lock.
bpf_ma can easily exhaust the free list in irq disabled region,
so the more operations outside of the known irq region the better.

Also it will help remove migrate_disable/enable from a bunch
of places in kernel/bpf where we added them due to syscall path
or map free path.

It's certainly a follow up, if you agree.
This patch set will go through bpf tree
(after hopefully few more acks from reviewers)
Hou Tao Nov. 28, 2024, 4:12 a.m. UTC | #2
Hi,

On 11/27/2024 1:51 PM, Alexei Starovoitov wrote:
> On Tue, Nov 26, 2024 at 4:34 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> 2. nodes are freed before invoking spin_unlock_irqrestore(). Therefore,
>> there is no need to add paired migrate_{disable|enable}() calls for
>> these free operations.
> ...
>
>>         if (ret)
>> -               kfree(new_node);
>> +               bpf_mem_cache_free(&trie->ma, new_node);
>> +       bpf_mem_cache_free_rcu(&trie->ma, free_node);
>>         spin_unlock_irqrestore(&trie->lock, irq_flags);
>> -       kfree_rcu(free_node, rcu);
> ...
>
>> +       bpf_mem_cache_free_rcu(&trie->ma, free_parent);
>> +       bpf_mem_cache_free_rcu(&trie->ma, free_node);
>>         spin_unlock_irqrestore(&trie->lock, irq_flags);
>> -       kfree_rcu(free_parent, rcu);
>> -       kfree_rcu(free_node, rcu);
> going back to under lock wasn't obvious.
> I only understood after reading the commit log for the 2nd time.
>
> Probably a code comment would be good.

Missed that. Will be alert next time.
>
> Though I wonder whether we should add migrate_disable/enable
> in the syscall path of these callbacks.
> We already wrapped them with rcu_read_lock().
> Extra migrate_disable() won't hurt.

It seems that bpf program has already been running with migration
disabled. I think we could also do the similar thing for the syscall path.
>
> And it will help this patch. bpf_mem_cache_free_rcu() can be
> done outside of bucket lock.
> bpf_ma can easily exhaust the free list in irq disabled region,
> so the more operations outside of the known irq region the better.
>
> Also it will help remove migrate_disable/enable from a bunch
> of places in kernel/bpf where we added them due to syscall path
> or map free path.
>
> It's certainly a follow up, if you agree.
> This patch set will go through bpf tree
> (after hopefully few more acks from reviewers)
Thanks for the suggestion. Will try it.
Toke Høiland-Jørgensen Nov. 29, 2024, 12:01 p.m. UTC | #3
Hou Tao <houtao@huaweicloud.com> writes:

> From: Hou Tao <houtao1@huawei.com>
>
> Multiple syzbot warnings have been reported. These warnings are mainly
> about the lock order between trie->lock and kmalloc()'s internal lock.
> See report [1] as an example:
>
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.10.0-rc7-syzkaller-00003-g4376e966ecb7 #0 Not tainted
> ------------------------------------------------------
> syz.3.2069/15008 is trying to acquire lock:
> ffff88801544e6d8 (&n->list_lock){-.-.}-{2:2}, at: get_partial_node ...
>
> but task is already holding lock:
> ffff88802dcc89f8 (&trie->lock){-.-.}-{2:2}, at: trie_update_elem ...
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (&trie->lock){-.-.}-{2:2}:
>        __raw_spin_lock_irqsave
>        _raw_spin_lock_irqsave+0x3a/0x60
>        trie_delete_elem+0xb0/0x820
>        ___bpf_prog_run+0x3e51/0xabd0
>        __bpf_prog_run32+0xc1/0x100
>        bpf_dispatcher_nop_func
>        ......
>        bpf_trace_run2+0x231/0x590
>        __bpf_trace_contention_end+0xca/0x110
>        trace_contention_end.constprop.0+0xea/0x170
>        __pv_queued_spin_lock_slowpath+0x28e/0xcc0
>        pv_queued_spin_lock_slowpath
>        queued_spin_lock_slowpath
>        queued_spin_lock
>        do_raw_spin_lock+0x210/0x2c0
>        __raw_spin_lock_irqsave
>        _raw_spin_lock_irqsave+0x42/0x60
>        __put_partials+0xc3/0x170
>        qlink_free
>        qlist_free_all+0x4e/0x140
>        kasan_quarantine_reduce+0x192/0x1e0
>        __kasan_slab_alloc+0x69/0x90
>        kasan_slab_alloc
>        slab_post_alloc_hook
>        slab_alloc_node
>        kmem_cache_alloc_node_noprof+0x153/0x310
>        __alloc_skb+0x2b1/0x380
>        ......
>
> -> #0 (&n->list_lock){-.-.}-{2:2}:
>        check_prev_add
>        check_prevs_add
>        validate_chain
>        __lock_acquire+0x2478/0x3b30
>        lock_acquire
>        lock_acquire+0x1b1/0x560
>        __raw_spin_lock_irqsave
>        _raw_spin_lock_irqsave+0x3a/0x60
>        get_partial_node.part.0+0x20/0x350
>        get_partial_node
>        get_partial
>        ___slab_alloc+0x65b/0x1870
>        __slab_alloc.constprop.0+0x56/0xb0
>        __slab_alloc_node
>        slab_alloc_node
>        __do_kmalloc_node
>        __kmalloc_node_noprof+0x35c/0x440
>        kmalloc_node_noprof
>        bpf_map_kmalloc_node+0x98/0x4a0
>        lpm_trie_node_alloc
>        trie_update_elem+0x1ef/0xe00
>        bpf_map_update_value+0x2c1/0x6c0
>        map_update_elem+0x623/0x910
>        __sys_bpf+0x90c/0x49a0
>        ...
>
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(&trie->lock);
>                                lock(&n->list_lock);
>                                lock(&trie->lock);
>   lock(&n->list_lock);
>
>  *** DEADLOCK ***
>
> [1]: https://syzkaller.appspot.com/bug?extid=9045c0a3d5a7f1b119f7
>
> A bpf program attached to trace_contention_end() triggers after
> acquiring &n->list_lock. The program invokes trie_delete_elem(), which
> then acquires trie->lock. However, it is possible that another
> process is invoking trie_update_elem(). trie_update_elem() will acquire
> trie->lock first, then invoke kmalloc_node(). kmalloc_node() may invoke
> get_partial_node() and try to acquire &n->list_lock (not necessarily the
> same lock object). Therefore, lockdep warns about the circular locking
> dependency.
>
> Invoking kmalloc() before acquiring trie->lock could fix the warning.
> However, since BPF programs call be invoked from any context (e.g.,
> through kprobe/tracepoint/fentry), there may still be lock ordering
> problems for internal locks in kmalloc() or trie->lock itself.
>
> To eliminate these potential lock ordering problems with kmalloc()'s
> internal locks, replacing kmalloc()/kfree()/kfree_rcu() with equivalent
> BPF memory allocator APIs that can be invoked in any context. The lock
> ordering problems with trie->lock (e.g., reentrance) will be handled
> separately.
>
> Two aspects of this change require explanation:
>
> 1. Intermediate and leaf nodes are allocated from the same allocator.
> The value size of LPM trie is usually small and only use one allocator
> reduces the memory overhead of BPF memory allocator.
>
> 2. nodes are freed before invoking spin_unlock_irqrestore(). Therefore,
> there is no need to add paired migrate_{disable|enable}() calls for
> these free operations.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>

I agree with Alexei's comments, but otherwise:

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index a1d72d2ec22f..4a3d837e9c9a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,6 +15,7 @@ 
 #include <net/ipv6.h>
 #include <uapi/linux/btf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
 
 /* Intermediate node */
 #define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -22,7 +23,6 @@ 
 struct lpm_trie_node;
 
 struct lpm_trie_node {
-	struct rcu_head rcu;
 	struct lpm_trie_node __rcu	*child[2];
 	u32				prefixlen;
 	u32				flags;
@@ -32,6 +32,7 @@  struct lpm_trie_node {
 struct lpm_trie {
 	struct bpf_map			map;
 	struct lpm_trie_node __rcu	*root;
+	struct bpf_mem_alloc		ma;
 	size_t				n_entries;
 	size_t				max_prefixlen;
 	size_t				data_size;
@@ -287,17 +288,12 @@  static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 	return found->data + trie->data_size;
 }
 
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
 						 const void *value)
 {
 	struct lpm_trie_node *node;
-	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
 
-	if (value)
-		size += trie->map.value_size;
-
-	node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
-				    trie->map.numa_node);
+	node = bpf_mem_cache_alloc(&trie->ma);
 	if (!node)
 		return NULL;
 
@@ -448,9 +444,9 @@  static long trie_update_elem(struct bpf_map *map,
 
 out:
 	if (ret)
-		kfree(new_node);
+		bpf_mem_cache_free(&trie->ma, new_node);
+	bpf_mem_cache_free_rcu(&trie->ma, free_node);
 	spin_unlock_irqrestore(&trie->lock, irq_flags);
-	kfree_rcu(free_node, rcu);
 
 	return ret;
 }
@@ -547,9 +543,9 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 	free_node = node;
 
 out:
+	bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+	bpf_mem_cache_free_rcu(&trie->ma, free_node);
 	spin_unlock_irqrestore(&trie->lock, irq_flags);
-	kfree_rcu(free_parent, rcu);
-	kfree_rcu(free_node, rcu);
 
 	return ret;
 }
@@ -571,6 +567,8 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
 	struct lpm_trie *trie;
+	size_t leaf_size;
+	int err;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 ||
@@ -595,7 +593,17 @@  static struct bpf_map *trie_alloc(union bpf_attr *attr)
 
 	spin_lock_init(&trie->lock);
 
+	/* Allocate intermediate and leaf nodes from the same allocator */
+	leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+		    trie->map.value_size;
+	err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+	if (err)
+		goto free_out;
 	return &trie->map;
+
+free_out:
+	bpf_map_area_free(trie);
+	return ERR_PTR(err);
 }
 
 static void trie_free(struct bpf_map *map)
@@ -627,13 +635,17 @@  static void trie_free(struct bpf_map *map)
 				continue;
 			}
 
-			kfree(node);
+			/* No bpf program may access the map, so freeing the
+			 * node without waiting for the extra RCU GP.
+			 */
+			bpf_mem_cache_raw_free(node);
 			RCU_INIT_POINTER(*slot, NULL);
 			break;
 		}
 	}
 
 out:
+	bpf_mem_alloc_destroy(&trie->ma);
 	bpf_map_area_free(trie);
 }