diff mbox series

[bpf-next,07/10] bpf: Switch to bpf mem allocator for LPM trie

Message ID 20241118010808.2243555-8-houtao@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Fixes for LPM trie | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch warning WARNING: The commit message has 'syzkaller', perhaps it also needs a 'Fixes:' tag? WARNING: line length of 90 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Hou Tao Nov. 18, 2024, 1:08 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Multiple syzbot warnings have been reported. These warnings are mainly
about the lock order between trie->lock and kmalloc()'s internal lock.
See report [1] as an example:

======================================================
WARNING: possible circular locking dependency detected
6.10.0-rc7-syzkaller-00003-g4376e966ecb7 #0 Not tainted
------------------------------------------------------
syz.3.2069/15008 is trying to acquire lock:
ffff88801544e6d8 (&n->list_lock){-.-.}-{2:2}, at: get_partial_node ...

but task is already holding lock:
ffff88802dcc89f8 (&trie->lock){-.-.}-{2:2}, at: trie_update_elem ...

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&trie->lock){-.-.}-{2:2}:
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       trie_delete_elem+0xb0/0x820
       ___bpf_prog_run+0x3e51/0xabd0
       __bpf_prog_run32+0xc1/0x100
       bpf_dispatcher_nop_func
       ......
       bpf_trace_run2+0x231/0x590
       __bpf_trace_contention_end+0xca/0x110
       trace_contention_end.constprop.0+0xea/0x170
       __pv_queued_spin_lock_slowpath+0x28e/0xcc0
       pv_queued_spin_lock_slowpath
       queued_spin_lock_slowpath
       queued_spin_lock
       do_raw_spin_lock+0x210/0x2c0
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x42/0x60
       __put_partials+0xc3/0x170
       qlink_free
       qlist_free_all+0x4e/0x140
       kasan_quarantine_reduce+0x192/0x1e0
       __kasan_slab_alloc+0x69/0x90
       kasan_slab_alloc
       slab_post_alloc_hook
       slab_alloc_node
       kmem_cache_alloc_node_noprof+0x153/0x310
       __alloc_skb+0x2b1/0x380
       ......

-> #0 (&n->list_lock){-.-.}-{2:2}:
       check_prev_add
       check_prevs_add
       validate_chain
       __lock_acquire+0x2478/0x3b30
       lock_acquire
       lock_acquire+0x1b1/0x560
       __raw_spin_lock_irqsave
       _raw_spin_lock_irqsave+0x3a/0x60
       get_partial_node.part.0+0x20/0x350
       get_partial_node
       get_partial
       ___slab_alloc+0x65b/0x1870
       __slab_alloc.constprop.0+0x56/0xb0
       __slab_alloc_node
       slab_alloc_node
       __do_kmalloc_node
       __kmalloc_node_noprof+0x35c/0x440
       kmalloc_node_noprof
       bpf_map_kmalloc_node+0x98/0x4a0
       lpm_trie_node_alloc
       trie_update_elem+0x1ef/0xe00
       bpf_map_update_value+0x2c1/0x6c0
       map_update_elem+0x623/0x910
       __sys_bpf+0x90c/0x49a0
       ...

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&trie->lock);
                               lock(&n->list_lock);
                               lock(&trie->lock);
  lock(&n->list_lock);

 *** DEADLOCK ***

[1]: https://syzkaller.appspot.com/bug?extid=9045c0a3d5a7f1b119f7

A bpf program attached to trace_contention_end() triggers after
acquiring &n->list_lock. The program invokes trie_delete_elem(), which
then acquires trie->lock. However, it is possible that another
process is invoking trie_update_elem(). trie_update_elem() will acquire
trie->lock first, then invoke kmalloc_node(). kmalloc_node() may invoke
get_partial_node() and try to acquire &n->list_lock (not necessarily the
same lock object). Therefore, lockdep warns about the circular locking
dependency.

Fix these warnings by replacing kmalloc()/kfree()/kfree_rcu() with
equivalent bpf memory allocator APIs. Since intermediate node and leaf
node have fixed sizes, fixed-size allocation APIs are used.

Two aspects of this change require explanation:

1. A new flag LPM_TREE_NODE_FLAG_ALLOC_LEAF is added to track the
   original allocator. This is necessary because during deletion, a leaf
   node may be used as an intermediate node. These nodes must be freed
   through the leaf allocator.
2. The intermediate node allocator and leaf node allocator may be merged
   because value_size for LPM trie is usually small. The merging reduces
   the memory overhead of bpf memory allocator.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/lpm_trie.c | 90 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 16 deletions(-)

Comments

Sebastian Andrzej Siewior Nov. 18, 2024, 1:30 p.m. UTC | #1
On 2024-11-18 09:08:05 [+0800], Hou Tao wrote:
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index d447a6dab83b..d8995acecedf 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -319,6 +326,25 @@ static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)
>  	return 0;
>  }
>  
> +static void lpm_trie_node_free(struct lpm_trie *trie,
> +			       struct lpm_trie_node *node, bool defer)
> +{
> +	struct bpf_mem_alloc *ma;
> +
> +	if (!node)
> +		return;
> +
> +	ma = (node->flags & LPM_TREE_NODE_FLAG_ALLOC_LEAF) ? trie->leaf_ma :
> +							     trie->im_ma;
> +
> +	migrate_disable();
> +	if (defer)
> +		bpf_mem_cache_free_rcu(ma, node);
> +	else
> +		bpf_mem_cache_free(ma, node);
> +	migrate_enable();

I guess a preempt_disable() here instead wouldn't hurt much. The inner
pieces of the allocator (unit_free()) does local_irq_save() for the
entire function so we don't win much with migrate_disable().

> +}
> +

Sebastian
Yonghong Song Nov. 18, 2024, 4:56 p.m. UTC | #2
On 11/18/24 5:30 AM, Sebastian Andrzej Siewior wrote:
> On 2024-11-18 09:08:05 [+0800], Hou Tao wrote:
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index d447a6dab83b..d8995acecedf 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -319,6 +326,25 @@ static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)
>>   	return 0;
>>   }
>>   
>> +static void lpm_trie_node_free(struct lpm_trie *trie,
>> +			       struct lpm_trie_node *node, bool defer)
>> +{
>> +	struct bpf_mem_alloc *ma;
>> +
>> +	if (!node)
>> +		return;
>> +
>> +	ma = (node->flags & LPM_TREE_NODE_FLAG_ALLOC_LEAF) ? trie->leaf_ma :
>> +							     trie->im_ma;
>> +
>> +	migrate_disable();
>> +	if (defer)
>> +		bpf_mem_cache_free_rcu(ma, node);
>> +	else
>> +		bpf_mem_cache_free(ma, node);
>> +	migrate_enable();
> I guess a preempt_disable() here instead wouldn't hurt much. The inner
> pieces of the allocator (unit_free()) does local_irq_save() for the
> entire function so we don't win much with migrate_disable().

Typically, bpf_mem_*() functions are surrounded directly
or indirectly by migrate_disable/enable. Let us just keep
this pattern to be consistent with other similar usage.
One close example is in kernel/bpf/cpumask.c.

>
>> +}
>> +
> Sebastian
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d447a6dab83b..d8995acecedf 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,23 +15,34 @@ 
 #include <net/ipv6.h>
 #include <uapi/linux/btf.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
 
 /* Intermediate node */
 #define LPM_TREE_NODE_FLAG_IM BIT(0)
+/* Allocated as leaf node. It may be used as intermediate node after deletion */
+#define LPM_TREE_NODE_FLAG_ALLOC_LEAF BIT(1)
 
 struct lpm_trie_node;
 
 struct lpm_trie_node {
-	struct rcu_head rcu;
 	struct lpm_trie_node __rcu	*child[2];
 	u32				prefixlen;
 	u32				flags;
 	u8				data[];
 };
 
+enum {
+	LPM_TRIE_MA_IM = 0,
+	LPM_TRIE_MA_LEAF,
+	LPM_TRIE_MA_CNT,
+};
+
 struct lpm_trie {
 	struct bpf_map			map;
 	struct lpm_trie_node __rcu	*root;
+	struct bpf_mem_alloc		ma[LPM_TRIE_MA_CNT];
+	struct bpf_mem_alloc		*im_ma;
+	struct bpf_mem_alloc		*leaf_ma;
 	size_t				n_entries;
 	size_t				max_prefixlen;
 	size_t				data_size;
@@ -287,25 +298,21 @@  static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 	return found->data + trie->data_size;
 }
 
-static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
-						 const void *value)
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, const void *value)
 {
+	struct bpf_mem_alloc *ma = value ? trie->leaf_ma : trie->im_ma;
 	struct lpm_trie_node *node;
-	size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
-
-	if (value)
-		size += trie->map.value_size;
 
-	node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
-				    trie->map.numa_node);
+	node = bpf_mem_cache_alloc(ma);
 	if (!node)
 		return NULL;
 
 	node->flags = 0;
-
-	if (value)
+	if (value) {
+		node->flags |= LPM_TREE_NODE_FLAG_ALLOC_LEAF;
 		memcpy(node->data + trie->data_size, value,
 		       trie->map.value_size);
+	}
 
 	return node;
 }
@@ -319,6 +326,25 @@  static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)
 	return 0;
 }
 
+static void lpm_trie_node_free(struct lpm_trie *trie,
+			       struct lpm_trie_node *node, bool defer)
+{
+	struct bpf_mem_alloc *ma;
+
+	if (!node)
+		return;
+
+	ma = (node->flags & LPM_TREE_NODE_FLAG_ALLOC_LEAF) ? trie->leaf_ma :
+							     trie->im_ma;
+
+	migrate_disable();
+	if (defer)
+		bpf_mem_cache_free_rcu(ma, node);
+	else
+		bpf_mem_cache_free(ma, node);
+	migrate_enable();
+}
+
 /* Called from syscall or from eBPF program */
 static long trie_update_elem(struct bpf_map *map,
 			     void *_key, void *value, u64 flags)
@@ -450,9 +476,10 @@  static long trie_update_elem(struct bpf_map *map,
 
 out:
 	if (ret)
-		kfree(new_node);
+		bpf_mem_cache_free(trie->leaf_ma, new_node);
+
 	spin_unlock_irqrestore(&trie->lock, irq_flags);
-	kfree_rcu(free_node, rcu);
+	lpm_trie_node_free(trie, free_node, true);
 
 	return ret;
 }
@@ -550,8 +577,8 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 
 out:
 	spin_unlock_irqrestore(&trie->lock, irq_flags);
-	kfree_rcu(free_parent, rcu);
-	kfree_rcu(free_node, rcu);
+	lpm_trie_node_free(trie, free_parent, true);
+	lpm_trie_node_free(trie, free_node, true);
 
 	return ret;
 }
@@ -572,7 +599,9 @@  static long trie_delete_elem(struct bpf_map *map, void *_key)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
+	size_t size, leaf_size;
 	struct lpm_trie *trie;
+	int err;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 ||
@@ -597,7 +626,30 @@  static struct bpf_map *trie_alloc(union bpf_attr *attr)
 
 	spin_lock_init(&trie->lock);
 
+	size = sizeof(struct lpm_trie_node) + trie->data_size;
+	err = bpf_mem_alloc_init(&trie->ma[LPM_TRIE_MA_IM], size, false);
+	if (err)
+		goto free_out;
+	trie->im_ma = &trie->ma[LPM_TRIE_MA_IM];
+
+	leaf_size = size + trie->map.value_size;
+	if (bpf_mem_cache_is_mergeable(size, leaf_size, false)) {
+		trie->leaf_ma = trie->im_ma;
+	} else {
+		err = bpf_mem_alloc_init(&trie->ma[LPM_TRIE_MA_LEAF],
+					 leaf_size, false);
+		if (err)
+			goto destroy_ma_out;
+		trie->leaf_ma = &trie->ma[LPM_TRIE_MA_LEAF];
+	}
+
 	return &trie->map;
+
+destroy_ma_out:
+	bpf_mem_alloc_destroy(trie->im_ma);
+free_out:
+	bpf_map_area_free(trie);
+	return ERR_PTR(err);
 }
 
 static void trie_free(struct bpf_map *map)
@@ -629,13 +681,19 @@  static void trie_free(struct bpf_map *map)
 				continue;
 			}
 
-			kfree(node);
+			/* No bpf program may access the map, so freeing the
+			 * node without waiting for the extra RCU GP.
+			 */
+			lpm_trie_node_free(trie, node, false);
 			RCU_INIT_POINTER(*slot, NULL);
 			break;
 		}
 	}
 
 out:
+	bpf_mem_alloc_destroy(trie->im_ma);
+	if (trie->leaf_ma != trie->im_ma)
+		bpf_mem_alloc_destroy(trie->leaf_ma);
 	bpf_map_area_free(trie);
 }