diff mbox series

[bpf-next] bpf: Alloc bpf_async_cb by using bpf_global_ma under PREEMPT_RT

Message ID 20250114081338.2375090-1-houtao@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next] bpf: Alloc bpf_async_cb by using bpf_global_ma under PREEMPT_RT | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 3 maintainers not CCed: clrkwllms@kernel.org linux-rt-devel@lists.linux.dev rostedt@goodmis.org
netdev/build_clang success Errors and warnings before: 19 this patch: 19
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 58 this patch: 58
netdev/checkpatch warning WARNING: The commit message has 'Call Trace:', perhaps it also needs a 'Fixes:' tag? WARNING: line length of 81 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 11 this patch: 11
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Hou Tao Jan. 14, 2025, 8:13 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Under PREEMPT_RT, it is not safe to use GPF_ATOMIC kmalloc when
preemption or irq is disabled. The following warning is reported when
running test_progs under PREEMPT_RT:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
  in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 675, name: test_progs
  preempt_count: 1, expected: 0
  RCU nest depth: 0, expected: 0
  2 locks held by test_progs/675:
   #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830
   #1: ffff8881f4ec40c8 ((&c->lock)){....}-{2:2}, at: ___slab_alloc+0xbc/0x1280
  Preemption disabled at:
  [<ffffffff8175ae2b>] __bpf_async_init+0xbb/0xb10
  CPU: 1 UID: 0 PID: 675 Comm: test_progs Tainted: G           O       6.12.0+ #11
  Tainted: [O]=OOT_MODULE
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...
  Call Trace:
   <TASK>
   dump_stack_lvl+0x57/0x70
   dump_stack+0x10/0x20
   __might_resched+0x337/0x4d0
   rt_spin_lock+0xd4/0x230
   ___slab_alloc+0xbc/0x1280
   __slab_alloc.isra.0+0x5d/0xa0
   __kmalloc_node_noprof+0xf7/0x4f0
   bpf_map_kmalloc_node+0xf5/0x6b0
   __bpf_async_init+0x20e/0xb10
   bpf_timer_init+0x30/0x40
   bpf_prog_c7e2dc9ff3d5ba62_start_cb+0x55/0x85
   bpf_prog_4eb421be69ae82fa_start_timer+0x5d/0x7e
   bpf_prog_test_run_syscall+0x322/0x830
   __sys_bpf+0x135d/0x3ca0
   __x64_sys_bpf+0x75/0xb0
   x64_sys_call+0x1b5/0xa10
   do_syscall_64+0x3b/0xc0
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

Fix the problem by using bpf_global_ma to allocate bpf_async_cb when
PREEMPT_RT is enabled. The reason for still using kmalloc for
no-PREEMPT_RT case is that bpf_global_ma doesn't support accouting the
allocated memory to specific memcg. Also doing the memory allocation
before invoking __bpf_spin_lock_irqsave() to reduce the possibility of
-ENOMEM for bpf_global_ma.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/helpers.c | 48 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

Comments

Hou Tao Jan. 17, 2025, 8:42 a.m. UTC | #1
On 1/14/2025 4:13 PM, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
>
> Under PREEMPT_RT, it is not safe to use GPF_ATOMIC kmalloc when
> preemption or irq is disabled. The following warning is reported when
> running test_progs under PREEMPT_RT:
>
>   

SNIP
> +	bpf_async_free_rcu(&w->cb);
>  }
>  
>  static void bpf_timer_delete_work(struct work_struct *work)
> @@ -1236,7 +1266,7 @@ static void bpf_timer_delete_work(struct work_struct *work)
>  	 * bpf_timer_cancel_and_free will have been cancelled.
>  	 */
>  	hrtimer_cancel(&t->timer);
> -	kfree_rcu(t, cb.rcu);
> +	bpf_async_free_rcu(&t->cb);
>  }

Er, it is buggy here. migrate_{disable|enable} pair is missed because it
is running under kworker context. Found it when apply the patch after
the patch set "Free htab element out of bucket lock v2". Will send v2
for it.
diff mbox series

Patch

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bcda671feafd9..5041f22812936 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1109,12 +1109,14 @@  struct bpf_async_cb {
  * freeing the timers when inner map is replaced or deleted by user space.
  */
 struct bpf_hrtimer {
+	/* cb must be the first member */
 	struct bpf_async_cb cb;
 	struct hrtimer timer;
 	atomic_t cancelling;
 };
 
 struct bpf_work {
+	/* cb must be the first member */
 	struct bpf_async_cb cb;
 	struct work_struct work;
 	struct work_struct delete_work;
@@ -1141,6 +1143,34 @@  enum bpf_async_type {
 
 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
 
+static void bpf_async_free(struct bpf_async_cb *cb)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		bpf_mem_free(&bpf_global_ma, cb);
+	else
+		kfree(cb);
+}
+
+static void bpf_async_free_rcu(struct bpf_async_cb *cb)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		bpf_mem_free_rcu(&bpf_global_ma, cb);
+	else
+		kfree_rcu(cb, rcu);
+}
+
+static struct bpf_async_cb *bpf_async_alloc(struct bpf_map *map, size_t size)
+{
+	struct bpf_async_cb *cb;
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		cb = bpf_mem_alloc(&bpf_global_ma, size);
+	else
+		/* allocate hrtimer via map_kmalloc to use memcg accounting */
+		cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+	return cb;
+}
+
 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
 {
 	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
@@ -1221,7 +1251,7 @@  static void bpf_wq_delete_work(struct work_struct *work)
 
 	cancel_work_sync(&w->work);
 
-	kfree_rcu(w, cb.rcu);
+	bpf_async_free_rcu(&w->cb);
 }
 
 static void bpf_timer_delete_work(struct work_struct *work)
@@ -1236,7 +1266,7 @@  static void bpf_timer_delete_work(struct work_struct *work)
 	 * bpf_timer_cancel_and_free will have been cancelled.
 	 */
 	hrtimer_cancel(&t->timer);
-	kfree_rcu(t, cb.rcu);
+	bpf_async_free_rcu(&t->cb);
 }
 
 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1263,20 +1293,18 @@  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		return -EINVAL;
 	}
 
+	cb = bpf_async_alloc(map, size);
+	if (!cb)
+		return -ENOMEM;
+
 	__bpf_spin_lock_irqsave(&async->lock);
 	t = async->timer;
 	if (t) {
+		bpf_async_free(cb);
 		ret = -EBUSY;
 		goto out;
 	}
 
-	/* allocate hrtimer via map_kmalloc to use memcg accounting */
-	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
-	if (!cb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	switch (type) {
 	case BPF_ASYNC_TYPE_TIMER:
 		clockid = flags & (MAX_CLOCKS - 1);
@@ -1313,7 +1341,7 @@  static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		 * or pinned in bpffs.
 		 */
 		WRITE_ONCE(async->cb, NULL);
-		kfree(cb);
+		bpf_async_free(cb);
 		ret = -EPERM;
 	}
 out: