diff mbox series

[bpf,1/2] bpf, cpumap: Make sure kthread is running before map update returns

Message ID 20230729095107.1722450-2-houtao@huaweicloud.com (mailing list archive)
State Accepted
Delegated to: BPF
Headers show
Series Two fixes for cpu-map | expand

Checks

Context Check Description
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-4 success Logs for build for x86_64 with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1344 this patch: 1344
netdev/cc_maintainers warning 1 maintainers not CCed: yhs@fb.com
netdev/build_clang success Errors and warnings before: 1365 this patch: 1365
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1367 this patch: 1367
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 67 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

Hou Tao July 29, 2023, 9:51 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

The following warning was reported when running stress-mode enabled
xdp_redirect_cpu with some RT threads:

  ------------[ cut here ]------------
  WARNING: CPU: 4 PID: 65 at kernel/bpf/cpumap.c:135
  CPU: 4 PID: 65 Comm: kworker/4:1 Not tainted 6.5.0-rc2+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: events cpu_map_kthread_stop
  RIP: 0010:put_cpu_map_entry+0xda/0x220
  ......
  Call Trace:
   <TASK>
   ? show_regs+0x65/0x70
   ? __warn+0xa5/0x240
   ......
   ? put_cpu_map_entry+0xda/0x220
   cpu_map_kthread_stop+0x41/0x60
   process_one_work+0x6b0/0xb80
   worker_thread+0x96/0x720
   kthread+0x1a5/0x1f0
   ret_from_fork+0x3a/0x70
   ret_from_fork_asm+0x1b/0x30
   </TASK>

The root cause is the same as commit 436901649731 ("bpf: cpumap: Fix memory
leak in cpu_map_update_elem"). The kthread is stopped prematurely by
kthread_stop() in cpu_map_kthread_stop(), and kthread() doesn't call
cpu_map_kthread_run() at all but XDP program has already queued some
frames or skbs into ptr_ring. So when __cpu_map_ring_cleanup() checks
the ptr_ring, it will find it was not emptied and report a warning.

An alternative fix is to use __cpu_map_ring_cleanup() to drop these
pending frames or skbs when kthread_stop() returns -EINTR, but it may
confuse the user, because these frames or skbs have been handled
correctly by XDP program. So instead of dropping these frames or skbs,
just make sure the per-cpu kthread is running before
__cpu_map_entry_alloc() returns.

After apply the fix, the error handle for kthread_stop() will be
unnecessary because it will always return 0, so just remove it.

Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP")
Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/cpumap.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

Comments

Pu Lehui July 29, 2023, 10:21 a.m. UTC | #1
On 2023/7/29 17:51, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
> 
> The following warning was reported when running stress-mode enabled
> xdp_redirect_cpu with some RT threads:
> 
>    ------------[ cut here ]------------
>    WARNING: CPU: 4 PID: 65 at kernel/bpf/cpumap.c:135
>    CPU: 4 PID: 65 Comm: kworker/4:1 Not tainted 6.5.0-rc2+ #1
>    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
>    Workqueue: events cpu_map_kthread_stop
>    RIP: 0010:put_cpu_map_entry+0xda/0x220
>    ......
>    Call Trace:
>     <TASK>
>     ? show_regs+0x65/0x70
>     ? __warn+0xa5/0x240
>     ......
>     ? put_cpu_map_entry+0xda/0x220
>     cpu_map_kthread_stop+0x41/0x60
>     process_one_work+0x6b0/0xb80
>     worker_thread+0x96/0x720
>     kthread+0x1a5/0x1f0
>     ret_from_fork+0x3a/0x70
>     ret_from_fork_asm+0x1b/0x30
>     </TASK>
> 
> The root cause is the same as commit 436901649731 ("bpf: cpumap: Fix memory
> leak in cpu_map_update_elem"). The kthread is stopped prematurely by
> kthread_stop() in cpu_map_kthread_stop(), and kthread() doesn't call
> cpu_map_kthread_run() at all but XDP program has already queued some
> frames or skbs into ptr_ring. So when __cpu_map_ring_cleanup() checks
> the ptr_ring, it will find it was not emptied and report a warning.
> 
> An alternative fix is to use __cpu_map_ring_cleanup() to drop these
> pending frames or skbs when kthread_stop() returns -EINTR, but it may
> confuse the user, because these frames or skbs have been handled
> correctly by XDP program. So instead of dropping these frames or skbs,
> just make sure the per-cpu kthread is running before
> __cpu_map_entry_alloc() returns.
> 
> After apply the fix, the error handle for kthread_stop() will be
> unnecessary because it will always return 0, so just remove it.
> 
> Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP")
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>   kernel/bpf/cpumap.c | 21 +++++++++++----------
>   1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 0a16e30b16ef..08351e0863e5 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -28,6 +28,7 @@
>   #include <linux/sched.h>
>   #include <linux/workqueue.h>
>   #include <linux/kthread.h>
> +#include <linux/completion.h>
>   #include <trace/events/xdp.h>
>   #include <linux/btf_ids.h>
>   
> @@ -71,6 +72,7 @@ struct bpf_cpu_map_entry {
>   	struct rcu_head rcu;
>   
>   	struct work_struct kthread_stop_wq;
> +	struct completion kthread_running;
>   };
>   
>   struct bpf_cpu_map {
> @@ -151,7 +153,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
>   static void cpu_map_kthread_stop(struct work_struct *work)
>   {
>   	struct bpf_cpu_map_entry *rcpu;
> -	int err;
>   
>   	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
>   
> @@ -161,14 +162,7 @@ static void cpu_map_kthread_stop(struct work_struct *work)
>   	rcu_barrier();
>   
>   	/* kthread_stop will wake_up_process and wait for it to complete */
> -	err = kthread_stop(rcpu->kthread);
> -	if (err) {
> -		/* kthread_stop may be called before cpu_map_kthread_run
> -		 * is executed, so we need to release the memory related
> -		 * to rcpu.
> -		 */
> -		put_cpu_map_entry(rcpu);
> -	}
> +	kthread_stop(rcpu->kthread);
>   }

Looks good to me.

Feel free to add:
Reviewed-by: Pu Lehui <pulehui@huawei.com>

>   
>   static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
> @@ -296,11 +290,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
>   	return nframes;
>   }
>   
> -
>   static int cpu_map_kthread_run(void *data)
>   {
>   	struct bpf_cpu_map_entry *rcpu = data;
>   
> +	complete(&rcpu->kthread_running);
>   	set_current_state(TASK_INTERRUPTIBLE);
>   
>   	/* When kthread gives stop order, then rcpu have been disconnected
> @@ -465,6 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>   		goto free_ptr_ring;
>   
>   	/* Setup kthread */
> +	init_completion(&rcpu->kthread_running);
>   	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
>   					       "cpumap/%d/map:%d", cpu,
>   					       map->id);
> @@ -478,6 +473,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>   	kthread_bind(rcpu->kthread, cpu);
>   	wake_up_process(rcpu->kthread);
>   
> +	/* Make sure kthread has been running, so kthread_stop() will not
> +	 * stop the kthread prematurely and all pending frames or skbs
> +	 * will be handled by the kthread before kthread_stop() returns.
> +	 */
> +	wait_for_completion(&rcpu->kthread_running);
> +
>   	return rcpu;
>   
>   free_prog:
Jesper Dangaard Brouer July 31, 2023, 9:51 a.m. UTC | #2
On 29/07/2023 11.51, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
> 
> The following warning was reported when running stress-mode enabled
> xdp_redirect_cpu with some RT threads:

Cool stress-mode test that leverage RT to provoke this.

> 
>    ------------[ cut here ]------------
>    WARNING: CPU: 4 PID: 65 at kernel/bpf/cpumap.c:135
>    CPU: 4 PID: 65 Comm: kworker/4:1 Not tainted 6.5.0-rc2+ #1

As you mention RT, I want to mention that it also possible to change the
sched prio on the kthread PID.

>    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
>    Workqueue: events cpu_map_kthread_stop
>    RIP: 0010:put_cpu_map_entry+0xda/0x220
>    ......
>    Call Trace:
>     <TASK>
>     ? show_regs+0x65/0x70
>     ? __warn+0xa5/0x240
>     ......
>     ? put_cpu_map_entry+0xda/0x220
>     cpu_map_kthread_stop+0x41/0x60
>     process_one_work+0x6b0/0xb80
>     worker_thread+0x96/0x720
>     kthread+0x1a5/0x1f0
>     ret_from_fork+0x3a/0x70
>     ret_from_fork_asm+0x1b/0x30
>     </TASK>
> 
> The root cause is the same as commit 436901649731 ("bpf: cpumap: Fix memory
> leak in cpu_map_update_elem"). The kthread is stopped prematurely by
> kthread_stop() in cpu_map_kthread_stop(), and kthread() doesn't call
> cpu_map_kthread_run() at all but XDP program has already queued some
> frames or skbs into ptr_ring. So when __cpu_map_ring_cleanup() checks
> the ptr_ring, it will find it was not emptied and report a warning.
> 
> An alternative fix is to use __cpu_map_ring_cleanup() to drop these
> pending frames or skbs when kthread_stop() returns -EINTR, but it may
> confuse the user, because these frames or skbs have been handled
> correctly by XDP program. So instead of dropping these frames or skbs,
> just make sure the per-cpu kthread is running before
> __cpu_map_entry_alloc() returns.
> 
> After apply the fix, the error handle for kthread_stop() will be
> unnecessary because it will always return 0, so just remove it.
> 
> Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP")
> Signed-off-by: Hou Tao <houtao1@huawei.com>

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>

Thanks for catching this!

> ---
>   kernel/bpf/cpumap.c | 21 +++++++++++----------
>   1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 0a16e30b16ef..08351e0863e5 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -28,6 +28,7 @@
>   #include <linux/sched.h>
>   #include <linux/workqueue.h>
>   #include <linux/kthread.h>
> +#include <linux/completion.h>
>   #include <trace/events/xdp.h>
>   #include <linux/btf_ids.h>
>   
> @@ -71,6 +72,7 @@ struct bpf_cpu_map_entry {
>   	struct rcu_head rcu;
>   
>   	struct work_struct kthread_stop_wq;
> +	struct completion kthread_running;
>   };
>   
>   struct bpf_cpu_map {
> @@ -151,7 +153,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
>   static void cpu_map_kthread_stop(struct work_struct *work)
>   {
>   	struct bpf_cpu_map_entry *rcpu;
> -	int err;
>   
>   	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
>   
> @@ -161,14 +162,7 @@ static void cpu_map_kthread_stop(struct work_struct *work)
>   	rcu_barrier();
>   
>   	/* kthread_stop will wake_up_process and wait for it to complete */
> -	err = kthread_stop(rcpu->kthread);
> -	if (err) {
> -		/* kthread_stop may be called before cpu_map_kthread_run
> -		 * is executed, so we need to release the memory related
> -		 * to rcpu.
> -		 */
> -		put_cpu_map_entry(rcpu);
> -	}
> +	kthread_stop(rcpu->kthread);
>   }
>   
>   static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
> @@ -296,11 +290,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
>   	return nframes;
>   }
>   
> -
>   static int cpu_map_kthread_run(void *data)
>   {
>   	struct bpf_cpu_map_entry *rcpu = data;
>   
> +	complete(&rcpu->kthread_running);
>   	set_current_state(TASK_INTERRUPTIBLE);
>   

Diff is missing next lines that show this is correct.
I checked this manually and for other reviewers here are the next lines:

	set_current_state(TASK_INTERRUPTIBLE);

	/* When kthread gives stop order, then rcpu have been disconnected
	 * from map, thus no new packets can enter. Remaining in-flight
	 * per CPU stored packets are flushed to this queue.  Wait honoring
	 * kthread_stop signal until queue is empty.
	 */
	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {

The patch is correct in setting complete(&rcpu->kthread_running) before
the while-loop, as the code also checks if ptr_ring is not empty.


>   	/* When kthread gives stop order, then rcpu have been disconnected
> @@ -465,6 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>   		goto free_ptr_ring;
>   
>   	/* Setup kthread */
> +	init_completion(&rcpu->kthread_running);
>   	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
>   					       "cpumap/%d/map:%d", cpu,
>   					       map->id);
> @@ -478,6 +473,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
>   	kthread_bind(rcpu->kthread, cpu);
>   	wake_up_process(rcpu->kthread);
>   
> +	/* Make sure kthread has been running, so kthread_stop() will not
> +	 * stop the kthread prematurely and all pending frames or skbs
> +	 * will be handled by the kthread before kthread_stop() returns.
> +	 */
> +	wait_for_completion(&rcpu->kthread_running);
> +
>   	return rcpu;
>   
>   free_prog:
diff mbox series

Patch

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 0a16e30b16ef..08351e0863e5 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -28,6 +28,7 @@ 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
+#include <linux/completion.h>
 #include <trace/events/xdp.h>
 #include <linux/btf_ids.h>
 
@@ -71,6 +72,7 @@  struct bpf_cpu_map_entry {
 	struct rcu_head rcu;
 
 	struct work_struct kthread_stop_wq;
+	struct completion kthread_running;
 };
 
 struct bpf_cpu_map {
@@ -151,7 +153,6 @@  static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 static void cpu_map_kthread_stop(struct work_struct *work)
 {
 	struct bpf_cpu_map_entry *rcpu;
-	int err;
 
 	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
 
@@ -161,14 +162,7 @@  static void cpu_map_kthread_stop(struct work_struct *work)
 	rcu_barrier();
 
 	/* kthread_stop will wake_up_process and wait for it to complete */
-	err = kthread_stop(rcpu->kthread);
-	if (err) {
-		/* kthread_stop may be called before cpu_map_kthread_run
-		 * is executed, so we need to release the memory related
-		 * to rcpu.
-		 */
-		put_cpu_map_entry(rcpu);
-	}
+	kthread_stop(rcpu->kthread);
 }
 
 static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
@@ -296,11 +290,11 @@  static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
 	return nframes;
 }
 
-
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
 
+	complete(&rcpu->kthread_running);
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	/* When kthread gives stop order, then rcpu have been disconnected
@@ -465,6 +459,7 @@  __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
 		goto free_ptr_ring;
 
 	/* Setup kthread */
+	init_completion(&rcpu->kthread_running);
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
 					       "cpumap/%d/map:%d", cpu,
 					       map->id);
@@ -478,6 +473,12 @@  __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
 	kthread_bind(rcpu->kthread, cpu);
 	wake_up_process(rcpu->kthread);
 
+	/* Make sure kthread has been running, so kthread_stop() will not
+	 * stop the kthread prematurely and all pending frames or skbs
+	 * will be handled by the kthread before kthread_stop() returns.
+	 */
+	wait_for_completion(&rcpu->kthread_running);
+
 	return rcpu;
 
 free_prog: