diff mbox series

[bpf-next,v2,1/2] bpf: fix NULL pointer dereference in bpf_get_local_storage() helper

Message ID 20210319031236.3709026-1-yhs@fb.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: fix NULL pointer dereference in | expand

Checks

Context Check Description
netdev/apply fail Patch does not apply to bpf-next
netdev/tree_selection success Clearly marked for bpf-next

Commit Message

Yonghong Song March 19, 2021, 3:12 a.m. UTC
Jiri Olsa reported a bug ([1]) in kernel where cgroup local
storage pointer may be NULL in bpf_get_local_storage() helper.
There are two issues uncovered by this bug:
  (1). kprobe or tracepoint prog incorrectly sets cgroup local storage
       before prog run,
  (2). due to change from preempt_disable to migrate_disable,
       preemption is possible and percpu storage might be overwritten
       by other tasks.

This issue (1) is fixed in [2]. This patch tried to address issue (2).
The following shows how things can go wrong:
  task 1:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 2:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 1:   run bpf program

task 1 will effectively use the percpu local storage setting by task 2
which will be either NULL or incorrect ones.

Instead of just one common local storage per cpu, this patch fixed
the issue by permitting 8 local storages per cpu and each local
storage is identified by a task_struct pointer. This way, we
allow at most 8 nested preemption between bpf_cgroup_storage_set()
and bpf_cgroup_storage_unset(). The percpu local storage slot
is released (calling bpf_cgroup_storage_unset()) by the same task
after bpf program finished running.

The patch is tested on top of [2] with reproducer in [1].
Without this patch, kernel will emit error in 2-3 minutes.
With this patch, after one hour, still no error.

 [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T
 [2] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf-cgroup.h | 58 ++++++++++++++++++++++++++++++++------
 include/linux/bpf.h        | 22 ++++++++++++---
 kernel/bpf/helpers.c       | 15 +++++++---
 kernel/bpf/local_storage.c |  5 ++--
 4 files changed, 82 insertions(+), 18 deletions(-)

Comments

Alexei Starovoitov March 20, 2021, 2:47 a.m. UTC | #1
On Thu, Mar 18, 2021 at 08:12:36PM -0700, Yonghong Song wrote:
> -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
> -					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
> +static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
> +					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
>  {
>  	enum bpf_cgroup_storage_type stype;
> +	int i;
> +
> +	preempt_disable();
> +	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
> +		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
> +			continue;
> +
> +		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
> +		for_each_cgroup_storage_type(stype)
> +			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
> +				       storage[stype]);
> +		break;
> +	}
> +	preempt_enable();
> +
> +	if (i == BPF_CGROUP_STORAGE_NEST_MAX) {
> +		WARN_ON_ONCE(1);
> +		return -EBUSY;
> +	}
> +	return 0;

The extra 'if' probably will be optimized by the compiler,
but could you write it like this instead:
+       int err = 0;
..
+		for_each_cgroup_storage_type(stype)
+			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
+				       storage[stype]);
+		goto out;
+	}
+       err = -EBUSY;
+	WARN_ON_ONCE(1);
+    out:
+	preempt_enable();
+	return err;

Also patch 2 should be squashed into patch 1,
since patch 1 alone makes bpf_prog_test_run() broken.
(The WARN_ON_ONCE should trigger right away on test_run without patch 2).

Another nit:
Is title of the patch "fix NULL pointer dereference" actually correct?
It surely was correct before accidental tracing overwrite was fixed.
But the fix is already in bpf tree.
Do you still see it as NULL deref with that 3 min reproducer?
Yonghong Song March 20, 2021, 3:31 a.m. UTC | #2
On 3/19/21 7:47 PM, Alexei Starovoitov wrote:
> On Thu, Mar 18, 2021 at 08:12:36PM -0700, Yonghong Song wrote:
>> -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
>> -					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
>> +static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
>> +					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
>>   {
>>   	enum bpf_cgroup_storage_type stype;
>> +	int i;
>> +
>> +	preempt_disable();
>> +	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
>> +		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
>> +			continue;
>> +
>> +		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
>> +		for_each_cgroup_storage_type(stype)
>> +			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
>> +				       storage[stype]);
>> +		break;
>> +	}
>> +	preempt_enable();
>> +
>> +	if (i == BPF_CGROUP_STORAGE_NEST_MAX) {
>> +		WARN_ON_ONCE(1);
>> +		return -EBUSY;
>> +	}
>> +	return 0;
> 
> The extra 'if' probably will be optimized by the compiler,
> but could you write it like this instead:
> +       int err = 0;
> ..
> +		for_each_cgroup_storage_type(stype)
> +			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
> +				       storage[stype]);
> +		goto out;
> +	}
> +       err = -EBUSY;
> +	WARN_ON_ONCE(1);
> +    out:
> +	preempt_enable();
> +	return err;

okay.

> 
> Also patch 2 should be squashed into patch 1,
> since patch 1 alone makes bpf_prog_test_run() broken.
> (The WARN_ON_ONCE should trigger right away on test_run without patch 2).

You are right. Will fold this into one patch. My original intention is
to apply patch 1 to bpf tree. Looks like folding one patch is necessary.
We can create a different patch for bpf tree if needed.

> 
> Another nit:
> Is title of the patch "fix NULL pointer dereference" actually correct?
> It surely was correct before accidental tracing overwrite was fixed.
> But the fix is already in bpf tree.
> Do you still see it as NULL deref with that 3 min reproducer?

Yes, I do. I just double checked and run again with latest bpf-next +
bpf_local_storage kprobe/tracepoint fix.

with gcc 8.4.1, kasan is enabled. I hit

[  806.571378] BUG: KASAN: null-ptr-deref in 
bpf_get_local_storage+0x29/0x70 

[  806.572393] Read of size 8 at addr 0000000000000000 by task 
test_progs/16069 

[  806.573487] 
 

[  806.573747] CPU: 1 PID: 16069 Comm: test_progs Tainted: G           O 
      5.12.0-rc2+ #59 

[  806.574964] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 

[  806.576627] Call Trace: 
 

[  806.577045]  dump_stack+0xa4/0xe5 
 

[  806.577572]  ? bpf_get_local_storage+0x29/0x70 
 

[  806.578257]  ? bpf_get_local_storage+0x29/0x70 
 

[  806.578929]  kasan_report.cold.13+0x5f/0xd8 
 

[  806.579595]  ? bpf_get_local_storage+0x29/0x70 
 

[  806.580300]  bpf_get_local_storage+0x29/0x70 
 

[  806.580970] 
bpf_prog_b06a218bf1bb5278_bpf_sping_lock_test+0x2af/0xdc8 
 

[  806.581976]  bpf_test_run+0x268/0x420 
 

[  806.582602]  ? bpf_test_timer_continue+0x1c0/0x1c0 
 

[  806.583338]  ? __build_skb+0x20/0x50 
 

[  806.583871]  ? rcu_read_lock_sched_held+0xa1/0xd0 
 

[  806.584562]  ? rcu_read_lock_bh_held+0xb0/0xb0 
 

[  806.585235]  ? static_obj+0x32/0x80 
 

[  806.585801]  ? eth_gro_receive+0x3b0/0x3b0 
 

[  806.586440]  ? __build_skb+0x45/0x50 
 

[  806.587006]  bpf_prog_test_run_skb+0x69c/0xc10 
 

[  806.587721]  ? bpf_prog_test_run_raw_tp+0x2e0/0x2e0 
 

[  806.588500]  ? fput_many+0x1a/0xc0 
 

[  806.589052]  __do_sys_bpf+0x1025/0x2d30
[  806.589637]  ? check_chain_key+0x1ea/0x2f0
[  806.590277]  ? bpf_link_get_from_fd+0x80/0x80
[  806.590974]  ? __lock_acquire+0x921/0x2f80
[  806.591633]  ? register_lock_class+0x950/0x950
[  806.592354]  ? pvclock_clocksource_read+0xdc/0x180
[  806.593160]  ? rcu_read_lock_sched_held+0xa1/0xd0
[  806.593940]  ? syscall_enter_from_user_mode+0x1c/0x40
[  806.594696]  do_syscall_64+0x33/0x40
[  806.595244]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  806.594696]  do_syscall_64+0x33/0x40
[  806.595244]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  806.596044] RIP: 0033:0x7f7a155e67f9
[  806.596599] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 
48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 
05 <48> 3d 01 f0 ff ff 73 01 c3 488
[  806.599404] RSP: 002b:00007f7a144ffe68 EFLAGS: 00000202 ORIG_RAX: 
0000000000000141
[  806.600552] RAX: ffffffffffffffda RBX: 00007f7a144fff2c RCX: 
00007f7a155e67f9
[  806.601627] RDX: 0000000000000078 RSI: 00007f7a144ffe70 RDI: 
000000000000000a
[  806.602694] RBP: 00007f7a144fff28 R08: 0000000000000000 R09: 
0000000000000008
[  806.603795] R10: 0000000000000000 R11: 0000000000000202 R12: 
0000000000000000
[  806.604833] R13: 00007ffd8b062d9f R14: 0000000000000003 R15: 
0000000000000000
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c42e02b4d84b..24689416b534 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,14 +20,25 @@  struct bpf_sock_ops_kern;
 struct bpf_cgroup_storage;
 struct ctl_table;
 struct ctl_table_header;
+struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
 
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+#define BPF_CGROUP_STORAGE_NEST_MAX	8
+
+struct bpf_cgroup_storage_info {
+	struct task_struct *task;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+};
+
+/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
+ * to use bpf cgroup storage simultaneously.
+ */
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -161,13 +172,43 @@  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
+					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	enum bpf_cgroup_storage_type stype;
+	int i;
+
+	preempt_disable();
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
+			continue;
+
+		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
+		for_each_cgroup_storage_type(stype)
+			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
+				       storage[stype]);
+		break;
+	}
+	preempt_enable();
+
+	if (i == BPF_CGROUP_STORAGE_NEST_MAX) {
+		WARN_ON_ONCE(1);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static inline void bpf_cgroup_storage_unset(void)
+{
+	int i;
+
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
 
-	for_each_cgroup_storage_type(stype)
-		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
+		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
+		return;
+	}
 }
 
 struct bpf_cgroup_storage *
@@ -448,8 +489,9 @@  static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline void bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
+static inline int bpf_cgroup_storage_set(
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
+static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a47285cd39c2..3a6ae69743ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1090,6 +1090,13 @@  int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
+/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
+ * if bpf_cgroup_storage_set() failed, the rest of programs
+ * will not execute. This should be a really rare scenario
+ * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
+ * preemptions all between bpf_cgroup_storage_set() and
+ * bpf_cgroup_storage_unset() on the same cpu.
+ */
 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
 	({								\
 		struct bpf_prog_array_item *_item;			\
@@ -1102,10 +1109,12 @@  int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		_array = rcu_dereference(array);			\
 		_item = &_array->items[0];				\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+				break;					\
 			func_ret = func(_prog, ctx);			\
 			_ret &= (func_ret & 1);				\
 			*(ret_flags) |= (func_ret >> 1);			\
+			bpf_cgroup_storage_unset();			\
 			_item++;					\
 		}							\
 		rcu_read_unlock();					\
@@ -1126,9 +1135,14 @@  int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			goto _out;			\
 		_item = &_array->items[0];		\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (set_cg_storage)		\
-				bpf_cgroup_storage_set(_item->cgroup_storage);	\
-			_ret &= func(_prog, ctx);	\
+			if (!set_cg_storage) {			\
+				_ret &= func(_prog, ctx);	\
+			} else {				\
+				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+					break;			\
+				_ret &= func(_prog, ctx);	\
+				bpf_cgroup_storage_unset();	\
+			}				\
 			_item++;			\
 		}					\
 _out:							\
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 074800226327..f306611c4ddf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@  const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -392,10 +392,17 @@  BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage;
+	struct bpf_cgroup_storage *storage = NULL;
 	void *ptr;
+	int i;
 
-	storage = this_cpu_read(bpf_cgroup_storage[stype]);
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
+
+		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+		break;
+	}
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@ 
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
 #ifdef CONFIG_CGROUP_BPF
 
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\