From patchwork Fri Nov 24 11:30:28 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467556 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 197BF10F4 for ; Fri, 24 Nov 2023 03:29:31 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.163.216]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS62QTCz4f3k68 for ; Fri, 24 Nov 2023 19:29:26 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 1BC891A016F for ; Fri, 24 Nov 2023 19:29:29 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S5; Fri, 24 Nov 2023 19:29:28 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 1/6] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers Date: Fri, 24 Nov 2023 19:30:28 +0800 Message-Id: <20231124113033.503338-2-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S5 X-Coremail-Antispam: 1UD129KBjvJXoWxCFWUXFWUXr45tFy3Kw1fCrg_yoW5ZF4xpF y09a4Fkr1jqFsrZ3yYva92yry5Ka90ganrJws7Ww4YvF4UWFn7XryxJFnIvF98KrWUAr48 Z3W2qwnrA348Aa7anT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUB0b4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUGw A2048vs2IY020Ec7CjxVAFwI0_Gr0_Xr1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0rcxS w2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267AKxV W8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E14v2 6rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7xfMc Ij6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Yz7v_ Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2Ij64 vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8G jcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0xvE2I x0cI8IcVAFwI0_Jr0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r4j6F4UMIIF0xvE42xK 8VAvwI8IcIk0rVWUJVWUCwCI42IY6I8E87Iv67AKxVWUJVW8JwCI42IY6I8E87Iv6xkF7I 0E14v26r4j6r4UJbIYCTnIWIevJa73UjIFyTuYvjxU2mL9UUUUU X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao These three bpf_map_{lookup,update,delete}_elem() helpers are also available for sleepable bpf program, so add the corresponding lock assertion for sleepable bpf program, otherwise the following warning will be reported when a sleepable bpf program manipulates bpf map under interpreter mode (aka bpf_jit_enable=0): WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ...... CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:bpf_map_lookup_elem+0x54/0x60 ...... Call Trace: ? __warn+0xa5/0x240 ? bpf_map_lookup_elem+0x54/0x60 ? report_bug+0x1ba/0x1f0 ? handle_bug+0x40/0x80 ? exc_invalid_op+0x18/0x50 ? asm_exc_invalid_op+0x1b/0x20 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ? rcu_lockdep_current_cpu_online+0x65/0xb0 ? rcu_is_watching+0x23/0x50 ? bpf_map_lookup_elem+0x54/0x60 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ___bpf_prog_run+0x513/0x3b70 __bpf_prog_run32+0x9d/0xd0 ? __bpf_prog_enter_sleepable_recur+0xad/0x120 ? __bpf_prog_enter_sleepable_recur+0x3e/0x120 bpf_trampoline_6442580665+0x4d/0x1000 __x64_sys_getpgid+0x5/0x30 ? do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao --- kernel/bpf/helpers.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 56b0c1f678ee..f43038931935 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -32,12 +32,13 @@ * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock - * if program is allowed to access maps, so check rcu_read_lock_held in - * all three functions. + * if program is allowed to access maps, so check rcu_read_lock_held() or + * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } From patchwork Fri Nov 24 11:30:29 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467558 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 52FB210CA for ; Fri, 24 Nov 2023 03:29:32 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.163.216]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS6614mz4f3k69 for ; Fri, 24 Nov 2023 19:29:26 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 96F731A016F for ; Fri, 24 Nov 2023 19:29:29 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S6; Fri, 24 Nov 2023 19:29:29 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 2/6] bpf: Add map and need_defer parameters to .map_fd_put_ptr() Date: Fri, 24 Nov 2023 19:30:29 +0800 Message-Id: <20231124113033.503338-3-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S6 X-Coremail-Antispam: 1UD129KBjvJXoW3GF4kAw13Ww4rWF43WFWxJFb_yoW7Kr1kpF WrKF42kw4kXF4UX3yrAw4kurWYyw1fZ345CF95Ka4Fvr15Xr9rXF4kGa9xGry5W3ykGF4q yFnrtryFk348ArDanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUB0b4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUXw A2048vs2IY020Ec7CjxVAFwI0_Xr0E3s1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0rcxS w2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267AKxV W8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E14v2 6rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7xfMc Ij6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Yz7v_ Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2Ij64 vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8G jcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0xvE2I x0cI8IcVAFwI0_Jr0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26r4j6F4UMIIF0xvE42xK 8VAvwI8IcIk0rVWUJVWUCwCI42IY6I8E87Iv67AKxVWUJVW8JwCI42IY6I8E87Iv6xkF7I 0E14v26r4j6r4UJbIYCTnIWIevJa73UjIFyTuYvjxUFa9-UUUUU X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao --- include/linux/bpf.h | 6 +++++- kernel/bpf/arraymap.c | 12 +++++++----- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6762dac3ef76..6f68ac24bb9a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..bd90c3c09032 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } @@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } else { return -ENOENT; @@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -1239,8 +1240,9 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd8d4b0addfc..5b9146fa825f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cd5eafaba97e..7d5754d18fd0 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif From patchwork Fri Nov 24 11:30:30 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467559 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2593B10E4 for ; Fri, 24 Nov 2023 03:29:32 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.93.142]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS72XGGz4f3k6S for ; Fri, 24 Nov 2023 19:29:27 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 1DE8C1A06C9 for ; Fri, 24 Nov 2023 19:29:30 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S7; Fri, 24 Nov 2023 19:29:29 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 3/6] bpf: Defer the free of inner map when necessary Date: Fri, 24 Nov 2023 19:30:30 +0800 Message-Id: <20231124113033.503338-4-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S7 X-Coremail-Antispam: 1UD129KBjvJXoWxZr4UGry7GFWkur4xXrWrXwb_yoWrtrW5pF Z8K348Cw40qr429rZxZa17ZrWYyw4fW34DCas5Ga4YyrnxWr9Fv3W0gFy3AF15Ar4kJrW2 vrnFg34Fk3yUZFJanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUBYb4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUWw A2048vs2IY020Ec7CjxVAFwI0_Xr0E3s1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0rcxS w2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267AKxV W8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E14v2 6rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7xfMc Ij6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Yz7v_ Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2Ij64 vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s026x8G jcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0xvE2I x0cI8IcVAFwI0_Jr0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26F4j6r4UJwCI42IY6xAI w20EY4v20xvaj40_Jr0_JF4lIxAIcVC2z280aVAFwI0_Jr0_Gr1lIxAIcVC2z280aVCY1x 0267AKxVW8JVW8JrUvcSsGvfC2KfnxnUUI43ZEXa7IU1c4S7UUUUU== X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when freeing the last ref-counter of bpf map, and the newly-added rcu_head field in bpf_map shares the same space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao --- include/linux/bpf.h | 7 ++++++- kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6f68ac24bb9a..15a6bb951b70 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 7d5754d18fd0..cf3363065566 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (deferred) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a144eb286974..88882cb58121 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -718,6 +718,28 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_map_free_rcu_gp(rcu); + else + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -727,11 +749,11 @@ void bpf_map_put(struct bpf_map *map) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - /* Avoid spawning kworkers, since they all might contend - * for the same mutex like slab_mutex. - */ - queue_work(system_unbound_wq, &map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); From patchwork Fri Nov 24 11:30:31 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467560 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 2F41810FD for ; Fri, 24 Nov 2023 03:29:33 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.93.142]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS768stz4f3k6Y for ; Fri, 24 Nov 2023 19:29:27 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 9A0761A06D2 for ; Fri, 24 Nov 2023 19:29:30 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S8; Fri, 24 Nov 2023 19:29:30 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 4/6] bpf: Optimize the free of inner map Date: Fri, 24 Nov 2023 19:30:31 +0800 Message-Id: <20231124113033.503338-5-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S8 X-Coremail-Antispam: 1UD129KBjvJXoW3AFyxCF43KF43KFyxAr43KFg_yoW7Ww4DpF W5KrW8Gr4kXr42k39Iyw47Z345XwsYgw45Gas5C345Ar15Wr9FqFy0gFW5Jr98AF4kJ3yI vrnIgryUK3yUZFDanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUBIb4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUAV Cq3wA2048vs2IY020Ec7CjxVAFwI0_Xr0E3s1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0 rcxSw2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267 AKxVW8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E 14v26rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7 xfMcIj6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Y z7v_Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2 Ij64vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s02 6x8GjcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0x vE2Ix0cI8IcVAFwI0_Jr0_JF4lIxAIcVC0I7IYx2IY6xkF7I0E14v26F4j6r4UJwCI42IY 6xAIw20EY4v20xvaj40_Jr0_JF4lIxAIcVC2z280aVAFwI0_Jr0_Gr1lIxAIcVC2z280aV CY1x0267AKxVW8JVW8JrUvcSsGvfC2KfnxnUUI43ZEXa7IU13l1DUUUUU== X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is unnecessary to wait for any RCU grace period, one RCU grace period or one RCU tasks trace grace period if the outer map is only accessed by userspace, sleepable program or non-sleepable program. So recording the sleepable attributes of the owned bpf programs when adding the outer map into env->used_maps, copying the recorded attributes to inner map atomically when removing inner map from the outer map and using the recorded attributes in the inner map to decide which, and how many, RCU grace periods are needed when freeing the inner map. Signed-off-by: Hou Tao --- include/linux/bpf.h | 8 +++++++- kernel/bpf/map_in_map.c | 19 ++++++++++++++----- kernel/bpf/syscall.c | 15 +++++++++++++-- kernel/bpf/verifier.c | 4 ++++ 4 files changed, 38 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 15a6bb951b70..c5b549f352d7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -245,6 +245,11 @@ struct bpf_list_node_kern { void *owner; } __attribute__((aligned(8))); +enum { + BPF_MAP_RCU_GP = BIT(0), + BPF_MAP_RCU_TT_GP = BIT(1), +}; + struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -296,7 +301,8 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - bool free_after_mult_rcu_gp; + atomic_t used_in_rcu_gp; + atomic_t free_by_rcu_gp; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cf3363065566..d044ee677107 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -131,12 +131,21 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool deferred) { struct bpf_map *inner_map = ptr; - /* The inner map may still be used by both non-sleepable and sleepable - * bpf program, so free it after one RCU grace period and one tasks - * trace RCU grace period. + /* Defer the freeing of inner map according to the attribute of bpf + * program which owns the outer map, so unnecessary multiple RCU GP + * waitings can be avoided. */ - if (deferred) - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + if (deferred) { + /* used_in_rcu_gp may be updated concurrently by new bpf + * program, so add smp_mb() to guarantee the order between + * used_in_rcu_gp and lookup/deletion operation of inner map. + * If a new bpf program finds the inner map before it is + * removed from outer map, reading used_in_rcu_gp below will + * return the newly-set bit set by the new bpf program. + */ + smp_mb(); + atomic_or(atomic_read(&map->used_in_rcu_gp), &inner_map->free_by_rcu_gp); + } bpf_map_put(inner_map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 88882cb58121..014a8cd55a41 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -734,7 +734,10 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu) static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) + struct bpf_map *map = container_of(rcu, struct bpf_map, rcu); + + if (!(atomic_read(&map->free_by_rcu_gp) & BPF_MAP_RCU_GP) || + rcu_trace_implies_rcu_gp()) bpf_map_free_rcu_gp(rcu); else call_rcu(rcu, bpf_map_free_rcu_gp); @@ -746,11 +749,16 @@ static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) void bpf_map_put(struct bpf_map *map) { if (atomic64_dec_and_test(&map->refcnt)) { + int free_by_rcu_gp; + /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); - if (READ_ONCE(map->free_after_mult_rcu_gp)) + free_by_rcu_gp = atomic_read(&map->free_by_rcu_gp); + if (free_by_rcu_gp == BPF_MAP_RCU_GP) + call_rcu(&map->rcu, bpf_map_free_rcu_gp); + else if (free_by_rcu_gp) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); else bpf_map_free_in_work(map); @@ -5343,6 +5351,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } + /* No need to update used_in_rcu_gp, because the bpf program doesn't + * access the map. + */ memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6da370a047fe..3b86c02077f1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18051,6 +18051,10 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } + atomic_or(env->prog->aux->sleepable ? BPF_MAP_RCU_TT_GP : BPF_MAP_RCU_GP, + &map->used_in_rcu_gp); + /* Pairs with smp_mb() in bpf_map_fd_put_ptr() */ + smp_mb__before_atomic(); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded From patchwork Fri Nov 24 11:30:32 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467561 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9D3EC10EC for ; Fri, 24 Nov 2023 03:29:33 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.163.216]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS82frhz4f3jM3 for ; Fri, 24 Nov 2023 19:29:28 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 240DF1A071C for ; Fri, 24 Nov 2023 19:29:31 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S9; Fri, 24 Nov 2023 19:29:30 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 5/6] selftests/bpf: Add test cases for inner map Date: Fri, 24 Nov 2023 19:30:32 +0800 Message-Id: <20231124113033.503338-6-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S9 X-Coremail-Antispam: 1UD129KBjvJXoW3Wry8uw1fGw15KrWkGrWUtwb_yoWftry5pF WrCry3CrW8JrWUXw4Uta17uFW8Krs5Ww1UtanYkr15ZF1DWr9xXrZ7WFWUXF13urWkAryF 93ZFyay8G3ykAFJanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUBIb4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUAV Cq3wA2048vs2IY020Ec7CjxVAFwI0_Xr0E3s1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0 rcxSw2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267 AKxVW8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E 14v26rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7 xfMcIj6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Y z7v_Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2 Ij64vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s02 6x8GjcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0x vE2Ix0cI8IcVAFwI0_JFI_Gr1lIxAIcVC0I7IYx2IY6xkF7I0E14v26F4j6r4UJwCI42IY 6xAIw20EY4v20xvaj40_Jr0_JF4lIxAIcVC2z280aVAFwI0_Jr0_Gr1lIxAIcVC2z280aV CY1x0267AKxVW8JVW8JrUvcSsGvfC2KfnxnUUI43ZEXa7IU13l1DUUUUU== X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao Add test cases to test the race between the destroy of inner map due to map-in-map update and the access of inner map in bpf program. The following 4 combinations are added: (1) array map in map array + bpf program (2) array map in map array + sleepable bpf program (3) array map in map htab + bpf program (4) array map in map htab + sleepable bpf program Before applying the fixes, when running `./test_prog -a map_in_map`, the following error was reported: ================================================================== BUG: KASAN: slab-use-after-free in array_map_update_elem+0x48/0x3e0 Read of size 4 at addr ffff888114f33824 by task test_progs/1858 CPU: 1 PID: 1858 Comm: test_progs Tainted: G O 6.6.0+ #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... Call Trace: dump_stack_lvl+0x4a/0x90 print_report+0xd2/0x620 kasan_report+0xd1/0x110 __asan_load4+0x81/0xa0 array_map_update_elem+0x48/0x3e0 bpf_prog_be94a9f26772f5b7_access_map_in_array+0xe6/0xf6 trace_call_bpf+0x1aa/0x580 kprobe_perf_func+0xdd/0x430 kprobe_dispatcher+0xa0/0xb0 kprobe_ftrace_handler+0x18b/0x2e0 0xffffffffc02280f7 RIP: 0010:__x64_sys_getpgid+0x1/0x30 ...... Allocated by task 1857: kasan_save_stack+0x26/0x50 kasan_set_track+0x25/0x40 kasan_save_alloc_info+0x1e/0x30 __kasan_kmalloc+0x98/0xa0 __kmalloc_node+0x6a/0x150 __bpf_map_area_alloc+0x141/0x170 bpf_map_area_alloc+0x10/0x20 array_map_alloc+0x11f/0x310 map_create+0x28a/0xb40 __sys_bpf+0x753/0x37c0 __x64_sys_bpf+0x44/0x60 do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Freed by task 11: kasan_save_stack+0x26/0x50 kasan_set_track+0x25/0x40 kasan_save_free_info+0x2b/0x50 __kasan_slab_free+0x113/0x190 slab_free_freelist_hook+0xd7/0x1e0 __kmem_cache_free+0x170/0x260 kfree+0x9b/0x160 kvfree+0x2d/0x40 bpf_map_area_free+0xe/0x20 array_map_free+0x120/0x2c0 bpf_map_free_deferred+0xd7/0x1e0 process_one_work+0x462/0x990 worker_thread+0x370/0x670 kthread+0x1b0/0x200 ret_from_fork+0x3a/0x70 ret_from_fork_asm+0x1b/0x30 Last potentially related work creation: kasan_save_stack+0x26/0x50 __kasan_record_aux_stack+0x94/0xb0 kasan_record_aux_stack_noalloc+0xb/0x20 __queue_work+0x331/0x950 queue_work_on+0x75/0x80 bpf_map_put+0xfa/0x160 bpf_map_fd_put_ptr+0xe/0x20 bpf_fd_array_map_update_elem+0x174/0x1b0 bpf_map_update_value+0x2b7/0x4a0 __sys_bpf+0x2551/0x37c0 __x64_sys_bpf+0x44/0x60 do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao --- .../selftests/bpf/prog_tests/map_in_map.c | 141 ++++++++++++++++++ .../selftests/bpf/progs/access_map_in_map.c | 93 ++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_in_map.c create mode 100644 tools/testing/selftests/bpf/progs/access_map_in_map.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_in_map.c b/tools/testing/selftests/bpf/prog_tests/map_in_map.c new file mode 100644 index 000000000000..d2a10eb4e5b5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_in_map.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "access_map_in_map.skel.h" + +struct thread_ctx { + pthread_barrier_t barrier; + int outer_map_fd; + int start, abort; + int loop, err; +}; + +static int wait_for_start_or_abort(struct thread_ctx *ctx) +{ + while (!ctx->start && !ctx->abort) + usleep(1); + return ctx->abort ? -1 : 0; +} + +static void *update_map_fn(void *data) +{ + struct thread_ctx *ctx = data; + int loop = ctx->loop, err = 0; + + if (wait_for_start_or_abort(ctx) < 0) + return NULL; + pthread_barrier_wait(&ctx->barrier); + + while (loop-- > 0) { + int fd, zero = 0; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (fd < 0) { + err |= 1; + pthread_barrier_wait(&ctx->barrier); + continue; + } + + /* Remove the old inner map */ + if (bpf_map_update_elem(ctx->outer_map_fd, &zero, &fd, 0) < 0) + err |= 2; + close(fd); + pthread_barrier_wait(&ctx->barrier); + } + + ctx->err = err; + + return NULL; +} + +static void *access_map_fn(void *data) +{ + struct thread_ctx *ctx = data; + int loop = ctx->loop; + + if (wait_for_start_or_abort(ctx) < 0) + return NULL; + pthread_barrier_wait(&ctx->barrier); + + while (loop-- > 0) { + /* Access the old inner map */ + syscall(SYS_getpgid); + pthread_barrier_wait(&ctx->barrier); + } + + return NULL; +} + +static void test_map_in_map_access(const char *prog_name, const char *map_name) +{ + struct access_map_in_map *skel; + struct bpf_map *outer_map; + struct bpf_program *prog; + struct thread_ctx ctx; + pthread_t tid[2]; + int err; + + skel = access_map_in_map__open(); + if (!ASSERT_OK_PTR(skel, "access_map_in_map open")) + return; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "find program")) + goto out; + bpf_program__set_autoload(prog, true); + + outer_map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(outer_map, "find map")) + goto out; + + err = access_map_in_map__load(skel); + if (!ASSERT_OK(err, "access_map_in_map load")) + goto out; + + err = access_map_in_map__attach(skel); + if (!ASSERT_OK(err, "access_map_in_map attach")) + goto out; + + skel->bss->tgid = getpid(); + + memset(&ctx, 0, sizeof(ctx)); + pthread_barrier_init(&ctx.barrier, NULL, 2); + ctx.outer_map_fd = bpf_map__fd(outer_map); + ctx.loop = 4; + + err = pthread_create(&tid[0], NULL, update_map_fn, &ctx); + if (!ASSERT_OK(err, "close_thread")) + goto out; + + err = pthread_create(&tid[1], NULL, access_map_fn, &ctx); + if (!ASSERT_OK(err, "read_thread")) { + ctx.abort = 1; + pthread_join(tid[0], NULL); + goto out; + } + + ctx.start = 1; + pthread_join(tid[0], NULL); + pthread_join(tid[1], NULL); + + ASSERT_OK(ctx.err, "err"); +out: + access_map_in_map__destroy(skel); +} + +void test_map_in_map(void) +{ + if (test__start_subtest("acc_map_in_array")) + test_map_in_map_access("access_map_in_array", "outer_array_map"); + if (test__start_subtest("sleepable_acc_map_in_array")) + test_map_in_map_access("sleepable_access_map_in_array", "outer_array_map"); + if (test__start_subtest("acc_map_in_htab")) + test_map_in_map_access("access_map_in_htab", "outer_htab_map"); + if (test__start_subtest("sleepable_acc_map_in_htab")) + test_map_in_map_access("sleepable_access_map_in_htab", "outer_htab_map"); +} + diff --git a/tools/testing/selftests/bpf/progs/access_map_in_map.c b/tools/testing/selftests/bpf/progs/access_map_in_map.c new file mode 100644 index 000000000000..1126871c2ebd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/access_map_in_map.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#include +#include +#include + +#include "bpf_misc.h" + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_array_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_htab_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + +char _license[] SEC("license") = "GPL"; + +int tgid = 0; + +static int acc_map_in_map(void *outer_map) +{ + int i, key, value = 0xdeadbeef; + void *inner_map; + + if ((bpf_get_current_pid_tgid() >> 32) != tgid) + return 0; + + /* Find nonexistent inner map */ + key = 1; + inner_map = bpf_map_lookup_elem(outer_map, &key); + if (inner_map) + return 0; + + /* Find the old inner map */ + key = 0; + inner_map = bpf_map_lookup_elem(outer_map, &key); + if (!inner_map) + return 0; + + /* Wait for the old inner map to be replaced */ + for (i = 0; i < 2048; i++) + bpf_map_update_elem(inner_map, &key, &value, 0); + + return 0; +} + +SEC("?kprobe/" SYS_PREFIX "sys_getpgid") +int access_map_in_array(void *ctx) +{ + return acc_map_in_map(&outer_array_map); +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int sleepable_access_map_in_array(void *ctx) +{ + return acc_map_in_map(&outer_array_map); +} + +SEC("?kprobe/" SYS_PREFIX "sys_getpgid") +int access_map_in_htab(void *ctx) +{ + return acc_map_in_map(&outer_htab_map); +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int sleepable_access_map_in_htab(void *ctx) +{ + return acc_map_in_map(&outer_htab_map); +} From patchwork Fri Nov 24 11:30:33 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hou Tao X-Patchwork-Id: 13467562 X-Patchwork-Delegate: bpf@iogearbox.net Authentication-Results: smtp.subspace.kernel.org; dkim=none Received: from dggsgout11.his.huawei.com (dggsgout11.his.huawei.com [45.249.212.51]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8B49710EB for ; Fri, 24 Nov 2023 03:29:35 -0800 (PST) Received: from mail.maildlp.com (unknown [172.19.93.142]) by dggsgout11.his.huawei.com (SkyGuard) with ESMTP id 4ScCS86FdSz4f3jZ6 for ; Fri, 24 Nov 2023 19:29:28 +0800 (CST) Received: from mail02.huawei.com (unknown [10.116.40.112]) by mail.maildlp.com (Postfix) with ESMTP id 9E09A1A06DC for ; Fri, 24 Nov 2023 19:29:31 +0800 (CST) Received: from huaweicloud.com (unknown [10.175.124.27]) by APP1 (Coremail) with SMTP id cCh0CgDn6xEUiWBloME3Bw--.31197S10; Fri, 24 Nov 2023 19:29:31 +0800 (CST) From: Hou Tao To: bpf@vger.kernel.org Cc: Martin KaFai Lau , Alexei Starovoitov , Andrii Nakryiko , Song Liu , Hao Luo , Yonghong Song , Daniel Borkmann , KP Singh , Stanislav Fomichev , Jiri Olsa , John Fastabend , houtao1@huawei.com Subject: [PATCH bpf v3 6/6] selftests/bpf: Test outer map update operations in syscall program Date: Fri, 24 Nov 2023 19:30:33 +0800 Message-Id: <20231124113033.503338-7-houtao@huaweicloud.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20231124113033.503338-1-houtao@huaweicloud.com> References: <20231124113033.503338-1-houtao@huaweicloud.com> Precedence: bulk X-Mailing-List: bpf@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 X-CM-TRANSID: cCh0CgDn6xEUiWBloME3Bw--.31197S10 X-Coremail-Antispam: 1UD129KBjvJXoW3WF4kXr1kJFW7KryxtF17Awb_yoW7Kry8pa yrGa4UtF4rXF43Gry8Xa17WayfKF1vg343trsrtw1YvFnrXr1SqFyIgay7tF13WrZ3Zr4F vay7trZ5Aw4UZFDanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDU0xBIdaVrnRJUUUBIb4IE77IF4wAFF20E14v26rWj6s0DM7CY07I20VC2zVCF04k2 6cxKx2IYs7xG6rWj6s0DM7CIcVAFz4kK6r1j6r18M28IrcIa0xkI8VA2jI8067AKxVWUAV Cq3wA2048vs2IY020Ec7CjxVAFwI0_Xr0E3s1l8cAvFVAK0II2c7xJM28CjxkF64kEwVA0 rcxSw2x7M28EF7xvwVC0I7IYx2IY67AKxVW7JVWDJwA2z4x0Y4vE2Ix0cI8IcVCY1x0267 AKxVW8Jr0_Cr1UM28EF7xvwVC2z280aVAFwI0_GcCE3s1l84ACjcxK6I8E87Iv6xkF7I0E 14v26rxl6s0DM2AIxVAIcxkEcVAq07x20xvEncxIr21l5I8CrVACY4xI64kE6c02F40Ex7 xfMcIj6xIIjxv20xvE14v26r1j6r18McIj6I8E87Iv67AKxVWUJVW8JwAm72CE4IkC6x0Y z7v_Jr0_Gr1lF7xvr2IYc2Ij64vIr41lFIxGxcIEc7CjxVA2Y2ka0xkIwI1l42xK82IYc2 Ij64vIr41l4I8I3I0E4IkC6x0Yz7v_Jr0_Gr1lx2IqxVAqx4xG67AKxVWUJVWUGwC20s02 6x8GjcxK67AKxVWUGVWUWwC2zVAF1VAY17CE14v26r1q6r43MIIYrxkI7VAKI48JMIIF0x vE2Ix0cI8IcVAFwI0_JFI_Gr1lIxAIcVC0I7IYx2IY6xkF7I0E14v26F4j6r4UJwCI42IY 6xAIw20EY4v20xvaj40_Jr0_JF4lIxAIcVC2z280aVAFwI0_Jr0_Gr1lIxAIcVC2z280aV CY1x0267AKxVW8JVW8JrUvcSsGvfC2KfnxnUUI43ZEXa7IU13l1DUUUUU== X-CM-SenderInfo: xkrx3t3r6k3tpzhluzxrxghudrp/ X-Patchwork-Delegate: bpf@iogearbox.net From: Hou Tao Syscall program is running with rcu_read_lock_trace being held, so if bpf_map_update_elem() or bpf_map_delete_elem() invokes synchronize_rcu_tasks_trace() when operating on an outer map, there will be dead-lock, so add a test to guarantee that it is dead-lock free. Signed-off-by: Hou Tao --- .../selftests/bpf/prog_tests/syscall.c | 30 +++++- tools/testing/selftests/bpf/progs/syscall.c | 91 ++++++++++++++++++- 2 files changed, 114 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/syscall.c b/tools/testing/selftests/bpf/prog_tests/syscall.c index f4d40001155a..0be8301c0ffd 100644 --- a/tools/testing/selftests/bpf/prog_tests/syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/syscall.c @@ -12,7 +12,7 @@ struct args { int btf_fd; }; -void test_syscall(void) +static void test_syscall_load_prog(void) { static char verifier_log[8192]; struct args ctx = { @@ -32,7 +32,7 @@ void test_syscall(void) if (!ASSERT_OK_PTR(skel, "skel_load")) goto cleanup; - prog_fd = bpf_program__fd(skel->progs.bpf_prog); + prog_fd = bpf_program__fd(skel->progs.load_prog); err = bpf_prog_test_run_opts(prog_fd, &tattr); ASSERT_EQ(err, 0, "err"); ASSERT_EQ(tattr.retval, 1, "retval"); @@ -53,3 +53,29 @@ void test_syscall(void) if (ctx.btf_fd > 0) close(ctx.btf_fd); } + +static void test_syscall_update_outer_map(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct syscall *skel; + int err, prog_fd; + + skel = syscall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.update_outer_map); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(err, 0, "err"); + ASSERT_EQ(opts.retval, 1, "retval"); +cleanup: + syscall__destroy(skel); +} + +void test_syscall(void) +{ + if (test__start_subtest("load_prog")) + test_syscall_load_prog(); + if (test__start_subtest("update_outer_map")) + test_syscall_update_outer_map(); +} diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c index e550f728962d..23058abbaabd 100644 --- a/tools/testing/selftests/bpf/progs/syscall.c +++ b/tools/testing/selftests/bpf/progs/syscall.c @@ -6,9 +6,15 @@ #include #include <../../../tools/include/linux/filter.h> #include +#include +#include char _license[] SEC("license") = "GPL"; +struct bpf_map { + int id; +} __attribute__((preserve_access_index)); + struct args { __u64 log_buf; __u32 log_size; @@ -27,6 +33,32 @@ struct args { BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ BTF_INT_ENC(encoding, bits_offset, bits) +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, union bpf_attr); + __uint(max_entries, 1); +} bpf_attr_array SEC(".maps"); + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_array_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + static int btf_load(void) { struct btf_blob { @@ -58,7 +90,7 @@ static int btf_load(void) } SEC("syscall") -int bpf_prog(struct args *ctx) +int load_prog(struct args *ctx) { static char license[] = "GPL"; static struct bpf_insn insns[] = { @@ -94,8 +126,8 @@ int bpf_prog(struct args *ctx) map_create_attr.max_entries = ctx->max_entries; map_create_attr.btf_fd = ret; - prog_load_attr.license = (long) license; - prog_load_attr.insns = (long) insns; + prog_load_attr.license = (unsigned long)license; + prog_load_attr.insns = (unsigned long)insns; prog_load_attr.log_buf = ctx->log_buf; prog_load_attr.log_size = ctx->log_size; prog_load_attr.log_level = 1; @@ -107,8 +139,8 @@ int bpf_prog(struct args *ctx) insns[3].imm = ret; map_update_attr.map_fd = ret; - map_update_attr.key = (long) &key; - map_update_attr.value = (long) &value; + map_update_attr.key = (unsigned long)&key; + map_update_attr.value = (unsigned long)&value; ret = bpf_sys_bpf(BPF_MAP_UPDATE_ELEM, &map_update_attr, sizeof(map_update_attr)); if (ret < 0) return ret; @@ -119,3 +151,52 @@ int bpf_prog(struct args *ctx) ctx->prog_fd = ret; return 1; } + +SEC("syscall") +int update_outer_map(void *ctx) +{ + int zero = 0, ret = 0, outer_fd = -1, inner_fd = -1, err; + const int attr_sz = sizeof(union bpf_attr); + union bpf_attr *attr; + + attr = bpf_map_lookup_elem((struct bpf_map *)&bpf_attr_array, &zero); + if (!attr) + goto out; + + memset(attr, 0, attr_sz); + attr->map_id = ((struct bpf_map *)&outer_array_map)->id; + outer_fd = bpf_sys_bpf(BPF_MAP_GET_FD_BY_ID, attr, attr_sz); + if (outer_fd < 0) + goto out; + + memset(attr, 0, attr_sz); + attr->map_type = BPF_MAP_TYPE_ARRAY; + attr->key_size = 4; + attr->value_size = 4; + attr->max_entries = 1; + inner_fd = bpf_sys_bpf(BPF_MAP_CREATE, attr, attr_sz); + if (inner_fd < 0) + goto out; + + memset(attr, 0, attr_sz); + attr->map_fd = outer_fd; + attr->key = (unsigned long)&zero; + attr->value = (unsigned long)&inner_fd; + err = bpf_sys_bpf(BPF_MAP_UPDATE_ELEM, attr, attr_sz); + if (err) + goto out; + + memset(attr, 0, attr_sz); + attr->map_fd = outer_fd; + attr->key = (unsigned long)&zero; + err = bpf_sys_bpf(BPF_MAP_DELETE_ELEM, attr, attr_sz); + if (err) + goto out; + ret = 1; +out: + if (inner_fd >= 0) + bpf_sys_close(inner_fd); + if (outer_fd >= 0) + bpf_sys_close(outer_fd); + return ret; +}