diff mbox series

[bpf,05/11] bpf: Add bpf_map_of_map_fd_{get,put}_ptr() helpers

Message ID 20231107140702.1891778-6-houtao@huaweicloud.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Fix the release of inner map | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf, async
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1328 this patch: 1328
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 1342 this patch: 1342
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1356 this patch: 1356
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-3 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-llvm-16 / build / build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-llvm-16 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-12 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-PR fail PR summary
bpf/vmtest-bpf-VM_Test-10 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-11 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Hou Tao Nov. 7, 2023, 2:06 p.m. UTC
From: Hou Tao <houtao1@huawei.com>

bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
pointer saved in map-in-map. These two helpers will be used by the
following patches to fix the use-after-free problems for map-in-map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_in_map.h | 11 +++++++--
 2 files changed, 60 insertions(+), 2 deletions(-)

Comments

Martin KaFai Lau Nov. 9, 2023, 6:36 a.m. UTC | #1
On 11/7/23 6:06 AM, Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
> 
> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
> pointer saved in map-in-map. These two helpers will be used by the
> following patches to fix the use-after-free problems for map-in-map.
> 
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
>   kernel/bpf/map_in_map.h | 11 +++++++--
>   2 files changed, 60 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
> index 8323ce201159d..96e32f4167c4e 100644
> --- a/kernel/bpf/map_in_map.c
> +++ b/kernel/bpf/map_in_map.c
> @@ -4,6 +4,7 @@
>   #include <linux/slab.h>
>   #include <linux/bpf.h>
>   #include <linux/btf.h>
> +#include <linux/rcupdate.h>
>   
>   #include "map_in_map.h"
>   
> @@ -139,3 +140,53 @@ u32 bpf_map_fd_sys_lookup_elem(void *ptr)
>   {
>   	return ((struct bpf_map *)ptr)->id;
>   }
> +
> +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
> +			       int ufd)
> +{
> +	struct bpf_inner_map_element *element;
> +	struct bpf_map *inner_map;
> +
> +	element = kmalloc(sizeof(*element), GFP_KERNEL);
> +	if (!element)
> +		return ERR_PTR(-ENOMEM);
> +
> +	inner_map = bpf_map_fd_get_ptr(map, map_file, ufd);
> +	if (IS_ERR(inner_map)) {
> +		kfree(element);
> +		return inner_map;
> +	}
> +
> +	element->map = inner_map;
> +	return element;
> +}
> +
> +static void bpf_inner_map_element_free_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_inner_map_element *elem = container_of(rcu, struct bpf_inner_map_element, rcu);
> +
> +	bpf_map_put(elem->map);
> +	kfree(elem);
> +}
> +
> +static void bpf_inner_map_element_free_tt_rcu(struct rcu_head *rcu)
> +{
> +	if (rcu_trace_implies_rcu_gp())
> +		bpf_inner_map_element_free_rcu(rcu);
> +	else
> +		call_rcu(rcu, bpf_inner_map_element_free_rcu);
> +}
> +
> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> +{
> +	struct bpf_inner_map_element *element = ptr;
> +
> +	/* Do bpf_map_put() after a RCU grace period and a tasks trace
> +	 * RCU grace period, so it is certain that the bpf program which is
> +	 * manipulating the map now has exited when bpf_map_put() is called.
> +	 */
> +	if (need_defer)

"need_defer" should only happen from the syscall cmd? Instead of adding rcu_head 
to each element, how about "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?

> +		call_rcu_tasks_trace(&element->rcu, bpf_inner_map_element_free_tt_rcu);
> +	else
> +		bpf_inner_map_element_free_rcu(&element->rcu);
> +}
> diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
> index 63872bffd9b3c..8d38496e5179b 100644
> --- a/kernel/bpf/map_in_map.h
> +++ b/kernel/bpf/map_in_map.h
> @@ -9,11 +9,18 @@
>   struct file;
>   struct bpf_map;
>   
> +struct bpf_inner_map_element {
> +	struct bpf_map *map;
> +	struct rcu_head rcu;
> +};
> +
>   struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
>   void bpf_map_meta_free(struct bpf_map *map_meta);
> -void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
> -			 int ufd);
> +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd);
>   void bpf_map_fd_put_ptr(void *ptr, bool need_defer);
>   u32 bpf_map_fd_sys_lookup_elem(void *ptr);
>   
> +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd);
> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer);
> +
>   #endif
Hou Tao Nov. 9, 2023, 7:26 a.m. UTC | #2
Hi,

On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
> On 11/7/23 6:06 AM, Hou Tao wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
>> pointer saved in map-in-map. These two helpers will be used by the
>> following patches to fix the use-after-free problems for map-in-map.
>>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
>>   kernel/bpf/map_in_map.h | 11 +++++++--
>>   2 files changed, 60 insertions(+), 2 deletions(-)
>>
>>
SNIP
>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>> +{
>> +    struct bpf_inner_map_element *element = ptr;
>> +
>> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
>> +     * RCU grace period, so it is certain that the bpf program which is
>> +     * manipulating the map now has exited when bpf_map_put() is
>> called.
>> +     */
>> +    if (need_defer)
>
> "need_defer" should only happen from the syscall cmd? Instead of
> adding rcu_head to each element, how about
> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?

No. I have tried the method before, but it didn't work due to dead-lock
(will mention that in commit message in v2). The reason is that bpf
syscall program may also do map update through sys_bpf helper. Because
bpf syscall program is running with sleep-able context and has
rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
call_rcu_tasks) will lead to dead-lock.
>
>> +        call_rcu_tasks_trace(&element->rcu,
>> bpf_inner_map_element_free_tt_rcu);
>> +    else
>> +        bpf_inner_map_element_free_rcu(&element->rcu);
>> +}
>> diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
>> index 63872bffd9b3c..8d38496e5179b 100644
>> --- a/kernel/bpf/map_in_map.h
>> +++ b/kernel/bpf/map_in_map.h
>> @@ -9,11 +9,18 @@
>>   struct file;
>>   struct bpf_map;
>>   +struct bpf_inner_map_element {
>> +    struct bpf_map *map;
>> +    struct rcu_head rcu;
>> +};
>> +
>>   struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
>>   void bpf_map_meta_free(struct bpf_map *map_meta);
>> -void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
>> -             int ufd);
>> +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
>> int ufd);
>>   void bpf_map_fd_put_ptr(void *ptr, bool need_defer);
>>   u32 bpf_map_fd_sys_lookup_elem(void *ptr);
>>   +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file
>> *map_file, int ufd);
>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer);
>> +
>>   #endif
Alexei Starovoitov Nov. 9, 2023, 3:55 p.m. UTC | #3
On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
> > On 11/7/23 6:06 AM, Hou Tao wrote:
> >> From: Hou Tao <houtao1@huawei.com>
> >>
> >> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
> >> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
> >> pointer saved in map-in-map. These two helpers will be used by the
> >> following patches to fix the use-after-free problems for map-in-map.
> >>
> >> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >> ---
> >>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
> >>   kernel/bpf/map_in_map.h | 11 +++++++--
> >>   2 files changed, 60 insertions(+), 2 deletions(-)
> >>
> >>
> SNIP
> >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> >> +{
> >> +    struct bpf_inner_map_element *element = ptr;
> >> +
> >> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
> >> +     * RCU grace period, so it is certain that the bpf program which is
> >> +     * manipulating the map now has exited when bpf_map_put() is
> >> called.
> >> +     */
> >> +    if (need_defer)
> >
> > "need_defer" should only happen from the syscall cmd? Instead of
> > adding rcu_head to each element, how about
> > "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>
> No. I have tried the method before, but it didn't work due to dead-lock
> (will mention that in commit message in v2). The reason is that bpf
> syscall program may also do map update through sys_bpf helper. Because
> bpf syscall program is running with sleep-able context and has
> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
> call_rcu_tasks) will lead to dead-lock.

Dead-lock? why?

I think it's legal to do call_rcu_tasks_trace() while inside RCU CS
or RCU tasks trace CS.
Paul E. McKenney Nov. 9, 2023, 7:55 p.m. UTC | #4
On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote:
> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >
> > Hi,
> >
> > On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
> > > On 11/7/23 6:06 AM, Hou Tao wrote:
> > >> From: Hou Tao <houtao1@huawei.com>
> > >>
> > >> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
> > >> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
> > >> pointer saved in map-in-map. These two helpers will be used by the
> > >> following patches to fix the use-after-free problems for map-in-map.
> > >>
> > >> Signed-off-by: Hou Tao <houtao1@huawei.com>
> > >> ---
> > >>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
> > >>   kernel/bpf/map_in_map.h | 11 +++++++--
> > >>   2 files changed, 60 insertions(+), 2 deletions(-)
> > >>
> > >>
> > SNIP
> > >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> > >> +{
> > >> +    struct bpf_inner_map_element *element = ptr;
> > >> +
> > >> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
> > >> +     * RCU grace period, so it is certain that the bpf program which is
> > >> +     * manipulating the map now has exited when bpf_map_put() is
> > >> called.
> > >> +     */
> > >> +    if (need_defer)
> > >
> > > "need_defer" should only happen from the syscall cmd? Instead of
> > > adding rcu_head to each element, how about
> > > "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
> >
> > No. I have tried the method before, but it didn't work due to dead-lock
> > (will mention that in commit message in v2). The reason is that bpf
> > syscall program may also do map update through sys_bpf helper. Because
> > bpf syscall program is running with sleep-able context and has
> > rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
> > call_rcu_tasks) will lead to dead-lock.
> 
> Dead-lock? why?
> 
> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS
> or RCU tasks trace CS.

Just confirming that this is the case.  If invoking call_rcu_tasks_trace()
within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks,
then there is a bug that needs fixing.  ;-)

							Thanx, Paul
Hou Tao Nov. 10, 2023, 1:06 a.m. UTC | #5
Hi,

On 11/10/2023 3:55 AM, Paul E. McKenney wrote:
> On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote:
>> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>> Hi,
>>>
>>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
>>>> On 11/7/23 6:06 AM, Hou Tao wrote:
>>>>> From: Hou Tao <houtao1@huawei.com>
>>>>>
>>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
>>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
>>>>> pointer saved in map-in-map. These two helpers will be used by the
>>>>> following patches to fix the use-after-free problems for map-in-map.
>>>>>
>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>> ---
>>>>>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
>>>>>   kernel/bpf/map_in_map.h | 11 +++++++--
>>>>>   2 files changed, 60 insertions(+), 2 deletions(-)
>>>>>
>>>>>
>>> SNIP
>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>>>>> +{
>>>>> +    struct bpf_inner_map_element *element = ptr;
>>>>> +
>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
>>>>> +     * RCU grace period, so it is certain that the bpf program which is
>>>>> +     * manipulating the map now has exited when bpf_map_put() is
>>>>> called.
>>>>> +     */
>>>>> +    if (need_defer)
>>>> "need_defer" should only happen from the syscall cmd? Instead of
>>>> adding rcu_head to each element, how about
>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>>> No. I have tried the method before, but it didn't work due to dead-lock
>>> (will mention that in commit message in v2). The reason is that bpf
>>> syscall program may also do map update through sys_bpf helper. Because
>>> bpf syscall program is running with sleep-able context and has
>>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
>>> call_rcu_tasks) will lead to dead-lock.
>> Dead-lock? why?
>>
>> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS
>> or RCU tasks trace CS.
> Just confirming that this is the case.  If invoking call_rcu_tasks_trace()
> within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks,
> then there is a bug that needs fixing.  ;-)

The case for dead-lock is that calling synchronize_rcu_mult(call_rcu,
call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it
is expected. The case that calling call_rcu_tasks_trace() with
rcu_read_lock_trace() being held is OK.
>
> 							Thanx, Paul
>
> .
Paul E. McKenney Nov. 10, 2023, 1:45 a.m. UTC | #6
On Fri, Nov 10, 2023 at 09:06:56AM +0800, Hou Tao wrote:
> Hi,
> 
> On 11/10/2023 3:55 AM, Paul E. McKenney wrote:
> > On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote:
> >> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >>> Hi,
> >>>
> >>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
> >>>> On 11/7/23 6:06 AM, Hou Tao wrote:
> >>>>> From: Hou Tao <houtao1@huawei.com>
> >>>>>
> >>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
> >>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
> >>>>> pointer saved in map-in-map. These two helpers will be used by the
> >>>>> following patches to fix the use-after-free problems for map-in-map.
> >>>>>
> >>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
> >>>>> ---
> >>>>>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
> >>>>>   kernel/bpf/map_in_map.h | 11 +++++++--
> >>>>>   2 files changed, 60 insertions(+), 2 deletions(-)
> >>>>>
> >>>>>
> >>> SNIP
> >>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> >>>>> +{
> >>>>> +    struct bpf_inner_map_element *element = ptr;
> >>>>> +
> >>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
> >>>>> +     * RCU grace period, so it is certain that the bpf program which is
> >>>>> +     * manipulating the map now has exited when bpf_map_put() is
> >>>>> called.
> >>>>> +     */
> >>>>> +    if (need_defer)
> >>>> "need_defer" should only happen from the syscall cmd? Instead of
> >>>> adding rcu_head to each element, how about
> >>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
> >>> No. I have tried the method before, but it didn't work due to dead-lock
> >>> (will mention that in commit message in v2). The reason is that bpf
> >>> syscall program may also do map update through sys_bpf helper. Because
> >>> bpf syscall program is running with sleep-able context and has
> >>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
> >>> call_rcu_tasks) will lead to dead-lock.
> >> Dead-lock? why?
> >>
> >> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS
> >> or RCU tasks trace CS.
> > Just confirming that this is the case.  If invoking call_rcu_tasks_trace()
> > within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks,
> > then there is a bug that needs fixing.  ;-)
> 
> The case for dead-lock is that calling synchronize_rcu_mult(call_rcu,
> call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it
> is expected. The case that calling call_rcu_tasks_trace() with
> rcu_read_lock_trace() being held is OK.

Very good, you are quite right.  In this particular case, deadlock is
expected behavior.

The problem here is that synchronize_rcu_mult() doesn't just invoke its
arguments, instead, it also waits for all of the corresponding grace
periods to complete.  But if you call this while under the protection of
rcu_read_lock_trace(), then synchronize_rcu_mult(call_rcu_tasks_trace)
cannot return until the corresponding rcu_read_unlock_trace() is
reached, but that rcu_read_unlock_trace() cannot be reached until after
synchronize_rcu_mult(call_rcu_tasks_trace) returns.

(I did leave out the call_rcu argument because it does not participate
in this particular deadlock.)

							Thanx, Paul
Hou Tao Nov. 10, 2023, 2:37 a.m. UTC | #7
Hi Paul,

On 11/10/2023 9:45 AM, Paul E. McKenney wrote:
> On Fri, Nov 10, 2023 at 09:06:56AM +0800, Hou Tao wrote:
>> Hi,
>>
>> On 11/10/2023 3:55 AM, Paul E. McKenney wrote:
>>> On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote:
>>>> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>>>> Hi,
>>>>>
>>>>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote:
>>>>>> On 11/7/23 6:06 AM, Hou Tao wrote:
>>>>>>> From: Hou Tao <houtao1@huawei.com>
>>>>>>>
>>>>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer
>>>>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the
>>>>>>> pointer saved in map-in-map. These two helpers will be used by the
>>>>>>> following patches to fix the use-after-free problems for map-in-map.
>>>>>>>
>>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>>>>>> ---
>>>>>>>   kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>   kernel/bpf/map_in_map.h | 11 +++++++--
>>>>>>>   2 files changed, 60 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>>
>>>>> SNIP
>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>>>>>>> +{
>>>>>>> +    struct bpf_inner_map_element *element = ptr;
>>>>>>> +
>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
>>>>>>> +     * RCU grace period, so it is certain that the bpf program which is
>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
>>>>>>> called.
>>>>>>> +     */
>>>>>>> +    if (need_defer)
>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
>>>>>> adding rcu_head to each element, how about
>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>>>>> No. I have tried the method before, but it didn't work due to dead-lock
>>>>> (will mention that in commit message in v2). The reason is that bpf
>>>>> syscall program may also do map update through sys_bpf helper. Because
>>>>> bpf syscall program is running with sleep-able context and has
>>>>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
>>>>> call_rcu_tasks) will lead to dead-lock.
>>>> Dead-lock? why?
>>>>
>>>> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS
>>>> or RCU tasks trace CS.
>>> Just confirming that this is the case.  If invoking call_rcu_tasks_trace()
>>> within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks,
>>> then there is a bug that needs fixing.  ;-)
>> The case for dead-lock is that calling synchronize_rcu_mult(call_rcu,
>> call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it
>> is expected. The case that calling call_rcu_tasks_trace() with
>> rcu_read_lock_trace() being held is OK.
> Very good, you are quite right.  In this particular case, deadlock is
> expected behavior.
>
> The problem here is that synchronize_rcu_mult() doesn't just invoke its
> arguments, instead, it also waits for all of the corresponding grace
> periods to complete.  But if you call this while under the protection of
> rcu_read_lock_trace(), then synchronize_rcu_mult(call_rcu_tasks_trace)
> cannot return until the corresponding rcu_read_unlock_trace() is
> reached, but that rcu_read_unlock_trace() cannot be reached until after
> synchronize_rcu_mult(call_rcu_tasks_trace) returns.
>
> (I did leave out the call_rcu argument because it does not participate
> in this particular deadlock.)

Got it. Thanks for the detailed explanation.
>
> 							Thanx, Paul
>
> .
Martin KaFai Lau Nov. 10, 2023, 2:48 a.m. UTC | #8
On 11/9/23 5:45 PM, Paul E. McKenney wrote:
>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>>>>>>> +{
>>>>>>> +    struct bpf_inner_map_element *element = ptr;
>>>>>>> +
>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks trace
>>>>>>> +     * RCU grace period, so it is certain that the bpf program which is
>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
>>>>>>> called.
>>>>>>> +     */
>>>>>>> +    if (need_defer)
>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
>>>>>> adding rcu_head to each element, how about
>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>>>>> No. I have tried the method before, but it didn't work due to dead-lock
>>>>> (will mention that in commit message in v2). The reason is that bpf
>>>>> syscall program may also do map update through sys_bpf helper. Because
>>>>> bpf syscall program is running with sleep-able context and has
>>>>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu,
>>>>> call_rcu_tasks) will lead to dead-lock.

Need to think of a less intrusive solution instead of adding rcu_head to each 
element and lookup also needs an extra de-referencing.

May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not be done by 
the syscall program? Which selftest does it?

Can the inner_map learn that it has been deleted from an outer map that is used 
in a sleepable prog->aux->used_maps? The bpf_map_free_deferred() will then wait 
for a task_trace gp?
Hou Tao Nov. 10, 2023, 3:34 a.m. UTC | #9
Hi Martin,

On 11/10/2023 10:48 AM, Martin KaFai Lau wrote:
> On 11/9/23 5:45 PM, Paul E. McKenney wrote:
>>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>>>>>>>> +{
>>>>>>>> +    struct bpf_inner_map_element *element = ptr;
>>>>>>>> +
>>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks
>>>>>>>> trace
>>>>>>>> +     * RCU grace period, so it is certain that the bpf program
>>>>>>>> which is
>>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
>>>>>>>> called.
>>>>>>>> +     */
>>>>>>>> +    if (need_defer)
>>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
>>>>>>> adding rcu_head to each element, how about
>>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>>>>>> No. I have tried the method before, but it didn't work due to
>>>>>> dead-lock
>>>>>> (will mention that in commit message in v2). The reason is that bpf
>>>>>> syscall program may also do map update through sys_bpf helper.
>>>>>> Because
>>>>>> bpf syscall program is running with sleep-able context and has
>>>>>> rcu_read_lock_trace being held, so call
>>>>>> synchronize_rcu_mult(call_rcu,
>>>>>> call_rcu_tasks) will lead to dead-lock.
>
> Need to think of a less intrusive solution instead of adding rcu_head
> to each element and lookup also needs an extra de-referencing.

I see.
>
> May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not
> be done by the syscall program? Which selftest does it?

Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I
remembered correctly it was map_ptr.
>
> Can the inner_map learn that it has been deleted from an outer map
> that is used in a sleepable prog->aux->used_maps? The
> bpf_map_free_deferred() will then wait for a task_trace gp?

Considering an inner_map may be used by multiple outer_map, the
following solution will be simpler: if the inner map has been deleted
from an outer map once, its free must be delayed after one RCU GP and
one tasks trace RCU GP. But I will check whether it is possible to only
wait for one RCU GP instead of two.
Paul E. McKenney Nov. 10, 2023, 4:58 a.m. UTC | #10
On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote:
> Hi Martin,
> 
> On 11/10/2023 10:48 AM, Martin KaFai Lau wrote:
> > On 11/9/23 5:45 PM, Paul E. McKenney wrote:
> >>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> >>>>>>>> +{
> >>>>>>>> +    struct bpf_inner_map_element *element = ptr;
> >>>>>>>> +
> >>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks
> >>>>>>>> trace
> >>>>>>>> +     * RCU grace period, so it is certain that the bpf program
> >>>>>>>> which is
> >>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
> >>>>>>>> called.
> >>>>>>>> +     */
> >>>>>>>> +    if (need_defer)
> >>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
> >>>>>>> adding rcu_head to each element, how about
> >>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
> >>>>>> No. I have tried the method before, but it didn't work due to
> >>>>>> dead-lock
> >>>>>> (will mention that in commit message in v2). The reason is that bpf
> >>>>>> syscall program may also do map update through sys_bpf helper.
> >>>>>> Because
> >>>>>> bpf syscall program is running with sleep-able context and has
> >>>>>> rcu_read_lock_trace being held, so call
> >>>>>> synchronize_rcu_mult(call_rcu,
> >>>>>> call_rcu_tasks) will lead to dead-lock.
> >
> > Need to think of a less intrusive solution instead of adding rcu_head
> > to each element and lookup also needs an extra de-referencing.
> 
> I see.
> >
> > May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not
> > be done by the syscall program? Which selftest does it?
> 
> Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I
> remembered correctly it was map_ptr.
> >
> > Can the inner_map learn that it has been deleted from an outer map
> > that is used in a sleepable prog->aux->used_maps? The
> > bpf_map_free_deferred() will then wait for a task_trace gp?
> 
> Considering an inner_map may be used by multiple outer_map, the
> following solution will be simpler: if the inner map has been deleted
> from an outer map once, its free must be delayed after one RCU GP and
> one tasks trace RCU GP. But I will check whether it is possible to only
> wait for one RCU GP instead of two.

If you are freeing a large quantity of elements at a time, one approach
is to use a single rcu_head structure for the group.  (Or, in this case,
maybe a pair of rcu_head structures, one for call_rcu() and the other
for call_rcu_tasks_trace().)

This requires that you be able to link the elements in the group
together somehow, which requires some per-element storage, but only
one word per element instead of two.

There are other variations on this theme, depending on what constraints
apply here.

							Thanx, Paul
Hou Tao Nov. 13, 2023, 12:53 a.m. UTC | #11
Hi,

On 11/10/2023 12:58 PM, Paul E. McKenney wrote:
> On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote:
>> Hi Martin,
>>
>> On 11/10/2023 10:48 AM, Martin KaFai Lau wrote:
>>> On 11/9/23 5:45 PM, Paul E. McKenney wrote:
>>>>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
>>>>>>>>>> +{
>>>>>>>>>> +    struct bpf_inner_map_element *element = ptr;
>>>>>>>>>> +
>>>>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks
>>>>>>>>>> trace
>>>>>>>>>> +     * RCU grace period, so it is certain that the bpf program
>>>>>>>>>> which is
>>>>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
>>>>>>>>>> called.
>>>>>>>>>> +     */
>>>>>>>>>> +    if (need_defer)
>>>>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
>>>>>>>>> adding rcu_head to each element, how about
>>>>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
>>>>>>>> No. I have tried the method before, but it didn't work due to
>>>>>>>> dead-lock
>>>>>>>> (will mention that in commit message in v2). The reason is that bpf
>>>>>>>> syscall program may also do map update through sys_bpf helper.
>>>>>>>> Because
>>>>>>>> bpf syscall program is running with sleep-able context and has
>>>>>>>> rcu_read_lock_trace being held, so call
>>>>>>>> synchronize_rcu_mult(call_rcu,
>>>>>>>> call_rcu_tasks) will lead to dead-lock.
>>> Need to think of a less intrusive solution instead of adding rcu_head
>>> to each element and lookup also needs an extra de-referencing.
>> I see.
>>> May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not
>>> be done by the syscall program? Which selftest does it?
>> Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I
>> remembered correctly it was map_ptr.
>>> Can the inner_map learn that it has been deleted from an outer map
>>> that is used in a sleepable prog->aux->used_maps? The
>>> bpf_map_free_deferred() will then wait for a task_trace gp?
>> Considering an inner_map may be used by multiple outer_map, the
>> following solution will be simpler: if the inner map has been deleted
>> from an outer map once, its free must be delayed after one RCU GP and
>> one tasks trace RCU GP. But I will check whether it is possible to only
>> wait for one RCU GP instead of two.
> If you are freeing a large quantity of elements at a time, one approach
> is to use a single rcu_head structure for the group.  (Or, in this case,
> maybe a pair of rcu_head structures, one for call_rcu() and the other
> for call_rcu_tasks_trace().)
>
> This requires that you be able to link the elements in the group
> together somehow, which requires some per-element storage, but only
> one word per element instead of two.
>
> There are other variations on this theme, depending on what constraints
> apply here.

Thanks for your suggestions. Although there are batch update support for
inner map, but I think inner map is updated one-by-one at most case. And
the main concern here is the extra dereference due to memory allocation,
so I think adding extra flags to indicate bpf_mem_free_deferred() to
free the map differently may be appropriate.

Regards,
Tao
> 							Thanx, Paul
Paul E. McKenney Nov. 14, 2023, 12:58 p.m. UTC | #12
On Mon, Nov 13, 2023 at 08:53:06AM +0800, Hou Tao wrote:
> Hi,
> 
> On 11/10/2023 12:58 PM, Paul E. McKenney wrote:
> > On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote:
> >> Hi Martin,
> >>
> >> On 11/10/2023 10:48 AM, Martin KaFai Lau wrote:
> >>> On 11/9/23 5:45 PM, Paul E. McKenney wrote:
> >>>>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
> >>>>>>>>>> +{
> >>>>>>>>>> +    struct bpf_inner_map_element *element = ptr;
> >>>>>>>>>> +
> >>>>>>>>>> +    /* Do bpf_map_put() after a RCU grace period and a tasks
> >>>>>>>>>> trace
> >>>>>>>>>> +     * RCU grace period, so it is certain that the bpf program
> >>>>>>>>>> which is
> >>>>>>>>>> +     * manipulating the map now has exited when bpf_map_put() is
> >>>>>>>>>> called.
> >>>>>>>>>> +     */
> >>>>>>>>>> +    if (need_defer)
> >>>>>>>>> "need_defer" should only happen from the syscall cmd? Instead of
> >>>>>>>>> adding rcu_head to each element, how about
> >>>>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here?
> >>>>>>>> No. I have tried the method before, but it didn't work due to
> >>>>>>>> dead-lock
> >>>>>>>> (will mention that in commit message in v2). The reason is that bpf
> >>>>>>>> syscall program may also do map update through sys_bpf helper.
> >>>>>>>> Because
> >>>>>>>> bpf syscall program is running with sleep-able context and has
> >>>>>>>> rcu_read_lock_trace being held, so call
> >>>>>>>> synchronize_rcu_mult(call_rcu,
> >>>>>>>> call_rcu_tasks) will lead to dead-lock.
> >>> Need to think of a less intrusive solution instead of adding rcu_head
> >>> to each element and lookup also needs an extra de-referencing.
> >> I see.
> >>> May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not
> >>> be done by the syscall program? Which selftest does it?
> >> Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I
> >> remembered correctly it was map_ptr.
> >>> Can the inner_map learn that it has been deleted from an outer map
> >>> that is used in a sleepable prog->aux->used_maps? The
> >>> bpf_map_free_deferred() will then wait for a task_trace gp?
> >> Considering an inner_map may be used by multiple outer_map, the
> >> following solution will be simpler: if the inner map has been deleted
> >> from an outer map once, its free must be delayed after one RCU GP and
> >> one tasks trace RCU GP. But I will check whether it is possible to only
> >> wait for one RCU GP instead of two.
> > If you are freeing a large quantity of elements at a time, one approach
> > is to use a single rcu_head structure for the group.  (Or, in this case,
> > maybe a pair of rcu_head structures, one for call_rcu() and the other
> > for call_rcu_tasks_trace().)
> >
> > This requires that you be able to link the elements in the group
> > together somehow, which requires some per-element storage, but only
> > one word per element instead of two.
> >
> > There are other variations on this theme, depending on what constraints
> > apply here.
> 
> Thanks for your suggestions. Although there are batch update support for
> inner map, but I think inner map is updated one-by-one at most case. And
> the main concern here is the extra dereference due to memory allocation,
> so I think adding extra flags to indicate bpf_mem_free_deferred() to
> free the map differently may be appropriate.

Whatever gets the job done is of course goodness, and yes, if the freeing
cannot be batched, my suggestion won't help much.  ;-)

							Thanx, Paul
diff mbox series

Patch

diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 8323ce201159d..96e32f4167c4e 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -4,6 +4,7 @@ 
 #include <linux/slab.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
+#include <linux/rcupdate.h>
 
 #include "map_in_map.h"
 
@@ -139,3 +140,53 @@  u32 bpf_map_fd_sys_lookup_elem(void *ptr)
 {
 	return ((struct bpf_map *)ptr)->id;
 }
+
+void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+			       int ufd)
+{
+	struct bpf_inner_map_element *element;
+	struct bpf_map *inner_map;
+
+	element = kmalloc(sizeof(*element), GFP_KERNEL);
+	if (!element)
+		return ERR_PTR(-ENOMEM);
+
+	inner_map = bpf_map_fd_get_ptr(map, map_file, ufd);
+	if (IS_ERR(inner_map)) {
+		kfree(element);
+		return inner_map;
+	}
+
+	element->map = inner_map;
+	return element;
+}
+
+static void bpf_inner_map_element_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_inner_map_element *elem = container_of(rcu, struct bpf_inner_map_element, rcu);
+
+	bpf_map_put(elem->map);
+	kfree(elem);
+}
+
+static void bpf_inner_map_element_free_tt_rcu(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		bpf_inner_map_element_free_rcu(rcu);
+	else
+		call_rcu(rcu, bpf_inner_map_element_free_rcu);
+}
+
+void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer)
+{
+	struct bpf_inner_map_element *element = ptr;
+
+	/* Do bpf_map_put() after a RCU grace period and a tasks trace
+	 * RCU grace period, so it is certain that the bpf program which is
+	 * manipulating the map now has exited when bpf_map_put() is called.
+	 */
+	if (need_defer)
+		call_rcu_tasks_trace(&element->rcu, bpf_inner_map_element_free_tt_rcu);
+	else
+		bpf_inner_map_element_free_rcu(&element->rcu);
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 63872bffd9b3c..8d38496e5179b 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -9,11 +9,18 @@ 
 struct file;
 struct bpf_map;
 
+struct bpf_inner_map_element {
+	struct bpf_map *map;
+	struct rcu_head rcu;
+};
+
 struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
 void bpf_map_meta_free(struct bpf_map *map_meta);
-void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
-			 int ufd);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd);
 void bpf_map_fd_put_ptr(void *ptr, bool need_defer);
 u32 bpf_map_fd_sys_lookup_elem(void *ptr);
 
+void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd);
+void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer);
+
 #endif