Message ID | 20231107140702.1891778-6-houtao@huaweicloud.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf: Fix the release of inner map | expand |
On 11/7/23 6:06 AM, Hou Tao wrote: > From: Hou Tao <houtao1@huawei.com> > > bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer > saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the > pointer saved in map-in-map. These two helpers will be used by the > following patches to fix the use-after-free problems for map-in-map. > > Signed-off-by: Hou Tao <houtao1@huawei.com> > --- > kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ > kernel/bpf/map_in_map.h | 11 +++++++-- > 2 files changed, 60 insertions(+), 2 deletions(-) > > diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c > index 8323ce201159d..96e32f4167c4e 100644 > --- a/kernel/bpf/map_in_map.c > +++ b/kernel/bpf/map_in_map.c > @@ -4,6 +4,7 @@ > #include <linux/slab.h> > #include <linux/bpf.h> > #include <linux/btf.h> > +#include <linux/rcupdate.h> > > #include "map_in_map.h" > > @@ -139,3 +140,53 @@ u32 bpf_map_fd_sys_lookup_elem(void *ptr) > { > return ((struct bpf_map *)ptr)->id; > } > + > +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, > + int ufd) > +{ > + struct bpf_inner_map_element *element; > + struct bpf_map *inner_map; > + > + element = kmalloc(sizeof(*element), GFP_KERNEL); > + if (!element) > + return ERR_PTR(-ENOMEM); > + > + inner_map = bpf_map_fd_get_ptr(map, map_file, ufd); > + if (IS_ERR(inner_map)) { > + kfree(element); > + return inner_map; > + } > + > + element->map = inner_map; > + return element; > +} > + > +static void bpf_inner_map_element_free_rcu(struct rcu_head *rcu) > +{ > + struct bpf_inner_map_element *elem = container_of(rcu, struct bpf_inner_map_element, rcu); > + > + bpf_map_put(elem->map); > + kfree(elem); > +} > + > +static void bpf_inner_map_element_free_tt_rcu(struct rcu_head *rcu) > +{ > + if (rcu_trace_implies_rcu_gp()) > + bpf_inner_map_element_free_rcu(rcu); > + else > + call_rcu(rcu, bpf_inner_map_element_free_rcu); > +} > + > +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > +{ > + struct bpf_inner_map_element *element = ptr; > + > + /* Do bpf_map_put() after a RCU grace period and a tasks trace > + * RCU grace period, so it is certain that the bpf program which is > + * manipulating the map now has exited when bpf_map_put() is called. > + */ > + if (need_defer) "need_defer" should only happen from the syscall cmd? Instead of adding rcu_head to each element, how about "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > + call_rcu_tasks_trace(&element->rcu, bpf_inner_map_element_free_tt_rcu); > + else > + bpf_inner_map_element_free_rcu(&element->rcu); > +} > diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h > index 63872bffd9b3c..8d38496e5179b 100644 > --- a/kernel/bpf/map_in_map.h > +++ b/kernel/bpf/map_in_map.h > @@ -9,11 +9,18 @@ > struct file; > struct bpf_map; > > +struct bpf_inner_map_element { > + struct bpf_map *map; > + struct rcu_head rcu; > +}; > + > struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); > void bpf_map_meta_free(struct bpf_map *map_meta); > -void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, > - int ufd); > +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); > void bpf_map_fd_put_ptr(void *ptr, bool need_defer); > u32 bpf_map_fd_sys_lookup_elem(void *ptr); > > +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); > +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer); > + > #endif
Hi, On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: > On 11/7/23 6:06 AM, Hou Tao wrote: >> From: Hou Tao <houtao1@huawei.com> >> >> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer >> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the >> pointer saved in map-in-map. These two helpers will be used by the >> following patches to fix the use-after-free problems for map-in-map. >> >> Signed-off-by: Hou Tao <houtao1@huawei.com> >> --- >> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ >> kernel/bpf/map_in_map.h | 11 +++++++-- >> 2 files changed, 60 insertions(+), 2 deletions(-) >> >> SNIP >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >> +{ >> + struct bpf_inner_map_element *element = ptr; >> + >> + /* Do bpf_map_put() after a RCU grace period and a tasks trace >> + * RCU grace period, so it is certain that the bpf program which is >> + * manipulating the map now has exited when bpf_map_put() is >> called. >> + */ >> + if (need_defer) > > "need_defer" should only happen from the syscall cmd? Instead of > adding rcu_head to each element, how about > "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? No. I have tried the method before, but it didn't work due to dead-lock (will mention that in commit message in v2). The reason is that bpf syscall program may also do map update through sys_bpf helper. Because bpf syscall program is running with sleep-able context and has rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, call_rcu_tasks) will lead to dead-lock. > >> + call_rcu_tasks_trace(&element->rcu, >> bpf_inner_map_element_free_tt_rcu); >> + else >> + bpf_inner_map_element_free_rcu(&element->rcu); >> +} >> diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h >> index 63872bffd9b3c..8d38496e5179b 100644 >> --- a/kernel/bpf/map_in_map.h >> +++ b/kernel/bpf/map_in_map.h >> @@ -9,11 +9,18 @@ >> struct file; >> struct bpf_map; >> +struct bpf_inner_map_element { >> + struct bpf_map *map; >> + struct rcu_head rcu; >> +}; >> + >> struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); >> void bpf_map_meta_free(struct bpf_map *map_meta); >> -void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, >> - int ufd); >> +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, >> int ufd); >> void bpf_map_fd_put_ptr(void *ptr, bool need_defer); >> u32 bpf_map_fd_sys_lookup_elem(void *ptr); >> +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file >> *map_file, int ufd); >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer); >> + >> #endif
On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote: > > Hi, > > On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: > > On 11/7/23 6:06 AM, Hou Tao wrote: > >> From: Hou Tao <houtao1@huawei.com> > >> > >> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer > >> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the > >> pointer saved in map-in-map. These two helpers will be used by the > >> following patches to fix the use-after-free problems for map-in-map. > >> > >> Signed-off-by: Hou Tao <houtao1@huawei.com> > >> --- > >> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ > >> kernel/bpf/map_in_map.h | 11 +++++++-- > >> 2 files changed, 60 insertions(+), 2 deletions(-) > >> > >> > SNIP > >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > >> +{ > >> + struct bpf_inner_map_element *element = ptr; > >> + > >> + /* Do bpf_map_put() after a RCU grace period and a tasks trace > >> + * RCU grace period, so it is certain that the bpf program which is > >> + * manipulating the map now has exited when bpf_map_put() is > >> called. > >> + */ > >> + if (need_defer) > > > > "need_defer" should only happen from the syscall cmd? Instead of > > adding rcu_head to each element, how about > > "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > > No. I have tried the method before, but it didn't work due to dead-lock > (will mention that in commit message in v2). The reason is that bpf > syscall program may also do map update through sys_bpf helper. Because > bpf syscall program is running with sleep-able context and has > rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, > call_rcu_tasks) will lead to dead-lock. Dead-lock? why? I think it's legal to do call_rcu_tasks_trace() while inside RCU CS or RCU tasks trace CS.
On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote: > On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote: > > > > Hi, > > > > On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: > > > On 11/7/23 6:06 AM, Hou Tao wrote: > > >> From: Hou Tao <houtao1@huawei.com> > > >> > > >> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer > > >> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the > > >> pointer saved in map-in-map. These two helpers will be used by the > > >> following patches to fix the use-after-free problems for map-in-map. > > >> > > >> Signed-off-by: Hou Tao <houtao1@huawei.com> > > >> --- > > >> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ > > >> kernel/bpf/map_in_map.h | 11 +++++++-- > > >> 2 files changed, 60 insertions(+), 2 deletions(-) > > >> > > >> > > SNIP > > >> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > > >> +{ > > >> + struct bpf_inner_map_element *element = ptr; > > >> + > > >> + /* Do bpf_map_put() after a RCU grace period and a tasks trace > > >> + * RCU grace period, so it is certain that the bpf program which is > > >> + * manipulating the map now has exited when bpf_map_put() is > > >> called. > > >> + */ > > >> + if (need_defer) > > > > > > "need_defer" should only happen from the syscall cmd? Instead of > > > adding rcu_head to each element, how about > > > "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > > > > No. I have tried the method before, but it didn't work due to dead-lock > > (will mention that in commit message in v2). The reason is that bpf > > syscall program may also do map update through sys_bpf helper. Because > > bpf syscall program is running with sleep-able context and has > > rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, > > call_rcu_tasks) will lead to dead-lock. > > Dead-lock? why? > > I think it's legal to do call_rcu_tasks_trace() while inside RCU CS > or RCU tasks trace CS. Just confirming that this is the case. If invoking call_rcu_tasks_trace() within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks, then there is a bug that needs fixing. ;-) Thanx, Paul
Hi, On 11/10/2023 3:55 AM, Paul E. McKenney wrote: > On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote: >> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote: >>> Hi, >>> >>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: >>>> On 11/7/23 6:06 AM, Hou Tao wrote: >>>>> From: Hou Tao <houtao1@huawei.com> >>>>> >>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer >>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the >>>>> pointer saved in map-in-map. These two helpers will be used by the >>>>> following patches to fix the use-after-free problems for map-in-map. >>>>> >>>>> Signed-off-by: Hou Tao <houtao1@huawei.com> >>>>> --- >>>>> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ >>>>> kernel/bpf/map_in_map.h | 11 +++++++-- >>>>> 2 files changed, 60 insertions(+), 2 deletions(-) >>>>> >>>>> >>> SNIP >>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >>>>> +{ >>>>> + struct bpf_inner_map_element *element = ptr; >>>>> + >>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks trace >>>>> + * RCU grace period, so it is certain that the bpf program which is >>>>> + * manipulating the map now has exited when bpf_map_put() is >>>>> called. >>>>> + */ >>>>> + if (need_defer) >>>> "need_defer" should only happen from the syscall cmd? Instead of >>>> adding rcu_head to each element, how about >>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? >>> No. I have tried the method before, but it didn't work due to dead-lock >>> (will mention that in commit message in v2). The reason is that bpf >>> syscall program may also do map update through sys_bpf helper. Because >>> bpf syscall program is running with sleep-able context and has >>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, >>> call_rcu_tasks) will lead to dead-lock. >> Dead-lock? why? >> >> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS >> or RCU tasks trace CS. > Just confirming that this is the case. If invoking call_rcu_tasks_trace() > within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks, > then there is a bug that needs fixing. ;-) The case for dead-lock is that calling synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it is expected. The case that calling call_rcu_tasks_trace() with rcu_read_lock_trace() being held is OK. > > Thanx, Paul > > .
On Fri, Nov 10, 2023 at 09:06:56AM +0800, Hou Tao wrote: > Hi, > > On 11/10/2023 3:55 AM, Paul E. McKenney wrote: > > On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote: > >> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote: > >>> Hi, > >>> > >>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: > >>>> On 11/7/23 6:06 AM, Hou Tao wrote: > >>>>> From: Hou Tao <houtao1@huawei.com> > >>>>> > >>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer > >>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the > >>>>> pointer saved in map-in-map. These two helpers will be used by the > >>>>> following patches to fix the use-after-free problems for map-in-map. > >>>>> > >>>>> Signed-off-by: Hou Tao <houtao1@huawei.com> > >>>>> --- > >>>>> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ > >>>>> kernel/bpf/map_in_map.h | 11 +++++++-- > >>>>> 2 files changed, 60 insertions(+), 2 deletions(-) > >>>>> > >>>>> > >>> SNIP > >>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > >>>>> +{ > >>>>> + struct bpf_inner_map_element *element = ptr; > >>>>> + > >>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks trace > >>>>> + * RCU grace period, so it is certain that the bpf program which is > >>>>> + * manipulating the map now has exited when bpf_map_put() is > >>>>> called. > >>>>> + */ > >>>>> + if (need_defer) > >>>> "need_defer" should only happen from the syscall cmd? Instead of > >>>> adding rcu_head to each element, how about > >>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > >>> No. I have tried the method before, but it didn't work due to dead-lock > >>> (will mention that in commit message in v2). The reason is that bpf > >>> syscall program may also do map update through sys_bpf helper. Because > >>> bpf syscall program is running with sleep-able context and has > >>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, > >>> call_rcu_tasks) will lead to dead-lock. > >> Dead-lock? why? > >> > >> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS > >> or RCU tasks trace CS. > > Just confirming that this is the case. If invoking call_rcu_tasks_trace() > > within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks, > > then there is a bug that needs fixing. ;-) > > The case for dead-lock is that calling synchronize_rcu_mult(call_rcu, > call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it > is expected. The case that calling call_rcu_tasks_trace() with > rcu_read_lock_trace() being held is OK. Very good, you are quite right. In this particular case, deadlock is expected behavior. The problem here is that synchronize_rcu_mult() doesn't just invoke its arguments, instead, it also waits for all of the corresponding grace periods to complete. But if you call this while under the protection of rcu_read_lock_trace(), then synchronize_rcu_mult(call_rcu_tasks_trace) cannot return until the corresponding rcu_read_unlock_trace() is reached, but that rcu_read_unlock_trace() cannot be reached until after synchronize_rcu_mult(call_rcu_tasks_trace) returns. (I did leave out the call_rcu argument because it does not participate in this particular deadlock.) Thanx, Paul
Hi Paul, On 11/10/2023 9:45 AM, Paul E. McKenney wrote: > On Fri, Nov 10, 2023 at 09:06:56AM +0800, Hou Tao wrote: >> Hi, >> >> On 11/10/2023 3:55 AM, Paul E. McKenney wrote: >>> On Thu, Nov 09, 2023 at 07:55:50AM -0800, Alexei Starovoitov wrote: >>>> On Wed, Nov 8, 2023 at 11:26 PM Hou Tao <houtao@huaweicloud.com> wrote: >>>>> Hi, >>>>> >>>>> On 11/9/2023 2:36 PM, Martin KaFai Lau wrote: >>>>>> On 11/7/23 6:06 AM, Hou Tao wrote: >>>>>>> From: Hou Tao <houtao1@huawei.com> >>>>>>> >>>>>>> bpf_map_of_map_fd_get_ptr() will convert the map fd to the pointer >>>>>>> saved in map-in-map. bpf_map_of_map_fd_put_ptr() will release the >>>>>>> pointer saved in map-in-map. These two helpers will be used by the >>>>>>> following patches to fix the use-after-free problems for map-in-map. >>>>>>> >>>>>>> Signed-off-by: Hou Tao <houtao1@huawei.com> >>>>>>> --- >>>>>>> kernel/bpf/map_in_map.c | 51 +++++++++++++++++++++++++++++++++++++++++ >>>>>>> kernel/bpf/map_in_map.h | 11 +++++++-- >>>>>>> 2 files changed, 60 insertions(+), 2 deletions(-) >>>>>>> >>>>>>> >>>>> SNIP >>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >>>>>>> +{ >>>>>>> + struct bpf_inner_map_element *element = ptr; >>>>>>> + >>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks trace >>>>>>> + * RCU grace period, so it is certain that the bpf program which is >>>>>>> + * manipulating the map now has exited when bpf_map_put() is >>>>>>> called. >>>>>>> + */ >>>>>>> + if (need_defer) >>>>>> "need_defer" should only happen from the syscall cmd? Instead of >>>>>> adding rcu_head to each element, how about >>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? >>>>> No. I have tried the method before, but it didn't work due to dead-lock >>>>> (will mention that in commit message in v2). The reason is that bpf >>>>> syscall program may also do map update through sys_bpf helper. Because >>>>> bpf syscall program is running with sleep-able context and has >>>>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, >>>>> call_rcu_tasks) will lead to dead-lock. >>>> Dead-lock? why? >>>> >>>> I think it's legal to do call_rcu_tasks_trace() while inside RCU CS >>>> or RCU tasks trace CS. >>> Just confirming that this is the case. If invoking call_rcu_tasks_trace() >>> within under either rcu_read_lock() or rcu_read_lock_trace() deadlocks, >>> then there is a bug that needs fixing. ;-) >> The case for dead-lock is that calling synchronize_rcu_mult(call_rcu, >> call_rcu_tasks_trace) within under rcu_read_lock_trace() and I think it >> is expected. The case that calling call_rcu_tasks_trace() with >> rcu_read_lock_trace() being held is OK. > Very good, you are quite right. In this particular case, deadlock is > expected behavior. > > The problem here is that synchronize_rcu_mult() doesn't just invoke its > arguments, instead, it also waits for all of the corresponding grace > periods to complete. But if you call this while under the protection of > rcu_read_lock_trace(), then synchronize_rcu_mult(call_rcu_tasks_trace) > cannot return until the corresponding rcu_read_unlock_trace() is > reached, but that rcu_read_unlock_trace() cannot be reached until after > synchronize_rcu_mult(call_rcu_tasks_trace) returns. > > (I did leave out the call_rcu argument because it does not participate > in this particular deadlock.) Got it. Thanks for the detailed explanation. > > Thanx, Paul > > .
On 11/9/23 5:45 PM, Paul E. McKenney wrote: >>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >>>>>>> +{ >>>>>>> + struct bpf_inner_map_element *element = ptr; >>>>>>> + >>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks trace >>>>>>> + * RCU grace period, so it is certain that the bpf program which is >>>>>>> + * manipulating the map now has exited when bpf_map_put() is >>>>>>> called. >>>>>>> + */ >>>>>>> + if (need_defer) >>>>>> "need_defer" should only happen from the syscall cmd? Instead of >>>>>> adding rcu_head to each element, how about >>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? >>>>> No. I have tried the method before, but it didn't work due to dead-lock >>>>> (will mention that in commit message in v2). The reason is that bpf >>>>> syscall program may also do map update through sys_bpf helper. Because >>>>> bpf syscall program is running with sleep-able context and has >>>>> rcu_read_lock_trace being held, so call synchronize_rcu_mult(call_rcu, >>>>> call_rcu_tasks) will lead to dead-lock. Need to think of a less intrusive solution instead of adding rcu_head to each element and lookup also needs an extra de-referencing. May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not be done by the syscall program? Which selftest does it? Can the inner_map learn that it has been deleted from an outer map that is used in a sleepable prog->aux->used_maps? The bpf_map_free_deferred() will then wait for a task_trace gp?
Hi Martin, On 11/10/2023 10:48 AM, Martin KaFai Lau wrote: > On 11/9/23 5:45 PM, Paul E. McKenney wrote: >>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >>>>>>>> +{ >>>>>>>> + struct bpf_inner_map_element *element = ptr; >>>>>>>> + >>>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks >>>>>>>> trace >>>>>>>> + * RCU grace period, so it is certain that the bpf program >>>>>>>> which is >>>>>>>> + * manipulating the map now has exited when bpf_map_put() is >>>>>>>> called. >>>>>>>> + */ >>>>>>>> + if (need_defer) >>>>>>> "need_defer" should only happen from the syscall cmd? Instead of >>>>>>> adding rcu_head to each element, how about >>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? >>>>>> No. I have tried the method before, but it didn't work due to >>>>>> dead-lock >>>>>> (will mention that in commit message in v2). The reason is that bpf >>>>>> syscall program may also do map update through sys_bpf helper. >>>>>> Because >>>>>> bpf syscall program is running with sleep-able context and has >>>>>> rcu_read_lock_trace being held, so call >>>>>> synchronize_rcu_mult(call_rcu, >>>>>> call_rcu_tasks) will lead to dead-lock. > > Need to think of a less intrusive solution instead of adding rcu_head > to each element and lookup also needs an extra de-referencing. I see. > > May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not > be done by the syscall program? Which selftest does it? Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I remembered correctly it was map_ptr. > > Can the inner_map learn that it has been deleted from an outer map > that is used in a sleepable prog->aux->used_maps? The > bpf_map_free_deferred() will then wait for a task_trace gp? Considering an inner_map may be used by multiple outer_map, the following solution will be simpler: if the inner map has been deleted from an outer map once, its free must be delayed after one RCU GP and one tasks trace RCU GP. But I will check whether it is possible to only wait for one RCU GP instead of two.
On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote: > Hi Martin, > > On 11/10/2023 10:48 AM, Martin KaFai Lau wrote: > > On 11/9/23 5:45 PM, Paul E. McKenney wrote: > >>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > >>>>>>>> +{ > >>>>>>>> + struct bpf_inner_map_element *element = ptr; > >>>>>>>> + > >>>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks > >>>>>>>> trace > >>>>>>>> + * RCU grace period, so it is certain that the bpf program > >>>>>>>> which is > >>>>>>>> + * manipulating the map now has exited when bpf_map_put() is > >>>>>>>> called. > >>>>>>>> + */ > >>>>>>>> + if (need_defer) > >>>>>>> "need_defer" should only happen from the syscall cmd? Instead of > >>>>>>> adding rcu_head to each element, how about > >>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > >>>>>> No. I have tried the method before, but it didn't work due to > >>>>>> dead-lock > >>>>>> (will mention that in commit message in v2). The reason is that bpf > >>>>>> syscall program may also do map update through sys_bpf helper. > >>>>>> Because > >>>>>> bpf syscall program is running with sleep-able context and has > >>>>>> rcu_read_lock_trace being held, so call > >>>>>> synchronize_rcu_mult(call_rcu, > >>>>>> call_rcu_tasks) will lead to dead-lock. > > > > Need to think of a less intrusive solution instead of adding rcu_head > > to each element and lookup also needs an extra de-referencing. > > I see. > > > > May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not > > be done by the syscall program? Which selftest does it? > > Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I > remembered correctly it was map_ptr. > > > > Can the inner_map learn that it has been deleted from an outer map > > that is used in a sleepable prog->aux->used_maps? The > > bpf_map_free_deferred() will then wait for a task_trace gp? > > Considering an inner_map may be used by multiple outer_map, the > following solution will be simpler: if the inner map has been deleted > from an outer map once, its free must be delayed after one RCU GP and > one tasks trace RCU GP. But I will check whether it is possible to only > wait for one RCU GP instead of two. If you are freeing a large quantity of elements at a time, one approach is to use a single rcu_head structure for the group. (Or, in this case, maybe a pair of rcu_head structures, one for call_rcu() and the other for call_rcu_tasks_trace().) This requires that you be able to link the elements in the group together somehow, which requires some per-element storage, but only one word per element instead of two. There are other variations on this theme, depending on what constraints apply here. Thanx, Paul
Hi, On 11/10/2023 12:58 PM, Paul E. McKenney wrote: > On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote: >> Hi Martin, >> >> On 11/10/2023 10:48 AM, Martin KaFai Lau wrote: >>> On 11/9/23 5:45 PM, Paul E. McKenney wrote: >>>>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) >>>>>>>>>> +{ >>>>>>>>>> + struct bpf_inner_map_element *element = ptr; >>>>>>>>>> + >>>>>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks >>>>>>>>>> trace >>>>>>>>>> + * RCU grace period, so it is certain that the bpf program >>>>>>>>>> which is >>>>>>>>>> + * manipulating the map now has exited when bpf_map_put() is >>>>>>>>>> called. >>>>>>>>>> + */ >>>>>>>>>> + if (need_defer) >>>>>>>>> "need_defer" should only happen from the syscall cmd? Instead of >>>>>>>>> adding rcu_head to each element, how about >>>>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? >>>>>>>> No. I have tried the method before, but it didn't work due to >>>>>>>> dead-lock >>>>>>>> (will mention that in commit message in v2). The reason is that bpf >>>>>>>> syscall program may also do map update through sys_bpf helper. >>>>>>>> Because >>>>>>>> bpf syscall program is running with sleep-able context and has >>>>>>>> rcu_read_lock_trace being held, so call >>>>>>>> synchronize_rcu_mult(call_rcu, >>>>>>>> call_rcu_tasks) will lead to dead-lock. >>> Need to think of a less intrusive solution instead of adding rcu_head >>> to each element and lookup also needs an extra de-referencing. >> I see. >>> May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not >>> be done by the syscall program? Which selftest does it? >> Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I >> remembered correctly it was map_ptr. >>> Can the inner_map learn that it has been deleted from an outer map >>> that is used in a sleepable prog->aux->used_maps? The >>> bpf_map_free_deferred() will then wait for a task_trace gp? >> Considering an inner_map may be used by multiple outer_map, the >> following solution will be simpler: if the inner map has been deleted >> from an outer map once, its free must be delayed after one RCU GP and >> one tasks trace RCU GP. But I will check whether it is possible to only >> wait for one RCU GP instead of two. > If you are freeing a large quantity of elements at a time, one approach > is to use a single rcu_head structure for the group. (Or, in this case, > maybe a pair of rcu_head structures, one for call_rcu() and the other > for call_rcu_tasks_trace().) > > This requires that you be able to link the elements in the group > together somehow, which requires some per-element storage, but only > one word per element instead of two. > > There are other variations on this theme, depending on what constraints > apply here. Thanks for your suggestions. Although there are batch update support for inner map, but I think inner map is updated one-by-one at most case. And the main concern here is the extra dereference due to memory allocation, so I think adding extra flags to indicate bpf_mem_free_deferred() to free the map differently may be appropriate. Regards, Tao > Thanx, Paul
On Mon, Nov 13, 2023 at 08:53:06AM +0800, Hou Tao wrote: > Hi, > > On 11/10/2023 12:58 PM, Paul E. McKenney wrote: > > On Fri, Nov 10, 2023 at 11:34:03AM +0800, Hou Tao wrote: > >> Hi Martin, > >> > >> On 11/10/2023 10:48 AM, Martin KaFai Lau wrote: > >>> On 11/9/23 5:45 PM, Paul E. McKenney wrote: > >>>>>>>>>> +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) > >>>>>>>>>> +{ > >>>>>>>>>> + struct bpf_inner_map_element *element = ptr; > >>>>>>>>>> + > >>>>>>>>>> + /* Do bpf_map_put() after a RCU grace period and a tasks > >>>>>>>>>> trace > >>>>>>>>>> + * RCU grace period, so it is certain that the bpf program > >>>>>>>>>> which is > >>>>>>>>>> + * manipulating the map now has exited when bpf_map_put() is > >>>>>>>>>> called. > >>>>>>>>>> + */ > >>>>>>>>>> + if (need_defer) > >>>>>>>>> "need_defer" should only happen from the syscall cmd? Instead of > >>>>>>>>> adding rcu_head to each element, how about > >>>>>>>>> "synchronize_rcu_mult(call_rcu, call_rcu_tasks)" here? > >>>>>>>> No. I have tried the method before, but it didn't work due to > >>>>>>>> dead-lock > >>>>>>>> (will mention that in commit message in v2). The reason is that bpf > >>>>>>>> syscall program may also do map update through sys_bpf helper. > >>>>>>>> Because > >>>>>>>> bpf syscall program is running with sleep-able context and has > >>>>>>>> rcu_read_lock_trace being held, so call > >>>>>>>> synchronize_rcu_mult(call_rcu, > >>>>>>>> call_rcu_tasks) will lead to dead-lock. > >>> Need to think of a less intrusive solution instead of adding rcu_head > >>> to each element and lookup also needs an extra de-referencing. > >> I see. > >>> May be the bpf_map_{update,delete}_elem(&outer_map, ....) should not > >>> be done by the syscall program? Which selftest does it? > >> Now bpf_map_update_elem is allowed for bpf_sys_bpf helper. If I > >> remembered correctly it was map_ptr. > >>> Can the inner_map learn that it has been deleted from an outer map > >>> that is used in a sleepable prog->aux->used_maps? The > >>> bpf_map_free_deferred() will then wait for a task_trace gp? > >> Considering an inner_map may be used by multiple outer_map, the > >> following solution will be simpler: if the inner map has been deleted > >> from an outer map once, its free must be delayed after one RCU GP and > >> one tasks trace RCU GP. But I will check whether it is possible to only > >> wait for one RCU GP instead of two. > > If you are freeing a large quantity of elements at a time, one approach > > is to use a single rcu_head structure for the group. (Or, in this case, > > maybe a pair of rcu_head structures, one for call_rcu() and the other > > for call_rcu_tasks_trace().) > > > > This requires that you be able to link the elements in the group > > together somehow, which requires some per-element storage, but only > > one word per element instead of two. > > > > There are other variations on this theme, depending on what constraints > > apply here. > > Thanks for your suggestions. Although there are batch update support for > inner map, but I think inner map is updated one-by-one at most case. And > the main concern here is the extra dereference due to memory allocation, > so I think adding extra flags to indicate bpf_mem_free_deferred() to > free the map differently may be appropriate. Whatever gets the job done is of course goodness, and yes, if the freeing cannot be batched, my suggestion won't help much. ;-) Thanx, Paul
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 8323ce201159d..96e32f4167c4e 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -4,6 +4,7 @@ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/rcupdate.h> #include "map_in_map.h" @@ -139,3 +140,53 @@ u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; } + +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd) +{ + struct bpf_inner_map_element *element; + struct bpf_map *inner_map; + + element = kmalloc(sizeof(*element), GFP_KERNEL); + if (!element) + return ERR_PTR(-ENOMEM); + + inner_map = bpf_map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(inner_map)) { + kfree(element); + return inner_map; + } + + element->map = inner_map; + return element; +} + +static void bpf_inner_map_element_free_rcu(struct rcu_head *rcu) +{ + struct bpf_inner_map_element *elem = container_of(rcu, struct bpf_inner_map_element, rcu); + + bpf_map_put(elem->map); + kfree(elem); +} + +static void bpf_inner_map_element_free_tt_rcu(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_inner_map_element_free_rcu(rcu); + else + call_rcu(rcu, bpf_inner_map_element_free_rcu); +} + +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer) +{ + struct bpf_inner_map_element *element = ptr; + + /* Do bpf_map_put() after a RCU grace period and a tasks trace + * RCU grace period, so it is certain that the bpf program which is + * manipulating the map now has exited when bpf_map_put() is called. + */ + if (need_defer) + call_rcu_tasks_trace(&element->rcu, bpf_inner_map_element_free_tt_rcu); + else + bpf_inner_map_element_free_rcu(&element->rcu); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 63872bffd9b3c..8d38496e5179b 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -9,11 +9,18 @@ struct file; struct bpf_map; +struct bpf_inner_map_element { + struct bpf_map *map; + struct rcu_head rcu; +}; + struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); -void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, - int ufd); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); +void *bpf_map_of_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); +void bpf_map_of_map_fd_put_ptr(void *ptr, bool need_defer); + #endif