diff mbox series

[RFC,bpf-next,05/11] bpf: Add bpf_spin_lock member to rbtree

Message ID 20220722183438.3319790-6-davemarchevsky@fb.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: Introduce rbtree map | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next

Commit Message

Dave Marchevsky July 22, 2022, 6:34 p.m. UTC
This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
well as a bpf_rbtree_get_lock helper which allows bpf programs to access
the lock.

Ideally the bpf_spin_lock would be created independently oustide of the
tree and associated with it before the tree is used, either as part of
map definition or via some call like rbtree_init(&rbtree, &lock). Doing
this in an ergonomic way is proving harder than expected, so for now use
this workaround.

Why is creating the bpf_spin_lock independently and associating it with
the tree preferable? Because we want to be able to transfer nodes
between trees atomically, and for this to work need same lock associated
with 2 trees.

Further locking-related patches will make it possible for the lock to be
used in BPF programs and add code which enforces that the lock is held
when doing any operation on the tree.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
---
 include/uapi/linux/bpf.h       |  7 +++++++
 kernel/bpf/helpers.c           |  3 +++
 kernel/bpf/rbtree.c            | 24 ++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  7 +++++++
 4 files changed, 41 insertions(+)

Comments

Alexei Starovoitov Aug. 1, 2022, 10:17 p.m. UTC | #1
On 7/22/22 11:34 AM, Dave Marchevsky wrote:
> This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
> well as a bpf_rbtree_get_lock helper which allows bpf programs to access
> the lock.
> 
> Ideally the bpf_spin_lock would be created independently oustide of the
> tree and associated with it before the tree is used, either as part of
> map definition or via some call like rbtree_init(&rbtree, &lock). Doing
> this in an ergonomic way is proving harder than expected, so for now use
> this workaround.
> 
> Why is creating the bpf_spin_lock independently and associating it with
> the tree preferable? Because we want to be able to transfer nodes
> between trees atomically, and for this to work need same lock associated
> with 2 trees.

Right. We need one lock to protect multiple rbtrees.
Since add/find/remove helpers will look into rbtree->lock
the two different rbtree (== map) have to point to the same lock.
Other than rbtree_init(&rbtree, &lock) what would be an alternative ?

> 
> Further locking-related patches will make it possible for the lock to be
> used in BPF programs and add code which enforces that the lock is held
> when doing any operation on the tree.
> 
> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> ---
>   include/uapi/linux/bpf.h       |  7 +++++++
>   kernel/bpf/helpers.c           |  3 +++
>   kernel/bpf/rbtree.c            | 24 ++++++++++++++++++++++++
>   tools/include/uapi/linux/bpf.h |  7 +++++++
>   4 files changed, 41 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4688ce88caf4..c677d92de3bc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5385,6 +5385,12 @@ union bpf_attr {
>    *	Return
>    *		0
>    *
> + * void *bpf_rbtree_get_lock(struct bpf_map *map)
> + *	Description
> + *		Return the bpf_spin_lock associated with the rbtree
> + *
> + *	Return
> + *		Ptr to lock
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -5600,6 +5606,7 @@ union bpf_attr {
>   	FN(rbtree_find),		\
>   	FN(rbtree_remove),		\
>   	FN(rbtree_free_node),		\
> +	FN(rbtree_get_lock),		\
>   	/* */
>   
>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 35eb66d11bf6..257a808bb767 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1587,6 +1587,7 @@ const struct bpf_func_proto bpf_rbtree_add_proto __weak;
>   const struct bpf_func_proto bpf_rbtree_find_proto __weak;
>   const struct bpf_func_proto bpf_rbtree_remove_proto __weak;
>   const struct bpf_func_proto bpf_rbtree_free_node_proto __weak;
> +const struct bpf_func_proto bpf_rbtree_get_lock_proto __weak;
>   
>   const struct bpf_func_proto *
>   bpf_base_func_proto(enum bpf_func_id func_id)
> @@ -1686,6 +1687,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>   		return &bpf_rbtree_remove_proto;
>   	case BPF_FUNC_rbtree_free_node:
>   		return &bpf_rbtree_free_node_proto;
> +	case BPF_FUNC_rbtree_get_lock:
> +		return &bpf_rbtree_get_lock_proto;
>   	default:
>   		break;
>   	}
> diff --git a/kernel/bpf/rbtree.c b/kernel/bpf/rbtree.c
> index 250d62210804..c6f0a2a083f6 100644
> --- a/kernel/bpf/rbtree.c
> +++ b/kernel/bpf/rbtree.c
> @@ -9,6 +9,7 @@
>   struct bpf_rbtree {
>   	struct bpf_map map;
>   	struct rb_root_cached root;
> +	struct bpf_spin_lock *lock;
>   };
>   
>   BTF_ID_LIST_SINGLE(bpf_rbtree_btf_ids, struct, rb_node);
> @@ -39,6 +40,14 @@ static struct bpf_map *rbtree_map_alloc(union bpf_attr *attr)
>   
>   	tree->root = RB_ROOT_CACHED;
>   	bpf_map_init_from_attr(&tree->map, attr);
> +
> +	tree->lock = bpf_map_kzalloc(&tree->map, sizeof(struct bpf_spin_lock),
> +				     GFP_KERNEL | __GFP_NOWARN);
> +	if (!tree->lock) {
> +		bpf_map_area_free(tree);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>   	return &tree->map;
>   }
>   
> @@ -139,6 +148,7 @@ static void rbtree_map_free(struct bpf_map *map)
>   
>   	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &tree->root.rb_root)
>   		kfree(pos);
> +	kfree(tree->lock);
>   	bpf_map_area_free(tree);
>   }
>   
> @@ -238,6 +248,20 @@ static int rbtree_map_get_next_key(struct bpf_map *map, void *key,
>   	return -ENOTSUPP;
>   }
>   
> +BPF_CALL_1(bpf_rbtree_get_lock, struct bpf_map *, map)
> +{
> +	struct bpf_rbtree *tree = container_of(map, struct bpf_rbtree, map);
> +
> +	return (u64)tree->lock;
> +}
> +
> +const struct bpf_func_proto bpf_rbtree_get_lock_proto = {
> +	.func = bpf_rbtree_get_lock,
> +	.gpl_only = true,
> +	.ret_type = RET_PTR_TO_MAP_VALUE,

This hack and

+const struct bpf_func_proto bpf_rbtree_unlock_proto = {
+	.func = bpf_rbtree_unlock,
+	.gpl_only = true,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_SPIN_LOCK,

may be too much arm twisting to reuse bpf_spin_lock.

May be return ptr_to_btf_id here and bpf_rbtree_lock
should match the type?
It could be new 'struct bpf_lock' in kernel's BTF.

Let's figure out how to associate locks with rbtrees.

Reiterating my proposal that was done earlier in the context
of Delyan's irq_work, but for different type:
How about:
struct bpf_lock *l;
l = bpf_mem_alloc(allocator, bpf_core_type_id_kernel(*l));

that would allocate ptr_to_btf_id object from kernel's btf.
The bpf_lock would have constructor and destructor provided by the 
kernel code.
constructor will set bpf_lock's refcnt to 1.
then bpf_rbtree_init(&map, lock) will bump the refcnt.
and dtor will eventually free it when all rbtrees are freed.
That would be similar to kptr's logic with kptr_get and dtor's 
associated with kernel's btf_id-s.
Kumar Kartikeya Dwivedi Aug. 2, 2022, 1:59 p.m. UTC | #2
On Tue, 2 Aug 2022 at 00:23, Alexei Starovoitov <ast@fb.com> wrote:
>
> On 7/22/22 11:34 AM, Dave Marchevsky wrote:
> > This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
> > well as a bpf_rbtree_get_lock helper which allows bpf programs to access
> > the lock.
> >
> > Ideally the bpf_spin_lock would be created independently oustide of the
> > tree and associated with it before the tree is used, either as part of
> > map definition or via some call like rbtree_init(&rbtree, &lock). Doing
> > this in an ergonomic way is proving harder than expected, so for now use
> > this workaround.
> >
> > Why is creating the bpf_spin_lock independently and associating it with
> > the tree preferable? Because we want to be able to transfer nodes
> > between trees atomically, and for this to work need same lock associated
> > with 2 trees.
>
> Right. We need one lock to protect multiple rbtrees.
> Since add/find/remove helpers will look into rbtree->lock
> the two different rbtree (== map) have to point to the same lock.
> Other than rbtree_init(&rbtree, &lock) what would be an alternative ?
>

Instead of dynamically associating locks with the rbtree map, why not
have lock embedded with the rbtree map, and construct a global locking
order among rbtree maps during prog load time that the verifier
maintains globally.

Prog A always takes rb_mapA.lock, then rb_mapB.lock,
If we see Prog B being loaded and it takes them in opposite order, we
reject the load because it can lead to ABBA deadlock. We also know the
map pointer statically so we ensure rb_mapA.lock cannot be called
recursively.

Everytime a prog is loaded, it validates against this single list of
lock orders amongst maps. Some of them may not have interdependencies
at all. There is a single total order, and any cycles lead to
verification failure. This might also work out for normal
bpf_spin_locks allowing us to take more than 1 at a time.

Then you can do moves atomically across two maps, by acquiring the
locks for both. Maybe we limit this to two locks for now only. There
could also be a C++ style std::lock{lock1, lock2} helper that takes
multiple locks to acquire in order, if you want to prevent anything
from happening between those two calls; just an idea.

Probably need to make rb_node add/find helpers notrace so that the bpf
program does not invoke lock again recursively when invoked from the
helper.

Then you can embed locked state statically in map pointer reg, and you
don't need to dynamically check whether map is locked. It will be
passed into the helpers, and from there a member offset can be fixed
which indicates whether the map is 'locked'.

If you have already explored this, then please share what the
problems/limitations were that you ran into, or if this is impossible.
It does sound too good to be true, so maybe I missed something
important here.

I was prototyping something similar for the pifomap in the XDP
queueing series, where we were exploring exposing the underlying
locking of the map to the user (to be able to batch insertions to the
map). The major concern was leaking too many implementation details to
the UAPI and locking (pun intended) ourselves out of improvements to
the map implementation later, so we held off on that for now (and also
wanted to evaluate other alternatives before doubling down on it).
Alexei Starovoitov Aug. 2, 2022, 3:30 p.m. UTC | #3
On 8/2/22 6:59 AM, Kumar Kartikeya Dwivedi wrote:
> On Tue, 2 Aug 2022 at 00:23, Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 7/22/22 11:34 AM, Dave Marchevsky wrote:
>>> This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
>>> well as a bpf_rbtree_get_lock helper which allows bpf programs to access
>>> the lock.
>>>
>>> Ideally the bpf_spin_lock would be created independently oustide of the
>>> tree and associated with it before the tree is used, either as part of
>>> map definition or via some call like rbtree_init(&rbtree, &lock). Doing
>>> this in an ergonomic way is proving harder than expected, so for now use
>>> this workaround.
>>>
>>> Why is creating the bpf_spin_lock independently and associating it with
>>> the tree preferable? Because we want to be able to transfer nodes
>>> between trees atomically, and for this to work need same lock associated
>>> with 2 trees.
>>
>> Right. We need one lock to protect multiple rbtrees.
>> Since add/find/remove helpers will look into rbtree->lock
>> the two different rbtree (== map) have to point to the same lock.
>> Other than rbtree_init(&rbtree, &lock) what would be an alternative ?
>>
> 
> Instead of dynamically associating locks with the rbtree map, why not
> have lock embedded with the rbtree map, and construct a global locking
> order among rbtree maps during prog load time that the verifier
> maintains globally.
> 
> Prog A always takes rb_mapA.lock, then rb_mapB.lock,
> If we see Prog B being loaded and it takes them in opposite order, we
> reject the load because it can lead to ABBA deadlock. We also know the
> map pointer statically so we ensure rb_mapA.lock cannot be called
> recursively.
> 
> Everytime a prog is loaded, it validates against this single list of
> lock orders amongst maps. Some of them may not have interdependencies
> at all. There is a single total order, and any cycles lead to
> verification failure. This might also work out for normal
> bpf_spin_locks allowing us to take more than 1 at a time.
> 
> Then you can do moves atomically across two maps, by acquiring the
> locks for both. Maybe we limit this to two locks for now only. There
> could also be a C++ style std::lock{lock1, lock2} helper that takes
> multiple locks to acquire in order, if you want to prevent anything
> from happening between those two calls; just an idea.

Consider the case of N rbtrees. One for each cpu.
In run-time the bpf prog might grab lock of X's rbtree and
then decide to transfer the node to a higher or lower cpu.
The order cannot be determined statically by the verifier.
The simple fix for this would be to share one lock across all rbtrees.
Not necessary such 'global' lock will scale. Just an example
where sharing the lock is useful.

> Probably need to make rb_node add/find helpers notrace so that the bpf
> program does not invoke lock again recursively when invoked from the
> helper.
> 
> Then you can embed locked state statically in map pointer reg, and you
> don't need to dynamically check whether map is locked. It will be
> passed into the helpers, and from there a member offset can be fixed
> which indicates whether the map is 'locked'.

considered that. So far doesn't look like we can avoid run-time checks.
And if we do run-time check there is no need to complicate the verifier.

> If you have already explored this, then please share what the
> problems/limitations were that you ran into, or if this is impossible.
> It does sound too good to be true, so maybe I missed something
> important here.
> 
> I was prototyping something similar for the pifomap in the XDP
> queueing series, where we were exploring exposing the underlying
> locking of the map to the user (to be able to batch insertions to the
> map). The major concern was leaking too many implementation details to
> the UAPI and locking (pun intended) ourselves out of improvements to
> the map implementation later, so we held off on that for now (and also
> wanted to evaluate other alternatives before doubling down on it).

That's probably not a concern. spin_lock concept is natural.
It's implementation could be different, but we already hide it in 
bpf_spin_lock.

As far as locking...
beyond rtbtree we will likely add link lists. They would need to be
protected by locks as well.
The same bpf struct element might have two fields: bpf_rb_node and
bpf_link_list_node. It could be a part of one rbtree while at the same
time linked into some other list.
Sharing one lock for rbtree operations and link list manipulations
would be necessary.

In the past we hid locks inside maps. Exposing 'bpf_lock' as a
standalone object and associating the lock with rbtrees, lists, etc
would allow for greater flexibility and allow bpf progs create
more advanced data structures.
Kumar Kartikeya Dwivedi Aug. 10, 2022, 9:46 p.m. UTC | #4
On Tue, 2 Aug 2022 at 00:23, Alexei Starovoitov <ast@fb.com> wrote:
>
> On 7/22/22 11:34 AM, Dave Marchevsky wrote:
> > This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
> > well as a bpf_rbtree_get_lock helper which allows bpf programs to access
> > the lock.
> >
> > Ideally the bpf_spin_lock would be created independently oustide of the
> > tree and associated with it before the tree is used, either as part of
> > map definition or via some call like rbtree_init(&rbtree, &lock). Doing
> > this in an ergonomic way is proving harder than expected, so for now use
> > this workaround.
> >
> > Why is creating the bpf_spin_lock independently and associating it with
> > the tree preferable? Because we want to be able to transfer nodes
> > between trees atomically, and for this to work need same lock associated
> > with 2 trees.
>
> Right. We need one lock to protect multiple rbtrees.
> Since add/find/remove helpers will look into rbtree->lock
> the two different rbtree (== map) have to point to the same lock.
> Other than rbtree_init(&rbtree, &lock) what would be an alternative ?
>
> >
> > Further locking-related patches will make it possible for the lock to be
> > used in BPF programs and add code which enforces that the lock is held
> > when doing any operation on the tree.
> >
> > Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
> > ---
> >   include/uapi/linux/bpf.h       |  7 +++++++
> >   kernel/bpf/helpers.c           |  3 +++
> >   kernel/bpf/rbtree.c            | 24 ++++++++++++++++++++++++
> >   tools/include/uapi/linux/bpf.h |  7 +++++++
> >   4 files changed, 41 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4688ce88caf4..c677d92de3bc 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -5385,6 +5385,12 @@ union bpf_attr {
> >    *  Return
> >    *          0
> >    *
> > + * void *bpf_rbtree_get_lock(struct bpf_map *map)
> > + *   Description
> > + *           Return the bpf_spin_lock associated with the rbtree
> > + *
> > + *   Return
> > + *           Ptr to lock
> >    */
> >   #define __BPF_FUNC_MAPPER(FN)               \
> >       FN(unspec),                     \
> > @@ -5600,6 +5606,7 @@ union bpf_attr {
> >       FN(rbtree_find),                \
> >       FN(rbtree_remove),              \
> >       FN(rbtree_free_node),           \
> > +     FN(rbtree_get_lock),            \
> >       /* */
> >
> >   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 35eb66d11bf6..257a808bb767 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1587,6 +1587,7 @@ const struct bpf_func_proto bpf_rbtree_add_proto __weak;
> >   const struct bpf_func_proto bpf_rbtree_find_proto __weak;
> >   const struct bpf_func_proto bpf_rbtree_remove_proto __weak;
> >   const struct bpf_func_proto bpf_rbtree_free_node_proto __weak;
> > +const struct bpf_func_proto bpf_rbtree_get_lock_proto __weak;
> >
> >   const struct bpf_func_proto *
> >   bpf_base_func_proto(enum bpf_func_id func_id)
> > @@ -1686,6 +1687,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
> >               return &bpf_rbtree_remove_proto;
> >       case BPF_FUNC_rbtree_free_node:
> >               return &bpf_rbtree_free_node_proto;
> > +     case BPF_FUNC_rbtree_get_lock:
> > +             return &bpf_rbtree_get_lock_proto;
> >       default:
> >               break;
> >       }
> > diff --git a/kernel/bpf/rbtree.c b/kernel/bpf/rbtree.c
> > index 250d62210804..c6f0a2a083f6 100644
> > --- a/kernel/bpf/rbtree.c
> > +++ b/kernel/bpf/rbtree.c
> > @@ -9,6 +9,7 @@
> >   struct bpf_rbtree {
> >       struct bpf_map map;
> >       struct rb_root_cached root;
> > +     struct bpf_spin_lock *lock;
> >   };
> >
> >   BTF_ID_LIST_SINGLE(bpf_rbtree_btf_ids, struct, rb_node);
> > @@ -39,6 +40,14 @@ static struct bpf_map *rbtree_map_alloc(union bpf_attr *attr)
> >
> >       tree->root = RB_ROOT_CACHED;
> >       bpf_map_init_from_attr(&tree->map, attr);
> > +
> > +     tree->lock = bpf_map_kzalloc(&tree->map, sizeof(struct bpf_spin_lock),
> > +                                  GFP_KERNEL | __GFP_NOWARN);
> > +     if (!tree->lock) {
> > +             bpf_map_area_free(tree);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> > +
> >       return &tree->map;
> >   }
> >
> > @@ -139,6 +148,7 @@ static void rbtree_map_free(struct bpf_map *map)
> >
> >       bpf_rbtree_postorder_for_each_entry_safe(pos, n, &tree->root.rb_root)
> >               kfree(pos);
> > +     kfree(tree->lock);
> >       bpf_map_area_free(tree);
> >   }
> >
> > @@ -238,6 +248,20 @@ static int rbtree_map_get_next_key(struct bpf_map *map, void *key,
> >       return -ENOTSUPP;
> >   }
> >
> > +BPF_CALL_1(bpf_rbtree_get_lock, struct bpf_map *, map)
> > +{
> > +     struct bpf_rbtree *tree = container_of(map, struct bpf_rbtree, map);
> > +
> > +     return (u64)tree->lock;
> > +}
> > +
> > +const struct bpf_func_proto bpf_rbtree_get_lock_proto = {
> > +     .func = bpf_rbtree_get_lock,
> > +     .gpl_only = true,
> > +     .ret_type = RET_PTR_TO_MAP_VALUE,
>
> This hack and
>
> +const struct bpf_func_proto bpf_rbtree_unlock_proto = {
> +       .func = bpf_rbtree_unlock,
> +       .gpl_only = true,
> +       .ret_type = RET_INTEGER,
> +       .arg1_type = ARG_PTR_TO_SPIN_LOCK,
>
> may be too much arm twisting to reuse bpf_spin_lock.
>
> May be return ptr_to_btf_id here and bpf_rbtree_lock
> should match the type?
> It could be new 'struct bpf_lock' in kernel's BTF.
>
> Let's figure out how to associate locks with rbtrees.
>
> Reiterating my proposal that was done earlier in the context
> of Delyan's irq_work, but for different type:
> How about:
> struct bpf_lock *l;
> l = bpf_mem_alloc(allocator, bpf_core_type_id_kernel(*l));
>
> that would allocate ptr_to_btf_id object from kernel's btf.
> The bpf_lock would have constructor and destructor provided by the
> kernel code.
> constructor will set bpf_lock's refcnt to 1.
> then bpf_rbtree_init(&map, lock) will bump the refcnt.
> and dtor will eventually free it when all rbtrees are freed.
> That would be similar to kptr's logic with kptr_get and dtor's
> associated with kernel's btf_id-s.

Just to continue brainstorming: Comments on this?

Instead of a rbtree map, you have a struct bpf_rbtree global variable
which works like a rbtree. To associate a lock with multiple
bpf_rbtree, you do clang style thread safety annotation in the bpf
program:

#define __guarded_by(lock) __attribute__((btf_type_tag("guarded_by:" #lock))

struct bpf_spin_lock shared_lock;
struct bpf_rbtree rbtree1 __guarded_by(shared_lock);
struct bpf_rbtree rbtree2 __guarded_by(shared_lock);

guarded_by tag is mandatory for the rbtree. Verifier ensures
shared_lock spin lock is held whenever rbtree1 or rbtree2 is being
accessed, and whitelists add/remove helpers inside the critical
section.

I don't think associating locks to rbtree dynamically is a hard
requirement for your use case? But if you need that, you may probably
also allocate sets of rbtree that are part of the same lock "class"
dynamically using bpf_kptr_alloc, and do similar annotation for the
struct being allocated.
struct rbtree_set {
  struct bpf_spin_lock lock;
  struct bpf_rbtree rbtree1 __guarded_by(lock);
  struct bpf_rbtree rbtree2 __guarded_by(lock);
};
struct rbtree_set *s = bpf_kptr_alloc(sizeof(*s), btf_local_type_id(*s));
// Just stash the pointer somewhere with kptr_xchg
On bpf_kptr_free, the verifier knows this is not a "trivial" struct,
so inserts destructor calls for bpf_rbtree fields during program
fixup.

The main insight is that lock and rbtree are part of the same
allocation (map value for global case, ptr_to_btf_id for dynamic case)
so the locked state can be bound to the reg state in the verifier.
Then we can also make this new rbtree API use kfuncs instead of UAPI
helpers, to get some field experience before baking it in.

Any opinions? Any brainos or deficiencies in the scheme above?

Background: I have been thinking about how I can bind kptr and normal
data synchronization without having unneeded atomic xchg cost when
lock is already protecting kptr. In my tests this guarded_by
annotation has been proving very useful (you mark data and kptr
protected by lock as guarded_by some spin lock member in same map
value, verifier ensures lock is held during access, and kptr_xchg for
guarded kptr is lowered to non-atomic load/store assuming no
concurrent access, and kptr_xchg from outside the lock section is
rejected).
Alexei Starovoitov Aug. 10, 2022, 10:06 p.m. UTC | #5
On Wed, Aug 10, 2022 at 2:47 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> Just to continue brainstorming: Comments on this?
>
> Instead of a rbtree map, you have a struct bpf_rbtree global variable
> which works like a rbtree. To associate a lock with multiple
> bpf_rbtree, you do clang style thread safety annotation in the bpf
> program:
>
> #define __guarded_by(lock) __attribute__((btf_type_tag("guarded_by:" #lock))
>
> struct bpf_spin_lock shared_lock;
> struct bpf_rbtree rbtree1 __guarded_by(shared_lock);
> struct bpf_rbtree rbtree2 __guarded_by(shared_lock);
>
> guarded_by tag is mandatory for the rbtree. Verifier ensures
> shared_lock spin lock is held whenever rbtree1 or rbtree2 is being
> accessed, and whitelists add/remove helpers inside the critical
> section.

We've been discussing exactly btf_type_tag annotation
to associate rbtree with a lock :)
Thanks for bringing it up as well.
Great that we're aligned.

> I don't think associating locks to rbtree dynamically is a hard
> requirement for your use case? But if you need that, you may probably
> also allocate sets of rbtree that are part of the same lock "class"
> dynamically using bpf_kptr_alloc, and do similar annotation for the
> struct being allocated.
> struct rbtree_set {
>   struct bpf_spin_lock lock;
>   struct bpf_rbtree rbtree1 __guarded_by(lock);
>   struct bpf_rbtree rbtree2 __guarded_by(lock);
> };
> struct rbtree_set *s = bpf_kptr_alloc(sizeof(*s), btf_local_type_id(*s));
> // Just stash the pointer somewhere with kptr_xchg
> On bpf_kptr_free, the verifier knows this is not a "trivial" struct,
> so inserts destructor calls for bpf_rbtree fields during program
> fixup.
>
> The main insight is that lock and rbtree are part of the same
> allocation (map value for global case, ptr_to_btf_id for dynamic case)
> so the locked state can be bound to the reg state in the verifier.

It works nicely in the static case, but ffwd a bit.
We might need an rbtree of rbtrees.
Pretty much like map-in-map that we have today for hash tables.

And the rbtree_node might be a part of an rbtree while
being chained into a separate link list.

We need a lock to protect operations on both rbtree and link list.
Then we might need to create an rbtree dynamically for each cgroup.
And store a pointer to rb_root in cgroup local storage.
Maybe allocating a lock + rb_root + linklist_head as one
allocation will not be too restrictive.

> Then we can also make this new rbtree API use kfuncs instead of UAPI
> helpers, to get some field experience before baking it in.

+1 to that.

> Any opinions? Any brainos or deficiencies in the scheme above?

It would be great if we can do the lock checks statically.
Dynamic locks means that rbtree_add/erase and in the future
link list insert/remove might fail which is horrible from
programmer perspective.
We've been thinking about the "abort" concept for such cases.
When helpers detect an unsafe condition like correct lock is not
taken, they can abort execution of itself, the bpf program
and prevent the program from executing in the future.
Conceptually it sounds great and will solve all kinds of ugliness,
but it's not clear yet how to implement such abort mechanism
which would mean stack unwind and calling of destructors for kptr-s,
refcnt decrements for acquired objects like sockets, etc.
Kumar Kartikeya Dwivedi Aug. 10, 2022, 11:16 p.m. UTC | #6
On Thu, 11 Aug 2022 at 00:06, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 10, 2022 at 2:47 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > Just to continue brainstorming: Comments on this?
> >
> > Instead of a rbtree map, you have a struct bpf_rbtree global variable
> > which works like a rbtree. To associate a lock with multiple
> > bpf_rbtree, you do clang style thread safety annotation in the bpf
> > program:
> >
> > #define __guarded_by(lock) __attribute__((btf_type_tag("guarded_by:" #lock))
> >
> > struct bpf_spin_lock shared_lock;
> > struct bpf_rbtree rbtree1 __guarded_by(shared_lock);
> > struct bpf_rbtree rbtree2 __guarded_by(shared_lock);
> >
> > guarded_by tag is mandatory for the rbtree. Verifier ensures
> > shared_lock spin lock is held whenever rbtree1 or rbtree2 is being
> > accessed, and whitelists add/remove helpers inside the critical
> > section.
>
> We've been discussing exactly btf_type_tag annotation
> to associate rbtree with a lock :)
> Thanks for bringing it up as well.
> Great that we're aligned.
>

You know how the saying goes, "Great minds think alike" ;-)

> > I don't think associating locks to rbtree dynamically is a hard
> > requirement for your use case? But if you need that, you may probably
> > also allocate sets of rbtree that are part of the same lock "class"
> > dynamically using bpf_kptr_alloc, and do similar annotation for the
> > struct being allocated.
> > struct rbtree_set {
> >   struct bpf_spin_lock lock;
> >   struct bpf_rbtree rbtree1 __guarded_by(lock);
> >   struct bpf_rbtree rbtree2 __guarded_by(lock);
> > };
> > struct rbtree_set *s = bpf_kptr_alloc(sizeof(*s), btf_local_type_id(*s));
> > // Just stash the pointer somewhere with kptr_xchg
> > On bpf_kptr_free, the verifier knows this is not a "trivial" struct,
> > so inserts destructor calls for bpf_rbtree fields during program
> > fixup.
> >
> > The main insight is that lock and rbtree are part of the same
> > allocation (map value for global case, ptr_to_btf_id for dynamic case)
> > so the locked state can be bound to the reg state in the verifier.
>
> It works nicely in the static case, but ffwd a bit.
> We might need an rbtree of rbtrees.
> Pretty much like map-in-map that we have today for hash tables.
>

True, the map in map case makes things harder. But just like
inner_map_fd, you will need to parameterize such rb_root/list_head
containers with a value type (probably type tag again) so the result
of find/remove can be known statically.
From there, we know the type of lock associated with the rbtree in the
value, _if_ we follow the convention of binding rb_root + lock + ...
in a single data item. Without that, it's pretty hard to do without
runtime checks, as you've pointed out in a previous reply.

My idea was that we should try to associate data being locked with its
lock in the same allocation, working a bit like a bpf_spin_lock<T>,
but more flexible as you can have additional unprotected data in the
struct by annotating using guarded_by style tags.

There might be some use cases where we really require dynamically
binding locks at runtime to some data structure, then we should
probably add that later when a use case comes up, and keep those sets
of APIs separate from the simpler case. As you mentioned, in the
dynamic case the add helper becomes fallible. The same will happen
with linked list helpers.

> And the rbtree_node might be a part of an rbtree while
> being chained into a separate link list.
>
> We need a lock to protect operations on both rbtree and link list.
> Then we might need to create an rbtree dynamically for each cgroup.
> And store a pointer to rb_root in cgroup local storage.
> Maybe allocating a lock + rb_root + linklist_head as one
> allocation will not be too restrictive.
>
> > Then we can also make this new rbtree API use kfuncs instead of UAPI
> > helpers, to get some field experience before baking it in.
>
> +1 to that.
>
> > Any opinions? Any brainos or deficiencies in the scheme above?
>
> It would be great if we can do the lock checks statically.
> Dynamic locks means that rbtree_add/erase and in the future
> link list insert/remove might fail which is horrible from
> programmer perspective.

+1. I'll also be happy to help with code review (for as much as I
understand) when Dave reposts the series.

> We've been thinking about the "abort" concept for such cases.
> When helpers detect an unsafe condition like correct lock is not
> taken, they can abort execution of itself, the bpf program
> and prevent the program from executing in the future.
> Conceptually it sounds great and will solve all kinds of ugliness,
> but it's not clear yet how to implement such abort mechanism
> which would mean stack unwind and calling of destructors for kptr-s,
> refcnt decrements for acquired objects like sockets, etc.

Fancy. Though it certainly looks very painful to implement (just
thinking about kptr, say the release kfunc is NMI unsafe, and
detecting such live kptr in perf prog moved out of a map, we would
probably then need to do it using irq_work_queue, but also alloc work
item for it, which might fail in NMI context unwind so mem leak).
Yonghong Song Aug. 15, 2022, 5:33 a.m. UTC | #7
On 8/10/22 2:46 PM, Kumar Kartikeya Dwivedi wrote:
> On Tue, 2 Aug 2022 at 00:23, Alexei Starovoitov <ast@fb.com> wrote:
>>
>> On 7/22/22 11:34 AM, Dave Marchevsky wrote:
>>> This patch adds a struct bpf_spin_lock *lock member to bpf_rbtree, as
>>> well as a bpf_rbtree_get_lock helper which allows bpf programs to access
>>> the lock.
>>>
>>> Ideally the bpf_spin_lock would be created independently oustide of the
>>> tree and associated with it before the tree is used, either as part of
>>> map definition or via some call like rbtree_init(&rbtree, &lock). Doing
>>> this in an ergonomic way is proving harder than expected, so for now use
>>> this workaround.
>>>
>>> Why is creating the bpf_spin_lock independently and associating it with
>>> the tree preferable? Because we want to be able to transfer nodes
>>> between trees atomically, and for this to work need same lock associated
>>> with 2 trees.
>>
>> Right. We need one lock to protect multiple rbtrees.
>> Since add/find/remove helpers will look into rbtree->lock
>> the two different rbtree (== map) have to point to the same lock.
>> Other than rbtree_init(&rbtree, &lock) what would be an alternative ?
>>
>>>
>>> Further locking-related patches will make it possible for the lock to be
>>> used in BPF programs and add code which enforces that the lock is held
>>> when doing any operation on the tree.
>>>
>>> Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
>>> ---
>>>    include/uapi/linux/bpf.h       |  7 +++++++
>>>    kernel/bpf/helpers.c           |  3 +++
>>>    kernel/bpf/rbtree.c            | 24 ++++++++++++++++++++++++
>>>    tools/include/uapi/linux/bpf.h |  7 +++++++
>>>    4 files changed, 41 insertions(+)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 4688ce88caf4..c677d92de3bc 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -5385,6 +5385,12 @@ union bpf_attr {
>>>     *  Return
>>>     *          0
>>>     *
>>> + * void *bpf_rbtree_get_lock(struct bpf_map *map)
>>> + *   Description
>>> + *           Return the bpf_spin_lock associated with the rbtree
>>> + *
>>> + *   Return
>>> + *           Ptr to lock
>>>     */
>>>    #define __BPF_FUNC_MAPPER(FN)               \
>>>        FN(unspec),                     \
>>> @@ -5600,6 +5606,7 @@ union bpf_attr {
>>>        FN(rbtree_find),                \
>>>        FN(rbtree_remove),              \
>>>        FN(rbtree_free_node),           \
>>> +     FN(rbtree_get_lock),            \
>>>        /* */
>>>
>>>    /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>> index 35eb66d11bf6..257a808bb767 100644
>>> --- a/kernel/bpf/helpers.c
>>> +++ b/kernel/bpf/helpers.c
>>> @@ -1587,6 +1587,7 @@ const struct bpf_func_proto bpf_rbtree_add_proto __weak;
>>>    const struct bpf_func_proto bpf_rbtree_find_proto __weak;
>>>    const struct bpf_func_proto bpf_rbtree_remove_proto __weak;
>>>    const struct bpf_func_proto bpf_rbtree_free_node_proto __weak;
>>> +const struct bpf_func_proto bpf_rbtree_get_lock_proto __weak;
>>>
>>>    const struct bpf_func_proto *
>>>    bpf_base_func_proto(enum bpf_func_id func_id)
>>> @@ -1686,6 +1687,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
>>>                return &bpf_rbtree_remove_proto;
>>>        case BPF_FUNC_rbtree_free_node:
>>>                return &bpf_rbtree_free_node_proto;
>>> +     case BPF_FUNC_rbtree_get_lock:
>>> +             return &bpf_rbtree_get_lock_proto;
>>>        default:
>>>                break;
>>>        }
>>> diff --git a/kernel/bpf/rbtree.c b/kernel/bpf/rbtree.c
>>> index 250d62210804..c6f0a2a083f6 100644
>>> --- a/kernel/bpf/rbtree.c
>>> +++ b/kernel/bpf/rbtree.c
>>> @@ -9,6 +9,7 @@
>>>    struct bpf_rbtree {
>>>        struct bpf_map map;
>>>        struct rb_root_cached root;
>>> +     struct bpf_spin_lock *lock;
>>>    };
>>>
>>>    BTF_ID_LIST_SINGLE(bpf_rbtree_btf_ids, struct, rb_node);
>>> @@ -39,6 +40,14 @@ static struct bpf_map *rbtree_map_alloc(union bpf_attr *attr)
>>>
>>>        tree->root = RB_ROOT_CACHED;
>>>        bpf_map_init_from_attr(&tree->map, attr);
>>> +
>>> +     tree->lock = bpf_map_kzalloc(&tree->map, sizeof(struct bpf_spin_lock),
>>> +                                  GFP_KERNEL | __GFP_NOWARN);
>>> +     if (!tree->lock) {
>>> +             bpf_map_area_free(tree);
>>> +             return ERR_PTR(-ENOMEM);
>>> +     }
>>> +
>>>        return &tree->map;
>>>    }
>>>
>>> @@ -139,6 +148,7 @@ static void rbtree_map_free(struct bpf_map *map)
>>>
>>>        bpf_rbtree_postorder_for_each_entry_safe(pos, n, &tree->root.rb_root)
>>>                kfree(pos);
>>> +     kfree(tree->lock);
>>>        bpf_map_area_free(tree);
>>>    }
>>>
>>> @@ -238,6 +248,20 @@ static int rbtree_map_get_next_key(struct bpf_map *map, void *key,
>>>        return -ENOTSUPP;
>>>    }
>>>
>>> +BPF_CALL_1(bpf_rbtree_get_lock, struct bpf_map *, map)
>>> +{
>>> +     struct bpf_rbtree *tree = container_of(map, struct bpf_rbtree, map);
>>> +
>>> +     return (u64)tree->lock;
>>> +}
>>> +
>>> +const struct bpf_func_proto bpf_rbtree_get_lock_proto = {
>>> +     .func = bpf_rbtree_get_lock,
>>> +     .gpl_only = true,
>>> +     .ret_type = RET_PTR_TO_MAP_VALUE,
>>
>> This hack and
>>
>> +const struct bpf_func_proto bpf_rbtree_unlock_proto = {
>> +       .func = bpf_rbtree_unlock,
>> +       .gpl_only = true,
>> +       .ret_type = RET_INTEGER,
>> +       .arg1_type = ARG_PTR_TO_SPIN_LOCK,
>>
>> may be too much arm twisting to reuse bpf_spin_lock.
>>
>> May be return ptr_to_btf_id here and bpf_rbtree_lock
>> should match the type?
>> It could be new 'struct bpf_lock' in kernel's BTF.
>>
>> Let's figure out how to associate locks with rbtrees.
>>
>> Reiterating my proposal that was done earlier in the context
>> of Delyan's irq_work, but for different type:
>> How about:
>> struct bpf_lock *l;
>> l = bpf_mem_alloc(allocator, bpf_core_type_id_kernel(*l));
>>
>> that would allocate ptr_to_btf_id object from kernel's btf.
>> The bpf_lock would have constructor and destructor provided by the
>> kernel code.
>> constructor will set bpf_lock's refcnt to 1.
>> then bpf_rbtree_init(&map, lock) will bump the refcnt.
>> and dtor will eventually free it when all rbtrees are freed.
>> That would be similar to kptr's logic with kptr_get and dtor's
>> associated with kernel's btf_id-s.
> 
> Just to continue brainstorming: Comments on this?
> 
> Instead of a rbtree map, you have a struct bpf_rbtree global variable
> which works like a rbtree. To associate a lock with multiple
> bpf_rbtree, you do clang style thread safety annotation in the bpf
> program:
> 
> #define __guarded_by(lock) __attribute__((btf_type_tag("guarded_by:" #lock))
> 
> struct bpf_spin_lock shared_lock;
> struct bpf_rbtree rbtree1 __guarded_by(shared_lock);
> struct bpf_rbtree rbtree2 __guarded_by(shared_lock);

For the above __guarded_by macro, we should use
btf_decl_tag instead of btf_type_tag

#define __guarded_by(lock) __attribute__((btf_decl_tag("guarded_by:" #lock))

Currently, in llvm implementation, btf_type_tag only applies
to pointee type's. btf_decl_tag can apply to global variable,
function argument, function return value and struct/union members.
So btf_decl_tag shoul work for the above global variable case or
below struct rbtree_set member case.

> 
> guarded_by tag is mandatory for the rbtree. Verifier ensures
> shared_lock spin lock is held whenever rbtree1 or rbtree2 is being
> accessed, and whitelists add/remove helpers inside the critical
> section.
> 
> I don't think associating locks to rbtree dynamically is a hard
> requirement for your use case? But if you need that, you may probably
> also allocate sets of rbtree that are part of the same lock "class"
> dynamically using bpf_kptr_alloc, and do similar annotation for the
> struct being allocated.
> struct rbtree_set {
>    struct bpf_spin_lock lock;
>    struct bpf_rbtree rbtree1 __guarded_by(lock);
>    struct bpf_rbtree rbtree2 __guarded_by(lock);
> };
> struct rbtree_set *s = bpf_kptr_alloc(sizeof(*s), btf_local_type_id(*s));
> // Just stash the pointer somewhere with kptr_xchg
> On bpf_kptr_free, the verifier knows this is not a "trivial" struct,
> so inserts destructor calls for bpf_rbtree fields during program
> fixup.
> 
> The main insight is that lock and rbtree are part of the same
> allocation (map value for global case, ptr_to_btf_id for dynamic case)
> so the locked state can be bound to the reg state in the verifier.
> Then we can also make this new rbtree API use kfuncs instead of UAPI
> helpers, to get some field experience before baking it in.
> 
> Any opinions? Any brainos or deficiencies in the scheme above?
> 
> Background: I have been thinking about how I can bind kptr and normal
> data synchronization without having unneeded atomic xchg cost when
> lock is already protecting kptr. In my tests this guarded_by
> annotation has been proving very useful (you mark data and kptr
> protected by lock as guarded_by some spin lock member in same map
> value, verifier ensures lock is held during access, and kptr_xchg for
> guarded kptr is lowered to non-atomic load/store assuming no
> concurrent access, and kptr_xchg from outside the lock section is
> rejected).
Kumar Kartikeya Dwivedi Aug. 15, 2022, 5:37 a.m. UTC | #8
On Mon, 15 Aug 2022 at 07:34, Yonghong Song <yhs@fb.com> wrote:
>
> On 8/10/22 2:46 PM, Kumar Kartikeya Dwivedi wrote:
> > On Tue, 2 Aug 2022 at 00:23, Alexei Starovoitov <ast@fb.com> wrote:
> [...]
> >
> > Just to continue brainstorming: Comments on this?
> >
> > Instead of a rbtree map, you have a struct bpf_rbtree global variable
> > which works like a rbtree. To associate a lock with multiple
> > bpf_rbtree, you do clang style thread safety annotation in the bpf
> > program:
> >
> > #define __guarded_by(lock) __attribute__((btf_type_tag("guarded_by:" #lock))
> >
> > struct bpf_spin_lock shared_lock;
> > struct bpf_rbtree rbtree1 __guarded_by(shared_lock);
> > struct bpf_rbtree rbtree2 __guarded_by(shared_lock);
>
> For the above __guarded_by macro, we should use
> btf_decl_tag instead of btf_type_tag
>
> #define __guarded_by(lock) __attribute__((btf_decl_tag("guarded_by:" #lock))
>
> Currently, in llvm implementation, btf_type_tag only applies
> to pointee type's. btf_decl_tag can apply to global variable,
> function argument, function return value and struct/union members.
> So btf_decl_tag shoul work for the above global variable case or
> below struct rbtree_set member case.
>

Yep, I actually wrote a prototype list implementation (it's very close
so I can probably post it very soon as an RFC, at this point) which is
using declaration tags like this. For now only one bpf_spin_lock is
there in a map value or globally, so I didn't add guarded_by, but if
you look in [0] it can be used to tag e.g. value_type of a specific
bpf_list_head, which node in that value type can be linked, etc.

  [0]: https://github.com/kkdwivedi/linux/commits/bpf-list
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4688ce88caf4..c677d92de3bc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5385,6 +5385,12 @@  union bpf_attr {
  *	Return
  *		0
  *
+ * void *bpf_rbtree_get_lock(struct bpf_map *map)
+ *	Description
+ *		Return the bpf_spin_lock associated with the rbtree
+ *
+ *	Return
+ *		Ptr to lock
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5600,6 +5606,7 @@  union bpf_attr {
 	FN(rbtree_find),		\
 	FN(rbtree_remove),		\
 	FN(rbtree_free_node),		\
+	FN(rbtree_get_lock),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 35eb66d11bf6..257a808bb767 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1587,6 +1587,7 @@  const struct bpf_func_proto bpf_rbtree_add_proto __weak;
 const struct bpf_func_proto bpf_rbtree_find_proto __weak;
 const struct bpf_func_proto bpf_rbtree_remove_proto __weak;
 const struct bpf_func_proto bpf_rbtree_free_node_proto __weak;
+const struct bpf_func_proto bpf_rbtree_get_lock_proto __weak;
 
 const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
@@ -1686,6 +1687,8 @@  bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_rbtree_remove_proto;
 	case BPF_FUNC_rbtree_free_node:
 		return &bpf_rbtree_free_node_proto;
+	case BPF_FUNC_rbtree_get_lock:
+		return &bpf_rbtree_get_lock_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/rbtree.c b/kernel/bpf/rbtree.c
index 250d62210804..c6f0a2a083f6 100644
--- a/kernel/bpf/rbtree.c
+++ b/kernel/bpf/rbtree.c
@@ -9,6 +9,7 @@ 
 struct bpf_rbtree {
 	struct bpf_map map;
 	struct rb_root_cached root;
+	struct bpf_spin_lock *lock;
 };
 
 BTF_ID_LIST_SINGLE(bpf_rbtree_btf_ids, struct, rb_node);
@@ -39,6 +40,14 @@  static struct bpf_map *rbtree_map_alloc(union bpf_attr *attr)
 
 	tree->root = RB_ROOT_CACHED;
 	bpf_map_init_from_attr(&tree->map, attr);
+
+	tree->lock = bpf_map_kzalloc(&tree->map, sizeof(struct bpf_spin_lock),
+				     GFP_KERNEL | __GFP_NOWARN);
+	if (!tree->lock) {
+		bpf_map_area_free(tree);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	return &tree->map;
 }
 
@@ -139,6 +148,7 @@  static void rbtree_map_free(struct bpf_map *map)
 
 	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &tree->root.rb_root)
 		kfree(pos);
+	kfree(tree->lock);
 	bpf_map_area_free(tree);
 }
 
@@ -238,6 +248,20 @@  static int rbtree_map_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
+BPF_CALL_1(bpf_rbtree_get_lock, struct bpf_map *, map)
+{
+	struct bpf_rbtree *tree = container_of(map, struct bpf_rbtree, map);
+
+	return (u64)tree->lock;
+}
+
+const struct bpf_func_proto bpf_rbtree_get_lock_proto = {
+	.func = bpf_rbtree_get_lock,
+	.gpl_only = true,
+	.ret_type = RET_PTR_TO_MAP_VALUE,
+	.arg1_type = ARG_CONST_MAP_PTR,
+};
+
 BTF_ID_LIST_SINGLE(bpf_rbtree_map_btf_ids, struct, bpf_rbtree)
 const struct bpf_map_ops rbtree_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4688ce88caf4..c677d92de3bc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5385,6 +5385,12 @@  union bpf_attr {
  *	Return
  *		0
  *
+ * void *bpf_rbtree_get_lock(struct bpf_map *map)
+ *	Description
+ *		Return the bpf_spin_lock associated with the rbtree
+ *
+ *	Return
+ *		Ptr to lock
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5600,6 +5606,7 @@  union bpf_attr {
 	FN(rbtree_find),		\
 	FN(rbtree_remove),		\
 	FN(rbtree_free_node),		\
+	FN(rbtree_get_lock),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper