diff mbox series

[v3,bpf-next,12/13] bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().

Message ID 20230628015634.33193-13-alexei.starovoitov@gmail.com (mailing list archive)
State Superseded
Headers show
Series bpf: Introduce bpf_mem_cache_free_rcu(). | expand

Commit Message

Alexei Starovoitov June 28, 2023, 1:56 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
objects into free_by_rcu_ttrace list where they are waiting for RCU
task trace grace period to be freed into slab.

The life cycle of objects:
alloc: dequeue free_llist
free: enqeueu free_llist
free_rcu: enqueue free_by_rcu -> waiting_for_gp
free_llist above high watermark -> free_by_rcu_ttrace
after RCU GP waiting_for_gp -> free_by_rcu_ttrace
free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |   2 +
 kernel/bpf/memalloc.c         | 129 +++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 3 deletions(-)

Comments

Paul E. McKenney June 28, 2023, 5:57 p.m. UTC | #1
On Tue, Jun 27, 2023 at 06:56:33PM -0700, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
> 
> The life cycle of objects:
> alloc: dequeue free_llist
> free: enqeueu free_llist
> free_rcu: enqueue free_by_rcu -> waiting_for_gp
> free_llist above high watermark -> free_by_rcu_ttrace
> after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf_mem_alloc.h |   2 +
>  kernel/bpf/memalloc.c         | 129 +++++++++++++++++++++++++++++++++-
>  2 files changed, 128 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
> index 3929be5743f4..d644bbb298af 100644
> --- a/include/linux/bpf_mem_alloc.h
> +++ b/include/linux/bpf_mem_alloc.h
> @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
>  /* kmalloc/kfree equivalent: */
>  void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
>  void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
> +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
>  
>  /* kmem_cache_alloc/free equivalent: */
>  void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
>  void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
> +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
>  void bpf_mem_cache_raw_free(void *ptr);
>  void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
>  
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 40524d9454c7..3081d06a434c 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -101,6 +101,15 @@ struct bpf_mem_cache {
>  	bool draining;
>  	struct bpf_mem_cache *tgt;
>  
> +	/* list of objects to be freed after RCU GP */
> +	struct llist_head free_by_rcu;
> +	struct llist_node *free_by_rcu_tail;
> +	struct llist_head waiting_for_gp;
> +	struct llist_node *waiting_for_gp_tail;
> +	struct rcu_head rcu;
> +	atomic_t call_rcu_in_progress;
> +	struct llist_head free_llist_extra_rcu;
> +
>  	/* list of objects to be freed after RCU tasks trace GP */
>  	struct llist_head free_by_rcu_ttrace;
>  	struct llist_head waiting_for_gp_ttrace;
> @@ -344,6 +353,69 @@ static void free_bulk(struct bpf_mem_cache *c)
>  	do_call_rcu_ttrace(tgt);
>  }
>  
> +static void __free_by_rcu(struct rcu_head *head)
> +{
> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +	struct bpf_mem_cache *tgt = c->tgt;
> +	struct llist_node *llnode;
> +
> +	llnode = llist_del_all(&c->waiting_for_gp);
> +	if (!llnode)
> +		goto out;
> +
> +	llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
> +
> +	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
> +	do_call_rcu_ttrace(tgt);
> +out:
> +	atomic_set(&c->call_rcu_in_progress, 0);
> +}
> +
> +static void check_free_by_rcu(struct bpf_mem_cache *c)
> +{
> +	struct llist_node *llnode, *t;
> +	unsigned long flags;
> +
> +	/* drain free_llist_extra_rcu */
> +	if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
> +		inc_active(c, &flags);
> +		llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
> +			if (__llist_add(llnode, &c->free_by_rcu))
> +				c->free_by_rcu_tail = llnode;
> +		dec_active(c, flags);
> +	}
> +
> +	if (llist_empty(&c->free_by_rcu))
> +		return;
> +
> +	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
> +		/*
> +		 * Instead of kmalloc-ing new rcu_head and triggering 10k
> +		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
> +		 * the overload just ask RCU to hurry up. There could be many
> +		 * objects in free_by_rcu list.
> +		 * This hint reduces memory consumption for an artifical
> +		 * benchmark from 2 Gbyte to 150 Mbyte.
> +		 */
> +		rcu_request_urgent_qs_task(current);

I have been going back and forth on whether rcu_request_urgent_qs_task()
needs to throttle calls to itself, for example, to pay attention to only
one invocation per jiffy.  The theory here is that RCU's state machine
normally only advances about once per jiffy anyway.

The main risk of *not* throttling is if several CPUs were to invoke
rcu_request_urgent_qs_task() in tight loops while those same CPUs were
undergoing interrupt storms, which would result in heavy lock contention
in __rcu_irq_enter_check_tick().  This is not exactly a common-case
scenario, but on the other hand, if you are having this degree of trouble,
should RCU really be adding lock contention to your troubles?

Thoughts?

							Thanx, Paul

> +		return;
> +	}
> +
> +	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
> +
> +	inc_active(c, &flags);
> +	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
> +	c->waiting_for_gp_tail = c->free_by_rcu_tail;
> +	dec_active(c, flags);
> +
> +	if (unlikely(READ_ONCE(c->draining))) {
> +		free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
> +		atomic_set(&c->call_rcu_in_progress, 0);
> +	} else {
> +		call_rcu_hurry(&c->rcu, __free_by_rcu);
> +	}
> +}
> +
>  static void bpf_mem_refill(struct irq_work *work)
>  {
>  	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
> @@ -358,6 +430,8 @@ static void bpf_mem_refill(struct irq_work *work)
>  		alloc_bulk(c, c->batch, NUMA_NO_NODE);
>  	else if (cnt > c->high_watermark)
>  		free_bulk(c);
> +
> +	check_free_by_rcu(c);
>  }
>  
>  static void notrace irq_work_raise(struct bpf_mem_cache *c)
> @@ -486,6 +560,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
>  	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
>  	free_all(__llist_del_all(&c->free_llist), percpu);
>  	free_all(__llist_del_all(&c->free_llist_extra), percpu);
> +	free_all(__llist_del_all(&c->free_by_rcu), percpu);
> +	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
> +	free_all(llist_del_all(&c->waiting_for_gp), percpu);
>  }
>  
>  static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
> @@ -498,8 +575,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>  
>  static void free_mem_alloc(struct bpf_mem_alloc *ma)
>  {
> -	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
> -	 * still execute. Wait for it now before we freeing percpu caches.
> +	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
> +	 * might still execute. Wait for them.
>  	 *
>  	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
>  	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
> @@ -508,7 +585,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
>  	 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
>  	 * using rcu_trace_implies_rcu_gp() as well.
>  	 */
> -	rcu_barrier_tasks_trace();
> +	rcu_barrier(); /* wait for __free_by_rcu */
> +	rcu_barrier_tasks_trace(); /* wait for __free_rcu */
>  	if (!rcu_trace_implies_rcu_gp())
>  		rcu_barrier();
>  	free_mem_alloc_no_barrier(ma);
> @@ -561,6 +639,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  			irq_work_sync(&c->refill_work);
>  			drain_mem_cache(c);
>  			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
> +			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
>  		}
>  		/* objcg is the same across cpus */
>  		if (c->objcg)
> @@ -577,6 +656,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  				irq_work_sync(&c->refill_work);
>  				drain_mem_cache(c);
>  				rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
> +				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
>  			}
>  		}
>  		if (c->objcg)
> @@ -661,6 +741,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
>  		irq_work_raise(c);
>  }
>  
> +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
> +{
> +	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
> +	unsigned long flags;
> +
> +	c->tgt = *(struct bpf_mem_cache **)llnode;
> +
> +	local_irq_save(flags);
> +	if (local_inc_return(&c->active) == 1) {
> +		if (__llist_add(llnode, &c->free_by_rcu))
> +			c->free_by_rcu_tail = llnode;
> +	} else {
> +		llist_add(llnode, &c->free_llist_extra_rcu);
> +	}
> +	local_dec(&c->active);
> +	local_irq_restore(flags);
> +
> +	if (!atomic_read(&c->call_rcu_in_progress))
> +		irq_work_raise(c);
> +}
> +
>  /* Called from BPF program or from sys_bpf syscall.
>   * In both cases migration is disabled.
>   */
> @@ -694,6 +795,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
>  	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
>  }
>  
> +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +	int idx;
> +
> +	if (!ptr)
> +		return;
> +
> +	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
> +	if (idx < 0)
> +		return;
> +
> +	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
> +}
> +
>  void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
>  {
>  	void *ret;
> @@ -710,6 +825,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
>  	unit_free(this_cpu_ptr(ma->cache), ptr);
>  }
>  
> +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +	if (!ptr)
> +		return;
> +
> +	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
> +}
> +
>  /* Directly does a kfree() without putting 'ptr' back to the free_llist
>   * for reuse and without waiting for a rcu_tasks_trace gp.
>   * The caller must first go through the rcu_tasks_trace gp for 'ptr'
> -- 
> 2.34.1
>
Hou Tao June 29, 2023, 2:24 a.m. UTC | #2
Hi,

On 6/28/2023 9:56 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
>
> The life cycle of objects:
> alloc: dequeue free_llist
> free: enqeueu free_llist
> free_rcu: enqueue free_by_rcu -> waiting_for_gp
> free_llist above high watermark -> free_by_rcu_ttrace
> after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
SNIP
>  
> +static void __free_by_rcu(struct rcu_head *head)
> +{
> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +	struct bpf_mem_cache *tgt = c->tgt;
> +	struct llist_node *llnode;
> +
> +	llnode = llist_del_all(&c->waiting_for_gp);
> +	if (!llnode)
> +		goto out;
> +
> +	llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
> +
> +	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
> +	do_call_rcu_ttrace(tgt);

I still got report about leaked free_by_rcu_ttrace without adding any
extra hack except using bpf_mem_cache_free_rcu() in htab.

When bpf ma is freed through free_mem_alloc(), the following sequence
may lead to leak of free_by_rcu_ttrace:

P1: bpf_mem_alloc_destroy()
    P2: __free_by_rcu()

    // got false
    P2: read c->draining

P1: c->draining = true
P1: llist_del_all(&c->free_by_rcu_ttrace)

    // add to free_by_rcu_ttrace again
    P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace)
        P2: do_call_rcu_ttrace()
            // call_rcu_ttrace_in_progress is 1, so xchg return 1
            // and it doesn't being moved to waiting_for_gp_ttrace
            P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)

// got 1
P1: atomic_read(&c->call_rcu_ttrace_in_progress)
// objects in free_by_rcu_ttrace is leaked

I think the race could be fixed by checking c->draining in
do_call_rcu_ttrace() when atomic_xchg() returns 1 as shown below:

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 2bdb894392c5..9f41025560bd 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -303,8 +303,13 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
 {
        struct llist_node *llnode, *t;

-       if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1))
+       if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+               if (READ_ONCE(c->draining)) {
+                       llnode = llist_del_all(&c->free_by_rcu_ttrace);
+                       free_all(llnode, !!c->percpu_size);
+               }
                return;
+       }
Alexei Starovoitov June 29, 2023, 3:42 a.m. UTC | #3
On Wed, Jun 28, 2023 at 7:24 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 6/28/2023 9:56 AM, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> > objects into free_by_rcu_ttrace list where they are waiting for RCU
> > task trace grace period to be freed into slab.
> >
> > The life cycle of objects:
> > alloc: dequeue free_llist
> > free: enqeueu free_llist
> > free_rcu: enqueue free_by_rcu -> waiting_for_gp
> > free_llist above high watermark -> free_by_rcu_ttrace
> > after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> > free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> SNIP
> >
> > +static void __free_by_rcu(struct rcu_head *head)
> > +{
> > +     struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> > +     struct bpf_mem_cache *tgt = c->tgt;
> > +     struct llist_node *llnode;
> > +
> > +     llnode = llist_del_all(&c->waiting_for_gp);
> > +     if (!llnode)
> > +             goto out;
> > +
> > +     llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
> > +
> > +     /* Objects went through regular RCU GP. Send them to RCU tasks trace */
> > +     do_call_rcu_ttrace(tgt);
>
> I still got report about leaked free_by_rcu_ttrace without adding any
> extra hack except using bpf_mem_cache_free_rcu() in htab.

Please share the steps to repro.

> When bpf ma is freed through free_mem_alloc(), the following sequence
> may lead to leak of free_by_rcu_ttrace:
>
> P1: bpf_mem_alloc_destroy()
>     P2: __free_by_rcu()
>
>     // got false
>     P2: read c->draining
>
> P1: c->draining = true
> P1: llist_del_all(&c->free_by_rcu_ttrace)
>
>     // add to free_by_rcu_ttrace again
>     P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace)
>         P2: do_call_rcu_ttrace()
>             // call_rcu_ttrace_in_progress is 1, so xchg return 1
>             // and it doesn't being moved to waiting_for_gp_ttrace
>             P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)
>
> // got 1
> P1: atomic_read(&c->call_rcu_ttrace_in_progress)
> // objects in free_by_rcu_ttrace is leaked
>
> I think the race could be fixed by checking c->draining in
> do_call_rcu_ttrace() when atomic_xchg() returns 1 as shown below:

If the theory of the bug holds true then the fix makes sense,
but did you repro without fix and cannot repro with the fix?
We should not add extra code based on a hunch.
Alexei Starovoitov June 29, 2023, 3:52 a.m. UTC | #4
On Wed, Jun 28, 2023 at 10:57 AM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Tue, Jun 27, 2023 at 06:56:33PM -0700, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> > objects into free_by_rcu_ttrace list where they are waiting for RCU
> > task trace grace period to be freed into slab.
> >
> > The life cycle of objects:
> > alloc: dequeue free_llist
> > free: enqeueu free_llist
> > free_rcu: enqueue free_by_rcu -> waiting_for_gp
> > free_llist above high watermark -> free_by_rcu_ttrace
> > after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> > free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> >  include/linux/bpf_mem_alloc.h |   2 +
> >  kernel/bpf/memalloc.c         | 129 +++++++++++++++++++++++++++++++++-
> >  2 files changed, 128 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
> > index 3929be5743f4..d644bbb298af 100644
> > --- a/include/linux/bpf_mem_alloc.h
> > +++ b/include/linux/bpf_mem_alloc.h
> > @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
> >  /* kmalloc/kfree equivalent: */
> >  void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
> >  void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
> > +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
> >
> >  /* kmem_cache_alloc/free equivalent: */
> >  void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
> >  void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
> > +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
> >  void bpf_mem_cache_raw_free(void *ptr);
> >  void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
> >
> > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> > index 40524d9454c7..3081d06a434c 100644
> > --- a/kernel/bpf/memalloc.c
> > +++ b/kernel/bpf/memalloc.c
> > @@ -101,6 +101,15 @@ struct bpf_mem_cache {
> >       bool draining;
> >       struct bpf_mem_cache *tgt;
> >
> > +     /* list of objects to be freed after RCU GP */
> > +     struct llist_head free_by_rcu;
> > +     struct llist_node *free_by_rcu_tail;
> > +     struct llist_head waiting_for_gp;
> > +     struct llist_node *waiting_for_gp_tail;
> > +     struct rcu_head rcu;
> > +     atomic_t call_rcu_in_progress;
> > +     struct llist_head free_llist_extra_rcu;
> > +
> >       /* list of objects to be freed after RCU tasks trace GP */
> >       struct llist_head free_by_rcu_ttrace;
> >       struct llist_head waiting_for_gp_ttrace;
> > @@ -344,6 +353,69 @@ static void free_bulk(struct bpf_mem_cache *c)
> >       do_call_rcu_ttrace(tgt);
> >  }
> >
> > +static void __free_by_rcu(struct rcu_head *head)
> > +{
> > +     struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> > +     struct bpf_mem_cache *tgt = c->tgt;
> > +     struct llist_node *llnode;
> > +
> > +     llnode = llist_del_all(&c->waiting_for_gp);
> > +     if (!llnode)
> > +             goto out;
> > +
> > +     llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
> > +
> > +     /* Objects went through regular RCU GP. Send them to RCU tasks trace */
> > +     do_call_rcu_ttrace(tgt);
> > +out:
> > +     atomic_set(&c->call_rcu_in_progress, 0);
> > +}
> > +
> > +static void check_free_by_rcu(struct bpf_mem_cache *c)
> > +{
> > +     struct llist_node *llnode, *t;
> > +     unsigned long flags;
> > +
> > +     /* drain free_llist_extra_rcu */
> > +     if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
> > +             inc_active(c, &flags);
> > +             llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
> > +                     if (__llist_add(llnode, &c->free_by_rcu))
> > +                             c->free_by_rcu_tail = llnode;
> > +             dec_active(c, flags);
> > +     }
> > +
> > +     if (llist_empty(&c->free_by_rcu))
> > +             return;
> > +
> > +     if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
> > +             /*
> > +              * Instead of kmalloc-ing new rcu_head and triggering 10k
> > +              * call_rcu() to hit rcutree.qhimark and force RCU to notice
> > +              * the overload just ask RCU to hurry up. There could be many
> > +              * objects in free_by_rcu list.
> > +              * This hint reduces memory consumption for an artifical
> > +              * benchmark from 2 Gbyte to 150 Mbyte.
> > +              */
> > +             rcu_request_urgent_qs_task(current);
>
> I have been going back and forth on whether rcu_request_urgent_qs_task()
> needs to throttle calls to itself, for example, to pay attention to only
> one invocation per jiffy.  The theory here is that RCU's state machine
> normally only advances about once per jiffy anyway.
>
> The main risk of *not* throttling is if several CPUs were to invoke
> rcu_request_urgent_qs_task() in tight loops while those same CPUs were
> undergoing interrupt storms, which would result in heavy lock contention
> in __rcu_irq_enter_check_tick().  This is not exactly a common-case
> scenario, but on the other hand, if you are having this degree of trouble,
> should RCU really be adding lock contention to your troubles?

I see spin_lock in __rcu_irq_enter_check_tick(), but I didn't observe
it in practice even when I was calling rcu_request_urgent_qs_task()
in multiple places through bpf_mem_alloc.
I left it only in one place (this patch),
because it was enough to 'hurry up the RCU' and make the difference.
rdp = this_cpu_ptr(&rcu_data); is percpu, so I'm not sure why
you think that the contention is possible.
I think we should avoid extra logic either in RCU or in bpf_mem_alloc
to keep the code simple, since contention is hypothetical at this point.
I've tried preempt and no preempt configs. With and without debug.
Hou Tao June 29, 2023, 4:31 a.m. UTC | #5
Hi,

On 6/29/2023 11:42 AM, Alexei Starovoitov wrote:
> On Wed, Jun 28, 2023 at 7:24 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 6/28/2023 9:56 AM, Alexei Starovoitov wrote:
>>> From: Alexei Starovoitov <ast@kernel.org>
>>>
>>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
>>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
>>> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
>>> objects into free_by_rcu_ttrace list where they are waiting for RCU
>>> task trace grace period to be freed into slab.
>>>
>>> The life cycle of objects:
>>> alloc: dequeue free_llist
>>> free: enqeueu free_llist
>>> free_rcu: enqueue free_by_rcu -> waiting_for_gp
>>> free_llist above high watermark -> free_by_rcu_ttrace
>>> after RCU GP waiting_for_gp -> free_by_rcu_ttrace
>>> free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
>>>
>>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> SNIP
>>> +static void __free_by_rcu(struct rcu_head *head)
>>> +{
>>> +     struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>> +     struct bpf_mem_cache *tgt = c->tgt;
>>> +     struct llist_node *llnode;
>>> +
>>> +     llnode = llist_del_all(&c->waiting_for_gp);
>>> +     if (!llnode)
>>> +             goto out;
>>> +
>>> +     llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
>>> +
>>> +     /* Objects went through regular RCU GP. Send them to RCU tasks trace */
>>> +     do_call_rcu_ttrace(tgt);
>> I still got report about leaked free_by_rcu_ttrace without adding any
>> extra hack except using bpf_mem_cache_free_rcu() in htab.
> Please share the steps to repro.
Firstly, I added the attached patch to check whether or not there are
leaked elements when calling free_mem_alloc_no_barrier(), then I ran the
following scripts to do the stress test in a kvm VM with 72 CPUs and
64GB memory:


#!/bin/bash

BASE=/all/code/linux_working/

{
        cd $BASE/tools/testing/selftests/bpf
        for x in $(seq 2)
        do
                while true; do ./test_maps &>/dev/null; done &
        done
} &

{
        cd $BASE/samples/bpf
        for y in $(seq 8)
        do
                while true; do ./map_perf_test &>/dev/null; done &
        done
} &

{
        cd $BASE/tools/testing/selftests/bpf
        for z in $(seq 8)
        do
                while true
                do
                        for name in overwrite batch_add_batch_del
add_del_on_diff_cpu
                        do
                                ./bench htab-mem -w1 -d5 -a -p32
--use-case $name
                        done
                done &>/dev/null &
                sleep 0.3
        done
} &


>
>> When bpf ma is freed through free_mem_alloc(), the following sequence
>> may lead to leak of free_by_rcu_ttrace:
>>
>> P1: bpf_mem_alloc_destroy()
>>     P2: __free_by_rcu()
>>
>>     // got false
>>     P2: read c->draining
>>
>> P1: c->draining = true
>> P1: llist_del_all(&c->free_by_rcu_ttrace)
>>
>>     // add to free_by_rcu_ttrace again
>>     P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace)
>>         P2: do_call_rcu_ttrace()
>>             // call_rcu_ttrace_in_progress is 1, so xchg return 1
>>             // and it doesn't being moved to waiting_for_gp_ttrace
>>             P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)
>>
>> // got 1
>> P1: atomic_read(&c->call_rcu_ttrace_in_progress)
>> // objects in free_by_rcu_ttrace is leaked
>>
>> I think the race could be fixed by checking c->draining in
>> do_call_rcu_ttrace() when atomic_xchg() returns 1 as shown below:
> If the theory of the bug holds true then the fix makes sense,
> but did you repro without fix and cannot repro with the fix?
> We should not add extra code based on a hunch.
Yes. With the fix applied only, the leak didn't occur in the last 13 hours.
>
> .
From 5e44ec29f5aa037ba8d1188ccee456ab3996d03e Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 21 Jun 2023 17:17:52 +0800
Subject: [PATCH] bpf: Check leaked objects

---
 kernel/bpf/memalloc.c | 48 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 3081d06a434c..2bdb894392c5 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -565,8 +565,50 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
 	free_all(llist_del_all(&c->waiting_for_gp), percpu);
 }
 
-static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+static void check_mem_cache(struct bpf_mem_cache *c, bool direct)
+{
+	if (!llist_empty(&c->free_by_rcu_ttrace))
+		pr_warn("leak: free_by_rcu_ttrace %d\n", direct);
+	if (!llist_empty(&c->waiting_for_gp_ttrace))
+		pr_warn("leak: waiting_for_gp_ttrace %d\n", direct);
+	if (!llist_empty(&c->free_llist))
+		pr_warn("leak: free_llist %d\n", direct);
+	if (!llist_empty(&c->free_llist_extra))
+		pr_warn("leak: free_llist_extra %d\n", direct);
+	if (!llist_empty(&c->free_by_rcu))
+		pr_warn("leak: free_by_rcu %d\n", direct);
+	if (!llist_empty(&c->free_llist_extra_rcu))
+		pr_warn("leak: free_llist_extra_rcu %d\n", direct);
+	if (!llist_empty(&c->waiting_for_gp))
+		pr_warn("leak: waiting_for_gp %d\n", direct);
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma, bool direct)
 {
+	struct bpf_mem_caches *cc;
+	struct bpf_mem_cache *c;
+	int cpu, i;
+
+	if (ma->cache) {
+		for_each_possible_cpu(cpu) {
+			c = per_cpu_ptr(ma->cache, cpu);
+			check_mem_cache(c, direct);
+		}
+	}
+	if (ma->caches) {
+		for_each_possible_cpu(cpu) {
+			cc = per_cpu_ptr(ma->caches, cpu);
+			for (i = 0; i < NUM_CACHES; i++) {
+				c = &cc->cache[i];
+				check_mem_cache(c, direct);
+			}
+		}
+	}
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma, bool direct)
+{
+	check_leaked_objs(ma, direct);
 	free_percpu(ma->cache);
 	free_percpu(ma->caches);
 	ma->cache = NULL;
@@ -589,7 +631,7 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
 	rcu_barrier_tasks_trace(); /* wait for __free_rcu */
 	if (!rcu_trace_implies_rcu_gp())
 		rcu_barrier();
-	free_mem_alloc_no_barrier(ma);
+	free_mem_alloc_no_barrier(ma, false);
 }
 
 static void free_mem_alloc_deferred(struct work_struct *work)
@@ -608,7 +650,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
 		/* Fast path. No callbacks are pending, hence no need to do
 		 * rcu_barrier-s.
 		 */
-		free_mem_alloc_no_barrier(ma);
+		free_mem_alloc_no_barrier(ma, true);
 		return;
 	}
Paul E. McKenney June 29, 2023, 2:20 p.m. UTC | #6
On Wed, Jun 28, 2023 at 08:52:12PM -0700, Alexei Starovoitov wrote:
> On Wed, Jun 28, 2023 at 10:57 AM Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > On Tue, Jun 27, 2023 at 06:56:33PM -0700, Alexei Starovoitov wrote:
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> > > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> > > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> > > objects into free_by_rcu_ttrace list where they are waiting for RCU
> > > task trace grace period to be freed into slab.
> > >
> > > The life cycle of objects:
> > > alloc: dequeue free_llist
> > > free: enqeueu free_llist
> > > free_rcu: enqueue free_by_rcu -> waiting_for_gp
> > > free_llist above high watermark -> free_by_rcu_ttrace
> > > after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> > > free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > > ---
> > >  include/linux/bpf_mem_alloc.h |   2 +
> > >  kernel/bpf/memalloc.c         | 129 +++++++++++++++++++++++++++++++++-
> > >  2 files changed, 128 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
> > > index 3929be5743f4..d644bbb298af 100644
> > > --- a/include/linux/bpf_mem_alloc.h
> > > +++ b/include/linux/bpf_mem_alloc.h
> > > @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
> > >  /* kmalloc/kfree equivalent: */
> > >  void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
> > >  void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
> > > +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
> > >
> > >  /* kmem_cache_alloc/free equivalent: */
> > >  void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
> > >  void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
> > > +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
> > >  void bpf_mem_cache_raw_free(void *ptr);
> > >  void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
> > >
> > > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> > > index 40524d9454c7..3081d06a434c 100644
> > > --- a/kernel/bpf/memalloc.c
> > > +++ b/kernel/bpf/memalloc.c
> > > @@ -101,6 +101,15 @@ struct bpf_mem_cache {
> > >       bool draining;
> > >       struct bpf_mem_cache *tgt;
> > >
> > > +     /* list of objects to be freed after RCU GP */
> > > +     struct llist_head free_by_rcu;
> > > +     struct llist_node *free_by_rcu_tail;
> > > +     struct llist_head waiting_for_gp;
> > > +     struct llist_node *waiting_for_gp_tail;
> > > +     struct rcu_head rcu;
> > > +     atomic_t call_rcu_in_progress;
> > > +     struct llist_head free_llist_extra_rcu;
> > > +
> > >       /* list of objects to be freed after RCU tasks trace GP */
> > >       struct llist_head free_by_rcu_ttrace;
> > >       struct llist_head waiting_for_gp_ttrace;
> > > @@ -344,6 +353,69 @@ static void free_bulk(struct bpf_mem_cache *c)
> > >       do_call_rcu_ttrace(tgt);
> > >  }
> > >
> > > +static void __free_by_rcu(struct rcu_head *head)
> > > +{
> > > +     struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> > > +     struct bpf_mem_cache *tgt = c->tgt;
> > > +     struct llist_node *llnode;
> > > +
> > > +     llnode = llist_del_all(&c->waiting_for_gp);
> > > +     if (!llnode)
> > > +             goto out;
> > > +
> > > +     llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
> > > +
> > > +     /* Objects went through regular RCU GP. Send them to RCU tasks trace */
> > > +     do_call_rcu_ttrace(tgt);
> > > +out:
> > > +     atomic_set(&c->call_rcu_in_progress, 0);
> > > +}
> > > +
> > > +static void check_free_by_rcu(struct bpf_mem_cache *c)
> > > +{
> > > +     struct llist_node *llnode, *t;
> > > +     unsigned long flags;
> > > +
> > > +     /* drain free_llist_extra_rcu */
> > > +     if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
> > > +             inc_active(c, &flags);
> > > +             llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
> > > +                     if (__llist_add(llnode, &c->free_by_rcu))
> > > +                             c->free_by_rcu_tail = llnode;
> > > +             dec_active(c, flags);
> > > +     }
> > > +
> > > +     if (llist_empty(&c->free_by_rcu))
> > > +             return;
> > > +
> > > +     if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
> > > +             /*
> > > +              * Instead of kmalloc-ing new rcu_head and triggering 10k
> > > +              * call_rcu() to hit rcutree.qhimark and force RCU to notice
> > > +              * the overload just ask RCU to hurry up. There could be many
> > > +              * objects in free_by_rcu list.
> > > +              * This hint reduces memory consumption for an artifical
> > > +              * benchmark from 2 Gbyte to 150 Mbyte.
> > > +              */
> > > +             rcu_request_urgent_qs_task(current);
> >
> > I have been going back and forth on whether rcu_request_urgent_qs_task()
> > needs to throttle calls to itself, for example, to pay attention to only
> > one invocation per jiffy.  The theory here is that RCU's state machine
> > normally only advances about once per jiffy anyway.
> >
> > The main risk of *not* throttling is if several CPUs were to invoke
> > rcu_request_urgent_qs_task() in tight loops while those same CPUs were
> > undergoing interrupt storms, which would result in heavy lock contention
> > in __rcu_irq_enter_check_tick().  This is not exactly a common-case
> > scenario, but on the other hand, if you are having this degree of trouble,
> > should RCU really be adding lock contention to your troubles?
> 
> I see spin_lock in __rcu_irq_enter_check_tick(), but I didn't observe
> it in practice even when I was calling rcu_request_urgent_qs_task()
> in multiple places through bpf_mem_alloc.
> I left it only in one place (this patch),
> because it was enough to 'hurry up the RCU' and make the difference.
> rdp = this_cpu_ptr(&rcu_data); is percpu, so I'm not sure why
> you think that the contention is possible.
> I think we should avoid extra logic either in RCU or in bpf_mem_alloc
> to keep the code simple, since contention is hypothetical at this point.
> I've tried preempt and no preempt configs. With and without debug.

Thank you for checking.

The trick in __rcu_irq_enter_check_tick() is this:

	raw_spin_lock_rcu_node(rdp->mynode);

This is not acquiring a per-CPU lock, but rather the lock on the
leaf rcu_node structure, which by default is shared by 16 CPUs.
So if there was a moderate interrupt storm on several of the CPUs
sharing a given leaf rcu_node structure while there were frequnt calls
to rcu_request_urgent_qs_task()), you could see lock contention, and
possibly rather heavy lock contention.

But it would need to be a moderate interrupt storm, that is, enough
to force contention on that lock, but not enough to prevent the CPU
from executing rcu_request_urgent_qs_task() frequently enough to force
__rcu_irq_enter_check_tick() to acquire that lock almost every time.

So, given that this is likely to be quite difficult (and perhaps even
impossible) to trigger, I am happy to leave this alone unless/until
it becomes a problem.  At that point, the problem might be solved by
adjusting the new callers (who might or might not be in BPF), in which
case the code could remain the same.  Or rcu_request_urgent_qs_task()
might prove necessary at that point.

But I figured that I should at least let you guys know of this
possibility.

Yes, I am paranoid.  Which is why RCU works at all.  ;-)

							Thanx, Paul
Alexei Starovoitov July 6, 2023, 3:25 a.m. UTC | #7
On Wed, Jun 28, 2023 at 7:24 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> I think the race could be fixed by checking c->draining in
> do_call_rcu_ttrace() when atomic_xchg() returns 1 as shown below:
>
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 2bdb894392c5..9f41025560bd 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -303,8 +303,13 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
>  {
>         struct llist_node *llnode, *t;
>
> -       if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1))
> +       if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
> +               if (READ_ONCE(c->draining)) {
> +                       llnode = llist_del_all(&c->free_by_rcu_ttrace);
> +                       free_all(llnode, !!c->percpu_size);
> +               }
>                 return;
> +       }

I managed to repro with your extra check-leaks patch that I will
include in the series.
The fix also makes sense.
Thanks
diff mbox series

Patch

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3929be5743f4..d644bbb298af 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -27,10 +27,12 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 /* kmalloc/kfree equivalent: */
 void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
 void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 void bpf_mem_cache_raw_free(void *ptr);
 void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 40524d9454c7..3081d06a434c 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -101,6 +101,15 @@  struct bpf_mem_cache {
 	bool draining;
 	struct bpf_mem_cache *tgt;
 
+	/* list of objects to be freed after RCU GP */
+	struct llist_head free_by_rcu;
+	struct llist_node *free_by_rcu_tail;
+	struct llist_head waiting_for_gp;
+	struct llist_node *waiting_for_gp_tail;
+	struct rcu_head rcu;
+	atomic_t call_rcu_in_progress;
+	struct llist_head free_llist_extra_rcu;
+
 	/* list of objects to be freed after RCU tasks trace GP */
 	struct llist_head free_by_rcu_ttrace;
 	struct llist_head waiting_for_gp_ttrace;
@@ -344,6 +353,69 @@  static void free_bulk(struct bpf_mem_cache *c)
 	do_call_rcu_ttrace(tgt);
 }
 
+static void __free_by_rcu(struct rcu_head *head)
+{
+	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+	struct bpf_mem_cache *tgt = c->tgt;
+	struct llist_node *llnode;
+
+	llnode = llist_del_all(&c->waiting_for_gp);
+	if (!llnode)
+		goto out;
+
+	llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
+	do_call_rcu_ttrace(tgt);
+out:
+	atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode, *t;
+	unsigned long flags;
+
+	/* drain free_llist_extra_rcu */
+	if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+		inc_active(c, &flags);
+		llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+			if (__llist_add(llnode, &c->free_by_rcu))
+				c->free_by_rcu_tail = llnode;
+		dec_active(c, flags);
+	}
+
+	if (llist_empty(&c->free_by_rcu))
+		return;
+
+	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+		/*
+		 * Instead of kmalloc-ing new rcu_head and triggering 10k
+		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
+		 * the overload just ask RCU to hurry up. There could be many
+		 * objects in free_by_rcu list.
+		 * This hint reduces memory consumption for an artifical
+		 * benchmark from 2 Gbyte to 150 Mbyte.
+		 */
+		rcu_request_urgent_qs_task(current);
+		return;
+	}
+
+	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+	inc_active(c, &flags);
+	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+	c->waiting_for_gp_tail = c->free_by_rcu_tail;
+	dec_active(c, flags);
+
+	if (unlikely(READ_ONCE(c->draining))) {
+		free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+		atomic_set(&c->call_rcu_in_progress, 0);
+	} else {
+		call_rcu_hurry(&c->rcu, __free_by_rcu);
+	}
+}
+
 static void bpf_mem_refill(struct irq_work *work)
 {
 	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
@@ -358,6 +430,8 @@  static void bpf_mem_refill(struct irq_work *work)
 		alloc_bulk(c, c->batch, NUMA_NO_NODE);
 	else if (cnt > c->high_watermark)
 		free_bulk(c);
+
+	check_free_by_rcu(c);
 }
 
 static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -486,6 +560,9 @@  static void drain_mem_cache(struct bpf_mem_cache *c)
 	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
 	free_all(__llist_del_all(&c->free_llist), percpu);
 	free_all(__llist_del_all(&c->free_llist_extra), percpu);
+	free_all(__llist_del_all(&c->free_by_rcu), percpu);
+	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+	free_all(llist_del_all(&c->waiting_for_gp), percpu);
 }
 
 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
@@ -498,8 +575,8 @@  static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
 
 static void free_mem_alloc(struct bpf_mem_alloc *ma)
 {
-	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
-	 * still execute. Wait for it now before we freeing percpu caches.
+	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+	 * might still execute. Wait for them.
 	 *
 	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
 	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -508,7 +585,8 @@  static void free_mem_alloc(struct bpf_mem_alloc *ma)
 	 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
 	 * using rcu_trace_implies_rcu_gp() as well.
 	 */
-	rcu_barrier_tasks_trace();
+	rcu_barrier(); /* wait for __free_by_rcu */
+	rcu_barrier_tasks_trace(); /* wait for __free_rcu */
 	if (!rcu_trace_implies_rcu_gp())
 		rcu_barrier();
 	free_mem_alloc_no_barrier(ma);
@@ -561,6 +639,7 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 			irq_work_sync(&c->refill_work);
 			drain_mem_cache(c);
 			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
 		/* objcg is the same across cpus */
 		if (c->objcg)
@@ -577,6 +656,7 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 				irq_work_sync(&c->refill_work);
 				drain_mem_cache(c);
 				rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
 		if (c->objcg)
@@ -661,6 +741,27 @@  static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 		irq_work_raise(c);
 }
 
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+	unsigned long flags;
+
+	c->tgt = *(struct bpf_mem_cache **)llnode;
+
+	local_irq_save(flags);
+	if (local_inc_return(&c->active) == 1) {
+		if (__llist_add(llnode, &c->free_by_rcu))
+			c->free_by_rcu_tail = llnode;
+	} else {
+		llist_add(llnode, &c->free_llist_extra_rcu);
+	}
+	local_dec(&c->active);
+	local_irq_restore(flags);
+
+	if (!atomic_read(&c->call_rcu_in_progress))
+		irq_work_raise(c);
+}
+
 /* Called from BPF program or from sys_bpf syscall.
  * In both cases migration is disabled.
  */
@@ -694,6 +795,20 @@  void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
 }
 
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+	int idx;
+
+	if (!ptr)
+		return;
+
+	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	if (idx < 0)
+		return;
+
+	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
 {
 	void *ret;
@@ -710,6 +825,14 @@  void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
 
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
 /* Directly does a kfree() without putting 'ptr' back to the free_llist
  * for reuse and without waiting for a rcu_tasks_trace gp.
  * The caller must first go through the rcu_tasks_trace gp for 'ptr'