diff mbox series

[v3,bpf-next,09/15] bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU.

Message ID 20220819214232.18784-10-alexei.starovoitov@gmail.com (mailing list archive)
State New
Headers show
Series bpf: BPF specific memory allocator. | expand

Commit Message

Alexei Starovoitov Aug. 19, 2022, 9:42 p.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down
kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps
and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change
solves the memory consumption issue, avoids kmem_cache_destroy latency and
keeps bpf hash map performance the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 64 +++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c  |  5 +++-
 2 files changed, 65 insertions(+), 4 deletions(-)

Comments

Kumar Kartikeya Dwivedi Aug. 24, 2022, 7:58 p.m. UTC | #1
On Fri, 19 Aug 2022 at 23:43, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down
> kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps
> and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change
> solves the memory consumption issue, avoids kmem_cache_destroy latency and
> keeps bpf hash map performance the same.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Makes sense, there was a call_rcu_lazy work from Joel (CCed) on doing
this batching using a timer + max batch count instead, I wonder if
that fits our use case and could be useful in the future when it is
merged?

https://lore.kernel.org/rcu/20220713213237.1596225-2-joel@joelfernandes.org

wdyt?

> ---
>  kernel/bpf/memalloc.c | 64 +++++++++++++++++++++++++++++++++++++++++--
>  kernel/bpf/syscall.c  |  5 +++-
>  2 files changed, 65 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 22b729914afe..d765a5cb24b4 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -100,6 +100,11 @@ struct bpf_mem_cache {
>         /* count of objects in free_llist */
>         int free_cnt;
>         int low_watermark, high_watermark, batch;
> +
> +       struct rcu_head rcu;
> +       struct llist_head free_by_rcu;
> +       struct llist_head waiting_for_gp;
> +       atomic_t call_rcu_in_progress;
>  };
>
>  struct bpf_mem_caches {
> @@ -188,6 +193,45 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
>                 kfree(obj);
>  }
>
> +static void __free_rcu(struct rcu_head *head)
> +{
> +       struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +       struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
> +       struct llist_node *pos, *t;
> +
> +       llist_for_each_safe(pos, t, llnode)
> +               free_one(c, pos);
> +       atomic_set(&c->call_rcu_in_progress, 0);
> +}
> +
> +static void enque_to_free(struct bpf_mem_cache *c, void *obj)
> +{
> +       struct llist_node *llnode = obj;
> +
> +       /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
> +        * Nothing races to add to free_by_rcu list.
> +        */
> +       __llist_add(llnode, &c->free_by_rcu);
> +}
> +
> +static void do_call_rcu(struct bpf_mem_cache *c)
> +{
> +       struct llist_node *llnode, *t;
> +
> +       if (atomic_xchg(&c->call_rcu_in_progress, 1))
> +               return;
> +
> +       WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
> +       llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
> +               /* There is no concurrent __llist_add(waiting_for_gp) access.
> +                * It doesn't race with llist_del_all either.
> +                * But there could be two concurrent llist_del_all(waiting_for_gp):
> +                * from __free_rcu() and from drain_mem_cache().
> +                */
> +               __llist_add(llnode, &c->waiting_for_gp);
> +       call_rcu(&c->rcu, __free_rcu);
> +}
> +
>  static void free_bulk(struct bpf_mem_cache *c)
>  {
>         struct llist_node *llnode, *t;
> @@ -207,12 +251,13 @@ static void free_bulk(struct bpf_mem_cache *c)
>                 local_dec(&c->active);
>                 if (IS_ENABLED(CONFIG_PREEMPT_RT))
>                         local_irq_restore(flags);
> -               free_one(c, llnode);
> +               enque_to_free(c, llnode);
>         } while (cnt > (c->high_watermark + c->low_watermark) / 2);
>
>         /* and drain free_llist_extra */
>         llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
> -               free_one(c, llnode);
> +               enque_to_free(c, llnode);
> +       do_call_rcu(c);
>  }
>
>  static void bpf_mem_refill(struct irq_work *work)
> @@ -298,7 +343,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size)
>                         return -ENOMEM;
>                 size += LLIST_NODE_SZ; /* room for llist_node */
>                 snprintf(buf, sizeof(buf), "bpf-%u", size);
> -               kmem_cache = kmem_cache_create(buf, size, 8, SLAB_TYPESAFE_BY_RCU, NULL);
> +               kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL);
>                 if (!kmem_cache) {
>                         free_percpu(pc);
>                         return -ENOMEM;
> @@ -340,6 +385,15 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
>  {
>         struct llist_node *llnode, *t;
>
> +       /* The caller has done rcu_barrier() and no progs are using this
> +        * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for
> +        * all remaining elements and they can be in free_by_rcu or in
> +        * waiting_for_gp lists, so drain those lists now.
> +        */
> +       llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
> +               free_one(c, llnode);
> +       llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
> +               free_one(c, llnode);
>         llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist))
>                 free_one(c, llnode);
>         llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
> @@ -361,6 +415,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>                 kmem_cache_destroy(c->kmem_cache);
>                 if (c->objcg)
>                         obj_cgroup_put(c->objcg);
> +               /* c->waiting_for_gp list was drained, but __free_rcu might
> +                * still execute. Wait for it now before we free 'c'.
> +                */
> +               rcu_barrier();
>                 free_percpu(ma->cache);
>                 ma->cache = NULL;
>         }
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index a4d40d98428a..850270a72350 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -638,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
>                 bpf_map_free_id(map, do_idr_lock);
>                 btf_put(map->btf);
>                 INIT_WORK(&map->work, bpf_map_free_deferred);
> -               schedule_work(&map->work);
> +               /* Avoid spawning kworkers, since they all might contend
> +                * for the same mutex like slab_mutex.
> +                */
> +               queue_work(system_unbound_wq, &map->work);
>         }
>  }
>
> --
> 2.30.2
>
Alexei Starovoitov Aug. 25, 2022, 12:13 a.m. UTC | #2
On Wed, Aug 24, 2022 at 12:59 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Fri, 19 Aug 2022 at 23:43, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down
> > kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps
> > and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change
> > solves the memory consumption issue, avoids kmem_cache_destroy latency and
> > keeps bpf hash map performance the same.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
> Makes sense, there was a call_rcu_lazy work from Joel (CCed) on doing
> this batching using a timer + max batch count instead, I wonder if
> that fits our use case and could be useful in the future when it is
> merged?
>
> https://lore.kernel.org/rcu/20220713213237.1596225-2-joel@joelfernandes.org

Thanks for the pointer. It looks orthogonal.
timer based call_rcu is for power savings.
I'm not sure how it would help here. Probably wouldn't hurt.
But explicit waiting_for_gp list is necessary here,
because two later patches (sleepable support and per-cpu rcu-safe
freeing) are relying on this patch.
Joel Fernandes Aug. 25, 2022, 12:35 a.m. UTC | #3
On Wed, Aug 24, 2022 at 8:14 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 24, 2022 at 12:59 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Fri, 19 Aug 2022 at 23:43, Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down
> > > kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps
> > > and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change
> > > solves the memory consumption issue, avoids kmem_cache_destroy latency and
> > > keeps bpf hash map performance the same.
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> >
> > Makes sense, there was a call_rcu_lazy work from Joel (CCed) on doing
> > this batching using a timer + max batch count instead, I wonder if
> > that fits our use case and could be useful in the future when it is
> > merged?
> >
> > https://lore.kernel.org/rcu/20220713213237.1596225-2-joel@joelfernandes.org
>
> Thanks for the pointer. It looks orthogonal.
> timer based call_rcu is for power savings.
> I'm not sure how it would help here. Probably wouldn't hurt.
> But explicit waiting_for_gp list is necessary here,
> because two later patches (sleepable support and per-cpu rcu-safe
> freeing) are relying on this patch.

Hello Kumar and Alexei,

Kumar thanks for the CC. I am seeing this BPF work for the first time
so have not gone over it too much - but in case the waiting is
synchronous by any chance, call_rcu_lazy() could hurt. The idea is to
only queue callbacks that are not all that important to the system
while keeping it quiet (power being the primary reason but Daniel
Bristot would concur it brings down OS noise and helps RT as well).

Have a good evening, Thank you,

 - Joel
Joel Fernandes Aug. 25, 2022, 12:49 a.m. UTC | #4
On Wed, Aug 24, 2022 at 8:35 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Wed, Aug 24, 2022 at 8:14 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Aug 24, 2022 at 12:59 PM Kumar Kartikeya Dwivedi
> > <memxor@gmail.com> wrote:
> > >
> > > On Fri, 19 Aug 2022 at 23:43, Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > From: Alexei Starovoitov <ast@kernel.org>
> > > >
> > > > SLAB_TYPESAFE_BY_RCU makes kmem_caches non mergeable and slows down
> > > > kmem_cache_destroy. All bpf_mem_cache are safe to share across different maps
> > > > and programs. Convert SLAB_TYPESAFE_BY_RCU to batched call_rcu. This change
> > > > solves the memory consumption issue, avoids kmem_cache_destroy latency and
> > > > keeps bpf hash map performance the same.
> > > >
> > > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Makes sense, there was a call_rcu_lazy work from Joel (CCed) on doing
> > > this batching using a timer + max batch count instead, I wonder if
> > > that fits our use case and could be useful in the future when it is
> > > merged?
> > >
> > > https://lore.kernel.org/rcu/20220713213237.1596225-2-joel@joelfernandes.org
> >
> > Thanks for the pointer. It looks orthogonal.
> > timer based call_rcu is for power savings.
> > I'm not sure how it would help here. Probably wouldn't hurt.
> > But explicit waiting_for_gp list is necessary here,
> > because two later patches (sleepable support and per-cpu rcu-safe
> > freeing) are relying on this patch.
>
> Hello Kumar and Alexei,
>
> Kumar thanks for the CC. I am seeing this BPF work for the first time
> so have not gone over it too much - but in case the waiting is
> synchronous by any chance, call_rcu_lazy() could hurt. The idea is to
> only queue callbacks that are not all that important to the system
> while keeping it quiet (power being the primary reason but Daniel
> Bristot would concur it brings down OS noise and helps RT as well).

Just as FYI, I see rcu_barrier() used in Alexei's patch - that will
flush the lazy CBs to keep rcu_barrier() both correct and performant.
At that point call_rcu_lazy() is equivalent to call_rcu() as we  no
longer kept the callbacks a secret from the rest of the system.

Thanks,

 -  Joel
diff mbox series

Patch

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 22b729914afe..d765a5cb24b4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -100,6 +100,11 @@  struct bpf_mem_cache {
 	/* count of objects in free_llist */
 	int free_cnt;
 	int low_watermark, high_watermark, batch;
+
+	struct rcu_head rcu;
+	struct llist_head free_by_rcu;
+	struct llist_head waiting_for_gp;
+	atomic_t call_rcu_in_progress;
 };
 
 struct bpf_mem_caches {
@@ -188,6 +193,45 @@  static void free_one(struct bpf_mem_cache *c, void *obj)
 		kfree(obj);
 }
 
+static void __free_rcu(struct rcu_head *head)
+{
+	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+	struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
+	struct llist_node *pos, *t;
+
+	llist_for_each_safe(pos, t, llnode)
+		free_one(c, pos);
+	atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+	struct llist_node *llnode = obj;
+
+	/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+	 * Nothing races to add to free_by_rcu list.
+	 */
+	__llist_add(llnode, &c->free_by_rcu);
+}
+
+static void do_call_rcu(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode, *t;
+
+	if (atomic_xchg(&c->call_rcu_in_progress, 1))
+		return;
+
+	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+		/* There is no concurrent __llist_add(waiting_for_gp) access.
+		 * It doesn't race with llist_del_all either.
+		 * But there could be two concurrent llist_del_all(waiting_for_gp):
+		 * from __free_rcu() and from drain_mem_cache().
+		 */
+		__llist_add(llnode, &c->waiting_for_gp);
+	call_rcu(&c->rcu, __free_rcu);
+}
+
 static void free_bulk(struct bpf_mem_cache *c)
 {
 	struct llist_node *llnode, *t;
@@ -207,12 +251,13 @@  static void free_bulk(struct bpf_mem_cache *c)
 		local_dec(&c->active);
 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
 			local_irq_restore(flags);
-		free_one(c, llnode);
+		enque_to_free(c, llnode);
 	} while (cnt > (c->high_watermark + c->low_watermark) / 2);
 
 	/* and drain free_llist_extra */
 	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
-		free_one(c, llnode);
+		enque_to_free(c, llnode);
+	do_call_rcu(c);
 }
 
 static void bpf_mem_refill(struct irq_work *work)
@@ -298,7 +343,7 @@  int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size)
 			return -ENOMEM;
 		size += LLIST_NODE_SZ; /* room for llist_node */
 		snprintf(buf, sizeof(buf), "bpf-%u", size);
-		kmem_cache = kmem_cache_create(buf, size, 8, SLAB_TYPESAFE_BY_RCU, NULL);
+		kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL);
 		if (!kmem_cache) {
 			free_percpu(pc);
 			return -ENOMEM;
@@ -340,6 +385,15 @@  static void drain_mem_cache(struct bpf_mem_cache *c)
 {
 	struct llist_node *llnode, *t;
 
+	/* The caller has done rcu_barrier() and no progs are using this
+	 * bpf_mem_cache, but htab_map_free() called bpf_mem_cache_free() for
+	 * all remaining elements and they can be in free_by_rcu or in
+	 * waiting_for_gp lists, so drain those lists now.
+	 */
+	llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
+		free_one(c, llnode);
+	llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
+		free_one(c, llnode);
 	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist))
 		free_one(c, llnode);
 	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
@@ -361,6 +415,10 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 		kmem_cache_destroy(c->kmem_cache);
 		if (c->objcg)
 			obj_cgroup_put(c->objcg);
+		/* c->waiting_for_gp list was drained, but __free_rcu might
+		 * still execute. Wait for it now before we free 'c'.
+		 */
+		rcu_barrier();
 		free_percpu(ma->cache);
 		ma->cache = NULL;
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a4d40d98428a..850270a72350 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -638,7 +638,10 @@  static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 		bpf_map_free_id(map, do_idr_lock);
 		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
-		schedule_work(&map->work);
+		/* Avoid spawning kworkers, since they all might contend
+		 * for the same mutex like slab_mutex.
+		 */
+		queue_work(system_unbound_wq, &map->work);
 	}
 }