diff mbox series

[RFC,bpf-next,v5,2/2] bpf: Call rcu_momentary_dyntick_idle() in task work periodically

Message ID 20230619143231.222536-3-houtao@huaweicloud.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Handle immediate reuse in bpf memory allocator | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 13 this patch: 13
netdev/cc_maintainers success CCed 12 of 12 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 13 this patch: 13
netdev/checkpatch warning WARNING: line length of 93 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

Hou Tao June 19, 2023, 2:32 p.m. UTC
From: Hou Tao <houtao1@huawei.com>

After doing reuse-after-RCU-GP in bpf memory allocator, if there are
intensive memory allocation and free in bpf memory allocator and RCU GP
is slow, the peak memory usage for bpf memory allocator will be high.

To reduce memory usage for bpf memory allocator, call
rcu_momentary_dyntick_idle() in task work periodically to accelerate the
expiration of RCU grace period.

The following benchmark results the memory usage reduce a lot after
applying the patch:

Before:
overwrite           per-prod-op 49.11 ± 1.30k/s, avg mem 313.09 ± 80.36MiB, peak mem 509.09MiB
batch_add_batch_del per-prod-op 76.06 ± 2.38k/s, avg mem 287.97 ± 63.59MiB, peak mem 496.81MiB
add_del_on_diff_cpu per-prod-op 18.75 ± 0.09k/s, avg mem  27.71 ±  4.92MiB, peak mem  44.54MiB

After:
overwrite           per-prod-op 51.17 ± 0.30k/s, avg mem 105.09 ±  7.74MiB, peak mem 143.60MiB
batch_add_batch_del per-prod-op 86.43 ± 0.90k/s, avg mem  85.82 ± 11.81MiB, peak mem 118.93MiB
add_del_on_diff_cpu per-prod-op 18.71 ± 0.08k/s, avg mem  26.92 ±  5.50MiB, peak mem  43.18MiB

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/memalloc.c | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

Comments

Alexei Starovoitov June 20, 2023, 4:15 p.m. UTC | #1
On Mon, Jun 19, 2023 at 7:00 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> +static void bpf_rcu_gp_acc_work(struct callback_head *head)
> +{
> +       struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
> +
> +       local_irq_disable();
> +       rcu_momentary_dyntick_idle();
> +       local_irq_enable();

We discussed this with Paul off-line and decided to go a different route.
Paul prepared a patch for us to expose rcu_request_urgent_qs_task().
I'll be sending the series later this week.
Paul E. McKenney June 20, 2023, 5:21 p.m. UTC | #2
On Tue, Jun 20, 2023 at 09:15:03AM -0700, Alexei Starovoitov wrote:
> On Mon, Jun 19, 2023 at 7:00 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >
> > +static void bpf_rcu_gp_acc_work(struct callback_head *head)
> > +{
> > +       struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
> > +
> > +       local_irq_disable();
> > +       rcu_momentary_dyntick_idle();
> > +       local_irq_enable();
> 
> We discussed this with Paul off-line and decided to go a different route.
> Paul prepared a patch for us to expose rcu_request_urgent_qs_task().
> I'll be sending the series later this week.

Just for completeness, this patch is in -rcu here and as shown below:

b1edaa1e6364 ("rcu: Export rcu_request_urgent_qs_task()")

							Thanx, Paul

------------------------------------------------------------------------

commit b1edaa1e6364caf9833a30b5dd7599080b6806b8
Author: Paul E. McKenney <paulmck@kernel.org>
Date:   Thu Jun 8 16:31:49 2023 -0700

    rcu: Export rcu_request_urgent_qs_task()
    
    If a CPU is executing a long series of non-sleeping system calls,
    RCU grace periods can be delayed for on the order of a couple hundred
    milliseconds.  This is normally not a problem, but if each system call
    does a call_rcu(), those callbacks can stack up.  RCU will eventually
    notice this callback storm, but use of rcu_request_urgent_qs_task()
    allows the code invoking call_rcu() to give RCU a heads up.
    
    This function is not for general use, not yet, anyway.
    
    Reported-by: Alexei Starovoitov <ast@kernel.org>
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7f17acf29dda..7b949292908a 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -138,6 +138,8 @@ static inline int rcu_needs_cpu(void)
 	return 0;
 }
 
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+
 /*
  * Take advantage of the fact that there is only one CPU, which
  * allows us to ignore virtualization-based context switches.
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 56bccb5a8fde..126f6b418f6a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -21,6 +21,7 @@ void rcu_softirq_qs(void);
 void rcu_note_context_switch(bool preempt);
 int rcu_needs_cpu(void);
 void rcu_cpu_stall_reset(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
 
 /*
  * Note a virtualization-based context switch.  This is simply a
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 9829d8161b21..0c99ec19c8e0 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { }
 static inline void rcu_unexpedite_gp(void) { }
 static inline void rcu_async_hurry(void) { }
 static inline void rcu_async_relax(void) { }
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
@@ -514,7 +513,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void show_rcu_tasks_gp_kthreads(void) {}
 #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
-void rcu_request_urgent_qs_task(struct task_struct *t);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #define RCU_SCHEDULER_INACTIVE	0
Hou Tao June 21, 2023, 1:07 a.m. UTC | #3
Hi,

On 6/21/2023 12:15 AM, Alexei Starovoitov wrote:
> On Mon, Jun 19, 2023 at 7:00 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> +static void bpf_rcu_gp_acc_work(struct callback_head *head)
>> +{
>> +       struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
>> +
>> +       local_irq_disable();
>> +       rcu_momentary_dyntick_idle();
>> +       local_irq_enable();
> We discussed this with Paul off-line and decided to go a different route.
"A different route" means the method used to reduce the memory footprint
is different or the method to do reuse-after-rcu-gp is different ?
> Paul prepared a patch for us to expose rcu_request_urgent_qs_task().
> I'll be sending the series later this week.
Do you plan to take over the reuse-after-rcu-gp patchset ?

I did a quick test, it showed that the memory footprint is smaller and
the performance is similar when using rcu_request_urgent_qs_task()
instead of rcu_momentary_dyntick_idle().

overwrite:
Summary: per-prod-op   27.25 ±    0.08k/s, memory usage   29.52 ±   
1.60MiB, peak memory usage   37.84MiB

batch_add_batch_del:
Summary: per-prod-op   45.84 ±    0.29k/s, memory usage   24.18 ±   
1.44MiB, peak memory usage   30.38MiB

add_del_on_diff_cpu:
Summary: per-prod-op   13.09 ±    0.33k/s, memory usage   32.48 ±   
3.51MiB, peak memory usage   53.54MiB
Alexei Starovoitov June 21, 2023, 1:20 a.m. UTC | #4
On Tue, Jun 20, 2023 at 6:07 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 6/21/2023 12:15 AM, Alexei Starovoitov wrote:
> > On Mon, Jun 19, 2023 at 7:00 AM Hou Tao <houtao@huaweicloud.com> wrote:
> >> +static void bpf_rcu_gp_acc_work(struct callback_head *head)
> >> +{
> >> +       struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
> >> +
> >> +       local_irq_disable();
> >> +       rcu_momentary_dyntick_idle();
> >> +       local_irq_enable();
> > We discussed this with Paul off-line and decided to go a different route.
> "A different route" means the method used to reduce the memory footprint
> is different or the method to do reuse-after-rcu-gp is different ?

Pretty much everything is different.

> > Paul prepared a patch for us to expose rcu_request_urgent_qs_task().
> > I'll be sending the series later this week.
> Do you plan to take over the reuse-after-rcu-gp patchset ?

I took a different approach.
It will be easier to discuss when I post patches.
Hopefully later today or tomorrow.

> I did a quick test, it showed that the memory footprint is smaller and
> the performance is similar when using rcu_request_urgent_qs_task()
> instead of rcu_momentary_dyntick_idle().

Right. I saw a similar effect as well.
My understanding is that rcu_momentary_dyntick_idle() is a heavier mechanism
and absolutely should not be used while holding rcu_read_lock().
So calling from irq_work is not ok.
Only kworker, as you did, is ok from safety pov.
Still not recommended in general. Hence rcu_request_urgent_qs_task.
Hou Tao June 21, 2023, 1:38 a.m. UTC | #5
Hi,

On 6/21/2023 9:20 AM, Alexei Starovoitov wrote:
> On Tue, Jun 20, 2023 at 6:07 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 6/21/2023 12:15 AM, Alexei Starovoitov wrote:
>>> On Mon, Jun 19, 2023 at 7:00 AM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> +static void bpf_rcu_gp_acc_work(struct callback_head *head)
>>>> +{
>>>> +       struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
>>>> +
>>>> +       local_irq_disable();
>>>> +       rcu_momentary_dyntick_idle();
>>>> +       local_irq_enable();
>>> We discussed this with Paul off-line and decided to go a different route.
>> "A different route" means the method used to reduce the memory footprint
>> is different or the method to do reuse-after-rcu-gp is different ?
> Pretty much everything is different.
I see.
>
>>> Paul prepared a patch for us to expose rcu_request_urgent_qs_task().
>>> I'll be sending the series later this week.
>> Do you plan to take over the reuse-after-rcu-gp patchset ?
> I took a different approach.
> It will be easier to discuss when I post patches.
> Hopefully later today or tomorrow.
Look forward to the new approach.
>
>> I did a quick test, it showed that the memory footprint is smaller and
>> the performance is similar when using rcu_request_urgent_qs_task()
>> instead of rcu_momentary_dyntick_idle().
> Right. I saw a similar effect as well.
> My understanding is that rcu_momentary_dyntick_idle() is a heavier mechanism
> and absolutely should not be used while holding rcu_read_lock().
> So calling from irq_work is not ok.
> Only kworker, as you did, is ok from safety pov.
It is not kworker and it is a task work which runs when the process
returns back to userspace.
> Still not recommended in general. Hence rcu_request_urgent_qs_task.
Thanks for the explanation.
diff mbox series

Patch

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9b31c53fd285..c4b4cae04400 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -6,6 +6,7 @@ 
 #include <linux/irq_work.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/task_work.h>
 #include <asm/local.h>
 
 /* Any context (including NMI) BPF specific memory allocator.
@@ -123,6 +124,16 @@  struct bpf_reuse_batch {
 	struct rcu_head rcu;
 };
 
+#define BPF_GP_ACC_RUNNING 0
+
+struct bpf_rcu_gp_acc_ctx {
+	unsigned long flags;
+	unsigned long next_run;
+	struct callback_head work;
+};
+
+static DEFINE_PER_CPU(struct bpf_rcu_gp_acc_ctx, bpf_acc_ctx);
+
 static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 {
 	struct llist_node *entry, *next;
@@ -347,12 +358,47 @@  static void dyn_reuse_rcu(struct rcu_head *rcu)
 	kfree(batch);
 }
 
+static void bpf_rcu_gp_acc_work(struct callback_head *head)
+{
+	struct bpf_rcu_gp_acc_ctx *ctx = container_of(head, struct bpf_rcu_gp_acc_ctx, work);
+
+	local_irq_disable();
+	rcu_momentary_dyntick_idle();
+	local_irq_enable();
+
+	/* The interval between rcu_momentary_dyntick_idle() calls is
+	 * at least 10ms.
+	 */
+	WRITE_ONCE(ctx->next_run, jiffies + msecs_to_jiffies(10));
+	clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags);
+}
+
+static void bpf_mem_rcu_gp_acc(struct bpf_mem_cache *c)
+{
+	struct bpf_rcu_gp_acc_ctx *ctx = this_cpu_ptr(&bpf_acc_ctx);
+
+	if (atomic_read(&c->dyn_reuse_rcu_cnt) < 128 ||
+	    time_before(jiffies, READ_ONCE(ctx->next_run)))
+		return;
+
+	if ((current->flags & PF_KTHREAD) ||
+	    test_and_set_bit(BPF_GP_ACC_RUNNING, &ctx->flags))
+		return;
+
+	init_task_work(&ctx->work, bpf_rcu_gp_acc_work);
+	/* Task is exiting ? */
+	if (task_work_add(current, &ctx->work, TWA_RESUME))
+		clear_bit(BPF_GP_ACC_RUNNING, &ctx->flags);
+}
+
 static void reuse_bulk(struct bpf_mem_cache *c)
 {
 	struct llist_node *head, *tail;
 	struct bpf_reuse_batch *batch;
 	unsigned long flags;
 
+	bpf_mem_rcu_gp_acc(c);
+
 	head = llist_del_all(&c->free_llist_extra);
 	tail = head;
 	while (tail && tail->next)