bpf: added the account of BPF running time

Message ID	20220902101217.1419-1-cuiyunhui@bytedance.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Yunhui Cui <cuiyunhui@bytedance.com> To: corbet@lwn.net, ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org, martin.lau@linux.dev, song@kernel.org, yhs@fb.com, john.fastabend@gmail.com, kpsingh@kernel.org, sdf@google.com, haoluo@google.com, jolsa@kernel.org, akpm@linux-foundation.org, hannes@cmpxchg.org, david@redhat.com, mail@christoph.anton.mitterer.name, ccross@google.com, cuiyunhui@bytedance.com, vincent.whitchurch@axis.com, paul.gortmaker@windriver.com, peterz@infradead.org, edumazet@google.com, joshdon@google.com Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-doc@vger.kernel.org, bpf@vger.kernel.org Subject: [PATCH] bpf: added the account of BPF running time Date: Fri, 2 Sep 2022 18:12:17 +0800 Message-Id: <20220902101217.1419-1-cuiyunhui@bytedance.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	bpf: added the account of BPF running time \| expand bpf: added the account of BPF running time

Message ID

20220902101217.1419-1-cuiyunhui@bytedance.com (mailing list archive)

State

New, archived

Headers

From: Yunhui Cui <cuiyunhui@bytedance.com>
To: corbet@lwn.net, ast@kernel.org, daniel@iogearbox.net,
        andrii@kernel.org, martin.lau@linux.dev, song@kernel.org,
        yhs@fb.com, john.fastabend@gmail.com, kpsingh@kernel.org,
        sdf@google.com, haoluo@google.com, jolsa@kernel.org,
        akpm@linux-foundation.org, hannes@cmpxchg.org, david@redhat.com,
        mail@christoph.anton.mitterer.name, ccross@google.com,
        cuiyunhui@bytedance.com, vincent.whitchurch@axis.com,
        paul.gortmaker@windriver.com, peterz@infradead.org,
        edumazet@google.com, joshdon@google.com
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
        linux-doc@vger.kernel.org, bpf@vger.kernel.org
Subject: [PATCH] bpf: added the account of BPF running time
Date: Fri,  2 Sep 2022 18:12:17 +0800
Message-Id: <20220902101217.1419-1-cuiyunhui@bytedance.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

bpf: added the account of BPF running time | expand

Commit Message

yunhui cui Sept. 2, 2022, 10:12 a.m. UTC

The following high CPU usage problem is caused by BPF, We found it through
the perf tools.
Children Self 	SharedObject 	Symbol
45.52%	45.51%	[kernel]	[k] native_queued_spin_lock_slowpath
45.35%	0.10%	[kernel]	[k] _raw_spin_lock_irqsave
45.29%	0.01%	[kernel]	[k] bpf_map_update_elem
45.29%	0.02%	[kernel]	[k] htab_map_update_elem

The above problem is caught when bpf_prog is executed, but we cannot
get the load on the system from bpf_progs executed before, and we
cannot monitor the occupancy rate of cpu by BPF all the time.

Currently only the running time of a single bpf_prog is counted in the
/proc/$pid/fdinfo/$fd file. It's impossible to count the occupancy rate
of all bpf_progs on the CPU, because we can't know which processes, and
it is possible that the process has exited.

With the increasing use of BPF function modules, all running bpf_progs may
occupy high CPU usage. So we need to add an item to the /proc/stat file to
observe the CPU usage of BPF from a global perspective.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 Documentation/filesystems/proc.rst |  1 +
 fs/proc/stat.c                     | 25 +++++++++++++++++++++++--
 include/linux/filter.h             | 17 +++++++++++++++--
 kernel/bpf/core.c                  |  1 +
 4 files changed, 40 insertions(+), 4 deletions(-)

Comments

Daniel Borkmann Sept. 2, 2022, 3:38 p.m. UTC | #1

On 9/2/22 12:12 PM, Yunhui Cui wrote:
[...]
> index a5f21dc3c432..9cb072f9e32b 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -565,6 +565,12 @@ struct sk_filter {
>   	struct bpf_prog	*prog;
>   };
>   
> +struct bpf_account {
> +	u64_stats_t nsecs;
> +	struct u64_stats_sync syncp;
> +};
> +DECLARE_PER_CPU(struct bpf_account, bpftime);
> +
>   DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
>   
>   typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
> @@ -577,12 +583,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>   					  bpf_dispatcher_fn dfunc)
>   {
>   	u32 ret;
> +	struct bpf_account *bact;
> +	unsigned long flags;
> +	u64 start = 0;
>   
>   	cant_migrate();
> +	start = sched_clock();
>   	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
>   		struct bpf_prog_stats *stats;
> -		u64 start = sched_clock();
> -		unsigned long flags;
>   
>   		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>   		stats = this_cpu_ptr(prog->stats);
> @@ -593,6 +601,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>   	} else {
>   		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
>   	}
> +	bact = this_cpu_ptr(&bpftime);
> +	flags = u64_stats_update_begin_irqsave(&bact->syncp);
> +	u64_stats_add(&bact->nsecs, sched_clock() - start);
> +	u64_stats_update_end_irqrestore(&bact->syncp, flags);
> +
>   	return ret;

The overhead this adds unconditionally is no-go. Have you tried using/improving:

commit 47c09d6a9f6794caface4ad50930460b82d7c670
Author: Song Liu <songliubraving@fb.com>
Date:   Mon Mar 9 10:32:15 2020 -0700

     bpftool: Introduce "prog profile" command

     With fentry/fexit programs, it is possible to profile BPF program with
     hardware counters. Introduce bpftool "prog profile", which measures key
     metrics of a BPF program.

     bpftool prog profile command creates per-cpu perf events. Then it attaches
     fentry/fexit programs to the target BPF program. The fentry program saves
     perf event value to a map. The fexit program reads the perf event again,
     and calculates the difference, which is the instructions/cycles used by
     the target program.

     Example input and output:

       ./bpftool prog profile id 337 duration 3 cycles instructions llc_misses

             4228 run_cnt
          3403698 cycles                                              (84.08%)
          3525294 instructions   #  1.04 insn per cycle               (84.05%)
               13 llc_misses     #  3.69 LLC misses per million isns  (83.50%)

     This command measures cycles and instructions for BPF program with id
     337 for 3 seconds. The program has triggered 4228 times. The rest of the
     output is similar to perf-stat. [...]

Thanks,
Daniel

kernel test robot Sept. 2, 2022, 7:35 p.m. UTC | #2

Hi Yunhui,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master linus/master v6.0-rc3 next-20220901]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Yunhui-Cui/bpf-added-the-account-of-BPF-running-time/20220902-181437
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: um-x86_64_defconfig (https://download.01.org/0day-ci/archive/20220903/202209030333.Goj9I0Pe-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/e31420772f2a63d6bc561211b8ef0331d41b2123
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yunhui-Cui/bpf-added-the-account-of-BPF-running-time/20220902-181437
        git checkout e31420772f2a63d6bc561211b8ef0331d41b2123
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=um SUBARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   /usr/bin/ld: warning: arch/x86/um/vdso/vdso.o: missing .note.GNU-stack section implies executable stack
   /usr/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker
   /usr/bin/ld: warning: .tmp_vmlinux.kallsyms1 has a LOAD segment with RWX permissions
   /usr/bin/ld: kernel/seccomp.o: in function `seccomp_run_filters':
   seccomp.c:(.text+0xaa7): undefined reference to `bpftime'
   /usr/bin/ld: fs/proc/stat.o: in function `show_stat':
   stat.c:(.text+0x124): undefined reference to `bpftime'
   /usr/bin/ld: stat.c:(.text+0x332): undefined reference to `bpftime'
   /usr/bin/ld: net/core/flow_dissector.o: in function `bpf_flow_dissect':
>> flow_dissector.c:(.text+0x9f5): undefined reference to `bpftime'
   /usr/bin/ld: net/core/dev.o: in function `bpf_prog_run_generic_xdp':
>> dev.c:(.text+0x8c43): undefined reference to `bpftime'
   /usr/bin/ld: net/core/ptp_classifier.o:ptp_classifier.c:(.text+0xf8): more undefined references to `bpftime' follow
   collect2: error: ld returned 1 exit status

yunhui cui Oct. 12, 2022, 10:26 a.m. UTC | #3

Daniel Borkmann <daniel@iogearbox.net> 于2022年9月2日周五 23:38写道：
>
> On 9/2/22 12:12 PM, Yunhui Cui wrote:
> [...]
> > index a5f21dc3c432..9cb072f9e32b 100644
> > --- a/include/linux/filter.h
> > +++ b/include/linux/filter.h
> > @@ -565,6 +565,12 @@ struct sk_filter {
> >       struct bpf_prog *prog;
> >   };
> >

> >               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> >               stats = this_cpu_ptr(prog->stats);
> > @@ -593,6 +601,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> >       } else {
> >               ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> >       }
> > +     bact = this_cpu_ptr(&bpftime);
> > +     flags = u64_stats_update_begin_irqsave(&bact->syncp);
> > +     u64_stats_add(&bact->nsecs, sched_clock() - start);
> > +     u64_stats_update_end_irqrestore(&bact->syncp, flags);
> > +
> >       return ret;
>
> The overhead this adds unconditionally is no-go. Have you tried using/improving:

The commit 47c09d6a9f6794caface4ad50930460b82d7c670 can not meet
the requirement of being able to visually see the cumulative running
time of progs on each cpu.

About the overhead, how about moving the above code to
if(static_branch_unlikely(&bpf_stats_enabled_key)) {}
branch, like prog->stats processing ?

Yunhui

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e7aafc82be99..353f41c3e4eb 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1477,6 +1477,7 @@  second).  The meanings of the columns are as follows, from left to right:
 - steal: involuntary wait
 - guest: running a normal guest
 - guest_nice: running a niced guest
+- bpf: running in bpf_programs
 
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4fb8729a68d4..ff8ef959fb4f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -14,6 +14,8 @@ 
 #include <linux/irqnr.h>
 #include <linux/sched/cputime.h>
 #include <linux/tick.h>
+#include <linux/filter.h>
+#include <linux/u64_stats_sync.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -22,6 +24,20 @@ 
 #define arch_irq_stat() 0
 #endif
 
+DECLARE_PER_CPU(struct bpf_account, bpftime);
+
+static void get_bpf_time(u64 *ns, int cpu)
+{
+	unsigned int start = 0;
+	const struct bpf_account *bact;
+
+	bact = per_cpu_ptr(&bpftime, cpu);
+	do {
+		start = u64_stats_fetch_begin_irq(&bact->syncp);
+		*ns = u64_stats_read(&bact->nsecs);
+	} while (u64_stats_fetch_retry_irq(&bact->syncp, start));
+}
+
 #ifdef arch_idle_time
 
 u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
@@ -112,11 +128,12 @@  static int show_stat(struct seq_file *p, void *v)
 	u64 guest, guest_nice;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
+	u64 bpf_sum, bpf;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec64 boottime;
 
 	user = nice = system = idle = iowait =
-		irq = softirq = steal = 0;
+		irq = softirq = steal = bpf = bpf_sum = 0;
 	guest = guest_nice = 0;
 	getboottime64(&boottime);
 	/* shift boot timestamp according to the timens offset */
@@ -127,6 +144,7 @@  static int show_stat(struct seq_file *p, void *v)
 		u64 *cpustat = kcpustat.cpustat;
 
 		kcpustat_cpu_fetch(&kcpustat, i);
+		get_bpf_time(&bpf, i);
 
 		user		+= cpustat[CPUTIME_USER];
 		nice		+= cpustat[CPUTIME_NICE];
@@ -138,6 +156,7 @@  static int show_stat(struct seq_file *p, void *v)
 		steal		+= cpustat[CPUTIME_STEAL];
 		guest		+= cpustat[CPUTIME_GUEST];
 		guest_nice	+= cpustat[CPUTIME_GUEST_NICE];
+		bpf_sum		+= bpf;
 		sum		+= kstat_cpu_irqs_sum(i);
 		sum		+= arch_irq_stat_cpu(i);
 
@@ -160,6 +179,7 @@  static int show_stat(struct seq_file *p, void *v)
 	seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
 	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
 	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(bpf_sum));
 	seq_putc(p, '\n');
 
 	for_each_online_cpu(i) {
@@ -167,7 +187,7 @@  static int show_stat(struct seq_file *p, void *v)
 		u64 *cpustat = kcpustat.cpustat;
 
 		kcpustat_cpu_fetch(&kcpustat, i);
-
+		get_bpf_time(&bpf, i);
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
 		user		= cpustat[CPUTIME_USER];
 		nice		= cpustat[CPUTIME_NICE];
@@ -190,6 +210,7 @@  static int show_stat(struct seq_file *p, void *v)
 		seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
 		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
 		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(bpf));
 		seq_putc(p, '\n');
 	}
 	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a5f21dc3c432..9cb072f9e32b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -565,6 +565,12 @@  struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
+struct bpf_account {
+	u64_stats_t nsecs;
+	struct u64_stats_sync syncp;
+};
+DECLARE_PER_CPU(struct bpf_account, bpftime);
+
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
@@ -577,12 +583,14 @@  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 					  bpf_dispatcher_fn dfunc)
 {
 	u32 ret;
+	struct bpf_account *bact;
+	unsigned long flags;
+	u64 start = 0;
 
 	cant_migrate();
+	start = sched_clock();
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		struct bpf_prog_stats *stats;
-		u64 start = sched_clock();
-		unsigned long flags;
 
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 		stats = this_cpu_ptr(prog->stats);
@@ -593,6 +601,11 @@  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 	} else {
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
 	}
+	bact = this_cpu_ptr(&bpftime);
+	flags = u64_stats_update_begin_irqsave(&bact->syncp);
+	u64_stats_add(&bact->nsecs, sched_clock() - start);
+	u64_stats_update_end_irqrestore(&bact->syncp, flags);
+
 	return ret;
 }
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1e10d088dbb..445ac1c6c01a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -618,6 +618,7 @@  static const struct latch_tree_ops bpf_tree_ops = {
 	.comp	= bpf_tree_comp,
 };
 
+DEFINE_PER_CPU(struct bpf_account, bpftime);
 static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;

bpf: added the account of BPF running time

Commit Message

Comments

Patch