Message ID | 20211006175106.GA295227@fuller.cnet (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | [bpf-next] bpf: introduce helper bpf_raw_read_cpu_clock | expand |
On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > > Add bpf_raw_read_cpu_clock helper, to read architecture specific > CPU clock. In x86's case, this is the TSC. > > This is necessary to synchronize bpf traces from host and guest bpf-programs > (after subtracting guest tsc-offset from guest timestamps). Trying to understand the use case. So in a host-guest scenario, bpf_ktime_get_ns() will return different values in host and guest, but rdtsc() will give the same value. Is this correct? > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index ab83c22d274e..832bb1f65f28 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -95,6 +95,7 @@ config X86 > select ARCH_HAS_UBSAN_SANITIZE_ALL > select ARCH_HAS_DEBUG_WX > select ARCH_HAS_ZONE_DMA_SET if EXPERT > + select ARCH_HAS_BPF_RAW_CPU_CLOCK > select ARCH_HAVE_NMI_SAFE_CMPXCHG > select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI > select ARCH_MIGHT_HAVE_PC_PARPORT > diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h > new file mode 100644 > index 000000000000..6951c399819e > --- /dev/null > +++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h > @@ -0,0 +1,10 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_ > +#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_ > + > +static inline unsigned long long read_raw_cpu_clock(void) > +{ > + return rdtsc_ordered(); > +} > + > +#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */ > diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c > index 3eff08d7b8e5..844a44ff508d 100644 > --- a/drivers/media/rc/bpf-lirc.c > +++ b/drivers/media/rc/bpf-lirc.c > @@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_ktime_get_ns_proto; > case BPF_FUNC_ktime_get_boot_ns: > return &bpf_ktime_get_boot_ns_proto; > + case BPF_FUNC_read_raw_cpu_clock: > + return &bpf_read_raw_cpu_clock_proto; > case BPF_FUNC_tail_call: > return &bpf_tail_call_proto; > case BPF_FUNC_get_prandom_u32: > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index d604c8251d88..b6cb426085fb 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; > extern const struct bpf_func_proto bpf_tail_call_proto; > extern const struct bpf_func_proto bpf_ktime_get_ns_proto; > extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; > +extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto; > extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; > extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; > extern const struct bpf_func_proto bpf_get_current_comm_proto; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 6fc59d61937a..52191791b089 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -4037,6 +4037,13 @@ union bpf_attr { > * Return > * Current *ktime*. > * > + * u64 bpf_read_raw_cpu_clock(void) > + * Description > + * Return the architecture specific CPU clock value. > + * For x86, this is the TSC clock. > + * Return > + * *CPU clock value* > + * > * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) > * Description > * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print > @@ -5089,6 +5096,7 @@ union bpf_attr { > FN(task_pt_regs), \ > FN(get_branch_snapshot), \ > FN(trace_vprintk), \ > + FN(read_raw_cpu_clock), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig > index a82d6de86522..5815db157220 100644 > --- a/kernel/bpf/Kconfig > +++ b/kernel/bpf/Kconfig > @@ -21,6 +21,10 @@ config HAVE_EBPF_JIT > config ARCH_WANT_DEFAULT_BPF_JIT > bool > > +# Used by archs to tell they support reading raw CPU clock > +config ARCH_HAS_BPF_RAW_CPU_CLOCK > + bool > + > menu "BPF subsystem" > > config BPF_SYSCALL > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index b6c72af64d5d..8e2359dfd582 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; > const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; > const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; > const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; > +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak; > + > > const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; > const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 1ffd469c217f..90b9e5efaf65 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -18,6 +18,10 @@ > > #include "../../lib/kstrtox.h" > > +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK > +#include <asm/bpf_raw_cpu_clock.h> > +#endif > + > /* If kernel subsystem is allowing eBPF programs to call this function, > * inside its own verifier_ops->get_func_proto() callback it should return > * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments > @@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { > .ret_type = RET_INTEGER, > }; > > +BPF_CALL_0(bpf_read_raw_cpu_clock) > +{ > +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK > + return read_raw_cpu_clock(); > +#else > + return sched_clock(); > +#endif > +} > + > +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = { > + .func = bpf_read_raw_cpu_clock, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > +}; > + > BPF_CALL_0(bpf_ktime_get_coarse_ns) > { > return ktime_get_coarse_ns(); > @@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) > return &bpf_ktime_get_boot_ns_proto; > case BPF_FUNC_ktime_get_coarse_ns: > return &bpf_ktime_get_coarse_ns_proto; > + case BPF_FUNC_read_raw_cpu_clock: > + return &bpf_read_raw_cpu_clock_proto; > case BPF_FUNC_ringbuf_output: > return &bpf_ringbuf_output_proto; > case BPF_FUNC_ringbuf_reserve: > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 6b3153841a33..047ca7c1d57a 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -1113,6 +1113,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) > return &bpf_ktime_get_boot_ns_proto; > case BPF_FUNC_ktime_get_coarse_ns: > return &bpf_ktime_get_coarse_ns_proto; > + case BPF_FUNC_read_raw_cpu_clock: > + return &bpf_read_raw_cpu_clock_proto; With the change in bpf_base_func_proto, this part is not needed. > case BPF_FUNC_tail_call: > return &bpf_tail_call_proto; > case BPF_FUNC_get_current_pid_tgid: > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 6fc59d61937a..52191791b089 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -4037,6 +4037,13 @@ union bpf_attr { > * Return > * Current *ktime*. > * > + * u64 bpf_read_raw_cpu_clock(void) > + * Description > + * Return the architecture specific CPU clock value. > + * For x86, this is the TSC clock. > + * Return > + * *CPU clock value* > + * > * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) > * Description > * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print > @@ -5089,6 +5096,7 @@ union bpf_attr { > FN(task_pt_regs), \ > FN(get_branch_snapshot), \ > FN(trace_vprintk), \ > + FN(read_raw_cpu_clock), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper >
On Wed, Oct 06, 2021 at 02:37:09PM -0700, Song Liu wrote: > On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > > > > > > Add bpf_raw_read_cpu_clock helper, to read architecture specific > > CPU clock. In x86's case, this is the TSC. > > > > This is necessary to synchronize bpf traces from host and guest bpf-programs > > (after subtracting guest tsc-offset from guest timestamps). > > Trying to understand the use case. So in a host-guest scenario, > bpf_ktime_get_ns() > will return different values in host and guest, but rdtsc() will give > the same value. > Is this correct? No, it will not. Also, please explain if any of this stands a chance of working for anything other than x86. Or even on x86 in the face of guest migration. Also, please explain, again, what's wrong with dumping snapshots of CLOCK_MONOTONIC{,_RAW} from host and guest and correlating time that way? And also explain why BPF needs to do this differently than all the other tracers.
Hi Peter, Song, On Thu, Oct 07, 2021 at 09:18:56AM +0200, Peter Zijlstra wrote: > On Wed, Oct 06, 2021 at 02:37:09PM -0700, Song Liu wrote: > > On Wed, Oct 6, 2021 at 10:52 AM Marcelo Tosatti <mtosatti@redhat.com> wrote: > > > > > > > > > > > > Add bpf_raw_read_cpu_clock helper, to read architecture specific > > > CPU clock. In x86's case, this is the TSC. > > > > > > This is necessary to synchronize bpf traces from host and guest bpf-programs > > > (after subtracting guest tsc-offset from guest timestamps). > > > > Trying to understand the use case. So in a host-guest scenario, > > bpf_ktime_get_ns() > > will return different values in host and guest, but rdtsc() will give > > the same value. > > Is this correct? > > No, it will not. No, but we can find out the delta between host and guest TSCs. On x86, you can read the offset through debugfs file: debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); Other architectures can expose that offset. > Also, please explain if any of this stands a chance of > working for anything other than x86. Yes, the same pattern repeats ARM: With offset between guest and host: https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/CNTVCT-EL0--Counter-timer-Virtual-Count-register?lang=en Without offset: commit 051ff581ce70e822729e9474941f3c206cbf7436 PPC: https://yhbt.net/lore/all/5f267a8aec5b8199a580c96ab2b1a3c27de4eb09.camel@gmail.com/T/ (Time Base Register is read through mftb instruction). > Or even on x86 in the face of > guest migration. It won't, but honestly we don't care about tracing at this level across migration. > Also, please explain, again, what's wrong with dumping snapshots of > CLOCK_MONOTONIC{,_RAW} from host and guest and correlating time that > way? You can't read the guest and the host clock at the same time (there will always be some variable delay between reading the two clocks). And that delay is not fixed, but variable (depending on scheduling of the guest vcpus, for example). So you will need an algorithm to estimate their differences, with non zero error bounds: " Add a driver with gettime method returning hosts realtime clock. This allows Chrony to synchronize host and guest clocks with high precision (see results below). chronyc> sources MS Name/IP address Stratum Poll Reach LastRx Last sample =============================================================================== #* PHC0 0 3 377 6 +4ns[ +4ns] +/- 3ns " Now with the hardware clock (which is usually the base for CLOCK_MONOTONIC_RAW), there are no errors (offset will be 0 ns, rather than 3/4ns). > And also explain why BPF needs to do this differently than all the other > tracers. For x86 we use: echo "x86-tsc" > /sys/kernel/debug/tracing/trace_clock For this purpose, on x86, so its not like anything different is being done?
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d274e..832bb1f65f28 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -95,6 +95,7 @@ config X86 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT + select ARCH_HAS_BPF_RAW_CPU_CLOCK select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/x86/include/asm/bpf_raw_cpu_clock.h b/arch/x86/include/asm/bpf_raw_cpu_clock.h new file mode 100644 index 000000000000..6951c399819e --- /dev/null +++ b/arch/x86/include/asm/bpf_raw_cpu_clock.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_BPF_RAW_CPU_CLOCK_H_ +#define _ASM_X86_BPF_RAW_CPU_CLOCK_H_ + +static inline unsigned long long read_raw_cpu_clock(void) +{ + return rdtsc_ordered(); +} + +#endif /* _ASM_X86_BPF_RAW_CPU_CLOCK_H_ */ diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 3eff08d7b8e5..844a44ff508d 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -105,6 +105,8 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_read_raw_cpu_clock: + return &bpf_read_raw_cpu_clock_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_prandom_u32: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d604c8251d88..b6cb426085fb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2058,6 +2058,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_read_raw_cpu_clock_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fc59d61937a..52191791b089 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4037,6 +4037,13 @@ union bpf_attr { * Return * Current *ktime*. * + * u64 bpf_read_raw_cpu_clock(void) + * Description + * Return the architecture specific CPU clock value. + * For x86, this is the TSC clock. + * Return + * *CPU clock value* + * * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print @@ -5089,6 +5096,7 @@ union bpf_attr { FN(task_pt_regs), \ FN(get_branch_snapshot), \ FN(trace_vprintk), \ + FN(read_raw_cpu_clock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index a82d6de86522..5815db157220 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -21,6 +21,10 @@ config HAVE_EBPF_JIT config ARCH_WANT_DEFAULT_BPF_JIT bool +# Used by archs to tell they support reading raw CPU clock +config ARCH_HAS_BPF_RAW_CPU_CLOCK + bool + menu "BPF subsystem" config BPF_SYSCALL diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b6c72af64d5d..8e2359dfd582 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2345,6 +2345,8 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto __weak; + const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..90b9e5efaf65 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,10 @@ #include "../../lib/kstrtox.h" +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK +#include <asm/bpf_raw_cpu_clock.h> +#endif + /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments @@ -168,6 +172,21 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_read_raw_cpu_clock) +{ +#ifdef CONFIG_ARCH_HAS_BPF_RAW_CPU_CLOCK + return read_raw_cpu_clock(); +#else + return sched_clock(); +#endif +} + +const struct bpf_func_proto bpf_read_raw_cpu_clock_proto = { + .func = bpf_read_raw_cpu_clock, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_coarse_ns) { return ktime_get_coarse_ns(); @@ -1366,6 +1385,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_read_raw_cpu_clock: + return &bpf_read_raw_cpu_clock_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6b3153841a33..047ca7c1d57a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1113,6 +1113,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_read_raw_cpu_clock: + return &bpf_read_raw_cpu_clock_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6fc59d61937a..52191791b089 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4037,6 +4037,13 @@ union bpf_attr { * Return * Current *ktime*. * + * u64 bpf_read_raw_cpu_clock(void) + * Description + * Return the architecture specific CPU clock value. + * For x86, this is the TSC clock. + * Return + * *CPU clock value* + * * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print @@ -5089,6 +5096,7 @@ union bpf_attr { FN(task_pt_regs), \ FN(get_branch_snapshot), \ FN(trace_vprintk), \ + FN(read_raw_cpu_clock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper
Add bpf_raw_read_cpu_clock helper, to read architecture specific CPU clock. In x86's case, this is the TSC. This is necessary to synchronize bpf traces from host and guest bpf-programs (after subtracting guest tsc-offset from guest timestamps). Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>