diff mbox series

[bpf-next] bpf: Add bpf_read_raw_record() helper

Message ID 20220823210354.1407473-1-namhyung@kernel.org (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next] bpf: Add bpf_read_raw_record() helper | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 105798 this patch: 105798
netdev/cc_maintainers warning 3 maintainers not CCed: song@kernel.org mingo@redhat.com martin.lau@linux.dev
netdev/build_clang success Errors and warnings before: 173 this patch: 173
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 107672 this patch: 107672
netdev/checkpatch warning WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-16
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc

Commit Message

Namhyung Kim Aug. 23, 2022, 9:03 p.m. UTC
The helper is for BPF programs attached to perf_event in order to read
event-specific raw data.  I followed the convention of the
bpf_read_branch_records() helper so that it can tell the size of
record using BPF_F_GET_RAW_RECORD flag.

The use case is to filter perf event samples based on the HW provided
data which have more detailed information about the sample.

Note that it only reads the first fragment of the raw record.  But it
seems mostly ok since all the existing PMU raw data have only single
fragment and the multi-fragment records are only for BPF output attached
to sockets.  So unless it's used with such an extreme case, it'd work
for most of tracing use cases.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
I don't know how to test this.  As the raw data is available on some
hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
rejected by the verifier.  Actually it needs a bpf_perf_event_data
context so that's not an option IIUC.

 include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
 kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

Comments

Song Liu Aug. 23, 2022, 10:19 p.m. UTC | #1
> On Aug 23, 2022, at 2:03 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> The helper is for BPF programs attached to perf_event in order to read
> event-specific raw data.  I followed the convention of the
> bpf_read_branch_records() helper so that it can tell the size of
> record using BPF_F_GET_RAW_RECORD flag.
> 
> The use case is to filter perf event samples based on the HW provided
> data which have more detailed information about the sample.
> 
> Note that it only reads the first fragment of the raw record.  But it
> seems mostly ok since all the existing PMU raw data have only single
> fragment and the multi-fragment records are only for BPF output attached
> to sockets.  So unless it's used with such an extreme case, it'd work
> for most of tracing use cases.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> I don't know how to test this.  As the raw data is available on some
> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> rejected by the verifier.  Actually it needs a bpf_perf_event_data
> context so that's not an option IIUC.

Can we add a software event that generates raw data for testing? 

Thanks,
Song


> 
> include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
> kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 64 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 934a2a8beb87..af7f70564819 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5355,6 +5355,23 @@ union bpf_attr {
>  *	Return
>  *		Current *ktime*.
>  *
> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> + *	Description
> + *		For an eBPF program attached to a perf event, retrieve the
> + *		raw record associated to *ctx* and store it in the buffer
> + *		pointed by *buf* up to size *size* bytes.
> + *	Return
> + *		On success, number of bytes written to *buf*. On error, a
> + *		negative value.
> + *
> + *		The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> + *		instead return the number of bytes required to store the raw
> + *		record. If this flag is set, *buf* may be NULL.
> + *
> + *		**-EINVAL** if arguments invalid or **size** not a multiple
> + *		of **sizeof**\ (u64\ ).
> + *
> + *		**-ENOENT** if the event does not have raw records.
>  */
> #define __BPF_FUNC_MAPPER(FN)		\
> 	FN(unspec),			\
> @@ -5566,6 +5583,7 @@ union bpf_attr {
> 	FN(tcp_raw_check_syncookie_ipv4),	\
> 	FN(tcp_raw_check_syncookie_ipv6),	\
> 	FN(ktime_get_tai_ns),		\
> +	FN(read_raw_record),		\
> 	/* */
> 
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> @@ -5749,6 +5767,11 @@ enum {
> 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
> };
> 
> +/* BPF_FUNC_read_raw_record flags. */
> +enum {
> +	BPF_F_GET_RAW_RECORD_SIZE	= (1ULL << 0),
> +};
> +
> #define __bpf_md_ptr(type, name)	\
> union {					\
> 	type name;			\
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 68e5cdd24cef..db172b12e5f8 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -20,6 +20,7 @@
> #include <linux/fprobe.h>
> #include <linux/bsearch.h>
> #include <linux/sort.h>
> +#include <linux/perf_event.h>
> 
> #include <net/bpf_sk_storage.h>
> 
> @@ -1532,6 +1533,44 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = {
> 	.arg4_type      = ARG_ANYTHING,
> };
> 
> +BPF_CALL_4(bpf_read_raw_record, struct bpf_perf_event_data_kern *, ctx,
> +	   void *, buf, u32, size, u64, flags)
> +{
> +	struct perf_raw_record *raw = ctx->data->raw;
> +	struct perf_raw_frag *frag;
> +	u32 to_copy;
> +
> +	if (unlikely(flags & ~BPF_F_GET_RAW_RECORD_SIZE))
> +		return -EINVAL;
> +
> +	if (unlikely(!raw))
> +		return -ENOENT;
> +
> +	if (flags & BPF_F_GET_RAW_RECORD_SIZE)
> +		return raw->size;
> +
> +	if (!buf || (size % sizeof(u32) != 0))
> +		return -EINVAL;
> +
> +	frag = &raw->frag;
> +	WARN_ON_ONCE(!perf_raw_frag_last(frag));
> +
> +	to_copy = min_t(u32, frag->size, size);
> +	memcpy(buf, frag->data, to_copy);
> +
> +	return to_copy;
> +}
> +
> +static const struct bpf_func_proto bpf_read_raw_record_proto = {
> +	.func           = bpf_read_raw_record,
> +	.gpl_only       = true,
> +	.ret_type       = RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
> +	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
> +	.arg4_type      = ARG_ANYTHING,
> +};
> +
> static const struct bpf_func_proto *
> pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> {
> @@ -1548,6 +1587,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> 		return &bpf_read_branch_records_proto;
> 	case BPF_FUNC_get_attach_cookie:
> 		return &bpf_get_attach_cookie_proto_pe;
> +	case BPF_FUNC_read_raw_record:
> +		return &bpf_read_raw_record_proto;
> 	default:
> 		return bpf_tracing_func_proto(func_id, prog);
> 	}
> -- 
> 2.37.2.609.g9ff673ca1a-goog
>
Namhyung Kim Aug. 23, 2022, 10:45 p.m. UTC | #2
Hi Song,

On Tue, Aug 23, 2022 at 3:19 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 23, 2022, at 2:03 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > The helper is for BPF programs attached to perf_event in order to read
> > event-specific raw data.  I followed the convention of the
> > bpf_read_branch_records() helper so that it can tell the size of
> > record using BPF_F_GET_RAW_RECORD flag.
> >
> > The use case is to filter perf event samples based on the HW provided
> > data which have more detailed information about the sample.
> >
> > Note that it only reads the first fragment of the raw record.  But it
> > seems mostly ok since all the existing PMU raw data have only single
> > fragment and the multi-fragment records are only for BPF output attached
> > to sockets.  So unless it's used with such an extreme case, it'd work
> > for most of tracing use cases.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > I don't know how to test this.  As the raw data is available on some
> > hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> > rejected by the verifier.  Actually it needs a bpf_perf_event_data
> > context so that's not an option IIUC.
>
> Can we add a software event that generates raw data for testing?

Ok, now I think that I can use a bpf-output sw event.  It would need
another BPF program to write data to the event and the test program
can read it from BPF using this helper. :)

Thanks,
Namhyung
John Fastabend Aug. 24, 2022, 5:31 a.m. UTC | #3
Namhyung Kim wrote:
> The helper is for BPF programs attached to perf_event in order to read
> event-specific raw data.  I followed the convention of the
> bpf_read_branch_records() helper so that it can tell the size of
> record using BPF_F_GET_RAW_RECORD flag.
> 
> The use case is to filter perf event samples based on the HW provided
> data which have more detailed information about the sample.
> 
> Note that it only reads the first fragment of the raw record.  But it
> seems mostly ok since all the existing PMU raw data have only single
> fragment and the multi-fragment records are only for BPF output attached
> to sockets.  So unless it's used with such an extreme case, it'd work
> for most of tracing use cases.
> 
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---

Acked-by: John Fastabend <john.fastabend@gmail.com>

> I don't know how to test this.  As the raw data is available on some
> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> rejected by the verifier.  Actually it needs a bpf_perf_event_data
> context so that's not an option IIUC.

not a pmu expert but also no good ideas on my side.

...

>  
> +BPF_CALL_4(bpf_read_raw_record, struct bpf_perf_event_data_kern *, ctx,
> +	   void *, buf, u32, size, u64, flags)
> +{
> +	struct perf_raw_record *raw = ctx->data->raw;
> +	struct perf_raw_frag *frag;
> +	u32 to_copy;
> +
> +	if (unlikely(flags & ~BPF_F_GET_RAW_RECORD_SIZE))
> +		return -EINVAL;
> +
> +	if (unlikely(!raw))
> +		return -ENOENT;
> +
> +	if (flags & BPF_F_GET_RAW_RECORD_SIZE)
> +		return raw->size;
> +
> +	if (!buf || (size % sizeof(u32) != 0))
> +		return -EINVAL;
> +
> +	frag = &raw->frag;
> +	WARN_ON_ONCE(!perf_raw_frag_last(frag));
> +
> +	to_copy = min_t(u32, frag->size, size);
> +	memcpy(buf, frag->data, to_copy);
> +
> +	return to_copy;
> +}
> +
> +static const struct bpf_func_proto bpf_read_raw_record_proto = {
> +	.func           = bpf_read_raw_record,
> +	.gpl_only       = true,
> +	.ret_type       = RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
> +	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
> +	.arg4_type      = ARG_ANYTHING,
> +};

Patch lgtm but curious why allow the ARG_PTR_TO_MEM_OR_NULL from API
side instead of just ARG_PTR_TO_MEM? Maybe, just to match the
existing perf_event_read()? I acked it as I think matching existing
API is likely good enough reason.

> +
>  static const struct bpf_func_proto *
>  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> @@ -1548,6 +1587,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_read_branch_records_proto;
>  	case BPF_FUNC_get_attach_cookie:
>  		return &bpf_get_attach_cookie_proto_pe;
> +	case BPF_FUNC_read_raw_record:
> +		return &bpf_read_raw_record_proto;
>  	default:
>  		return bpf_tracing_func_proto(func_id, prog);
>  	}
> -- 
> 2.37.2.609.g9ff673ca1a-goog
>
John Fastabend Aug. 24, 2022, 5:32 a.m. UTC | #4
Namhyung Kim wrote:
> Hi Song,
> 
> On Tue, Aug 23, 2022 at 3:19 PM Song Liu <songliubraving@fb.com> wrote:
> >
> >
> >
> > > On Aug 23, 2022, at 2:03 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > The helper is for BPF programs attached to perf_event in order to read
> > > event-specific raw data.  I followed the convention of the
> > > bpf_read_branch_records() helper so that it can tell the size of
> > > record using BPF_F_GET_RAW_RECORD flag.
> > >
> > > The use case is to filter perf event samples based on the HW provided
> > > data which have more detailed information about the sample.
> > >
> > > Note that it only reads the first fragment of the raw record.  But it
> > > seems mostly ok since all the existing PMU raw data have only single
> > > fragment and the multi-fragment records are only for BPF output attached
> > > to sockets.  So unless it's used with such an extreme case, it'd work
> > > for most of tracing use cases.
> > >
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > ---
> > > I don't know how to test this.  As the raw data is available on some
> > > hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> > > rejected by the verifier.  Actually it needs a bpf_perf_event_data
> > > context so that's not an option IIUC.
> >
> > Can we add a software event that generates raw data for testing?
> 
> Ok, now I think that I can use a bpf-output sw event.  It would need
> another BPF program to write data to the event and the test program
> can read it from BPF using this helper. :)
> 
> Thanks,
> Namhyung

Ah good idea. Feel free to carry my ACK to the v2 with the test.
Namhyung Kim Aug. 24, 2022, 6:09 a.m. UTC | #5
Hello,

On Tue, Aug 23, 2022 at 10:31 PM John Fastabend
<john.fastabend@gmail.com> wrote:
>
> Namhyung Kim wrote:
> > The helper is for BPF programs attached to perf_event in order to read
> > event-specific raw data.  I followed the convention of the
> > bpf_read_branch_records() helper so that it can tell the size of
> > record using BPF_F_GET_RAW_RECORD flag.
> >
> > The use case is to filter perf event samples based on the HW provided
> > data which have more detailed information about the sample.
> >
> > Note that it only reads the first fragment of the raw record.  But it
> > seems mostly ok since all the existing PMU raw data have only single
> > fragment and the multi-fragment records are only for BPF output attached
> > to sockets.  So unless it's used with such an extreme case, it'd work
> > for most of tracing use cases.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
>
> Acked-by: John Fastabend <john.fastabend@gmail.com>

Thanks!

>
> > I don't know how to test this.  As the raw data is available on some
> > hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> > rejected by the verifier.  Actually it needs a bpf_perf_event_data
> > context so that's not an option IIUC.
>
> not a pmu expert but also no good ideas on my side.
>
> ...
>
> >
> > +BPF_CALL_4(bpf_read_raw_record, struct bpf_perf_event_data_kern *, ctx,
> > +        void *, buf, u32, size, u64, flags)
> > +{
> > +     struct perf_raw_record *raw = ctx->data->raw;
> > +     struct perf_raw_frag *frag;
> > +     u32 to_copy;
> > +
> > +     if (unlikely(flags & ~BPF_F_GET_RAW_RECORD_SIZE))
> > +             return -EINVAL;
> > +
> > +     if (unlikely(!raw))
> > +             return -ENOENT;
> > +
> > +     if (flags & BPF_F_GET_RAW_RECORD_SIZE)
> > +             return raw->size;
> > +
> > +     if (!buf || (size % sizeof(u32) != 0))
> > +             return -EINVAL;
> > +
> > +     frag = &raw->frag;
> > +     WARN_ON_ONCE(!perf_raw_frag_last(frag));
> > +
> > +     to_copy = min_t(u32, frag->size, size);
> > +     memcpy(buf, frag->data, to_copy);
> > +
> > +     return to_copy;
> > +}
> > +
> > +static const struct bpf_func_proto bpf_read_raw_record_proto = {
> > +     .func           = bpf_read_raw_record,
> > +     .gpl_only       = true,
> > +     .ret_type       = RET_INTEGER,
> > +     .arg1_type      = ARG_PTR_TO_CTX,
> > +     .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
> > +     .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
> > +     .arg4_type      = ARG_ANYTHING,
> > +};
>
> Patch lgtm but curious why allow the ARG_PTR_TO_MEM_OR_NULL from API
> side instead of just ARG_PTR_TO_MEM? Maybe, just to match the
> existing perf_event_read()? I acked it as I think matching existing
> API is likely good enough reason.

It can query the size of raw record using BPF_F_GET_RAW_RECORD_SIZE.
In that case it can pass NULL for the buffer (and 0 for the size).

Thanks,
Namhyung


>
> > +
> >  static const struct bpf_func_proto *
> >  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> >  {
> > @@ -1548,6 +1587,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> >               return &bpf_read_branch_records_proto;
> >       case BPF_FUNC_get_attach_cookie:
> >               return &bpf_get_attach_cookie_proto_pe;
> > +     case BPF_FUNC_read_raw_record:
> > +             return &bpf_read_raw_record_proto;
> >       default:
> >               return bpf_tracing_func_proto(func_id, prog);
> >       }
> > --
> > 2.37.2.609.g9ff673ca1a-goog
> >
>
>
Namhyung Kim Aug. 25, 2022, 4:57 p.m. UTC | #6
On Tue, Aug 23, 2022 at 10:32 PM John Fastabend
<john.fastabend@gmail.com> wrote:
> Namhyung Kim wrote:
> > Ok, now I think that I can use a bpf-output sw event.  It would need
> > another BPF program to write data to the event and the test program
> > can read it from BPF using this helper. :)
>
> Ah good idea. Feel free to carry my ACK to the v2 with the test.

Hmm.. it seems not to work because
1. bpf_output sw event doesn't have the overflow mechanism and it
   doesn't call the bpf program.
2. even if I added it, it couldn't run due to the recursion protection by
   bpf_prog_active.

Thanks,
Namhyung
Song Liu Aug. 25, 2022, 5:04 p.m. UTC | #7
> On Aug 25, 2022, at 9:57 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Tue, Aug 23, 2022 at 10:32 PM John Fastabend
> <john.fastabend@gmail.com> wrote:
>> Namhyung Kim wrote:
>>> Ok, now I think that I can use a bpf-output sw event.  It would need
>>> another BPF program to write data to the event and the test program
>>> can read it from BPF using this helper. :)
>> 
>> Ah good idea. Feel free to carry my ACK to the v2 with the test.
> 
> Hmm.. it seems not to work because
> 1. bpf_output sw event doesn't have the overflow mechanism and it
>   doesn't call the bpf program.
> 2. even if I added it, it couldn't run due to the recursion protection by
>   bpf_prog_active.

How about we enable some raw record for a software event? Something not 
controlled by BPF?

If this doesn't work, a self test that only runs on some hardware is also
helpful. 

Thanks,
Song
Namhyung Kim Aug. 25, 2022, 5:21 p.m. UTC | #8
On Thu, Aug 25, 2022 at 10:05 AM Song Liu <songliubraving@fb.com> wrote:
>
> > On Aug 25, 2022, at 9:57 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Tue, Aug 23, 2022 at 10:32 PM John Fastabend
> > <john.fastabend@gmail.com> wrote:
> >> Namhyung Kim wrote:
> >>> Ok, now I think that I can use a bpf-output sw event.  It would need
> >>> another BPF program to write data to the event and the test program
> >>> can read it from BPF using this helper. :)
> >>
> >> Ah good idea. Feel free to carry my ACK to the v2 with the test.
> >
> > Hmm.. it seems not to work because
> > 1. bpf_output sw event doesn't have the overflow mechanism and it
> >   doesn't call the bpf program.
> > 2. even if I added it, it couldn't run due to the recursion protection by
> >   bpf_prog_active.
>
> How about we enable some raw record for a software event? Something not
> controlled by BPF?

Only for the test?  It'd be nice if we could have meaningful data from software
events but I don't have an idea what data it could carry on which event.

Peter, what do you think?

>
> If this doesn't work, a self test that only runs on some hardware is also
> helpful.

Yep, makes sense.

Thanks,
Namhyung
Andrii Nakryiko Aug. 25, 2022, 9:33 p.m. UTC | #9
On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> The helper is for BPF programs attached to perf_event in order to read
> event-specific raw data.  I followed the convention of the
> bpf_read_branch_records() helper so that it can tell the size of
> record using BPF_F_GET_RAW_RECORD flag.
>
> The use case is to filter perf event samples based on the HW provided
> data which have more detailed information about the sample.
>
> Note that it only reads the first fragment of the raw record.  But it
> seems mostly ok since all the existing PMU raw data have only single
> fragment and the multi-fragment records are only for BPF output attached
> to sockets.  So unless it's used with such an extreme case, it'd work
> for most of tracing use cases.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> ---
> I don't know how to test this.  As the raw data is available on some
> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> rejected by the verifier.  Actually it needs a bpf_perf_event_data
> context so that's not an option IIUC.
>
>  include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
>  kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 64 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 934a2a8beb87..af7f70564819 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -5355,6 +5355,23 @@ union bpf_attr {
>   *     Return
>   *             Current *ktime*.
>   *
> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> + *     Description
> + *             For an eBPF program attached to a perf event, retrieve the
> + *             raw record associated to *ctx* and store it in the buffer
> + *             pointed by *buf* up to size *size* bytes.
> + *     Return
> + *             On success, number of bytes written to *buf*. On error, a
> + *             negative value.
> + *
> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> + *             instead return the number of bytes required to store the raw
> + *             record. If this flag is set, *buf* may be NULL.

It looks pretty ugly from a usability standpoint to have one helper
doing completely different things and returning two different values
based on BPF_F_GET_RAW_RECORD_SIZE.

I'm not sure what's best, but I have two alternative proposals:

1. Add two helpers: one to get perf record information (and size will
be one of them). Something like bpf_perf_record_query(ctx, flags)
where you pass perf ctx and what kind of information you want to read
(through flags), and u64 return result returns that (see
bpf_ringbuf_query() for such approach). And then have separate helper
to read data.

2. Keep one helper, but specify that it always returns record size,
even if user specified smaller size to read. And then allow passing
buf==NULL && size==0. So passing NULL, 0 -- you get record size.
Passing non-NULL buf -- you read data.


And also, "read_raw_record" is way too generic. We have
bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
a name. We should have called bpf_read_branch_records() as
bpf_perf_read_branch_records(), probably, as well. But it's too late.

> + *
> + *             **-EINVAL** if arguments invalid or **size** not a multiple
> + *             of **sizeof**\ (u64\ ).
> + *
> + *             **-ENOENT** if the event does not have raw records.
>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -5566,6 +5583,7 @@ union bpf_attr {
>         FN(tcp_raw_check_syncookie_ipv4),       \
>         FN(tcp_raw_check_syncookie_ipv6),       \
>         FN(ktime_get_tai_ns),           \
> +       FN(read_raw_record),            \
>         /* */
>

[...]
Song Liu Aug. 25, 2022, 10:08 p.m. UTC | #10
> On Aug 25, 2022, at 2:33 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
>> 
>> The helper is for BPF programs attached to perf_event in order to read
>> event-specific raw data.  I followed the convention of the
>> bpf_read_branch_records() helper so that it can tell the size of
>> record using BPF_F_GET_RAW_RECORD flag.
>> 
>> The use case is to filter perf event samples based on the HW provided
>> data which have more detailed information about the sample.
>> 
>> Note that it only reads the first fragment of the raw record.  But it
>> seems mostly ok since all the existing PMU raw data have only single
>> fragment and the multi-fragment records are only for BPF output attached
>> to sockets.  So unless it's used with such an extreme case, it'd work
>> for most of tracing use cases.
>> 
>> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>> ---
>> I don't know how to test this.  As the raw data is available on some
>> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
>> rejected by the verifier.  Actually it needs a bpf_perf_event_data
>> context so that's not an option IIUC.
>> 
>> include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
>> kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 64 insertions(+)
>> 
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 934a2a8beb87..af7f70564819 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -5355,6 +5355,23 @@ union bpf_attr {
>>  *     Return
>>  *             Current *ktime*.
>>  *
>> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
>> + *     Description
>> + *             For an eBPF program attached to a perf event, retrieve the
>> + *             raw record associated to *ctx* and store it in the buffer
>> + *             pointed by *buf* up to size *size* bytes.
>> + *     Return
>> + *             On success, number of bytes written to *buf*. On error, a
>> + *             negative value.
>> + *
>> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
>> + *             instead return the number of bytes required to store the raw
>> + *             record. If this flag is set, *buf* may be NULL.
> 
> It looks pretty ugly from a usability standpoint to have one helper
> doing completely different things and returning two different values
> based on BPF_F_GET_RAW_RECORD_SIZE.

Yeah, I had the same thought when I first looked at it. But that's the 
exact syntax with bpf_read_branch_records(). Well, we still have time
to fix the new helper..

> 
> I'm not sure what's best, but I have two alternative proposals:
> 
> 1. Add two helpers: one to get perf record information (and size will
> be one of them). Something like bpf_perf_record_query(ctx, flags)
> where you pass perf ctx and what kind of information you want to read
> (through flags), and u64 return result returns that (see
> bpf_ringbuf_query() for such approach). And then have separate helper
> to read data.
> 
> 2. Keep one helper, but specify that it always returns record size,
> even if user specified smaller size to read. And then allow passing
> buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> Passing non-NULL buf -- you read data.

AFAICT, this is also confusing.

Maybe we should use two kfuncs for this?

Thanks,
Song

> 
> 
> And also, "read_raw_record" is way too generic. We have
> bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
> a name. We should have called bpf_read_branch_records() as
> bpf_perf_read_branch_records(), probably, as well. But it's too late.
> 
>> + *
>> + *             **-EINVAL** if arguments invalid or **size** not a multiple
>> + *             of **sizeof**\ (u64\ ).
>> + *
>> + *             **-ENOENT** if the event does not have raw records.
>>  */
>> #define __BPF_FUNC_MAPPER(FN)          \
>>        FN(unspec),                     \
>> @@ -5566,6 +5583,7 @@ union bpf_attr {
>>        FN(tcp_raw_check_syncookie_ipv4),       \
>>        FN(tcp_raw_check_syncookie_ipv6),       \
>>        FN(ktime_get_tai_ns),           \
>> +       FN(read_raw_record),            \
>>        /* */
>> 
> 
> [...]
Namhyung Kim Aug. 25, 2022, 10:13 p.m. UTC | #11
Hello,

On Thu, Aug 25, 2022 at 2:34 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > The helper is for BPF programs attached to perf_event in order to read
> > event-specific raw data.  I followed the convention of the
> > bpf_read_branch_records() helper so that it can tell the size of
> > record using BPF_F_GET_RAW_RECORD flag.
> >
> > The use case is to filter perf event samples based on the HW provided
> > data which have more detailed information about the sample.
> >
> > Note that it only reads the first fragment of the raw record.  But it
> > seems mostly ok since all the existing PMU raw data have only single
> > fragment and the multi-fragment records are only for BPF output attached
> > to sockets.  So unless it's used with such an extreme case, it'd work
> > for most of tracing use cases.
> >
> > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > ---
> > I don't know how to test this.  As the raw data is available on some
> > hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> > rejected by the verifier.  Actually it needs a bpf_perf_event_data
> > context so that's not an option IIUC.
> >
> >  include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
> >  kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 64 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 934a2a8beb87..af7f70564819 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -5355,6 +5355,23 @@ union bpf_attr {
> >   *     Return
> >   *             Current *ktime*.
> >   *
> > + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> > + *     Description
> > + *             For an eBPF program attached to a perf event, retrieve the
> > + *             raw record associated to *ctx* and store it in the buffer
> > + *             pointed by *buf* up to size *size* bytes.
> > + *     Return
> > + *             On success, number of bytes written to *buf*. On error, a
> > + *             negative value.
> > + *
> > + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> > + *             instead return the number of bytes required to store the raw
> > + *             record. If this flag is set, *buf* may be NULL.
>
> It looks pretty ugly from a usability standpoint to have one helper
> doing completely different things and returning two different values
> based on BPF_F_GET_RAW_RECORD_SIZE.

Agreed.

>
> I'm not sure what's best, but I have two alternative proposals:
>
> 1. Add two helpers: one to get perf record information (and size will
> be one of them). Something like bpf_perf_record_query(ctx, flags)
> where you pass perf ctx and what kind of information you want to read
> (through flags), and u64 return result returns that (see
> bpf_ringbuf_query() for such approach). And then have separate helper
> to read data.

I like this as I want to have more info for the perf event sample like
instruction address or sample type.  I know some of the info is
available through the context but I think this is a better approach.

>
> 2. Keep one helper, but specify that it always returns record size,
> even if user specified smaller size to read. And then allow passing
> buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> Passing non-NULL buf -- you read data.
>
>
> And also, "read_raw_record" is way too generic. We have
> bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
> a name. We should have called bpf_read_branch_records() as
> bpf_perf_read_branch_records(), probably, as well. But it's too late.

Yeah, what about this?

 * bpf_perf_event_query(ctx, flag)
 * bpf_perf_event_get(ctx, flag, buf, size)

Maybe we can use the same flag for both.  Like BPF_PERF_RAW_RECORD
can return the size (or -1 if not) on _query() and read the data on _get().
Or we can have a BPF_PERF_RAW_RECORD_SIZE only for _query().
It seems we don't need _get() for things like BPF_PERF_SAMPLE_IP.
What do you think?

Thanks,
Namhyung
Andrii Nakryiko Aug. 25, 2022, 11:03 p.m. UTC | #12
On Thu, Aug 25, 2022 at 3:08 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 25, 2022, at 2:33 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >>
> >> The helper is for BPF programs attached to perf_event in order to read
> >> event-specific raw data.  I followed the convention of the
> >> bpf_read_branch_records() helper so that it can tell the size of
> >> record using BPF_F_GET_RAW_RECORD flag.
> >>
> >> The use case is to filter perf event samples based on the HW provided
> >> data which have more detailed information about the sample.
> >>
> >> Note that it only reads the first fragment of the raw record.  But it
> >> seems mostly ok since all the existing PMU raw data have only single
> >> fragment and the multi-fragment records are only for BPF output attached
> >> to sockets.  So unless it's used with such an extreme case, it'd work
> >> for most of tracing use cases.
> >>
> >> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> >> ---
> >> I don't know how to test this.  As the raw data is available on some
> >> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> >> rejected by the verifier.  Actually it needs a bpf_perf_event_data
> >> context so that's not an option IIUC.
> >>
> >> include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
> >> kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
> >> 2 files changed, 64 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 934a2a8beb87..af7f70564819 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -5355,6 +5355,23 @@ union bpf_attr {
> >>  *     Return
> >>  *             Current *ktime*.
> >>  *
> >> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> >> + *     Description
> >> + *             For an eBPF program attached to a perf event, retrieve the
> >> + *             raw record associated to *ctx* and store it in the buffer
> >> + *             pointed by *buf* up to size *size* bytes.
> >> + *     Return
> >> + *             On success, number of bytes written to *buf*. On error, a
> >> + *             negative value.
> >> + *
> >> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> >> + *             instead return the number of bytes required to store the raw
> >> + *             record. If this flag is set, *buf* may be NULL.
> >
> > It looks pretty ugly from a usability standpoint to have one helper
> > doing completely different things and returning two different values
> > based on BPF_F_GET_RAW_RECORD_SIZE.
>
> Yeah, I had the same thought when I first looked at it. But that's the
> exact syntax with bpf_read_branch_records(). Well, we still have time
> to fix the new helper..
>
> >
> > I'm not sure what's best, but I have two alternative proposals:
> >
> > 1. Add two helpers: one to get perf record information (and size will
> > be one of them). Something like bpf_perf_record_query(ctx, flags)
> > where you pass perf ctx and what kind of information you want to read
> > (through flags), and u64 return result returns that (see
> > bpf_ringbuf_query() for such approach). And then have separate helper
> > to read data.
> >
> > 2. Keep one helper, but specify that it always returns record size,
> > even if user specified smaller size to read. And then allow passing
> > buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> > Passing non-NULL buf -- you read data.
>
> AFAICT, this is also confusing.
>

this is analogous to snprintf() behavior, so not that new and
surprising when you think about it. But if query + read makes more
sense, then it's fine by me

> Maybe we should use two kfuncs for this?
>
> Thanks,
> Song
>
> >
> >
> > And also, "read_raw_record" is way too generic. We have
> > bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
> > a name. We should have called bpf_read_branch_records() as
> > bpf_perf_read_branch_records(), probably, as well. But it's too late.
> >
> >> + *
> >> + *             **-EINVAL** if arguments invalid or **size** not a multiple
> >> + *             of **sizeof**\ (u64\ ).
> >> + *
> >> + *             **-ENOENT** if the event does not have raw records.
> >>  */
> >> #define __BPF_FUNC_MAPPER(FN)          \
> >>        FN(unspec),                     \
> >> @@ -5566,6 +5583,7 @@ union bpf_attr {
> >>        FN(tcp_raw_check_syncookie_ipv4),       \
> >>        FN(tcp_raw_check_syncookie_ipv6),       \
> >>        FN(ktime_get_tai_ns),           \
> >> +       FN(read_raw_record),            \
> >>        /* */
> >>
> >
> > [...]
>
Andrii Nakryiko Aug. 25, 2022, 11:09 p.m. UTC | #13
On Thu, Aug 25, 2022 at 3:13 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> Hello,
>
> On Thu, Aug 25, 2022 at 2:34 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >
> > > The helper is for BPF programs attached to perf_event in order to read
> > > event-specific raw data.  I followed the convention of the
> > > bpf_read_branch_records() helper so that it can tell the size of
> > > record using BPF_F_GET_RAW_RECORD flag.
> > >
> > > The use case is to filter perf event samples based on the HW provided
> > > data which have more detailed information about the sample.
> > >
> > > Note that it only reads the first fragment of the raw record.  But it
> > > seems mostly ok since all the existing PMU raw data have only single
> > > fragment and the multi-fragment records are only for BPF output attached
> > > to sockets.  So unless it's used with such an extreme case, it'd work
> > > for most of tracing use cases.
> > >
> > > Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> > > ---
> > > I don't know how to test this.  As the raw data is available on some
> > > hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
> > > rejected by the verifier.  Actually it needs a bpf_perf_event_data
> > > context so that's not an option IIUC.
> > >
> > >  include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
> > >  kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 64 insertions(+)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 934a2a8beb87..af7f70564819 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -5355,6 +5355,23 @@ union bpf_attr {
> > >   *     Return
> > >   *             Current *ktime*.
> > >   *
> > > + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> > > + *     Description
> > > + *             For an eBPF program attached to a perf event, retrieve the
> > > + *             raw record associated to *ctx* and store it in the buffer
> > > + *             pointed by *buf* up to size *size* bytes.
> > > + *     Return
> > > + *             On success, number of bytes written to *buf*. On error, a
> > > + *             negative value.
> > > + *
> > > + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> > > + *             instead return the number of bytes required to store the raw
> > > + *             record. If this flag is set, *buf* may be NULL.
> >
> > It looks pretty ugly from a usability standpoint to have one helper
> > doing completely different things and returning two different values
> > based on BPF_F_GET_RAW_RECORD_SIZE.
>
> Agreed.
>
> >
> > I'm not sure what's best, but I have two alternative proposals:
> >
> > 1. Add two helpers: one to get perf record information (and size will
> > be one of them). Something like bpf_perf_record_query(ctx, flags)
> > where you pass perf ctx and what kind of information you want to read
> > (through flags), and u64 return result returns that (see
> > bpf_ringbuf_query() for such approach). And then have separate helper
> > to read data.
>
> I like this as I want to have more info for the perf event sample like
> instruction address or sample type.  I know some of the info is
> available through the context but I think this is a better approach.
>
> >
> > 2. Keep one helper, but specify that it always returns record size,
> > even if user specified smaller size to read. And then allow passing
> > buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> > Passing non-NULL buf -- you read data.
> >
> >
> > And also, "read_raw_record" is way too generic. We have
> > bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
> > a name. We should have called bpf_read_branch_records() as
> > bpf_perf_read_branch_records(), probably, as well. But it's too late.
>
> Yeah, what about this?
>
>  * bpf_perf_event_query(ctx, flag)
>  * bpf_perf_event_get(ctx, flag, buf, size)
>
> Maybe we can use the same flag for both.  Like BPF_PERF_RAW_RECORD
> can return the size (or -1 if not) on _query() and read the data on _get().
> Or we can have a BPF_PERF_RAW_RECORD_SIZE only for _query().
> It seems we don't need _get() for things like BPF_PERF_SAMPLE_IP.
> What do you think?
>

probably separate flags makes more sense, because I can see how we can
allow querying multiple up-to-u64-sized things (even sample_ip, for
example), while reserve bpf_perf_event_get() to variable-size or big
sized values.

Not super keen on "get", "read" or "load" is the verb we typically use
for such operations, but we already have bpf_perf_event_read(). And
bpf_perf_event_load() doesn't seem right either. Naming is hard. Maybe
"fetch"? Or just "get". Don't know, but maybe someone has good ideas.

> Thanks,
> Namhyung
Song Liu Aug. 26, 2022, 2:35 a.m. UTC | #14
> On Aug 25, 2022, at 4:03 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> 
> On Thu, Aug 25, 2022 at 3:08 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Aug 25, 2022, at 2:33 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>>> 
>>> On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>>> 
>>>> The helper is for BPF programs attached to perf_event in order to read
>>>> event-specific raw data.  I followed the convention of the
>>>> bpf_read_branch_records() helper so that it can tell the size of
>>>> record using BPF_F_GET_RAW_RECORD flag.
>>>> 
>>>> The use case is to filter perf event samples based on the HW provided
>>>> data which have more detailed information about the sample.
>>>> 
>>>> Note that it only reads the first fragment of the raw record.  But it
>>>> seems mostly ok since all the existing PMU raw data have only single
>>>> fragment and the multi-fragment records are only for BPF output attached
>>>> to sockets.  So unless it's used with such an extreme case, it'd work
>>>> for most of tracing use cases.
>>>> 
>>>> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
>>>> ---
>>>> I don't know how to test this.  As the raw data is available on some
>>>> hardware PMU only (e.g. AMD IBS).  I tried a tracepoint event but it was
>>>> rejected by the verifier.  Actually it needs a bpf_perf_event_data
>>>> context so that's not an option IIUC.
>>>> 
>>>> include/uapi/linux/bpf.h | 23 ++++++++++++++++++++++
>>>> kernel/trace/bpf_trace.c | 41 ++++++++++++++++++++++++++++++++++++++++
>>>> 2 files changed, 64 insertions(+)
>>>> 
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 934a2a8beb87..af7f70564819 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -5355,6 +5355,23 @@ union bpf_attr {
>>>> *     Return
>>>> *             Current *ktime*.
>>>> *
>>>> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
>>>> + *     Description
>>>> + *             For an eBPF program attached to a perf event, retrieve the
>>>> + *             raw record associated to *ctx* and store it in the buffer
>>>> + *             pointed by *buf* up to size *size* bytes.
>>>> + *     Return
>>>> + *             On success, number of bytes written to *buf*. On error, a
>>>> + *             negative value.
>>>> + *
>>>> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
>>>> + *             instead return the number of bytes required to store the raw
>>>> + *             record. If this flag is set, *buf* may be NULL.
>>> 
>>> It looks pretty ugly from a usability standpoint to have one helper
>>> doing completely different things and returning two different values
>>> based on BPF_F_GET_RAW_RECORD_SIZE.
>> 
>> Yeah, I had the same thought when I first looked at it. But that's the
>> exact syntax with bpf_read_branch_records(). Well, we still have time
>> to fix the new helper..
>> 
>>> 
>>> I'm not sure what's best, but I have two alternative proposals:
>>> 
>>> 1. Add two helpers: one to get perf record information (and size will
>>> be one of them). Something like bpf_perf_record_query(ctx, flags)
>>> where you pass perf ctx and what kind of information you want to read
>>> (through flags), and u64 return result returns that (see
>>> bpf_ringbuf_query() for such approach). And then have separate helper
>>> to read data.
>>> 
>>> 2. Keep one helper, but specify that it always returns record size,
>>> even if user specified smaller size to read. And then allow passing
>>> buf==NULL && size==0. So passing NULL, 0 -- you get record size.
>>> Passing non-NULL buf -- you read data.
>> 
>> AFAICT, this is also confusing.
>> 
> 
> this is analogous to snprintf() behavior, so not that new and
> surprising when you think about it. But if query + read makes more
> sense, then it's fine by me

Given the name discussion (the other email), I now like one API better.

Actually, since we are on this, can we make it more generic, and handle
all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
like:

long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);

WDYT Namhyung?

Another idea is to add another parameter, so that we can pick which 
PERF_SAMPLE_* to output via bpf_perf_event_read_sample().

I think this will cover all cases with sample perf_event. Thoughts?

Thanks,
Song



> 
>> Maybe we should use two kfuncs for this?
>> 
>> Thanks,
>> Song
>> 
>>> 
>>> 
>>> And also, "read_raw_record" is way too generic. We have
>>> bpf_perf_prog_read_value(), let's use "bpf_perf_read_raw_record()" as
>>> a name. We should have called bpf_read_branch_records() as
>>> bpf_perf_read_branch_records(), probably, as well. But it's too late.
>>> 
>>>> + *
>>>> + *             **-EINVAL** if arguments invalid or **size** not a multiple
>>>> + *             of **sizeof**\ (u64\ ).
>>>> + *
>>>> + *             **-ENOENT** if the event does not have raw records.
>>>> */
>>>> #define __BPF_FUNC_MAPPER(FN)          \
>>>>       FN(unspec),                     \
>>>> @@ -5566,6 +5583,7 @@ union bpf_attr {
>>>>       FN(tcp_raw_check_syncookie_ipv4),       \
>>>>       FN(tcp_raw_check_syncookie_ipv6),       \
>>>>       FN(ktime_get_tai_ns),           \
>>>> +       FN(read_raw_record),            \
>>>>       /* */
>>>> 
>>> 
>>> [...]
>>
Namhyung Kim Aug. 26, 2022, 5:22 a.m. UTC | #15
On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 25, 2022, at 4:03 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > On Thu, Aug 25, 2022 at 3:08 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Aug 25, 2022, at 2:33 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >>>
> >>> On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >>>> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> >>>> + *     Description
> >>>> + *             For an eBPF program attached to a perf event, retrieve the
> >>>> + *             raw record associated to *ctx* and store it in the buffer
> >>>> + *             pointed by *buf* up to size *size* bytes.
> >>>> + *     Return
> >>>> + *             On success, number of bytes written to *buf*. On error, a
> >>>> + *             negative value.
> >>>> + *
> >>>> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> >>>> + *             instead return the number of bytes required to store the raw
> >>>> + *             record. If this flag is set, *buf* may be NULL.
> >>>
> >>> It looks pretty ugly from a usability standpoint to have one helper
> >>> doing completely different things and returning two different values
> >>> based on BPF_F_GET_RAW_RECORD_SIZE.
> >>
> >> Yeah, I had the same thought when I first looked at it. But that's the
> >> exact syntax with bpf_read_branch_records(). Well, we still have time
> >> to fix the new helper..
> >>
> >>>
> >>> I'm not sure what's best, but I have two alternative proposals:
> >>>
> >>> 1. Add two helpers: one to get perf record information (and size will
> >>> be one of them). Something like bpf_perf_record_query(ctx, flags)
> >>> where you pass perf ctx and what kind of information you want to read
> >>> (through flags), and u64 return result returns that (see
> >>> bpf_ringbuf_query() for such approach). And then have separate helper
> >>> to read data.
> >>>
> >>> 2. Keep one helper, but specify that it always returns record size,
> >>> even if user specified smaller size to read. And then allow passing
> >>> buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> >>> Passing non-NULL buf -- you read data.
> >>
> >> AFAICT, this is also confusing.
> >>
> >
> > this is analogous to snprintf() behavior, so not that new and
> > surprising when you think about it. But if query + read makes more
> > sense, then it's fine by me
>
> Given the name discussion (the other email), I now like one API better.
>
> Actually, since we are on this, can we make it more generic, and handle
> all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
> like:
>
> long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
>
> WDYT Namhyung?

Do you mean reading the whole sample data at once?
Then it needs to parse the sample data format properly
which is non trivial due to a number of variable length
fields like callchains and branch stack, etc.

Also I'm afraid I might need event configuration info
other than sample data like attr.type, attr.config,
attr.sample_type and so on.

Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?

>
> Another idea is to add another parameter, so that we can pick which
> PERF_SAMPLE_* to output via bpf_perf_event_read_sample().
>
> I think this will cover all cases with sample perf_event. Thoughts?

Yeah, I like this more and it looks easier to use.

Thanks,
Namhyung
Song Liu Aug. 26, 2022, 5:53 a.m. UTC | #16
On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
> >
> >
> >
> > > On Aug 25, 2022, at 4:03 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Thu, Aug 25, 2022 at 3:08 PM Song Liu <songliubraving@fb.com> wrote:
> > >>
> > >>
> > >>
> > >>> On Aug 25, 2022, at 2:33 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> > >>>
> > >>> On Tue, Aug 23, 2022 at 2:04 PM Namhyung Kim <namhyung@kernel.org> wrote:
> > >>>> + * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
> > >>>> + *     Description
> > >>>> + *             For an eBPF program attached to a perf event, retrieve the
> > >>>> + *             raw record associated to *ctx* and store it in the buffer
> > >>>> + *             pointed by *buf* up to size *size* bytes.
> > >>>> + *     Return
> > >>>> + *             On success, number of bytes written to *buf*. On error, a
> > >>>> + *             negative value.
> > >>>> + *
> > >>>> + *             The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
> > >>>> + *             instead return the number of bytes required to store the raw
> > >>>> + *             record. If this flag is set, *buf* may be NULL.
> > >>>
> > >>> It looks pretty ugly from a usability standpoint to have one helper
> > >>> doing completely different things and returning two different values
> > >>> based on BPF_F_GET_RAW_RECORD_SIZE.
> > >>
> > >> Yeah, I had the same thought when I first looked at it. But that's the
> > >> exact syntax with bpf_read_branch_records(). Well, we still have time
> > >> to fix the new helper..
> > >>
> > >>>
> > >>> I'm not sure what's best, but I have two alternative proposals:
> > >>>
> > >>> 1. Add two helpers: one to get perf record information (and size will
> > >>> be one of them). Something like bpf_perf_record_query(ctx, flags)
> > >>> where you pass perf ctx and what kind of information you want to read
> > >>> (through flags), and u64 return result returns that (see
> > >>> bpf_ringbuf_query() for such approach). And then have separate helper
> > >>> to read data.
> > >>>
> > >>> 2. Keep one helper, but specify that it always returns record size,
> > >>> even if user specified smaller size to read. And then allow passing
> > >>> buf==NULL && size==0. So passing NULL, 0 -- you get record size.
> > >>> Passing non-NULL buf -- you read data.
> > >>
> > >> AFAICT, this is also confusing.
> > >>
> > >
> > > this is analogous to snprintf() behavior, so not that new and
> > > surprising when you think about it. But if query + read makes more
> > > sense, then it's fine by me
> >
> > Given the name discussion (the other email), I now like one API better.
> >
> > Actually, since we are on this, can we make it more generic, and handle
> > all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
> > like:
> >
> > long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
> >
> > WDYT Namhyung?
>
> Do you mean reading the whole sample data at once?
> Then it needs to parse the sample data format properly
> which is non trivial due to a number of variable length
> fields like callchains and branch stack, etc.
>
> Also I'm afraid I might need event configuration info
> other than sample data like attr.type, attr.config,
> attr.sample_type and so on.
>
> Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?

The user should have access to the perf_event_attr used to
create the event. This is also available in ctx->event->attr.

Would this work?

Thanks,
Song

>
> >
> > Another idea is to add another parameter, so that we can pick which
> > PERF_SAMPLE_* to output via bpf_perf_event_read_sample().
> >
> > I think this will cover all cases with sample perf_event. Thoughts?
>
> Yeah, I like this more and it looks easier to use.
Namhyung Kim Aug. 26, 2022, 4:33 p.m. UTC | #17
On Thu, Aug 25, 2022 at 10:53 PM Song Liu <song@kernel.org> wrote:
>
> On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
> > > Actually, since we are on this, can we make it more generic, and handle
> > > all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
> > > like:
> > >
> > > long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
> > >
> > > WDYT Namhyung?
> >
> > Do you mean reading the whole sample data at once?
> > Then it needs to parse the sample data format properly
> > which is non trivial due to a number of variable length
> > fields like callchains and branch stack, etc.
> >
> > Also I'm afraid I might need event configuration info
> > other than sample data like attr.type, attr.config,
> > attr.sample_type and so on.
> >
> > Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?
>
> The user should have access to the perf_event_attr used to
> create the event. This is also available in ctx->event->attr.

Do you mean from BPF?  I'd like to have a generic BPF program
that can handle various filtering according to the command line
arguments.  I'm not sure but it might do something differently
for each event based on the attr settings.

Thanks,
Namhyung
Song Liu Aug. 26, 2022, 6:09 p.m. UTC | #18
> On Aug 26, 2022, at 9:33 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Thu, Aug 25, 2022 at 10:53 PM Song Liu <song@kernel.org> wrote:
>> 
>> On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>> 
>>> On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
>>>> Actually, since we are on this, can we make it more generic, and handle
>>>> all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
>>>> like:
>>>> 
>>>> long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
>>>> 
>>>> WDYT Namhyung?
>>> 
>>> Do you mean reading the whole sample data at once?
>>> Then it needs to parse the sample data format properly
>>> which is non trivial due to a number of variable length
>>> fields like callchains and branch stack, etc.
>>> 
>>> Also I'm afraid I might need event configuration info
>>> other than sample data like attr.type, attr.config,
>>> attr.sample_type and so on.
>>> 
>>> Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?
>> 
>> The user should have access to the perf_event_attr used to
>> create the event. This is also available in ctx->event->attr.
> 
> Do you mean from BPF?  I'd like to have a generic BPF program
> that can handle various filtering according to the command line
> arguments.  I'm not sure but it might do something differently
> for each event based on the attr settings.

Yeah, we can access perf_event_attr from BPF program. Note that
the ctx for perf_event bpf program is struct bpf_perf_event_data_kern:

SEC("perf_event")
int perf_e(struct bpf_perf_event_data_kern *ctx)
{	
	...
}

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

Alternatively, we can also have bpf user space configure the BPF 
program via a few knobs. 

And actually, we can just read ctx->data and get the raw record, 
right..?

Thanks,
Song
Song Liu Aug. 26, 2022, 6:44 p.m. UTC | #19
> On Aug 26, 2022, at 11:09 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> 
> 
>> On Aug 26, 2022, at 9:33 AM, Namhyung Kim <namhyung@kernel.org> wrote:
>> 
>> On Thu, Aug 25, 2022 at 10:53 PM Song Liu <song@kernel.org> wrote:
>>> 
>>> On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>>> 
>>>> On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
>>>>> Actually, since we are on this, can we make it more generic, and handle
>>>>> all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
>>>>> like:
>>>>> 
>>>>> long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
>>>>> 
>>>>> WDYT Namhyung?
>>>> 
>>>> Do you mean reading the whole sample data at once?
>>>> Then it needs to parse the sample data format properly
>>>> which is non trivial due to a number of variable length
>>>> fields like callchains and branch stack, etc.
>>>> 
>>>> Also I'm afraid I might need event configuration info
>>>> other than sample data like attr.type, attr.config,
>>>> attr.sample_type and so on.
>>>> 
>>>> Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?
>>> 
>>> The user should have access to the perf_event_attr used to
>>> create the event. This is also available in ctx->event->attr.
>> 
>> Do you mean from BPF?  I'd like to have a generic BPF program
>> that can handle various filtering according to the command line
>> arguments.  I'm not sure but it might do something differently
>> for each event based on the attr settings.
> 
> Yeah, we can access perf_event_attr from BPF program. Note that
> the ctx for perf_event bpf program is struct bpf_perf_event_data_kern:
> 
> SEC("perf_event")
> int perf_e(struct bpf_perf_event_data_kern *ctx)
> {	
> 	...
> }
> 
> struct bpf_perf_event_data_kern {
>        bpf_user_pt_regs_t *regs;
>        struct perf_sample_data *data;
>        struct perf_event *event;
> };
> 
> Alternatively, we can also have bpf user space configure the BPF 
> program via a few knobs. 
> 
> And actually, we can just read ctx->data and get the raw record, 
> right..?

Played with this for a little bit. ctx->data appears to be not 
reliable sometimes. I guess (not 100% sure) this is because we 
call bpf program before event->orig_overflow_handler. We can 
probably add a flag to specify we want to call orig_overflow_handler
first. 

Thanks,
Song
Namhyung Kim Aug. 26, 2022, 7:21 p.m. UTC | #20
On Fri, Aug 26, 2022 at 11:09 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 26, 2022, at 9:33 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Thu, Aug 25, 2022 at 10:53 PM Song Liu <song@kernel.org> wrote:
> >>
> >> On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >>>
> >>> On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
> >>>> Actually, since we are on this, can we make it more generic, and handle
> >>>> all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
> >>>> like:
> >>>>
> >>>> long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
> >>>>
> >>>> WDYT Namhyung?
> >>>
> >>> Do you mean reading the whole sample data at once?
> >>> Then it needs to parse the sample data format properly
> >>> which is non trivial due to a number of variable length
> >>> fields like callchains and branch stack, etc.
> >>>
> >>> Also I'm afraid I might need event configuration info
> >>> other than sample data like attr.type, attr.config,
> >>> attr.sample_type and so on.
> >>>
> >>> Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?
> >>
> >> The user should have access to the perf_event_attr used to
> >> create the event. This is also available in ctx->event->attr.
> >
> > Do you mean from BPF?  I'd like to have a generic BPF program
> > that can handle various filtering according to the command line
> > arguments.  I'm not sure but it might do something differently
> > for each event based on the attr settings.
>
> Yeah, we can access perf_event_attr from BPF program. Note that
> the ctx for perf_event bpf program is struct bpf_perf_event_data_kern:
>
> SEC("perf_event")
> int perf_e(struct bpf_perf_event_data_kern *ctx)
> {
>         ...
> }
>
> struct bpf_perf_event_data_kern {
>         bpf_user_pt_regs_t *regs;
>         struct perf_sample_data *data;
>         struct perf_event *event;
> };

I didn't know that it's allowed to access the kernel data directly.
For some reason, I thought it should use fields in bpf_event_event_data
only, like sample_period and addr.  And the verifier will convert the
access to them according to pe_prog_convert_ctx_access().

>
> Alternatively, we can also have bpf user space configure the BPF
> program via a few knobs.
>
> And actually, we can just read ctx->data and get the raw record,
> right..?

If it's possible, sure, it'd be more powerful.

Thanks,
Namhyung
Namhyung Kim Aug. 26, 2022, 7:30 p.m. UTC | #21
On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:

> > And actually, we can just read ctx->data and get the raw record,
> > right..?
>
> Played with this for a little bit. ctx->data appears to be not
> reliable sometimes. I guess (not 100% sure) this is because we
> call bpf program before event->orig_overflow_handler. We can
> probably add a flag to specify we want to call orig_overflow_handler
> first.

I'm not sure.  The sample_data should be provided by the caller
of perf_event_overflow.  So I guess the bpf program should see
a valid ctx->data.

Also I want to control calling the orig_overflow_handler based
on the return value of the BPF program.  So calling the orig
handler before BPF won't work for me. :)

Thanks,
Namhyung
Song Liu Aug. 26, 2022, 8:52 p.m. UTC | #22
> On Aug 26, 2022, at 12:21 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Fri, Aug 26, 2022 at 11:09 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Aug 26, 2022, at 9:33 AM, Namhyung Kim <namhyung@kernel.org> wrote:
>>> 
>>> On Thu, Aug 25, 2022 at 10:53 PM Song Liu <song@kernel.org> wrote:
>>>> 
>>>> On Thu, Aug 25, 2022 at 10:22 PM Namhyung Kim <namhyung@kernel.org> wrote:
>>>>> 
>>>>> On Thu, Aug 25, 2022 at 7:35 PM Song Liu <songliubraving@fb.com> wrote:
>>>>>> Actually, since we are on this, can we make it more generic, and handle
>>>>>> all possible PERF_SAMPLE_* (in enum perf_event_sample_format)? Something
>>>>>> like:
>>>>>> 
>>>>>> long bpf_perf_event_read_sample(void *ctx, void *buf, u64 size, u64 flags);
>>>>>> 
>>>>>> WDYT Namhyung?
>>>>> 
>>>>> Do you mean reading the whole sample data at once?
>>>>> Then it needs to parse the sample data format properly
>>>>> which is non trivial due to a number of variable length
>>>>> fields like callchains and branch stack, etc.
>>>>> 
>>>>> Also I'm afraid I might need event configuration info
>>>>> other than sample data like attr.type, attr.config,
>>>>> attr.sample_type and so on.
>>>>> 
>>>>> Hmm.. maybe we can add it to the ctx directly like ctx.attr_type?
>>>> 
>>>> The user should have access to the perf_event_attr used to
>>>> create the event. This is also available in ctx->event->attr.
>>> 
>>> Do you mean from BPF?  I'd like to have a generic BPF program
>>> that can handle various filtering according to the command line
>>> arguments.  I'm not sure but it might do something differently
>>> for each event based on the attr settings.
>> 
>> Yeah, we can access perf_event_attr from BPF program. Note that
>> the ctx for perf_event bpf program is struct bpf_perf_event_data_kern:
>> 
>> SEC("perf_event")
>> int perf_e(struct bpf_perf_event_data_kern *ctx)
>> {
>>        ...
>> }
>> 
>> struct bpf_perf_event_data_kern {
>>        bpf_user_pt_regs_t *regs;
>>        struct perf_sample_data *data;
>>        struct perf_event *event;
>> };
> 
> I didn't know that it's allowed to access the kernel data directly.
> For some reason, I thought it should use fields in bpf_event_event_data
> only, like sample_period and addr.  And the verifier will convert the
> access to them according to pe_prog_convert_ctx_access().

We can bypass pe_prog_convert_ctx_access() with something like:

	struct perf_event *event;
	u64 config;

        bpf_probe_read_kernel(&event, sizeof(void *), &ctx->event);
        bpf_probe_read_kernel(&config, sizeof(u64), &event->attr.config);

Thanks,
Song
Song Liu Aug. 26, 2022, 8:58 p.m. UTC | #23
> On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
> 
>>> And actually, we can just read ctx->data and get the raw record,
>>> right..?
>> 
>> Played with this for a little bit. ctx->data appears to be not
>> reliable sometimes. I guess (not 100% sure) this is because we
>> call bpf program before event->orig_overflow_handler. We can
>> probably add a flag to specify we want to call orig_overflow_handler
>> first.
> 
> I'm not sure.  The sample_data should be provided by the caller
> of perf_event_overflow.  So I guess the bpf program should see
> a valid ctx->data.

Let's dig into this. Maybe we need some small changes in 
pe_prog_convert_ctx_access. 

> Also I want to control calling the orig_overflow_handler based
> on the return value of the BPF program.  So calling the orig
> handler before BPF won't work for me. :)

Interesting. Could you share more information about the use case?

Thanks,
Song
Namhyung Kim Aug. 26, 2022, 9:12 p.m. UTC | #24
On Fri, Aug 26, 2022 at 1:59 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
> >
> >>> And actually, we can just read ctx->data and get the raw record,
> >>> right..?
> >>
> >> Played with this for a little bit. ctx->data appears to be not
> >> reliable sometimes. I guess (not 100% sure) this is because we
> >> call bpf program before event->orig_overflow_handler. We can
> >> probably add a flag to specify we want to call orig_overflow_handler
> >> first.
> >
> > I'm not sure.  The sample_data should be provided by the caller
> > of perf_event_overflow.  So I guess the bpf program should see
> > a valid ctx->data.
>
> Let's dig into this. Maybe we need some small changes in
> pe_prog_convert_ctx_access.

Sure, can you explain the problem in detail and share your program?

>
> > Also I want to control calling the orig_overflow_handler based
> > on the return value of the BPF program.  So calling the orig
> > handler before BPF won't work for me. :)
>
> Interesting. Could you share more information about the use case?

Well.. it's nothing new.  The bpf_overflow_handler calls the
orig_overflow_handler (which writes the sample to the buffer)
only if the BPF returns non zero.  Then I can drop unnecessary
samples based on the sample data by returning 0.

The possible use cases are
1. when you want to sample from specific code ranges only
2. when hardware sets specific bits in raw data

Thanks,
Namhyung
Song Liu Aug. 26, 2022, 9:25 p.m. UTC | #25
> On Aug 26, 2022, at 2:12 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Fri, Aug 26, 2022 at 1:59 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
>>> 
>>> On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
>>> 
>>>>> And actually, we can just read ctx->data and get the raw record,
>>>>> right..?
>>>> 
>>>> Played with this for a little bit. ctx->data appears to be not
>>>> reliable sometimes. I guess (not 100% sure) this is because we
>>>> call bpf program before event->orig_overflow_handler. We can
>>>> probably add a flag to specify we want to call orig_overflow_handler
>>>> first.
>>> 
>>> I'm not sure.  The sample_data should be provided by the caller
>>> of perf_event_overflow.  So I guess the bpf program should see
>>> a valid ctx->data.
>> 
>> Let's dig into this. Maybe we need some small changes in
>> pe_prog_convert_ctx_access.
> 
> Sure, can you explain the problem in detail and share your program?

I push the code to 

 https://git.kernel.org/pub/scm/linux/kernel/git/song/linux.git/log/?h=test-perf-event

The code is in tools/bpf/perf-test/. 

The problem is we cannot get reliable print of data->cpu_entry in 
/sys/kernel/tracing/trace. 

> 
>> 
>>> Also I want to control calling the orig_overflow_handler based
>>> on the return value of the BPF program.  So calling the orig
>>> handler before BPF won't work for me. :)
>> 
>> Interesting. Could you share more information about the use case?
> 
> Well.. it's nothing new.  The bpf_overflow_handler calls the
> orig_overflow_handler (which writes the sample to the buffer)
> only if the BPF returns non zero.  Then I can drop unnecessary
> samples based on the sample data by returning 0.
> 
> The possible use cases are
> 1. when you want to sample from specific code ranges only
> 2. when hardware sets specific bits in raw data

I like this idea. We already using BPF in counting perf_event. 
Now it is time to use it in sampling perf_event. :)

Thanks,
Song
Namhyung Kim Aug. 27, 2022, 6:25 a.m. UTC | #26
On Fri, Aug 26, 2022 at 2:26 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 26, 2022, at 2:12 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Fri, Aug 26, 2022 at 1:59 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >>>
> >>> On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
> >>>
> >>>>> And actually, we can just read ctx->data and get the raw record,
> >>>>> right..?
> >>>>
> >>>> Played with this for a little bit. ctx->data appears to be not
> >>>> reliable sometimes. I guess (not 100% sure) this is because we
> >>>> call bpf program before event->orig_overflow_handler. We can
> >>>> probably add a flag to specify we want to call orig_overflow_handler
> >>>> first.
> >>>
> >>> I'm not sure.  The sample_data should be provided by the caller
> >>> of perf_event_overflow.  So I guess the bpf program should see
> >>> a valid ctx->data.
> >>
> >> Let's dig into this. Maybe we need some small changes in
> >> pe_prog_convert_ctx_access.
> >
> > Sure, can you explain the problem in detail and share your program?
>
> I push the code to
>
>  https://git.kernel.org/pub/scm/linux/kernel/git/song/linux.git/log/?h=test-perf-event
>
> The code is in tools/bpf/perf-test/.
>
> The problem is we cannot get reliable print of data->cpu_entry in
> /sys/kernel/tracing/trace.

Ah, right.  I've realized that the sample data is passed before full
initialized.  Please see perf_sample_data_init().  The other members
are initialized right before written to the ring buffer in the
orig_overflow_handler (__perf_event_output).

That explains why pe_prog_convert_ctx_access() handles
data and period specially.  We need to handle it first.

Thanks,
Namhyung
Song Liu Aug. 29, 2022, 7:20 a.m. UTC | #27
> On Aug 26, 2022, at 11:25 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> 
> On Fri, Aug 26, 2022 at 2:26 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> 
>> 
>>> On Aug 26, 2022, at 2:12 PM, Namhyung Kim <namhyung@kernel.org> wrote:
>>> 
>>> On Fri, Aug 26, 2022 at 1:59 PM Song Liu <songliubraving@fb.com> wrote:
>>>> 
>>>> 
>>>> 
>>>>> On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
>>>>> 
>>>>> On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
>>>>> 
>>>>>>> And actually, we can just read ctx->data and get the raw record,
>>>>>>> right..?
>>>>>> 
>>>>>> Played with this for a little bit. ctx->data appears to be not
>>>>>> reliable sometimes. I guess (not 100% sure) this is because we
>>>>>> call bpf program before event->orig_overflow_handler. We can
>>>>>> probably add a flag to specify we want to call orig_overflow_handler
>>>>>> first.
>>>>> 
>>>>> I'm not sure.  The sample_data should be provided by the caller
>>>>> of perf_event_overflow.  So I guess the bpf program should see
>>>>> a valid ctx->data.
>>>> 
>>>> Let's dig into this. Maybe we need some small changes in
>>>> pe_prog_convert_ctx_access.
>>> 
>>> Sure, can you explain the problem in detail and share your program?
>> 
>> I push the code to
>> 
>> https://git.kernel.org/pub/scm/linux/kernel/git/song/linux.git/log/?h=test-perf-event
>> 
>> The code is in tools/bpf/perf-test/.
>> 
>> The problem is we cannot get reliable print of data->cpu_entry in
>> /sys/kernel/tracing/trace.
> 
> Ah, right.  I've realized that the sample data is passed before full
> initialized.  Please see perf_sample_data_init().  The other members
> are initialized right before written to the ring buffer in the
> orig_overflow_handler (__perf_event_output).
> 
> That explains why pe_prog_convert_ctx_access() handles
> data and period specially.  We need to handle it first.

Thanks for confirming this. I guess we will need a helper (or kfunc) 
for the raw data. 

Shall we make it more generic that we can get other PERF_SAMPLE_*? 

Thanks,
Song
Namhyung Kim Aug. 29, 2022, 8:11 p.m. UTC | #28
On Mon, Aug 29, 2022 at 12:21 AM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Aug 26, 2022, at 11:25 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Fri, Aug 26, 2022 at 2:26 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Aug 26, 2022, at 2:12 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >>>
> >>> On Fri, Aug 26, 2022 at 1:59 PM Song Liu <songliubraving@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>>> On Aug 26, 2022, at 12:30 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> >>>>>
> >>>>> On Fri, Aug 26, 2022 at 11:45 AM Song Liu <songliubraving@fb.com> wrote:
> >>>>>
> >>>>>>> And actually, we can just read ctx->data and get the raw record,
> >>>>>>> right..?
> >>>>>>
> >>>>>> Played with this for a little bit. ctx->data appears to be not
> >>>>>> reliable sometimes. I guess (not 100% sure) this is because we
> >>>>>> call bpf program before event->orig_overflow_handler. We can
> >>>>>> probably add a flag to specify we want to call orig_overflow_handler
> >>>>>> first.
> >>>>>
> >>>>> I'm not sure.  The sample_data should be provided by the caller
> >>>>> of perf_event_overflow.  So I guess the bpf program should see
> >>>>> a valid ctx->data.
> >>>>
> >>>> Let's dig into this. Maybe we need some small changes in
> >>>> pe_prog_convert_ctx_access.
> >>>
> >>> Sure, can you explain the problem in detail and share your program?
> >>
> >> I push the code to
> >>
> >> https://git.kernel.org/pub/scm/linux/kernel/git/song/linux.git/log/?h=test-perf-event
> >>
> >> The code is in tools/bpf/perf-test/.
> >>
> >> The problem is we cannot get reliable print of data->cpu_entry in
> >> /sys/kernel/tracing/trace.
> >
> > Ah, right.  I've realized that the sample data is passed before full
> > initialized.  Please see perf_sample_data_init().  The other members
> > are initialized right before written to the ring buffer in the
> > orig_overflow_handler (__perf_event_output).
> >
> > That explains why pe_prog_convert_ctx_access() handles
> > data and period specially.  We need to handle it first.
>
> Thanks for confirming this. I guess we will need a helper (or kfunc)
> for the raw data.
>
> Shall we make it more generic that we can get other PERF_SAMPLE_*?

I don't think we can (or allow to) get all the sample data but some
would be useful for filtering.  Currently I'm only interested in the raw
data, but ip and page size seem useful too.

So I think it'd be better to have a generic helper rather than a specific
one.  But it'd require some refactoring to get the data before calling
BPF programs.  Let me work on it first.

Thanks,
Namhyung
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 934a2a8beb87..af7f70564819 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5355,6 +5355,23 @@  union bpf_attr {
  *	Return
  *		Current *ktime*.
  *
+ * long bpf_read_raw_record(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ *	Description
+ *		For an eBPF program attached to a perf event, retrieve the
+ *		raw record associated to *ctx* and store it in the buffer
+ *		pointed by *buf* up to size *size* bytes.
+ *	Return
+ *		On success, number of bytes written to *buf*. On error, a
+ *		negative value.
+ *
+ *		The *flags* can be set to **BPF_F_GET_RAW_RECORD_SIZE** to
+ *		instead return the number of bytes required to store the raw
+ *		record. If this flag is set, *buf* may be NULL.
+ *
+ *		**-EINVAL** if arguments invalid or **size** not a multiple
+ *		of **sizeof**\ (u64\ ).
+ *
+ *		**-ENOENT** if the event does not have raw records.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5566,6 +5583,7 @@  union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
 	FN(ktime_get_tai_ns),		\
+	FN(read_raw_record),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5749,6 +5767,11 @@  enum {
 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
 };
 
+/* BPF_FUNC_read_raw_record flags. */
+enum {
+	BPF_F_GET_RAW_RECORD_SIZE	= (1ULL << 0),
+};
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 68e5cdd24cef..db172b12e5f8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@ 
 #include <linux/fprobe.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -1532,6 +1533,44 @@  static const struct bpf_func_proto bpf_read_branch_records_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_read_raw_record, struct bpf_perf_event_data_kern *, ctx,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct perf_raw_record *raw = ctx->data->raw;
+	struct perf_raw_frag *frag;
+	u32 to_copy;
+
+	if (unlikely(flags & ~BPF_F_GET_RAW_RECORD_SIZE))
+		return -EINVAL;
+
+	if (unlikely(!raw))
+		return -ENOENT;
+
+	if (flags & BPF_F_GET_RAW_RECORD_SIZE)
+		return raw->size;
+
+	if (!buf || (size % sizeof(u32) != 0))
+		return -EINVAL;
+
+	frag = &raw->frag;
+	WARN_ON_ONCE(!perf_raw_frag_last(frag));
+
+	to_copy = min_t(u32, frag->size, size);
+	memcpy(buf, frag->data, to_copy);
+
+	return to_copy;
+}
+
+static const struct bpf_func_proto bpf_read_raw_record_proto = {
+	.func           = bpf_read_raw_record,
+	.gpl_only       = true,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1548,6 +1587,8 @@  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_read_branch_records_proto;
 	case BPF_FUNC_get_attach_cookie:
 		return &bpf_get_attach_cookie_proto_pe;
+	case BPF_FUNC_read_raw_record:
+		return &bpf_read_raw_record_proto;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}