Message ID | 20220218225058.12701-2-beaub@linux.microsoft.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | libtracefs: Add APIs for user_events to libtracefs | expand |
On 19.02.22 г. 0:50 ч., Beau Belgrave wrote: > diff --git a/include/tracefs-local.h b/include/tracefs-local.h > index bf157e1..e768cba 100644 > --- a/include/tracefs-local.h > +++ b/include/tracefs-local.h > @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, > struct tep_event *get_tep_event(struct tep_handle *tep, > const char *system, const char *name); > > +/* Internal interface for ftrace user events */ > + > +struct tracefs_user_event_group; > + > +struct tracefs_user_event > +{ > + int write_index; > + int status_index; > + int iovecs; > + int rels; > + int len; > + struct tracefs_user_event_group *group; > + struct tracefs_user_event *next; > +}; > + > +struct tracefs_user_event_group > +{ > + int fd; > + int mmap_len; > + char *mmap; > + pthread_mutex_t lock; > + struct tracefs_user_event *events; > +}; > + > #endif /* _TRACE_FS_LOCAL_H */ > diff --git a/include/tracefs.h b/include/tracefs.h > index 1848ad0..7871dfe 100644 > --- a/include/tracefs.h > +++ b/include/tracefs.h > @@ -571,4 +571,64 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name, > struct tep_event * > tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth); > > +/* User events */ > +enum tracefs_uevent_type { > + TRACEFS_UEVENT_END, > + TRACEFS_UEVENT_u8, > + TRACEFS_UEVENT_s8, > + TRACEFS_UEVENT_u16, > + TRACEFS_UEVENT_s16, > + TRACEFS_UEVENT_u32, > + TRACEFS_UEVENT_s32, > + TRACEFS_UEVENT_u64, > + TRACEFS_UEVENT_s64, > + TRACEFS_UEVENT_string, > + TRACEFS_UEVENT_struct, > + TRACEFS_UEVENT_varray, > + TRACEFS_UEVENT_vstring, > +}; > + > +enum tracefs_uevent_flags { > + /* None */ > + TRACEFS_UEVENT_FLAG_NONE = 0, > + > + /* When BPF is attached, use iterator/no copy */ > + TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0, > +}; > + > +struct tracefs_uevent_item { > + /* Type of item */ > + enum tracefs_uevent_type type; > + > + /* Length of data, optional during register */ > + int len; > + > + union { > + /* Used during write */ > + const void *data; > + > + /* Used during register */ > + const char *name; > + }; > +}; > + > +struct tracefs_user_event; > +struct tracefs_user_event_group; > + We've been trying to follow certain naming convention for the APIs and to provide similar usage patterns for all types of trace events that are supported by the library so far (dynamic, synthetic and histograms). If 'XXX' is the type of the event ('user_event' in your case), the pattern looks like this: tracefs_XXX_alloc() - this constructor just allocates memory and initializes the descriptor object without modifying anything on the system. We allow for multiple constructor function, in the case when your objects has to many possible configurations and it is hard to do everything in a single API. Looking into your implementation, such constructor can do half of the work done in 'tracefs_user_event_group_create()' int tracefs_XXX_create(struct tracefs_XXX *evt) - This is the API that actually adds your event on the system. Note that it takes just one argument that is the object itself. Again, looking into your implementation, this will combine the other half of tracefs_user_event_group_create() and tracefs_user_event_register(). int tracefs_XXX_destroy(struct tracefs_XXX *evt) This API removes your event from the system. The first argument is again the object. If needed, here you can use a second argument that is 'bool force'. int tracefs_XXX_free(struct tracefs_XXX *evt) - just to free the memory > +struct tracefs_user_event_group *tracefs_user_event_group_create(void); > + > +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); > + > +int tracefs_user_event_delete(const char *name); > + > +struct tracefs_user_event * > +tracefs_user_event_register(struct tracefs_user_event_group *group, > + const char *name, enum tracefs_uevent_flags flags, > + struct tracefs_uevent_item *items); > + > +bool tracefs_user_event_test(struct tracefs_user_event *event); And maybe "test" is redundant. You can do the check in tracefs_XXX_create() and return an error if it fails. > + > +int tracefs_user_event_write(struct tracefs_user_event *event, > + struct tracefs_uevent_item *items); The "write" is OK. Maybe we can change 'tracefs_user_event' to 'tracefs_usrevent' since we already use 'tracefs_dynevent' for dynamic events. What do you think? Thanks! Yordan
On Mon, Feb 21, 2022 at 01:34:00PM +0200, Yordan Karadzhov wrote: > > > On 19.02.22 г. 0:50 ч., Beau Belgrave wrote: > > diff --git a/include/tracefs-local.h b/include/tracefs-local.h > > index bf157e1..e768cba 100644 > > --- a/include/tracefs-local.h > > +++ b/include/tracefs-local.h > > @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, > > struct tep_event *get_tep_event(struct tep_handle *tep, > > const char *system, const char *name); > > +/* Internal interface for ftrace user events */ > > + > > +struct tracefs_user_event_group; > > + > > +struct tracefs_user_event > > +{ > > + int write_index; > > + int status_index; > > + int iovecs; > > + int rels; > > + int len; > > + struct tracefs_user_event_group *group; > > + struct tracefs_user_event *next; > > +}; > > + > > +struct tracefs_user_event_group > > +{ > > + int fd; > > + int mmap_len; > > + char *mmap; > > + pthread_mutex_t lock; > > + struct tracefs_user_event *events; > > +}; > > + > > #endif /* _TRACE_FS_LOCAL_H */ > > diff --git a/include/tracefs.h b/include/tracefs.h > > index 1848ad0..7871dfe 100644 > > --- a/include/tracefs.h > > +++ b/include/tracefs.h > > @@ -571,4 +571,64 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name, > > struct tep_event * > > tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth); > > +/* User events */ > > +enum tracefs_uevent_type { > > + TRACEFS_UEVENT_END, > > + TRACEFS_UEVENT_u8, > > + TRACEFS_UEVENT_s8, > > + TRACEFS_UEVENT_u16, > > + TRACEFS_UEVENT_s16, > > + TRACEFS_UEVENT_u32, > > + TRACEFS_UEVENT_s32, > > + TRACEFS_UEVENT_u64, > > + TRACEFS_UEVENT_s64, > > + TRACEFS_UEVENT_string, > > + TRACEFS_UEVENT_struct, > > + TRACEFS_UEVENT_varray, > > + TRACEFS_UEVENT_vstring, > > +}; > > + > > +enum tracefs_uevent_flags { > > + /* None */ > > + TRACEFS_UEVENT_FLAG_NONE = 0, > > + > > + /* When BPF is attached, use iterator/no copy */ > > + TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0, > > +}; > > + > > +struct tracefs_uevent_item { > > + /* Type of item */ > > + enum tracefs_uevent_type type; > > + > > + /* Length of data, optional during register */ > > + int len; > > + > > + union { > > + /* Used during write */ > > + const void *data; > > + > > + /* Used during register */ > > + const char *name; > > + }; > > +}; > > + > > +struct tracefs_user_event; > > +struct tracefs_user_event_group; > > + > > We've been trying to follow certain naming convention for the APIs and to > provide similar usage patterns for all types of trace events that are > supported by the library so far (dynamic, synthetic and histograms). If > 'XXX' is the type of the event ('user_event' in your case), the pattern > looks like this: > > tracefs_XXX_alloc() - this constructor just allocates memory and initializes > the descriptor object without modifying anything on the system. We allow for > multiple constructor function, in the case when your objects has to many > possible configurations and it is hard to do everything in a single API. > Looking into your implementation, such constructor can do half of the work > done in 'tracefs_user_event_group_create()' > > int tracefs_XXX_create(struct tracefs_XXX *evt) - This is the API that > actually adds your event on the system. Note that it takes just one argument > that is the object itself. Again, looking into your implementation, this > will combine the other half of tracefs_user_event_group_create() and > tracefs_user_event_register(). Are you and Steven aligned on this convention? The request looked slightly different: See https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e > > int tracefs_XXX_destroy(struct tracefs_XXX *evt) This API removes your event > from the system. The first argument is again the object. If needed, here you > can use a second argument that is 'bool force'. > > int tracefs_XXX_free(struct tracefs_XXX *evt) - just to free the memory > > > > +struct tracefs_user_event_group *tracefs_user_event_group_create(void); > > + > > +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); > > + > > +int tracefs_user_event_delete(const char *name); > > + > > +struct tracefs_user_event * > > +tracefs_user_event_register(struct tracefs_user_event_group *group, > > + const char *name, enum tracefs_uevent_flags flags, > > + struct tracefs_uevent_item *items); > > + > > +bool tracefs_user_event_test(struct tracefs_user_event *event); > > And maybe "test" is redundant. You can do the check in tracefs_XXX_create() and return an error if it fails. > Test is required so user programs can tell when write should be called. Otherwise excessive calculations and stack data are pushed for no good reason. > > + > > +int tracefs_user_event_write(struct tracefs_user_event *event, > > + struct tracefs_uevent_item *items); > > The "write" is OK. > > Maybe we can change 'tracefs_user_event' to 'tracefs_usrevent' since we already use 'tracefs_dynevent' for dynamic events. > > What do you think? > I'm happy to do whatever, I just want to ensure you and Steven are aligned. > Thanks! > Yordan Thanks, -Beau
On Mon, 21 Feb 2022 09:57:20 -0800 Beau Belgrave <beaub@linux.microsoft.com> wrote: > > We've been trying to follow certain naming convention for the APIs and to > > provide similar usage patterns for all types of trace events that are > > supported by the library so far (dynamic, synthetic and histograms). If > > 'XXX' is the type of the event ('user_event' in your case), the pattern > > looks like this: > > > > tracefs_XXX_alloc() - this constructor just allocates memory and initializes > > the descriptor object without modifying anything on the system. We allow for > > multiple constructor function, in the case when your objects has to many > > possible configurations and it is hard to do everything in a single API. > > Looking into your implementation, such constructor can do half of the work > > done in 'tracefs_user_event_group_create()' > > > > int tracefs_XXX_create(struct tracefs_XXX *evt) - This is the API that > > actually adds your event on the system. Note that it takes just one argument > > that is the object itself. Again, looking into your implementation, this > > will combine the other half of tracefs_user_event_group_create() and > > tracefs_user_event_register(). > > Are you and Steven aligned on this convention? I would say you are both correct ;-) The problem is, this doesn't really match the other events. The other events are created in the kernel, and become a normal event. The big difference here is that the event is tested in user space. It will be hard to make it match the same criteria as kprobes, dynamic events, and eprobes. > > The request looked slightly different: > See https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e > > > > > int tracefs_XXX_destroy(struct tracefs_XXX *evt) This API removes your event > > from the system. The first argument is again the object. If needed, here you > > can use a second argument that is 'bool force'. > > > > int tracefs_XXX_free(struct tracefs_XXX *evt) - just to free the memory > > > > > > > +struct tracefs_user_event_group *tracefs_user_event_group_create(void); > > > + > > > +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); > > > + > > > +int tracefs_user_event_delete(const char *name); > > > + > > > +struct tracefs_user_event * > > > +tracefs_user_event_register(struct tracefs_user_event_group *group, > > > + const char *name, enum tracefs_uevent_flags flags, > > > + struct tracefs_uevent_item *items); > > > + > > > +bool tracefs_user_event_test(struct tracefs_user_event *event); > > > > And maybe "test" is redundant. You can do the check in tracefs_XXX_create() and return an error if it fails. > > > > Test is required so user programs can tell when write should be called. > Otherwise excessive calculations and stack data are pushed for no good > reason. This is the part I'm talking about. The "user_event" here is actually executed in the application, and this API is to handle it. This is where the user events diverge from the other events. The user events create the event like any other dynamic event (we could look at keeping that part similar with the other APIs). But then everything is different. When the event is created, the application is given a location on a special mmapped page that represents if the event is enabled or not. Then the application is to read it *at the event site* to know if it should record the event into the ring buffer or not. That's what the test is. It's equivalent to the "static_branch" that tracepoints use in the kernel, except this isn't a static branch (although we could possibly do something like that in the future!). The application will: if (tracefs_user_event_test(event)) { tracefs_user_event_write(event, ...); } Where if the event is enabled (by an external program), then it is to record the events. This is the API that does that. And test is not redundant at all. > > > > + > > > +int tracefs_user_event_write(struct tracefs_user_event *event, > > > + struct tracefs_uevent_item *items); > > > > The "write" is OK. > > > > Maybe we can change 'tracefs_user_event' to 'tracefs_usrevent' since we already use 'tracefs_dynevent' for dynamic events. > > > > What do you think? Not really. This is closer to tracefs_printf(). But thinking about this more, perhaps "write" is a bit confusing as we use it to write to tracefs files. What about: tracefs_user_event_record() ? As this is exactly what it is doing. It's recording the event. > > > > I'm happy to do whatever, I just want to ensure you and Steven are > aligned. Thanks! -- Steve
On 21.02.22 г. 21:16 ч., Steven Rostedt wrote: > On Mon, 21 Feb 2022 09:57:20 -0800 > Beau Belgrave <beaub@linux.microsoft.com> wrote: > >>> We've been trying to follow certain naming convention for the APIs and to >>> provide similar usage patterns for all types of trace events that are >>> supported by the library so far (dynamic, synthetic and histograms). If >>> 'XXX' is the type of the event ('user_event' in your case), the pattern >>> looks like this: >>> >>> tracefs_XXX_alloc() - this constructor just allocates memory and initializes >>> the descriptor object without modifying anything on the system. We allow for >>> multiple constructor function, in the case when your objects has to many >>> possible configurations and it is hard to do everything in a single API. >>> Looking into your implementation, such constructor can do half of the work >>> done in 'tracefs_user_event_group_create()' >>> >>> int tracefs_XXX_create(struct tracefs_XXX *evt) - This is the API that >>> actually adds your event on the system. Note that it takes just one argument >>> that is the object itself. Again, looking into your implementation, this >>> will combine the other half of tracefs_user_event_group_create() and >>> tracefs_user_event_register(). >> >> Are you and Steven aligned on this convention? > > I would say you are both correct ;-) > > The problem is, this doesn't really match the other events. The other > events are created in the kernel, and become a normal event. The big > difference here is that the event is tested in user space. It will be > hard to make it match the same criteria as kprobes, dynamic events, and > eprobes. > >> >> The request looked slightly different: >> See https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e >> >>> >>> int tracefs_XXX_destroy(struct tracefs_XXX *evt) This API removes your event >>> from the system. The first argument is again the object. If needed, here you >>> can use a second argument that is 'bool force'. >>> >>> int tracefs_XXX_free(struct tracefs_XXX *evt) - just to free the memory >>> >>> >>>> +struct tracefs_user_event_group *tracefs_user_event_group_create(void); >>>> + >>>> +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); >>>> + >>>> +int tracefs_user_event_delete(const char *name); >>>> + >>>> +struct tracefs_user_event * >>>> +tracefs_user_event_register(struct tracefs_user_event_group *group, >>>> + const char *name, enum tracefs_uevent_flags flags, >>>> + struct tracefs_uevent_item *items); >>>> + >>>> +bool tracefs_user_event_test(struct tracefs_user_event *event); >>> >>> And maybe "test" is redundant. You can do the check in tracefs_XXX_create() and return an error if it fails. >>> >> >> Test is required so user programs can tell when write should be called. >> Otherwise excessive calculations and stack data are pushed for no good >> reason. > > This is the part I'm talking about. The "user_event" here is actually > executed in the application, and this API is to handle it. This is > where the user events diverge from the other events. > > The user events create the event like any other dynamic event (we could > look at keeping that part similar with the other APIs). But then > everything is different. > > When the event is created, the application is given a location on a > special mmapped page that represents if the event is enabled or not. > Then the application is to read it *at the event site* to know if it > should record the event into the ring buffer or not. > > That's what the test is. It's equivalent to the "static_branch" that > tracepoints use in the kernel, except this isn't a static branch > (although we could possibly do something like that in the future!). > > The application will: > > if (tracefs_user_event_test(event)) { > tracefs_user_event_write(event, ...); > } > > Where if the event is enabled (by an external program), then it is to > record the events. This is the API that does that. > > And test is not redundant at all. Thanks a lot for clarifying this and sorry about my confusion! I have one last question. Do you consider as a valid use case that the library must support, someone to do a just 'test" without writing after this, or to "write" without testing first? thanks! Yordan > >> >>>> + >>>> +int tracefs_user_event_write(struct tracefs_user_event *event, >>>> + struct tracefs_uevent_item *items); >>> >>> The "write" is OK. >>> >>> Maybe we can change 'tracefs_user_event' to 'tracefs_usrevent' since we already use 'tracefs_dynevent' for dynamic events. >>> >>> What do you think? > > Not really. This is closer to tracefs_printf(). But thinking about this > more, perhaps "write" is a bit confusing as we use it to write to > tracefs files. > > What about: tracefs_user_event_record() ? > > As this is exactly what it is doing. It's recording the event. > > >>> >> >> I'm happy to do whatever, I just want to ensure you and Steven are >> aligned. > > Thanks! > > -- Steve
On Tue, 22 Feb 2022 08:27:31 +0200 Yordan Karadzhov <y.karadz@gmail.com> wrote: > I have one last question. Do you consider as a valid use case that > the library must support, someone to do a just 'test" without writing > after this, or to "write" without testing first? Actually, that's a very good point. I was thinking that I didn't like the "test" name, and was thinking of having it be: if (tracefs_user_event_enabled(event)) { tracefs_user_event_record(event, ...); } But I think you have a good point. Perhaps we should just have: tracefs_user_event_trace(event, ...); and it do all the work. But it would need to be a macro, that does: #define tracefs_user_event_trace(event, ...) \ do { \ if (tracefs_user_event_enabled(event)) { \ tracefs_user_event_record(event, ##__VA_ARGS); \ } \ } while (0) Because we do not want to have the compiler process the arguments when the event is not enabled. That would be too much overhead. But as a macro, it would work. Thoughts? -- Steve
On 22.02.22 г. 16:00 ч., Steven Rostedt wrote: > On Tue, 22 Feb 2022 08:27:31 +0200 > Yordan Karadzhov <y.karadz@gmail.com> wrote: > >> I have one last question. Do you consider as a valid use case that >> the library must support, someone to do a just 'test" without writing >> after this, or to "write" without testing first? > > Actually, that's a very good point. > > I was thinking that I didn't like the "test" name, and was thinking of > having it be: > > if (tracefs_user_event_enabled(event)) { > tracefs_user_event_record(event, ...); > } > > But I think you have a good point. Perhaps we should just have: > > tracefs_user_event_trace(event, ...); > > and it do all the work. But it would need to be a macro, that does: > > #define tracefs_user_event_trace(event, ...) \ I personally think that, using variadic arguments in library APIs is a not a good idea, because this enforces that the caller must know the number of those arguments at compile time. For me the original solution that uses an array of items is better. Or maybe we can have both as 2 different APIs. Thanks! Y. > do { \ > if (tracefs_user_event_enabled(event)) { \ > tracefs_user_event_record(event, ##__VA_ARGS); \ > } \ > } while (0) > > Because we do not want to have the compiler process the arguments when > the event is not enabled. That would be too much overhead. > > But as a macro, it would work. > > Thoughts? > > -- Steve
On Tue, 22 Feb 2022 16:25:35 +0200 Yordan Karadzhov <y.karadz@gmail.com> wrote: > I personally think that, using variadic arguments in library APIs is a > not a good idea, because this enforces that the caller must know the > number of those arguments at compile time. > > For me the original solution that uses an array of items is better. > Or maybe we can have both as 2 different APIs. Actually, I was using the va_args as an example. I really didn't care about the implementation of the arguments, except that they need to be added after the enable check. For now, lets just keep the two functions to check for the event being enabled and recording. I think the last names were the way to go: tracefs_user_event_enabled(); tracefs_user_event_record(); And keep the record using the array. If we want a macro, we could do: #define tracefs_user_event_trace(event, ...) \ do { \ if (tracefs_user_event_enabled(event)) { \ struct tracefs_uevent_item items[] = { \ ##__VA_ARGS__, \ { TRACEFS_UEVENT_END }, \ } \ tracefs_user_event_record(event, items); \ } \ } while (0) And the user could have: tracefs_user_event_trace(event, { TRACEFS_UEVENT_vstring, strlen(msg)+1, .data = msg }); Again, for those that do not want compile time knowledge of the arguments, you just use the normal interface, and those that want the helper, to use the macro. -- Steve
On Tue, Feb 22, 2022 at 09:00:21AM -0500, Steven Rostedt wrote: > On Tue, 22 Feb 2022 08:27:31 +0200 > Yordan Karadzhov <y.karadz@gmail.com> wrote: > > > I have one last question. Do you consider as a valid use case that > > the library must support, someone to do a just 'test" without writing > > after this, or to "write" without testing first? > > Actually, that's a very good point. > > I was thinking that I didn't like the "test" name, and was thinking of > having it be: > > if (tracefs_user_event_enabled(event)) { > tracefs_user_event_record(event, ...); > } I do like the renaming of test to enabled and write to record. > > But I think you have a good point. Perhaps we should just have: > > tracefs_user_event_trace(event, ...); > > and it do all the work. But it would need to be a macro, that does: > > #define tracefs_user_event_trace(event, ...) \ > do { \ > if (tracefs_user_event_enabled(event)) { \ > tracefs_user_event_record(event, ##__VA_ARGS); \ > } \ > } while (0) > > Because we do not want to have the compiler process the arguments when > the event is not enabled. That would be too much overhead. > > But as a macro, it would work. > > Thoughts? Many times we have found that we have to do work that should only be performed if the event is enabled. As long as there is a still clear ways to check without the macro, this would be fine with me. IE: strlen(message) We should not walk strings unless the event is enabled. Thanks, -Beau
On Fri, 18 Feb 2022 14:50:56 -0800 Beau Belgrave <beaub@linux.microsoft.com> wrote: > Adds the required APIs to libtracefs to create, manage and write out > data to trace events via the user_events kernel mechanism. Just to be more explicit here. I would add a link to the conversation that you shared with Yordan. Link: https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e With a little introduction: The user events are scheduled to be included into Linux 5.18, which register a special mmapped page to denote when the user event is enabled (from an external source). This API adds a wrapper to the kernel interface that makes it easy to register user events and test if they are enabled and to record the event when it is. > > Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com> > --- > Makefile | 8 + > include/tracefs-local.h | 24 ++ > include/tracefs.h | 60 +++++ > src/Makefile | 4 + > src/tracefs-userevents.c | 545 +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 641 insertions(+) > create mode 100644 src/tracefs-userevents.c > > diff --git a/Makefile b/Makefile > index 544684c..a4598b4 100644 > --- a/Makefile > +++ b/Makefile > @@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall > CPPFLAGS ?= > LDFLAGS ?= > > +USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) > +export USEREVENTS_INSTALLED > +ifeq ($(USEREVENTS_INSTALLED), 1) > +CFLAGS += -DUSEREVENTS > +else > +$(warning user_events.h not installed, skipping) > +endif > + > CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) > export CUNIT_INSTALLED > > diff --git a/include/tracefs-local.h b/include/tracefs-local.h > index bf157e1..e768cba 100644 > --- a/include/tracefs-local.h > +++ b/include/tracefs-local.h > @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, > struct tep_event *get_tep_event(struct tep_handle *tep, > const char *system, const char *name); > > +/* Internal interface for ftrace user events */ > + > +struct tracefs_user_event_group; > + > +struct tracefs_user_event > +{ > + int write_index; > + int status_index; > + int iovecs; > + int rels; > + int len; > + struct tracefs_user_event_group *group; > + struct tracefs_user_event *next; > +}; > + > +struct tracefs_user_event_group > +{ > + int fd; > + int mmap_len; > + char *mmap; > + pthread_mutex_t lock; > + struct tracefs_user_event *events; > +}; Nit, but can you indent the fields like we do in the kernel. struct tracefs_user_event { int write_index; int status_index; int iovecs; int rels; int len; struct tracefs_user_event_group *group; struct tracefs_user_event *next; }; struct tracefs_user_event_group { int fd; int mmap_len; char *mmap; pthread_mutex_t lock; struct tracefs_user_event *events; }; It's just easier to read and see the fields. > + > #endif /* _TRACE_FS_LOCAL_H */ > diff --git a/include/tracefs.h b/include/tracefs.h > index 1848ad0..7871dfe 100644 > --- a/include/tracefs.h > +++ b/include/tracefs.h > @@ -571,4 +571,64 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name, > struct tep_event * > tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth); > > +/* User events */ > +enum tracefs_uevent_type { > + TRACEFS_UEVENT_END, > + TRACEFS_UEVENT_u8, > + TRACEFS_UEVENT_s8, > + TRACEFS_UEVENT_u16, > + TRACEFS_UEVENT_s16, > + TRACEFS_UEVENT_u32, > + TRACEFS_UEVENT_s32, > + TRACEFS_UEVENT_u64, > + TRACEFS_UEVENT_s64, > + TRACEFS_UEVENT_string, > + TRACEFS_UEVENT_struct, > + TRACEFS_UEVENT_varray, > + TRACEFS_UEVENT_vstring, > +}; > + > +enum tracefs_uevent_flags { > + /* None */ > + TRACEFS_UEVENT_FLAG_NONE = 0, > + > + /* When BPF is attached, use iterator/no copy */ > + TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0, > +}; > + > +struct tracefs_uevent_item { > + /* Type of item */ > + enum tracefs_uevent_type type; > + > + /* Length of data, optional during register */ > + int len; > + > + union { > + /* Used during write */ > + const void *data; > + > + /* Used during register */ > + const char *name; > + }; > +}; Same with the above: struct tracefs_uevent_item { /* Type of item */ enum tracefs_uevent_type type; /* Length of data, optional during register */ int len; union { /* Used during write */ const void *data; /* Used during register */ const char *name; }; }; > + > +struct tracefs_user_event; > +struct tracefs_user_event_group; > + > +struct tracefs_user_event_group *tracefs_user_event_group_create(void); Naming is hard :-( Yordan has a point that we use the term "create" in the other APIs to mean "create the event on the file system". Where as this isn't doing that. But it is not just allocating either as it is doing work on the filesystem. Perhaps a better term is "init" as it is initializing the group? tracefs_user_event_group_init() ? > + > +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); Or we could call it "open" as then it would match the "close" here? tracefs_user_event_group_open() ? Is that a better name for the create function? Hmm, I'm thinking calling it "open" may be the best, as it then matches the close. (Just writing out my brain thoughts here) > + > +int tracefs_user_event_delete(const char *name); > + > +struct tracefs_user_event * > +tracefs_user_event_register(struct tracefs_user_event_group *group, > + const char *name, enum tracefs_uevent_flags flags, > + struct tracefs_uevent_item *items); > + > +bool tracefs_user_event_test(struct tracefs_user_event *event); Like I said before. I think tracefs_user_event_enabled() may be a better name. I know I'm the one that suggested "test" but as I stated up front, naming is hard ;-) > + > +int tracefs_user_event_write(struct tracefs_user_event *event, > + struct tracefs_uevent_item *items); And this to be tracefs_user_event_record(), that way it will not be confused to mean something similar to what the other API "write" functions do. > + > #endif /* _TRACE_FS_H */ > diff --git a/src/Makefile b/src/Makefile > index e8afab5..984e8cf 100644 > --- a/src/Makefile > +++ b/src/Makefile > @@ -14,6 +14,10 @@ OBJS += tracefs-filter.o > OBJS += tracefs-dynevents.o > OBJS += tracefs-eprobes.o > > +ifeq ($(USEREVENTS_INSTALLED), 1) > +OBJS += tracefs-userevents.o > +endif > + > # Order matters for the the three below > OBJS += sqlhist-lex.o > OBJS += sqlhist.tab.o > diff --git a/src/tracefs-userevents.c b/src/tracefs-userevents.c > new file mode 100644 > index 0000000..4d64fd8 > --- /dev/null > +++ b/src/tracefs-userevents.c > @@ -0,0 +1,545 @@ > +// SPDX-License-Identifier: LGPL-2.1 > +/* > + * Copyright (C) 2022 Microsoft Corporation. > + * > + * Authors: > + * Beau Belgrave <beaub@linux.microsoft.com> > + */ > + > +#include <alloca.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <errno.h> > +#include <fcntl.h> > +#include <pthread.h> > +#include <sys/mman.h> > +#include <sys/ioctl.h> > +#include <sys/uio.h> > +#include <linux/user_events.h> > + > +#include "tracefs.h" > +#include "tracefs-local.h" > + > +#define STAT_FILE "user_events_status" > +#define DATA_FILE "user_events_data" > + > +static void free_user_events(struct tracefs_user_event *event) > +{ > + struct tracefs_user_event *next; > + > + while (event) { > + next = event->next; > + free(event); > + event = next; > + } > +} > + > +#define LEN_OR_ZERO (len ? len - pos : 0) > +static int append_field(struct tracefs_uevent_item *item, char *buf, > + int len, int offset, int index) > +{ > + int pos = offset; > + > + if (index != 0) > + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); You know, you can use the trace_seq functionality here, which might make things easier. In tracefs_user_event_reg() trace_seq seq; trace_seq_init(&seq); then pass the &seq into the other functions where here we would have: static int append_field(struct tracefs_uevent_item *item, struct trace_seq *s, int len, int offset, int index) > + > + switch (item->type) { > + case TRACEFS_UEVENT_u8: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " u8 %s", item->name); trace_seq_printf(s, " u8 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_s8: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " s8 %s", item->name); And the same for all these. Then at the end, you can check for failed allocations with: if (seq.state) { /* failed allocation */ And you can get the access to the buffer with: trace_seq_terminate(&seq); // adds '\0' to the buffer printf("%s", seq.buffer); Then clean it up with: trace_seq_destroy(&seq); trace_seq is part of libtraceevent which tracefs depends on. It will make a lot of this code simpler. > + break; > + > + case TRACEFS_UEVENT_u16: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " u16 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_s16: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " s16 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_u32: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " u32 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_s32: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " s32 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_u64: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " u64 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_s64: > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " s64 %s", item->name); > + break; > + > + case TRACEFS_UEVENT_string: > + if (item->len <= 0) { > + errno = EINVAL; > + return -1; > + } > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " char[%d] %s", item->len, item->name); > + break; > + > + case TRACEFS_UEVENT_struct: > + /* > + * struct must have 2 strings, do simple check > + * in user, kernel will fully validate > + */ > + if (!strchr(item->name, ' ')) { > + errno = EINVAL; > + return -1; > + } > + > + if (item->len <= 0) { > + errno = EINVAL; > + return -1; > + } > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " struct %s %d", item->name, item->len); > + break; > + > + case TRACEFS_UEVENT_varray: > + /* Variable length array */ > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " __rel_loc u8[] %s", item->name); > + break; > + > + case TRACEFS_UEVENT_vstring: > + /* Variable length string */ > + pos += snprintf(buf + pos, LEN_OR_ZERO, > + " __rel_loc char[] %s", item->name); > + break; > + > + default: > + /* Unknown */ > + errno = ENOENT; > + return -1; > + } > + > + return pos; > +} > + > +static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags, > + struct tracefs_uevent_item *items, char *buf, int len) > +{ > + int pos = 0; > + int index = 0; > + > + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", name); > + > + if (flags & TRACEFS_UEVENT_FLAG_bpf_iter) > + pos += snprintf(buf + pos, LEN_OR_ZERO, ":BPF_ITER"); > + > + while (items->type != TRACEFS_UEVENT_END) { > + pos = append_field(items, buf, len, pos, index++); > + > + if (pos < 0) > + return pos; > + > + items++; > + } > + > + return pos + 1; > +} > +#undef LEN_OR_ZERO > + > +static int get_write_counts(struct tracefs_user_event *event, > + struct tracefs_uevent_item *item) > +{ > + event->rels = 0; > + event->len = 0; > + > + /* Start at 1, need iovec for write_index */ > + event->iovecs = 1; > + > + while (item->type != TRACEFS_UEVENT_END) { > + switch (item->type) { > + case TRACEFS_UEVENT_u8: > + case TRACEFS_UEVENT_s8: > + event->len += sizeof(__u8); > + break; > + > + case TRACEFS_UEVENT_u16: > + case TRACEFS_UEVENT_s16: > + event->len += sizeof(__u16); > + break; > + > + case TRACEFS_UEVENT_u32: > + case TRACEFS_UEVENT_s32: > + event->len += sizeof(__u32); > + break; > + > + case TRACEFS_UEVENT_u64: > + case TRACEFS_UEVENT_s64: > + event->len += sizeof(__u64); > + break; > + > + case TRACEFS_UEVENT_string: > + case TRACEFS_UEVENT_struct: > + event->len += item->len; > + break; > + > + case TRACEFS_UEVENT_varray: > + case TRACEFS_UEVENT_vstring: > + /* Requires a rel loc entry */ > + event->len += sizeof(__u32); > + event->rels++; > + break; > + > + default: > + /* Unknown */ > + errno = ENOENT; > + return -1; > + } > + > + event->iovecs++; > + item++; > + } > + > + return 0; > +} > + > +/** > + * tracefs_user_event_group_create - Create a new group to use for user events > + * > + * Returns a pointer to a group to use for user events. The pointer is valid until > + * tracefs_user_event_group_close() is called. In case of an error NULL is > + * returned. > + */ > +struct tracefs_user_event_group *tracefs_user_event_group_create(void) > +{ > + int stat, write, page_size, i; > + struct tracefs_user_event_group *group; > + > + stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR); > + > + if (stat < 0) > + return NULL; > + > + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); > + > + if (write < 0) > + goto put_stat; > + > + group = malloc(sizeof(*group)); > + > + if (!group) > + goto put_write; > + > + if (pthread_mutex_init(&group->lock, NULL) < 0) > + goto put_group; > + > + /* Scale up to 16-bit max user events a page at a time */ > + page_size = sysconf(_SC_PAGESIZE); > + group->mmap_len = page_size; > + > + for (i = 0; i < 16; ++i) { > + group->mmap = mmap(NULL, group->mmap_len, > + PROT_READ, MAP_SHARED, stat, 0); > + > + if (group->mmap == MAP_FAILED && errno == EINVAL) { > + /* Increase by page size and try again */ > + group->mmap_len += page_size; > + continue; > + } > + > + break; > + } > + > + if (group->mmap == MAP_FAILED) > + goto put_group; > + > + group->fd = write; > + group->events = NULL; > + > + /* Status fd no longer needed */ > + close(stat); > + > + return group; > + > +put_group: > + free(group); > +put_write: > + close(write); > +put_stat: > + close(stat); > + > + return NULL; > +} > + > +/** > + * tracefs_user_event_delete - Deletes a user event from the system > + * @name: Name of the event to delete > + * > + * Deletes the event from the system if it is not used. > + */ > +int tracefs_user_event_delete(const char *name) > +{ > + int ret, write; > + The above line has white space at the end of it. > + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); > + > + if (write < 0) > + return write; > + > + ret = ioctl(write, DIAG_IOCSDEL, name); > + > + close(write); > + > + return ret; > +} > + > +/** > + * tracefs_user_event_group_close - Closes a group containing user events > + * @group: Group to close > + * > + * Closes a group and all the user events within it. Any user event that has > + * been added to the group is no longer valid and cannot be used. > + */ > +void tracefs_user_event_group_close(struct tracefs_user_event_group *group) > +{ > + if (!group) > + return; > + > + if (group->mmap != MAP_FAILED) > + munmap(group->mmap, group->mmap_len); > + > + if (group->fd != -1) > + close(group->fd); > + > + free_user_events(group->events); > + free(group); > +} > + > +/** > + * tracefs_user_event_register - Registers a user event with the system > + * @group: Group to add the user event to > + * @name: Name of the event to register > + * @flags: Flags to use > + * @items: Array of items that the event contains > + * > + * Allocates and registers a user event with the system. The user event will be > + * added to the @group. The lifetime of the event is bound to the @group. When > + * the @group is closed via tracefs_user_event_group_close() the event will no > + * longer exist and should not be used. > + * > + * The @items are processed in order and the final item type must be set to > + * TRACEFS_UEVENT_END to mark the last item. Each item must have the type > + * and name defined. The string and struct type also require the len to be set > + * for the item. > + * > + * Return a pointer to a user event on success, or NULL or error. > + * > + * errno will be set to EINVAL if @group is null or unexpected @items. > + */ > +struct tracefs_user_event * > +tracefs_user_event_register(struct tracefs_user_event_group *group, > + const char *name, enum tracefs_uevent_flags flags, > + struct tracefs_uevent_item *items) > +{ > + struct tracefs_user_event *event = NULL; > + struct user_reg reg = {0}; > + char *cmd = NULL; > + int len; > + > + if (!group || !items) { > + errno = EINVAL; > + return NULL; > + } > + > + /* Determine length of cmd */ > + len = create_reg_cmd(name, flags, items, cmd, 0); > + > + if (len < 0) { > + errno = EINVAL; > + return NULL; > + } > + > + /* Allocate and fill cmd */ > + cmd = malloc(len); > + > + if (!cmd) > + return NULL; > + > + create_reg_cmd(name, flags, items, cmd, len); > + > + event = malloc(sizeof(*event)); > + > + if (!event) > + goto put_cmd; > + > + reg.size = sizeof(reg); > + reg.name_args = (__u64)cmd; > + > + /* Register event with kernel */ > + if (ioctl(group->fd, DIAG_IOCSREG, ®) == -1) > + goto put_event; > + > + /* Sanity check bounds returned */ > + if (reg.status_index >= group->mmap_len) { > + errno = EINVAL; > + goto put_event; > + } > + > + if (get_write_counts(event, items)) > + goto put_event; > + > + event->write_index = reg.write_index; > + event->status_index = reg.status_index; > + event->group = group; > + > + /* Add event into the group under lock */ > + pthread_mutex_lock(&group->lock); > + event->next = group->events; > + group->events = event->next; > + pthread_mutex_unlock(&group->lock); > + > + free(cmd); > + > + return event; > +put_event: > + free(event); > +put_cmd: > + free(cmd); > + > + return NULL; > +} > + > +/** > + * tracefs_user_event_test - Tests if an event is currently enabled > + * @event: User event to test > + * > + * Tests if the @event is valid and currently enabled on the system. > + * > + * Return true if enabled, false otherwise. > + */ > +bool tracefs_user_event_test(struct tracefs_user_event *event) > +{ > + return event && event->group->mmap[event->status_index] != 0; > +} > + > +/** > + * tracefs_user_event_write - Writes data out to an event > + * @event: User event to write data about > + * @items: Items to write for the event > + * > + * Writes out items for the event. Callers should check if the cost of writing > + * should be performed by calling tracefs_user_event_test(). Items are checked > + * to ensure they fit within the described items during register. Each item > + * must specify the length of the item being written. > + * > + * Return the number of bytes written or -1 upon error. > + * > + * errno will be set to EINVAL if @event or @items is null or @items contains > + * an item with a length of less than or equal to 0. > + * errno will be set to E2BIG if @items contains more items than previously > + * registered for the event. > + */ > +int tracefs_user_event_write(struct tracefs_user_event *event, > + struct tracefs_uevent_item *items) > +{ > + struct iovec *head, *io, *relio, *io_end; > + __u32 *rel, *rel_end; > + int len, rel_offset, data_offset, used; > + > + if (!event || !items) { > + errno = EINVAL; > + return -1; > + } > + > + head = io = alloca(sizeof(*io) * (event->iovecs + event->rels)); > + rel = alloca(sizeof(*rel) * event->rels); Heh, I forgot about "alloca". I haven't seen that in a long time! > + > + io_end = head + (event->iovecs + event->rels); > + rel_end = rel + event->rels; > + > + /* Relative offset starts at end of static data */ > + relio = io + event->iovecs; > + rel_offset = event->len; > + data_offset = 0; > + > + /* Write index must be first */ > + io->iov_base = &event->write_index; > + io->iov_len = sizeof(event->write_index); > + io++; > + used = 1; > + > + while (items->type != TRACEFS_UEVENT_END) { > + len = items->len; > + > + if (len <= 0) > + goto bad_length; > + > + if (io >= io_end) > + goto bad_count; > + > + switch (items->type) { > + case TRACEFS_UEVENT_varray: > + case TRACEFS_UEVENT_vstring: > + /* Dual vectors */ > + used += 2; > + > + if (rel >= rel_end || relio >= io_end) > + goto bad_count; > + > + /* __rel_loc types */ > + relio->iov_base = (void *)items->data; > + relio->iov_len = len; > + relio++; > + > + io->iov_base = (void *)rel; > + io->iov_len = sizeof(*rel); > + io++; > + rel_offset -= sizeof(*rel); > + > + /* Fill in rel loc data */ > + *rel = DYN_LOC(rel_offset + data_offset, len); > + data_offset += len; > + rel++; > + > + break; > + > + default: > + /* Single vector */ > + used++; > + > + /* Direct types */ > + io->iov_base = (void *)items->data; > + io->iov_len = len; > + io++; > + rel_offset -= len; > + > + break; > + } > + > + items++; > + } > + > + return writev(event->group->fd, head, used); This is fine for now, but I wonder if we want to invest time in the future to create macros for user space that are similar to the TRACE_EVENT() macros in the kernel, that would make the above a direct one to one mapping of the array to the event. That way we wouldn't use a loop, but instead would just write directly into the iovec that will be passed into the kernel. But that can be an optimization for another time. This all looks good. Thanks Beau! -- Steve > + > +bad_length: > + fprintf(stderr, "Bad user_event item length at index %d\n", > + used - 1); > + errno = EINVAL; > + return -1; > + > +bad_count: > + fprintf(stderr, "Too many user_event items passed\n"); > + errno = E2BIG; > + return -1; > +}
On Tue, 22 Feb 2022 08:59:46 -0800 Beau Belgrave <beaub@linux.microsoft.com> wrote: > > Thoughts? > > Many times we have found that we have to do work that should only be > performed if the event is enabled. As long as there is a still clear > ways to check without the macro, this would be fine with me. > > IE: strlen(message) > > We should not walk strings unless the event is enabled. Right. The macro would only be a helper macro. I would even add it as a separate patch at the end and not part of the main patch. -- Steve
On Fri, 18 Feb 2022 14:50:56 -0800 Beau Belgrave <beaub@linux.microsoft.com> wrote: > diff --git a/include/tracefs-local.h b/include/tracefs-local.h > index bf157e1..e768cba 100644 > --- a/include/tracefs-local.h > +++ b/include/tracefs-local.h > @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, > struct tep_event *get_tep_event(struct tep_handle *tep, > const char *system, const char *name); > > +/* Internal interface for ftrace user events */ > + > +struct tracefs_user_event_group; > + > +struct tracefs_user_event > +{ > + int write_index; > + int status_index; > + int iovecs; > + int rels; > + int len; > + struct tracefs_user_event_group *group; > + struct tracefs_user_event *next; > +}; > + > +struct tracefs_user_event_group > +{ > + int fd; > + int mmap_len; > + char *mmap; > + pthread_mutex_t lock; > + struct tracefs_user_event *events; > +}; > + > #endif /* _TRACE_FS_LOCAL_H */ > > +/** > + * tracefs_user_event_test - Tests if an event is currently enabled > + * @event: User event to test > + * > + * Tests if the @event is valid and currently enabled on the system. > + * > + * Return true if enabled, false otherwise. > + */ > +bool tracefs_user_event_test(struct tracefs_user_event *event) > +{ > + return event && event->group->mmap[event->status_index] != 0; > +} > + I was thinking we could even make the above faster by making it a static inline in the tracefs.h header file. In tracefs.h: struct tracefs_user_event { char *enable; }; static inline bool tracefs_user_event_test(struct tracefs_user_event *event) { return event && event->enable[0] != 0; } Then in tracefs-local.h: struct tracefs_user_event_internal { struct tracefs_user_event event_external; [...] }; Then have in tracefs_user_event_register(): event->write_index = reg.write_index; event->status_index = reg.status_index; event->group = group; event->event_external.enable = &event->group->mmap[event->status_index]; [..] return &event->event_external; All the other functions wouldn't even have to do a container_of() call, as the event_external will be the first field in the struct it needs. struct tracefs_user_event_internal *event = (struct tracefs_user_event_internal *)event_external; Or make the above a helper function: #define INTERNAL_EVENT(e) ((struct tracefs_user_event_internal *)e) struct tracefs_user_event_internal *event = INTERNAL_EVENT(event_external); Then we save on the function call, and allow the code to do the test inline. -- Steve
On Tue, 22 Feb 2022 12:31:43 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > static inline bool tracefs_user_event_test(struct tracefs_user_event *event) > { > return event && event->enable[0] != 0; > } I wonder if we need to add: return event && ((volatile char *)event->enable)[0] != 0; to prevent the compiler from optimizing it. -- Steve
On Tue, Feb 22, 2022 at 12:31:43PM -0500, Steven Rostedt wrote: > On Fri, 18 Feb 2022 14:50:56 -0800 > Beau Belgrave <beaub@linux.microsoft.com> wrote: > > > diff --git a/include/tracefs-local.h b/include/tracefs-local.h > > index bf157e1..e768cba 100644 > > --- a/include/tracefs-local.h > > +++ b/include/tracefs-local.h > > @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, > > struct tep_event *get_tep_event(struct tep_handle *tep, > > const char *system, const char *name); > > > > +/* Internal interface for ftrace user events */ > > + > > +struct tracefs_user_event_group; > > + > > +struct tracefs_user_event > > +{ > > + int write_index; > > + int status_index; > > + int iovecs; > > + int rels; > > + int len; > > + struct tracefs_user_event_group *group; > > + struct tracefs_user_event *next; > > +}; > > + > > +struct tracefs_user_event_group > > +{ > > + int fd; > > + int mmap_len; > > + char *mmap; > > + pthread_mutex_t lock; > > + struct tracefs_user_event *events; > > +}; > > + > > #endif /* _TRACE_FS_LOCAL_H */ > > > > > > > +/** > > + * tracefs_user_event_test - Tests if an event is currently enabled > > + * @event: User event to test > > + * > > + * Tests if the @event is valid and currently enabled on the system. > > + * > > + * Return true if enabled, false otherwise. > > + */ > > +bool tracefs_user_event_test(struct tracefs_user_event *event) > > +{ > > + return event && event->group->mmap[event->status_index] != 0; > > +} > > + > > I was thinking we could even make the above faster by making it a static > inline in the tracefs.h header file. > > In tracefs.h: > > struct tracefs_user_event { > char *enable; > }; > > static inline bool tracefs_user_event_test(struct tracefs_user_event *event) > { > return event && event->enable[0] != 0; > } > > Then in tracefs-local.h: > > struct tracefs_user_event_internal { > struct tracefs_user_event event_external; > [...] > }; > > Then have in tracefs_user_event_register(): > > event->write_index = reg.write_index; > event->status_index = reg.status_index; > event->group = group; > > event->event_external.enable = &event->group->mmap[event->status_index]; > > [..] > > return &event->event_external; > > All the other functions wouldn't even have to do a container_of() call, as > the event_external will be the first field in the struct it needs. > > struct tracefs_user_event_internal *event = (struct tracefs_user_event_internal *)event_external; > > Or make the above a helper function: > > #define INTERNAL_EVENT(e) ((struct tracefs_user_event_internal *)e) > > struct tracefs_user_event_internal *event = INTERNAL_EVENT(event_external); > > > Then we save on the function call, and allow the code to do the test inline. Yes, I was thinking along the same lines. I saw how things were split between private / public in the headers and wasn't sure how to accomplish this. What you have above seems like a great way to accomplish this without exposing too much out. Thanks, -Beau
On Tue, 22 Feb 2022 09:46:34 -0800 Beau Belgrave <beaub@linux.microsoft.com> wrote: > Yes, I was thinking along the same lines. I saw how things were split > between private / public in the headers and wasn't sure how to > accomplish this. > > What you have above seems like a great way to accomplish this without exposing > too much out. Just in case we ever want to extend the user view, I wonder if we should also expose the size as the next argument (or the first). struct tracefs_user_event { unsigned int size; char *enable; }; and set size to sizeof(struct tracefs_user_event). Then if we add another field, we can differentiate it from new additions, without breaking forward or backward API. -- Steve
diff --git a/Makefile b/Makefile index 544684c..a4598b4 100644 --- a/Makefile +++ b/Makefile @@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall CPPFLAGS ?= LDFLAGS ?= +USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) +export USEREVENTS_INSTALLED +ifeq ($(USEREVENTS_INSTALLED), 1) +CFLAGS += -DUSEREVENTS +else +$(warning user_events.h not installed, skipping) +endif + CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) export CUNIT_INSTALLED diff --git a/include/tracefs-local.h b/include/tracefs-local.h index bf157e1..e768cba 100644 --- a/include/tracefs-local.h +++ b/include/tracefs-local.h @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, struct tep_event *get_tep_event(struct tep_handle *tep, const char *system, const char *name); +/* Internal interface for ftrace user events */ + +struct tracefs_user_event_group; + +struct tracefs_user_event +{ + int write_index; + int status_index; + int iovecs; + int rels; + int len; + struct tracefs_user_event_group *group; + struct tracefs_user_event *next; +}; + +struct tracefs_user_event_group +{ + int fd; + int mmap_len; + char *mmap; + pthread_mutex_t lock; + struct tracefs_user_event *events; +}; + #endif /* _TRACE_FS_LOCAL_H */ diff --git a/include/tracefs.h b/include/tracefs.h index 1848ad0..7871dfe 100644 --- a/include/tracefs.h +++ b/include/tracefs.h @@ -571,4 +571,64 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name, struct tep_event * tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth); +/* User events */ +enum tracefs_uevent_type { + TRACEFS_UEVENT_END, + TRACEFS_UEVENT_u8, + TRACEFS_UEVENT_s8, + TRACEFS_UEVENT_u16, + TRACEFS_UEVENT_s16, + TRACEFS_UEVENT_u32, + TRACEFS_UEVENT_s32, + TRACEFS_UEVENT_u64, + TRACEFS_UEVENT_s64, + TRACEFS_UEVENT_string, + TRACEFS_UEVENT_struct, + TRACEFS_UEVENT_varray, + TRACEFS_UEVENT_vstring, +}; + +enum tracefs_uevent_flags { + /* None */ + TRACEFS_UEVENT_FLAG_NONE = 0, + + /* When BPF is attached, use iterator/no copy */ + TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0, +}; + +struct tracefs_uevent_item { + /* Type of item */ + enum tracefs_uevent_type type; + + /* Length of data, optional during register */ + int len; + + union { + /* Used during write */ + const void *data; + + /* Used during register */ + const char *name; + }; +}; + +struct tracefs_user_event; +struct tracefs_user_event_group; + +struct tracefs_user_event_group *tracefs_user_event_group_create(void); + +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); + +int tracefs_user_event_delete(const char *name); + +struct tracefs_user_event * +tracefs_user_event_register(struct tracefs_user_event_group *group, + const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items); + +bool tracefs_user_event_test(struct tracefs_user_event *event); + +int tracefs_user_event_write(struct tracefs_user_event *event, + struct tracefs_uevent_item *items); + #endif /* _TRACE_FS_H */ diff --git a/src/Makefile b/src/Makefile index e8afab5..984e8cf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,6 +14,10 @@ OBJS += tracefs-filter.o OBJS += tracefs-dynevents.o OBJS += tracefs-eprobes.o +ifeq ($(USEREVENTS_INSTALLED), 1) +OBJS += tracefs-userevents.o +endif + # Order matters for the the three below OBJS += sqlhist-lex.o OBJS += sqlhist.tab.o diff --git a/src/tracefs-userevents.c b/src/tracefs-userevents.c new file mode 100644 index 0000000..4d64fd8 --- /dev/null +++ b/src/tracefs-userevents.c @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (C) 2022 Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <alloca.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <linux/user_events.h> + +#include "tracefs.h" +#include "tracefs-local.h" + +#define STAT_FILE "user_events_status" +#define DATA_FILE "user_events_data" + +static void free_user_events(struct tracefs_user_event *event) +{ + struct tracefs_user_event *next; + + while (event) { + next = event->next; + free(event); + event = next; + } +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int append_field(struct tracefs_uevent_item *item, char *buf, + int len, int offset, int index) +{ + int pos = offset; + + if (index != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + switch (item->type) { + case TRACEFS_UEVENT_u8: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u8 %s", item->name); + break; + + case TRACEFS_UEVENT_s8: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s8 %s", item->name); + break; + + case TRACEFS_UEVENT_u16: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u16 %s", item->name); + break; + + case TRACEFS_UEVENT_s16: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s16 %s", item->name); + break; + + case TRACEFS_UEVENT_u32: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u32 %s", item->name); + break; + + case TRACEFS_UEVENT_s32: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s32 %s", item->name); + break; + + case TRACEFS_UEVENT_u64: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u64 %s", item->name); + break; + + case TRACEFS_UEVENT_s64: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s64 %s", item->name); + break; + + case TRACEFS_UEVENT_string: + if (item->len <= 0) { + errno = EINVAL; + return -1; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " char[%d] %s", item->len, item->name); + break; + + case TRACEFS_UEVENT_struct: + /* + * struct must have 2 strings, do simple check + * in user, kernel will fully validate + */ + if (!strchr(item->name, ' ')) { + errno = EINVAL; + return -1; + } + + if (item->len <= 0) { + errno = EINVAL; + return -1; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " struct %s %d", item->name, item->len); + break; + + case TRACEFS_UEVENT_varray: + /* Variable length array */ + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __rel_loc u8[] %s", item->name); + break; + + case TRACEFS_UEVENT_vstring: + /* Variable length string */ + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __rel_loc char[] %s", item->name); + break; + + default: + /* Unknown */ + errno = ENOENT; + return -1; + } + + return pos; +} + +static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items, char *buf, int len) +{ + int pos = 0; + int index = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", name); + + if (flags & TRACEFS_UEVENT_FLAG_bpf_iter) + pos += snprintf(buf + pos, LEN_OR_ZERO, ":BPF_ITER"); + + while (items->type != TRACEFS_UEVENT_END) { + pos = append_field(items, buf, len, pos, index++); + + if (pos < 0) + return pos; + + items++; + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int get_write_counts(struct tracefs_user_event *event, + struct tracefs_uevent_item *item) +{ + event->rels = 0; + event->len = 0; + + /* Start at 1, need iovec for write_index */ + event->iovecs = 1; + + while (item->type != TRACEFS_UEVENT_END) { + switch (item->type) { + case TRACEFS_UEVENT_u8: + case TRACEFS_UEVENT_s8: + event->len += sizeof(__u8); + break; + + case TRACEFS_UEVENT_u16: + case TRACEFS_UEVENT_s16: + event->len += sizeof(__u16); + break; + + case TRACEFS_UEVENT_u32: + case TRACEFS_UEVENT_s32: + event->len += sizeof(__u32); + break; + + case TRACEFS_UEVENT_u64: + case TRACEFS_UEVENT_s64: + event->len += sizeof(__u64); + break; + + case TRACEFS_UEVENT_string: + case TRACEFS_UEVENT_struct: + event->len += item->len; + break; + + case TRACEFS_UEVENT_varray: + case TRACEFS_UEVENT_vstring: + /* Requires a rel loc entry */ + event->len += sizeof(__u32); + event->rels++; + break; + + default: + /* Unknown */ + errno = ENOENT; + return -1; + } + + event->iovecs++; + item++; + } + + return 0; +} + +/** + * tracefs_user_event_group_create - Create a new group to use for user events + * + * Returns a pointer to a group to use for user events. The pointer is valid until + * tracefs_user_event_group_close() is called. In case of an error NULL is + * returned. + */ +struct tracefs_user_event_group *tracefs_user_event_group_create(void) +{ + int stat, write, page_size, i; + struct tracefs_user_event_group *group; + + stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR); + + if (stat < 0) + return NULL; + + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); + + if (write < 0) + goto put_stat; + + group = malloc(sizeof(*group)); + + if (!group) + goto put_write; + + if (pthread_mutex_init(&group->lock, NULL) < 0) + goto put_group; + + /* Scale up to 16-bit max user events a page at a time */ + page_size = sysconf(_SC_PAGESIZE); + group->mmap_len = page_size; + + for (i = 0; i < 16; ++i) { + group->mmap = mmap(NULL, group->mmap_len, + PROT_READ, MAP_SHARED, stat, 0); + + if (group->mmap == MAP_FAILED && errno == EINVAL) { + /* Increase by page size and try again */ + group->mmap_len += page_size; + continue; + } + + break; + } + + if (group->mmap == MAP_FAILED) + goto put_group; + + group->fd = write; + group->events = NULL; + + /* Status fd no longer needed */ + close(stat); + + return group; + +put_group: + free(group); +put_write: + close(write); +put_stat: + close(stat); + + return NULL; +} + +/** + * tracefs_user_event_delete - Deletes a user event from the system + * @name: Name of the event to delete + * + * Deletes the event from the system if it is not used. + */ +int tracefs_user_event_delete(const char *name) +{ + int ret, write; + + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); + + if (write < 0) + return write; + + ret = ioctl(write, DIAG_IOCSDEL, name); + + close(write); + + return ret; +} + +/** + * tracefs_user_event_group_close - Closes a group containing user events + * @group: Group to close + * + * Closes a group and all the user events within it. Any user event that has + * been added to the group is no longer valid and cannot be used. + */ +void tracefs_user_event_group_close(struct tracefs_user_event_group *group) +{ + if (!group) + return; + + if (group->mmap != MAP_FAILED) + munmap(group->mmap, group->mmap_len); + + if (group->fd != -1) + close(group->fd); + + free_user_events(group->events); + free(group); +} + +/** + * tracefs_user_event_register - Registers a user event with the system + * @group: Group to add the user event to + * @name: Name of the event to register + * @flags: Flags to use + * @items: Array of items that the event contains + * + * Allocates and registers a user event with the system. The user event will be + * added to the @group. The lifetime of the event is bound to the @group. When + * the @group is closed via tracefs_user_event_group_close() the event will no + * longer exist and should not be used. + * + * The @items are processed in order and the final item type must be set to + * TRACEFS_UEVENT_END to mark the last item. Each item must have the type + * and name defined. The string and struct type also require the len to be set + * for the item. + * + * Return a pointer to a user event on success, or NULL or error. + * + * errno will be set to EINVAL if @group is null or unexpected @items. + */ +struct tracefs_user_event * +tracefs_user_event_register(struct tracefs_user_event_group *group, + const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items) +{ + struct tracefs_user_event *event = NULL; + struct user_reg reg = {0}; + char *cmd = NULL; + int len; + + if (!group || !items) { + errno = EINVAL; + return NULL; + } + + /* Determine length of cmd */ + len = create_reg_cmd(name, flags, items, cmd, 0); + + if (len < 0) { + errno = EINVAL; + return NULL; + } + + /* Allocate and fill cmd */ + cmd = malloc(len); + + if (!cmd) + return NULL; + + create_reg_cmd(name, flags, items, cmd, len); + + event = malloc(sizeof(*event)); + + if (!event) + goto put_cmd; + + reg.size = sizeof(reg); + reg.name_args = (__u64)cmd; + + /* Register event with kernel */ + if (ioctl(group->fd, DIAG_IOCSREG, ®) == -1) + goto put_event; + + /* Sanity check bounds returned */ + if (reg.status_index >= group->mmap_len) { + errno = EINVAL; + goto put_event; + } + + if (get_write_counts(event, items)) + goto put_event; + + event->write_index = reg.write_index; + event->status_index = reg.status_index; + event->group = group; + + /* Add event into the group under lock */ + pthread_mutex_lock(&group->lock); + event->next = group->events; + group->events = event->next; + pthread_mutex_unlock(&group->lock); + + free(cmd); + + return event; +put_event: + free(event); +put_cmd: + free(cmd); + + return NULL; +} + +/** + * tracefs_user_event_test - Tests if an event is currently enabled + * @event: User event to test + * + * Tests if the @event is valid and currently enabled on the system. + * + * Return true if enabled, false otherwise. + */ +bool tracefs_user_event_test(struct tracefs_user_event *event) +{ + return event && event->group->mmap[event->status_index] != 0; +} + +/** + * tracefs_user_event_write - Writes data out to an event + * @event: User event to write data about + * @items: Items to write for the event + * + * Writes out items for the event. Callers should check if the cost of writing + * should be performed by calling tracefs_user_event_test(). Items are checked + * to ensure they fit within the described items during register. Each item + * must specify the length of the item being written. + * + * Return the number of bytes written or -1 upon error. + * + * errno will be set to EINVAL if @event or @items is null or @items contains + * an item with a length of less than or equal to 0. + * errno will be set to E2BIG if @items contains more items than previously + * registered for the event. + */ +int tracefs_user_event_write(struct tracefs_user_event *event, + struct tracefs_uevent_item *items) +{ + struct iovec *head, *io, *relio, *io_end; + __u32 *rel, *rel_end; + int len, rel_offset, data_offset, used; + + if (!event || !items) { + errno = EINVAL; + return -1; + } + + head = io = alloca(sizeof(*io) * (event->iovecs + event->rels)); + rel = alloca(sizeof(*rel) * event->rels); + + io_end = head + (event->iovecs + event->rels); + rel_end = rel + event->rels; + + /* Relative offset starts at end of static data */ + relio = io + event->iovecs; + rel_offset = event->len; + data_offset = 0; + + /* Write index must be first */ + io->iov_base = &event->write_index; + io->iov_len = sizeof(event->write_index); + io++; + used = 1; + + while (items->type != TRACEFS_UEVENT_END) { + len = items->len; + + if (len <= 0) + goto bad_length; + + if (io >= io_end) + goto bad_count; + + switch (items->type) { + case TRACEFS_UEVENT_varray: + case TRACEFS_UEVENT_vstring: + /* Dual vectors */ + used += 2; + + if (rel >= rel_end || relio >= io_end) + goto bad_count; + + /* __rel_loc types */ + relio->iov_base = (void *)items->data; + relio->iov_len = len; + relio++; + + io->iov_base = (void *)rel; + io->iov_len = sizeof(*rel); + io++; + rel_offset -= sizeof(*rel); + + /* Fill in rel loc data */ + *rel = DYN_LOC(rel_offset + data_offset, len); + data_offset += len; + rel++; + + break; + + default: + /* Single vector */ + used++; + + /* Direct types */ + io->iov_base = (void *)items->data; + io->iov_len = len; + io++; + rel_offset -= len; + + break; + } + + items++; + } + + return writev(event->group->fd, head, used); + +bad_length: + fprintf(stderr, "Bad user_event item length at index %d\n", + used - 1); + errno = EINVAL; + return -1; + +bad_count: + fprintf(stderr, "Too many user_event items passed\n"); + errno = E2BIG; + return -1; +}
Adds the required APIs to libtracefs to create, manage and write out data to trace events via the user_events kernel mechanism. Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com> --- Makefile | 8 + include/tracefs-local.h | 24 ++ include/tracefs.h | 60 +++++ src/Makefile | 4 + src/tracefs-userevents.c | 545 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 641 insertions(+) create mode 100644 src/tracefs-userevents.c