diff mbox series

[v1,2/8] tracing/ftrace: guard syscall probe with preempt_notrace

Message ID 20241003151638.1608537-3-mathieu.desnoyers@efficios.com (mailing list archive)
State Superseded
Headers show
Series tracing: Allow system call tracepoints to handle page faults | expand

Commit Message

Mathieu Desnoyers Oct. 3, 2024, 3:16 p.m. UTC
In preparation for allowing system call enter/exit instrumentation to
handle page faults, make sure that ftrace can handle this change by
explicitly disabling preemption within the ftrace system call tracepoint
probes to respect the current expectations within ftrace ring buffer
code.

This change does not yet allow ftrace to take page faults per se within
its probe, but allows its existing probes to adapt to the upcoming
change.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
---
 include/trace/trace_events.h  | 38 ++++++++++++++++++++++++++++-------
 kernel/trace/trace_syscalls.c | 12 +++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)

Comments

Steven Rostedt Oct. 3, 2024, 10:23 p.m. UTC | #1
On Thu,  3 Oct 2024 11:16:32 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> In preparation for allowing system call enter/exit instrumentation to
> handle page faults, make sure that ftrace can handle this change by
> explicitly disabling preemption within the ftrace system call tracepoint
> probes to respect the current expectations within ftrace ring buffer
> code.

The ftrace ring buffer doesn't expect preemption being disabled before use.
It will explicitly disable preemption.

I don't think this patch is needed.

-- Steve


> 
> This change does not yet allow ftrace to take page faults per se within
> its probe, but allows its existing probes to adapt to the upcoming
> change.
> 
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> Cc: Michael Jeanson <mjeanson@efficios.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
> Cc: bpf@vger.kernel.org
> Cc: Joel Fernandes <joel@joelfernandes.org>
Mathieu Desnoyers Oct. 4, 2024, 12:26 a.m. UTC | #2
On 2024-10-04 00:23, Steven Rostedt wrote:
> On Thu,  3 Oct 2024 11:16:32 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> In preparation for allowing system call enter/exit instrumentation to
>> handle page faults, make sure that ftrace can handle this change by
>> explicitly disabling preemption within the ftrace system call tracepoint
>> probes to respect the current expectations within ftrace ring buffer
>> code.
> 
> The ftrace ring buffer doesn't expect preemption being disabled before use.
> It will explicitly disable preemption.
> 
> I don't think this patch is needed.

Steve,

Look here:

static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
         struct trace_array *tr = data;
         struct trace_event_file *trace_file;
         struct syscall_trace_enter *entry;
         struct syscall_metadata *sys_data;
         struct trace_event_buffer fbuffer;
         unsigned long args[6];
         int syscall_nr;
         int size;

         syscall_nr = trace_get_syscall_nr(current, regs);
         if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                 return;

         /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
         trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);

^^^^ this function explicitly states that preempt needs to be disabled by
tracepoints.

         if (!trace_file)
                 return;

         if (trace_trigger_soft_disabled(trace_file))
                 return;

         sys_data = syscall_nr_to_meta(syscall_nr);
         if (!sys_data)
                 return;

         size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

         entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
^^^^ it reserves space in the ring buffer without disabling preemption explicitly.

And also:

void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
                                  struct trace_event_file *trace_file,
                                  unsigned long len)
{
         struct trace_event_call *event_call = trace_file->event_call;

         if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
             trace_event_ignore_this_pid(trace_file))
                 return NULL;

         /*
          * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
          * preemption (adding one to the preempt_count). Since we are
          * interested in the preempt_count at the time the tracepoint was
          * hit, we need to subtract one to offset the increment.
          */
^^^ This function also explicitly expects preemption to be disabled.

So I rest my case. The change I'm introducing for tracepoints
don't make any assumptions about whether or not each tracer require
preempt off or not: it keeps the behavior the _same_ as it was before.

Then it's up to each tracer's developer to change the behavior of their
own callbacks as they see fit. But I'm not introducing regressions in
tracers with the "big switch" change of making syscall tracepoints
faultable. This will belong to changes that are specific to each tracer.

Thanks,

Mathieu

> 
> -- Steve
> 
> 
>>
>> This change does not yet allow ftrace to take page faults per se within
>> its probe, but allows its existing probes to adapt to the upcoming
>> change.
>>
>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>> Cc: Michael Jeanson <mjeanson@efficios.com>
>> Cc: Steven Rostedt <rostedt@goodmis.org>
>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Alexei Starovoitov <ast@kernel.org>
>> Cc: Yonghong Song <yhs@fb.com>
>> Cc: Paul E. McKenney <paulmck@kernel.org>
>> Cc: Ingo Molnar <mingo@redhat.com>
>> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
>> Cc: Namhyung Kim <namhyung@kernel.org>
>> Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
>> Cc: bpf@vger.kernel.org
>> Cc: Joel Fernandes <joel@joelfernandes.org>
Steven Rostedt Oct. 4, 2024, 1:04 a.m. UTC | #3
On Thu, 3 Oct 2024 20:26:29 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:


> static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
> {
>          struct trace_array *tr = data;
>          struct trace_event_file *trace_file;
>          struct syscall_trace_enter *entry;
>          struct syscall_metadata *sys_data;
>          struct trace_event_buffer fbuffer;
>          unsigned long args[6];
>          int syscall_nr;
>          int size;
> 
>          syscall_nr = trace_get_syscall_nr(current, regs);
>          if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
>                  return;
> 
>          /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
>          trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
> 
> ^^^^ this function explicitly states that preempt needs to be disabled by
> tracepoints.

Ah, I should have known it was the syscall portion. I don't care for this
hidden dependency. I rather add a preempt disable here and not expect it to
be disabled when called.

> 
>          if (!trace_file)
>                  return;
> 
>          if (trace_trigger_soft_disabled(trace_file))
>                  return;
> 
>          sys_data = syscall_nr_to_meta(syscall_nr);
>          if (!sys_data)
>                  return;
> 
>          size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
> 
>          entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
> ^^^^ it reserves space in the ring buffer without disabling preemption explicitly.
> 
> And also:
> 
> void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
>                                   struct trace_event_file *trace_file,
>                                   unsigned long len)
> {
>          struct trace_event_call *event_call = trace_file->event_call;
> 
>          if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
>              trace_event_ignore_this_pid(trace_file))
>                  return NULL;
> 
>          /*
>           * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
>           * preemption (adding one to the preempt_count). Since we are
>           * interested in the preempt_count at the time the tracepoint was
>           * hit, we need to subtract one to offset the increment.
>           */
> ^^^ This function also explicitly expects preemption to be disabled.
> 
> So I rest my case. The change I'm introducing for tracepoints
> don't make any assumptions about whether or not each tracer require
> preempt off or not: it keeps the behavior the _same_ as it was before.
> 
> Then it's up to each tracer's developer to change the behavior of their
> own callbacks as they see fit. But I'm not introducing regressions in
> tracers with the "big switch" change of making syscall tracepoints
> faultable. This will belong to changes that are specific to each tracer.


I rather remove these dependencies at the source. So, IMHO, these places
should be "fixed" first.

At least for the ftrace users. But I think the same can be done for the
other users as well. BPF already stated it just needs "migrate_disable()".
Let's see what perf has.

We can then audit all the tracepoint users to make sure they do not need
preemption disabled.

-- Steve
Mathieu Desnoyers Oct. 4, 2024, 1:33 a.m. UTC | #4
On 2024-10-04 03:04, Steven Rostedt wrote:
> On Thu, 3 Oct 2024 20:26:29 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
> 
>> static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
>> {
>>           struct trace_array *tr = data;
>>           struct trace_event_file *trace_file;
>>           struct syscall_trace_enter *entry;
>>           struct syscall_metadata *sys_data;
>>           struct trace_event_buffer fbuffer;
>>           unsigned long args[6];
>>           int syscall_nr;
>>           int size;
>>
>>           syscall_nr = trace_get_syscall_nr(current, regs);
>>           if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
>>                   return;
>>
>>           /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
>>           trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
>>
>> ^^^^ this function explicitly states that preempt needs to be disabled by
>> tracepoints.
> 
> Ah, I should have known it was the syscall portion. I don't care for this
> hidden dependency. I rather add a preempt disable here and not expect it to
> be disabled when called.

Which is exactly what this patch is doing.

> 
>>
>>           if (!trace_file)
>>                   return;
>>
>>           if (trace_trigger_soft_disabled(trace_file))
>>                   return;
>>
>>           sys_data = syscall_nr_to_meta(syscall_nr);
>>           if (!sys_data)
>>                   return;
>>
>>           size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
>>
>>           entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
>> ^^^^ it reserves space in the ring buffer without disabling preemption explicitly.
>>
>> And also:
>>
>> void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
>>                                    struct trace_event_file *trace_file,
>>                                    unsigned long len)
>> {
>>           struct trace_event_call *event_call = trace_file->event_call;
>>
>>           if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
>>               trace_event_ignore_this_pid(trace_file))
>>                   return NULL;
>>
>>           /*
>>            * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
>>            * preemption (adding one to the preempt_count). Since we are
>>            * interested in the preempt_count at the time the tracepoint was
>>            * hit, we need to subtract one to offset the increment.
>>            */
>> ^^^ This function also explicitly expects preemption to be disabled.
>>
>> So I rest my case. The change I'm introducing for tracepoints
>> don't make any assumptions about whether or not each tracer require
>> preempt off or not: it keeps the behavior the _same_ as it was before.
>>
>> Then it's up to each tracer's developer to change the behavior of their
>> own callbacks as they see fit. But I'm not introducing regressions in
>> tracers with the "big switch" change of making syscall tracepoints
>> faultable. This will belong to changes that are specific to each tracer.
> 
> 
> I rather remove these dependencies at the source. So, IMHO, these places
> should be "fixed" first.
> 
> At least for the ftrace users. But I think the same can be done for the
> other users as well. BPF already stated it just needs "migrate_disable()".
> Let's see what perf has.
> 
> We can then audit all the tracepoint users to make sure they do not need
> preemption disabled.

Why does it need to be a broad refactoring of the entire world ? What is
wrong with the simple approach of introducing this tracepoint faultable
syscall support as a no-op from the tracer's perspective ?

Then we can build on top and figure out if we want to relax things
on a tracer-per-tracer basis.

Thanks,

Mathieu
Steven Rostedt Oct. 4, 2024, 1:26 p.m. UTC | #5
On Thu, 3 Oct 2024 21:33:16 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> On 2024-10-04 03:04, Steven Rostedt wrote:
> > On Thu, 3 Oct 2024 20:26:29 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> > 
> >   
> >> static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
> >> {
> >>           struct trace_array *tr = data;
> >>           struct trace_event_file *trace_file;
> >>           struct syscall_trace_enter *entry;
> >>           struct syscall_metadata *sys_data;
> >>           struct trace_event_buffer fbuffer;
> >>           unsigned long args[6];
> >>           int syscall_nr;
> >>           int size;
> >>
> >>           syscall_nr = trace_get_syscall_nr(current, regs);
> >>           if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
> >>                   return;
> >>
> >>           /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
> >>           trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
> >>
> >> ^^^^ this function explicitly states that preempt needs to be disabled by
> >> tracepoints.  
> > 
> > Ah, I should have known it was the syscall portion. I don't care for this
> > hidden dependency. I rather add a preempt disable here and not expect it to
> > be disabled when called.  
> 
> Which is exactly what this patch is doing.

I was thinking of putting the protection in the function and not the macro.

> 
> >   
> >>
> >>           if (!trace_file)
> >>                   return;
> >>
> >>           if (trace_trigger_soft_disabled(trace_file))
> >>                   return;
> >>
> >>           sys_data = syscall_nr_to_meta(syscall_nr);
> >>           if (!sys_data)
> >>                   return;
> >>
> >>           size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
> >>
> >>           entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
> >> ^^^^ it reserves space in the ring buffer without disabling preemption explicitly.
> >>
> >> And also:
> >>
> >> void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
> >>                                    struct trace_event_file *trace_file,
> >>                                    unsigned long len)
> >> {
> >>           struct trace_event_call *event_call = trace_file->event_call;
> >>
> >>           if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
> >>               trace_event_ignore_this_pid(trace_file))
> >>                   return NULL;
> >>
> >>           /*
> >>            * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
> >>            * preemption (adding one to the preempt_count). Since we are
> >>            * interested in the preempt_count at the time the tracepoint was
> >>            * hit, we need to subtract one to offset the increment.
> >>            */
> >> ^^^ This function also explicitly expects preemption to be disabled.
> >>
> >> So I rest my case. The change I'm introducing for tracepoints
> >> don't make any assumptions about whether or not each tracer require
> >> preempt off or not: it keeps the behavior the _same_ as it was before.
> >>
> >> Then it's up to each tracer's developer to change the behavior of their
> >> own callbacks as they see fit. But I'm not introducing regressions in
> >> tracers with the "big switch" change of making syscall tracepoints
> >> faultable. This will belong to changes that are specific to each tracer.  
> > 
> > 
> > I rather remove these dependencies at the source. So, IMHO, these places
> > should be "fixed" first.
> > 
> > At least for the ftrace users. But I think the same can be done for the
> > other users as well. BPF already stated it just needs "migrate_disable()".
> > Let's see what perf has.
> > 
> > We can then audit all the tracepoint users to make sure they do not need
> > preemption disabled.  
> 
> Why does it need to be a broad refactoring of the entire world ? What is
> wrong with the simple approach of introducing this tracepoint faultable
> syscall support as a no-op from the tracer's perspective ?

Because we want in-tree users too ;-)

> 
> Then we can build on top and figure out if we want to relax things
> on a tracer-per-tracer basis.

Looking deeper into how ftrace can implement this, it may require some more
work. Doing it your way may be fine for now, but we need this working for
something in-tree instead of having it only work for LTTng.

Note, it doesn't have to be ftrace either. It could be perf or BPF. Or
simply the sframe code (doing stack traces at the entry of system calls).

-- Steve
Mathieu Desnoyers Oct. 4, 2024, 2:18 p.m. UTC | #6
On 2024-10-04 15:26, Steven Rostedt wrote:
> On Thu, 3 Oct 2024 21:33:16 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> On 2024-10-04 03:04, Steven Rostedt wrote:
>>> On Thu, 3 Oct 2024 20:26:29 -0400
>>> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
>>>
>>>    
>>>> static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
>>>> {
>>>>            struct trace_array *tr = data;
>>>>            struct trace_event_file *trace_file;
>>>>            struct syscall_trace_enter *entry;
>>>>            struct syscall_metadata *sys_data;
>>>>            struct trace_event_buffer fbuffer;
>>>>            unsigned long args[6];
>>>>            int syscall_nr;
>>>>            int size;
>>>>
>>>>            syscall_nr = trace_get_syscall_nr(current, regs);
>>>>            if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
>>>>                    return;
>>>>
>>>>            /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
>>>>            trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
>>>>
>>>> ^^^^ this function explicitly states that preempt needs to be disabled by
>>>> tracepoints.
>>>
>>> Ah, I should have known it was the syscall portion. I don't care for this
>>> hidden dependency. I rather add a preempt disable here and not expect it to
>>> be disabled when called.
>>
>> Which is exactly what this patch is doing.
> 
> I was thinking of putting the protection in the function and not the macro.

I'm confused by your comment. The protection is added to the function here:

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 67ac5366f724..ab4db8c23f36 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
         int syscall_nr;
         int size;
  
+       /*
+        * Syscall probe called with preemption enabled, but the ring
+        * buffer and per-cpu data require preemption to be disabled.
+        */
+       guard(preempt_notrace)();
+
         syscall_nr = trace_get_syscall_nr(current, regs);
         if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                 return;
@@ -338,6 +344,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
         struct trace_event_buffer fbuffer;
         int syscall_nr;
  
+       /*
+        * Syscall probe called with preemption enabled, but the ring
+        * buffer and per-cpu data require preemption to be disabled.
+        */
+       guard(preempt_notrace)();
+
         syscall_nr = trace_get_syscall_nr(current, regs);
         if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                 return;

(I'll answer to the rest of your message in a separate email)

Thanks,

Mathieu
Mathieu Desnoyers Oct. 4, 2024, 2:19 p.m. UTC | #7
On 2024-10-04 15:26, Steven Rostedt wrote:
> On Thu, 3 Oct 2024 21:33:16 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> On 2024-10-04 03:04, Steven Rostedt wrote:
>>> On Thu, 3 Oct 2024 20:26:29 -0400
>>> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
>>>

[...]

>>>> So I rest my case. The change I'm introducing for tracepoints
>>>> don't make any assumptions about whether or not each tracer require
>>>> preempt off or not: it keeps the behavior the _same_ as it was before.
>>>>
>>>> Then it's up to each tracer's developer to change the behavior of their
>>>> own callbacks as they see fit. But I'm not introducing regressions in
>>>> tracers with the "big switch" change of making syscall tracepoints
>>>> faultable. This will belong to changes that are specific to each tracer.
>>>
>>>
>>> I rather remove these dependencies at the source. So, IMHO, these places
>>> should be "fixed" first.
>>>
>>> At least for the ftrace users. But I think the same can be done for the
>>> other users as well. BPF already stated it just needs "migrate_disable()".
>>> Let's see what perf has.
>>>
>>> We can then audit all the tracepoint users to make sure they do not need
>>> preemption disabled.
>>
>> Why does it need to be a broad refactoring of the entire world ? What is
>> wrong with the simple approach of introducing this tracepoint faultable
>> syscall support as a no-op from the tracer's perspective ?
> 
> Because we want in-tree users too ;-)

This series is infrastructure work that allows all in-tree tracers to
start handling page faults for sys enter/exit events. Can we simply
do the tracer-specific work on top of this infrastructure series rather
than do everything at once ?

Regarding test coverage, this series modifies each tracer syscall probe
code to add a might_fault(), which ensures a page fault can indeed be
serviced at this point.

>>
>> Then we can build on top and figure out if we want to relax things
>> on a tracer-per-tracer basis.
> 
> Looking deeper into how ftrace can implement this, it may require some more
> work. Doing it your way may be fine for now, but we need this working for
> something in-tree instead of having it only work for LTTng.

There is nothing LTTng-specific here. LTTng only has feature branches
to test it out for now. Once we have the infrastructure in place we
can discuss how each tracer can use this. I've recently enumerated
various approaches that can be taken by tracers to handle page faults:

https://lore.kernel.org/lkml/c2a2db4b-4409-4f3c-9959-53622fd8dfa7@efficios.com/

> Note, it doesn't have to be ftrace either. It could be perf or BPF. Or
> simply the sframe code (doing stack traces at the entry of system calls).

Steven, we've been working on faultable tracepoints for four years now:

https://lore.kernel.org/lkml/20201023195352.26269-1-mjeanson@efficios.com/ [2020]
https://lore.kernel.org/lkml/20210218222125.46565-1-mjeanson@efficios.com/ [2021]
https://lore.kernel.org/lkml/20231002202531.3160-1-mathieu.desnoyers@efficios.com/ [2023]
https://lore.kernel.org/lkml/20231120205418.334172-1-mathieu.desnoyers@efficios.com/ [2023]
https://lore.kernel.org/lkml/20240626185941.68420-1-mathieu.desnoyers@efficios.com/ [2024]
https://lore.kernel.org/lkml/20240828144153.829582-1-mathieu.desnoyers@efficios.com/ [2024]
https://lore.kernel.org/lkml/20240909201652.319406-1-mathieu.desnoyers@efficios.com/ [2024]
https://lore.kernel.org/lkml/20241003151638.1608537-1-mathieu.desnoyers@efficios.com/ [2024] (current)

The eBPF people want to leverage this. When I last discussed this with
eBPF maintainers, they were open to adapt eBPF after this infrastructure
series is merged. Based on this eBPF attempt from 2022:

https://lore.kernel.org/lkml/c323bce9-a04e-b1c3-580a-783fde259d60@fb.com/

The sframe code is just getting in shape (2024), but is far from being ready.

Everyone appears to be waiting for this infrastructure work to go in
before they can build on top. Once this infrastructure is available,
multiple groups can start working on introducing use of this into their
own code in parallel.

Four years into this effort, and this is the first time we're told we need
to adapt in-tree tracers to handle the page faults before this can go in.

Could you please stop moving the goal posts ?

Thanks,

Mathieu
Steven Rostedt Oct. 4, 2024, 2:45 p.m. UTC | #8
On Fri, 4 Oct 2024 10:18:59 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> On 2024-10-04 15:26, Steven Rostedt wrote:
> > On Thu, 3 Oct 2024 21:33:16 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> >   
> >> On 2024-10-04 03:04, Steven Rostedt wrote:  
> >>> On Thu, 3 Oct 2024 20:26:29 -0400
> >>> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> >>>
> >>>      
> >>>> static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
> >>>> {
> >>>>            struct trace_array *tr = data;
> >>>>            struct trace_event_file *trace_file;
> >>>>            struct syscall_trace_enter *entry;
> >>>>            struct syscall_metadata *sys_data;
> >>>>            struct trace_event_buffer fbuffer;
> >>>>            unsigned long args[6];
> >>>>            int syscall_nr;
> >>>>            int size;
> >>>>
> >>>>            syscall_nr = trace_get_syscall_nr(current, regs);
> >>>>            if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
> >>>>                    return;
> >>>>
> >>>>            /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
> >>>>            trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
> >>>>
> >>>> ^^^^ this function explicitly states that preempt needs to be disabled by
> >>>> tracepoints.  
> >>>
> >>> Ah, I should have known it was the syscall portion. I don't care for this
> >>> hidden dependency. I rather add a preempt disable here and not expect it to
> >>> be disabled when called.  
> >>
> >> Which is exactly what this patch is doing.  
> > 
> > I was thinking of putting the protection in the function and not the macro.  
> 
> I'm confused by your comment. The protection is added to the function here:

Ah, sorry. I'm the one confused. I was talking about this part:

> +#undef DECLARE_EVENT_SYSCALL_CLASS
> +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
> +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
> +		      PARAMS(assign), PARAMS(print))			\
> +static notrace void							\
> +trace_event_raw_event_##call(void *__data, proto)			\
> +{									\
> +	guard(preempt_notrace)();					\
> +	do_trace_event_raw_event_##call(__data, args);			\
> +}
> +

But that's for the non-syscall case.

This is why I shouldn't review patches just before going to bed :-p

-- Steve
Mathieu Desnoyers Oct. 4, 2024, 2:51 p.m. UTC | #9
On 2024-10-04 16:52, Steven Rostedt wrote:
> On Fri, 4 Oct 2024 10:19:36 -0400
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> The eBPF people want to leverage this. When I last discussed this with
>> eBPF maintainers, they were open to adapt eBPF after this infrastructure
>> series is merged. Based on this eBPF attempt from 2022:
>>
>> https://lore.kernel.org/lkml/c323bce9-a04e-b1c3-580a-783fde259d60@fb.com/
> 
> Sorry, I wasn't part of that discussion.
> 
>>
>> The sframe code is just getting in shape (2024), but is far from being ready.
>>
>> Everyone appears to be waiting for this infrastructure work to go in
>> before they can build on top. Once this infrastructure is available,
>> multiple groups can start working on introducing use of this into their
>> own code in parallel.
>>
>> Four years into this effort, and this is the first time we're told we need
>> to adapt in-tree tracers to handle the page faults before this can go in.
>>
>> Could you please stop moving the goal posts ?
> 
> I don't think I'm moving the goal posts. I was mentioning to show an
> in-tree user. If BPF wants this, I'm all for it. The only thing I saw was a
> generalization in the cover letter about perf, bpf and ftrace using
> faultible tracepoints. I just wanted to see a path for that happening.

AFAIU eBPF folks are very eager to start making use of this, so we won't
have to wait long.

Thanks,

Mathieu
Steven Rostedt Oct. 4, 2024, 2:52 p.m. UTC | #10
On Fri, 4 Oct 2024 10:19:36 -0400
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> The eBPF people want to leverage this. When I last discussed this with
> eBPF maintainers, they were open to adapt eBPF after this infrastructure
> series is merged. Based on this eBPF attempt from 2022:
> 
> https://lore.kernel.org/lkml/c323bce9-a04e-b1c3-580a-783fde259d60@fb.com/

Sorry, I wasn't part of that discussion.

> 
> The sframe code is just getting in shape (2024), but is far from being ready.
> 
> Everyone appears to be waiting for this infrastructure work to go in
> before they can build on top. Once this infrastructure is available,
> multiple groups can start working on introducing use of this into their
> own code in parallel.
> 
> Four years into this effort, and this is the first time we're told we need
> to adapt in-tree tracers to handle the page faults before this can go in.
> 
> Could you please stop moving the goal posts ?

I don't think I'm moving the goal posts. I was mentioning to show an
in-tree user. If BPF wants this, I'm all for it. The only thing I saw was a
generalization in the cover letter about perf, bpf and ftrace using
faultible tracepoints. I just wanted to see a path for that happening.

-- Steve
Andrii Nakryiko Oct. 4, 2024, 8:04 p.m. UTC | #11
On Fri, Oct 4, 2024 at 7:53 AM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> On 2024-10-04 16:52, Steven Rostedt wrote:
> > On Fri, 4 Oct 2024 10:19:36 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> >
> >> The eBPF people want to leverage this. When I last discussed this with
> >> eBPF maintainers, they were open to adapt eBPF after this infrastructure
> >> series is merged. Based on this eBPF attempt from 2022:
> >>
> >> https://lore.kernel.org/lkml/c323bce9-a04e-b1c3-580a-783fde259d60@fb.com/
> >
> > Sorry, I wasn't part of that discussion.
> >
> >>
> >> The sframe code is just getting in shape (2024), but is far from being ready.
> >>
> >> Everyone appears to be waiting for this infrastructure work to go in
> >> before they can build on top. Once this infrastructure is available,
> >> multiple groups can start working on introducing use of this into their
> >> own code in parallel.
> >>
> >> Four years into this effort, and this is the first time we're told we need
> >> to adapt in-tree tracers to handle the page faults before this can go in.
> >>
> >> Could you please stop moving the goal posts ?
> >
> > I don't think I'm moving the goal posts. I was mentioning to show an
> > in-tree user. If BPF wants this, I'm all for it. The only thing I saw was a
> > generalization in the cover letter about perf, bpf and ftrace using
> > faultible tracepoints. I just wanted to see a path for that happening.
>
> AFAIU eBPF folks are very eager to start making use of this, so we won't
> have to wait long.

I already gave my ack on BPF parts of this patch set, but I'll
elaborate a bit more here for the record. There seems to be two things
that's been discussed.

First, preempt_disable() vs migrate_disable(). We only need the
latter, but the former just preserves current behavior and I think
it's fine, we can follow up with BPF-specific bits later to optimize
and clean this up further. No big deal.

Second, whether BPF can utilize sleepable (faultable) tracepoints
right now with these changes. No, we need a bit more work (again, in
BPF specific parts) to allow faultable tracepoint attachment for BPF
programs. But it's a bit nuanced piece of code to get everything
right, and it's best done by someone more familiar with BPF internals.
So I wouldn't expect Mathieu to do this either.

So, tl;dr, I think patches are fine as-is (from BPF perspective), and
we'd like to see them applied and get to bpf-next for further
development on top of that.

>
> Thanks,
>
> Mathieu
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
>
Steven Rostedt Oct. 4, 2024, 8:59 p.m. UTC | #12
On Fri, 4 Oct 2024 13:04:21 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> > AFAIU eBPF folks are very eager to start making use of this, so we won't
> > have to wait long.  
> 
> I already gave my ack on BPF parts of this patch set, but I'll
> elaborate a bit more here for the record. There seems to be two things
> that's been discussed.
> 
> First, preempt_disable() vs migrate_disable(). We only need the
> latter, but the former just preserves current behavior and I think
> it's fine, we can follow up with BPF-specific bits later to optimize
> and clean this up further. No big deal.
> 
> Second, whether BPF can utilize sleepable (faultable) tracepoints
> right now with these changes. No, we need a bit more work (again, in
> BPF specific parts) to allow faultable tracepoint attachment for BPF
> programs. But it's a bit nuanced piece of code to get everything
> right, and it's best done by someone more familiar with BPF internals.
> So I wouldn't expect Mathieu to do this either.
> 
> So, tl;dr, I think patches are fine as-is (from BPF perspective), and
> we'd like to see them applied and get to bpf-next for further
> development on top of that.

Thanks Andrii for elaborating.

-- Steve
diff mbox series

Patch

diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 8bcbb9ee44de..0228d9ed94a3 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -263,6 +263,9 @@  static struct trace_event_fields trace_event_fields_##call[] = {	\
 	tstruct								\
 	{} };
 
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)
 
@@ -396,11 +399,11 @@  static inline notrace int trace_event_get_offsets_##call(		\
 
 #include "stages/stage6_event_callback.h"
 
-#undef DECLARE_EVENT_CLASS
-#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
-									\
+
+#undef __DECLARE_EVENT_CLASS
+#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static notrace void							\
-trace_event_raw_event_##call(void *__data, proto)			\
+do_trace_event_raw_event_##call(void *__data, proto)			\
 {									\
 	struct trace_event_file *trace_file = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
@@ -425,15 +428,34 @@  trace_event_raw_event_##call(void *__data, proto)			\
 									\
 	trace_event_buffer_commit(&fbuffer);				\
 }
+
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+		      PARAMS(assign), PARAMS(print))			\
+static notrace void							\
+trace_event_raw_event_##call(void *__data, proto)			\
+{									\
+	do_trace_event_raw_event_##call(__data, args);			\
+}
+
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+		      PARAMS(assign), PARAMS(print))			\
+static notrace void							\
+trace_event_raw_event_##call(void *__data, proto)			\
+{									\
+	guard(preempt_notrace)();					\
+	do_trace_event_raw_event_##call(__data, args);			\
+}
+
 /*
  * The ftrace_test_probe is compiled out, it is only here as a build time check
  * to make sure that if the tracepoint handling changes, the ftrace probe will
  * fail to compile unless it too is updated.
  */
 
-#undef DECLARE_EVENT_SYSCALL_CLASS
-#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
-
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
 static inline void ftrace_test_probe_##call(void)			\
@@ -443,6 +465,8 @@  static inline void ftrace_test_probe_##call(void)			\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+#undef __DECLARE_EVENT_CLASS
+
 #include "stages/stage7_class_define.h"
 
 #undef DECLARE_EVENT_CLASS
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 67ac5366f724..ab4db8c23f36 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,12 @@  static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	int syscall_nr;
 	int size;
 
+	/*
+	 * Syscall probe called with preemption enabled, but the ring
+	 * buffer and per-cpu data require preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
 		return;
@@ -338,6 +344,12 @@  static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	struct trace_event_buffer fbuffer;
 	int syscall_nr;
 
+	/*
+	 * Syscall probe called with preemption enabled, but the ring
+	 * buffer and per-cpu data require preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
 		return;