diff mbox series

[v2,13/13] perf machine thread: Remove exited threads by default

Message ID 20231012062359.1616786-14-irogers@google.com (mailing list archive)
State Not Applicable
Headers show
Series Improvements to memory use | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Ian Rogers Oct. 12, 2023, 6:23 a.m. UTC
struct thread values hold onto references to mmaps, dsos, etc. When a
thread exits it is necessary to clean all of this memory up by
removing the thread from the machine's threads. Some tools require
this doesn't happen, such as perf report if offcpu events exist or if
a task list is being generated, so add a symbol_conf value to make the
behavior optional. When an exited thread is left in the machine's
threads, mark it as exited.

This change relates to commit 40826c45eb0b ("perf thread: Remove
notion of dead threads"). Dead threads were removed as they had a
reference count of 0 and were difficult to reason about with the
reference count checker. Here a thread is removed from threads when it
exits, unless via symbol_conf the exited thread isn't remove and is
marked as exited. Reference counting behaves as it normally does.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/builtin-report.c   |  7 +++++++
 tools/perf/util/machine.c     | 10 +++++++---
 tools/perf/util/symbol_conf.h |  3 ++-
 tools/perf/util/thread.h      | 14 ++++++++++++++
 4 files changed, 30 insertions(+), 4 deletions(-)

Comments

Namhyung Kim Oct. 18, 2023, 11:30 p.m. UTC | #1
On Wed, Oct 11, 2023 at 11:24 PM Ian Rogers <irogers@google.com> wrote:
>
> struct thread values hold onto references to mmaps, dsos, etc. When a
> thread exits it is necessary to clean all of this memory up by
> removing the thread from the machine's threads. Some tools require
> this doesn't happen, such as perf report if offcpu events exist or if
> a task list is being generated, so add a symbol_conf value to make the
> behavior optional. When an exited thread is left in the machine's
> threads, mark it as exited.
>
> This change relates to commit 40826c45eb0b ("perf thread: Remove
> notion of dead threads"). Dead threads were removed as they had a
> reference count of 0 and were difficult to reason about with the
> reference count checker. Here a thread is removed from threads when it
> exits, unless via symbol_conf the exited thread isn't remove and is
> marked as exited. Reference counting behaves as it normally does.

Maybe we can do it the other way around.  IOW tools can access
dead threads for whatever reason if they are dealing with a data
file.  And I guess the main concern is perf top to reduce memory
footprint, right?  Then we can declare to remove the dead threads
for perf top case only IMHO.

Thanks,
Namhyung

>
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/builtin-report.c   |  7 +++++++
>  tools/perf/util/machine.c     | 10 +++++++---
>  tools/perf/util/symbol_conf.h |  3 ++-
>  tools/perf/util/thread.h      | 14 ++++++++++++++
>  4 files changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index dcedfe00f04d..749246817aed 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -1411,6 +1411,13 @@ int cmd_report(int argc, const char **argv)
>         if (ret < 0)
>                 goto exit;
>
> +       /*
> +        * tasks_mode require access to exited threads to list those that are in
> +        * the data file. Off-cpu events are synthesized after other events and
> +        * reference exited threads.
> +        */
> +       symbol_conf.keep_exited_threads = true;
> +
>         annotation_options__init(&report.annotation_opts);
>
>         ret = perf_config(report__config, &report);
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 6ca7500e2cf4..5cda47eb337d 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
>         if (dump_trace)
>                 perf_event__fprintf_task(event, stdout);
>
> -       if (thread != NULL)
> -               thread__put(thread);
> -
> +       if (thread != NULL) {
> +               if (symbol_conf.keep_exited_threads)
> +                       thread__set_exited(thread, /*exited=*/true);
> +               else
> +                       machine__remove_thread(machine, thread);
> +       }
> +       thread__put(thread);
>         return 0;
>  }
>
> diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
> index 2b2fb9e224b0..6040286e07a6 100644
> --- a/tools/perf/util/symbol_conf.h
> +++ b/tools/perf/util/symbol_conf.h
> @@ -43,7 +43,8 @@ struct symbol_conf {
>                         disable_add2line_warn,
>                         buildid_mmap2,
>                         guest_code,
> -                       lazy_load_kernel_maps;
> +                       lazy_load_kernel_maps,
> +                       keep_exited_threads;
>         const char      *vmlinux_name,
>                         *kallsyms_name,
>                         *source_prefix,
> diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> index e79225a0ea46..0df775b5c110 100644
> --- a/tools/perf/util/thread.h
> +++ b/tools/perf/util/thread.h
> @@ -36,13 +36,22 @@ struct thread_rb_node {
>  };
>
>  DECLARE_RC_STRUCT(thread) {
> +       /** @maps: mmaps associated with this thread. */
>         struct maps             *maps;
>         pid_t                   pid_; /* Not all tools update this */
> +       /** @tid: thread ID number unique to a machine. */
>         pid_t                   tid;
> +       /** @ppid: parent process of the process this thread belongs to. */
>         pid_t                   ppid;
>         int                     cpu;
>         int                     guest_cpu; /* For QEMU thread */
>         refcount_t              refcnt;
> +       /**
> +        * @exited: Has the thread had an exit event. Such threads are usually
> +        * removed from the machine's threads but some events/tools require
> +        * access to dead threads.
> +        */
> +       bool                    exited;
>         bool                    comm_set;
>         int                     comm_len;
>         struct list_head        namespaces_list;
> @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread)
>         return &RC_CHK_ACCESS(thread)->refcnt;
>  }
>
> +static inline void thread__set_exited(struct thread *thread, bool exited)
> +{
> +       RC_CHK_ACCESS(thread)->exited = exited;
> +}
> +
>  static inline bool thread__comm_set(const struct thread *thread)
>  {
>         return RC_CHK_ACCESS(thread)->comm_set;
> --
> 2.42.0.609.gbb76f46606-goog
>
Ian Rogers Oct. 19, 2023, 12:39 a.m. UTC | #2
On Wed, Oct 18, 2023 at 4:30 PM Namhyung Kim <namhyung@kernel.org> wrote:
>
> On Wed, Oct 11, 2023 at 11:24 PM Ian Rogers <irogers@google.com> wrote:
> >
> > struct thread values hold onto references to mmaps, dsos, etc. When a
> > thread exits it is necessary to clean all of this memory up by
> > removing the thread from the machine's threads. Some tools require
> > this doesn't happen, such as perf report if offcpu events exist or if
> > a task list is being generated, so add a symbol_conf value to make the
> > behavior optional. When an exited thread is left in the machine's
> > threads, mark it as exited.
> >
> > This change relates to commit 40826c45eb0b ("perf thread: Remove
> > notion of dead threads"). Dead threads were removed as they had a
> > reference count of 0 and were difficult to reason about with the
> > reference count checker. Here a thread is removed from threads when it
> > exits, unless via symbol_conf the exited thread isn't remove and is
> > marked as exited. Reference counting behaves as it normally does.
>
> Maybe we can do it the other way around.  IOW tools can access
> dead threads for whatever reason if they are dealing with a data
> file.  And I guess the main concern is perf top to reduce memory
> footprint, right?  Then we can declare to remove the dead threads
> for perf top case only IMHO.

Thanks I did consider this but I think this order makes most sense.
The only tool relying on access to dead threads is perf report, but
its uses are questionable:

 - task printing - the tools wants to show all threads from a perf run
and assumes they are in threads. By replacing tool.exit it would be
easy to record dead threads for this, as the maps aren't required the
memory overhead could be improved by just recording the necessary
task's data.

 - offcpu - it would be very useful to have offcpu samples be real
samples, rather than an aggregated sample at the end of the data file
with a timestamp greater-than when the thread exited. These samples
would happen before the exit event is processed and so the reason to
keep dead threads around would be removed. I think we could do some
kind of sample aggregation using BPF, but I think we need to think it
through with exit events. Perhaps we can fix things in the short-term
by generating BPF samples with aggregation when threads exit in the
offcpu BPF code, but then again, if you have this going it seems as
easy just to keep to record all offcpu events as distinct.

Other commands like perf top and perf script don't currently have
dependencies on dead threads and I'm kind of glad, for
understandability, memory footprint, etc. Having the default be that
dead threads linger on will just encourage the kind of issues we see
currently in perf report and having to have every tool, perf script
declare it doesn't want dead threads seems burdensome.

Thanks,
Ian

> Thanks,
> Namhyung
>
> >
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >  tools/perf/builtin-report.c   |  7 +++++++
> >  tools/perf/util/machine.c     | 10 +++++++---
> >  tools/perf/util/symbol_conf.h |  3 ++-
> >  tools/perf/util/thread.h      | 14 ++++++++++++++
> >  4 files changed, 30 insertions(+), 4 deletions(-)
> >
> > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > index dcedfe00f04d..749246817aed 100644
> > --- a/tools/perf/builtin-report.c
> > +++ b/tools/perf/builtin-report.c
> > @@ -1411,6 +1411,13 @@ int cmd_report(int argc, const char **argv)
> >         if (ret < 0)
> >                 goto exit;
> >
> > +       /*
> > +        * tasks_mode require access to exited threads to list those that are in
> > +        * the data file. Off-cpu events are synthesized after other events and
> > +        * reference exited threads.
> > +        */
> > +       symbol_conf.keep_exited_threads = true;
> > +
> >         annotation_options__init(&report.annotation_opts);
> >
> >         ret = perf_config(report__config, &report);
> > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> > index 6ca7500e2cf4..5cda47eb337d 100644
> > --- a/tools/perf/util/machine.c
> > +++ b/tools/perf/util/machine.c
> > @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
> >         if (dump_trace)
> >                 perf_event__fprintf_task(event, stdout);
> >
> > -       if (thread != NULL)
> > -               thread__put(thread);
> > -
> > +       if (thread != NULL) {
> > +               if (symbol_conf.keep_exited_threads)
> > +                       thread__set_exited(thread, /*exited=*/true);
> > +               else
> > +                       machine__remove_thread(machine, thread);
> > +       }
> > +       thread__put(thread);
> >         return 0;
> >  }
> >
> > diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
> > index 2b2fb9e224b0..6040286e07a6 100644
> > --- a/tools/perf/util/symbol_conf.h
> > +++ b/tools/perf/util/symbol_conf.h
> > @@ -43,7 +43,8 @@ struct symbol_conf {
> >                         disable_add2line_warn,
> >                         buildid_mmap2,
> >                         guest_code,
> > -                       lazy_load_kernel_maps;
> > +                       lazy_load_kernel_maps,
> > +                       keep_exited_threads;
> >         const char      *vmlinux_name,
> >                         *kallsyms_name,
> >                         *source_prefix,
> > diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> > index e79225a0ea46..0df775b5c110 100644
> > --- a/tools/perf/util/thread.h
> > +++ b/tools/perf/util/thread.h
> > @@ -36,13 +36,22 @@ struct thread_rb_node {
> >  };
> >
> >  DECLARE_RC_STRUCT(thread) {
> > +       /** @maps: mmaps associated with this thread. */
> >         struct maps             *maps;
> >         pid_t                   pid_; /* Not all tools update this */
> > +       /** @tid: thread ID number unique to a machine. */
> >         pid_t                   tid;
> > +       /** @ppid: parent process of the process this thread belongs to. */
> >         pid_t                   ppid;
> >         int                     cpu;
> >         int                     guest_cpu; /* For QEMU thread */
> >         refcount_t              refcnt;
> > +       /**
> > +        * @exited: Has the thread had an exit event. Such threads are usually
> > +        * removed from the machine's threads but some events/tools require
> > +        * access to dead threads.
> > +        */
> > +       bool                    exited;
> >         bool                    comm_set;
> >         int                     comm_len;
> >         struct list_head        namespaces_list;
> > @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread)
> >         return &RC_CHK_ACCESS(thread)->refcnt;
> >  }
> >
> > +static inline void thread__set_exited(struct thread *thread, bool exited)
> > +{
> > +       RC_CHK_ACCESS(thread)->exited = exited;
> > +}
> > +
> >  static inline bool thread__comm_set(const struct thread *thread)
> >  {
> >         return RC_CHK_ACCESS(thread)->comm_set;
> > --
> > 2.42.0.609.gbb76f46606-goog
> >
Namhyung Kim Oct. 20, 2023, 9:08 p.m. UTC | #3
On Wed, Oct 18, 2023 at 5:39 PM Ian Rogers <irogers@google.com> wrote:
>
> On Wed, Oct 18, 2023 at 4:30 PM Namhyung Kim <namhyung@kernel.org> wrote:
> >
> > On Wed, Oct 11, 2023 at 11:24 PM Ian Rogers <irogers@google.com> wrote:
> > >
> > > struct thread values hold onto references to mmaps, dsos, etc. When a
> > > thread exits it is necessary to clean all of this memory up by
> > > removing the thread from the machine's threads. Some tools require
> > > this doesn't happen, such as perf report if offcpu events exist or if
> > > a task list is being generated, so add a symbol_conf value to make the
> > > behavior optional. When an exited thread is left in the machine's
> > > threads, mark it as exited.
> > >
> > > This change relates to commit 40826c45eb0b ("perf thread: Remove
> > > notion of dead threads"). Dead threads were removed as they had a
> > > reference count of 0 and were difficult to reason about with the
> > > reference count checker. Here a thread is removed from threads when it
> > > exits, unless via symbol_conf the exited thread isn't remove and is
> > > marked as exited. Reference counting behaves as it normally does.
> >
> > Maybe we can do it the other way around.  IOW tools can access
> > dead threads for whatever reason if they are dealing with a data
> > file.  And I guess the main concern is perf top to reduce memory
> > footprint, right?  Then we can declare to remove the dead threads
> > for perf top case only IMHO.
>
> Thanks I did consider this but I think this order makes most sense.
> The only tool relying on access to dead threads is perf report, but
> its uses are questionable:
>
>  - task printing - the tools wants to show all threads from a perf run
> and assumes they are in threads. By replacing tool.exit it would be
> easy to record dead threads for this, as the maps aren't required the
> memory overhead could be improved by just recording the necessary
> task's data.
>
>  - offcpu - it would be very useful to have offcpu samples be real
> samples, rather than an aggregated sample at the end of the data file
> with a timestamp greater-than when the thread exited. These samples
> would happen before the exit event is processed and so the reason to
> keep dead threads around would be removed. I think we could do some
> kind of sample aggregation using BPF, but I think we need to think it
> through with exit events. Perhaps we can fix things in the short-term
> by generating BPF samples with aggregation when threads exit in the
> offcpu BPF code, but then again, if you have this going it seems as
> easy just to keep to record all offcpu events as distinct.

Saving off-cpu event on every context switch might cause event loss
due to its frequency.  Generating aggregated samples on EXIT sounds
interesting, but then it'd iterated all possible keys for that thread
including stack trace ID and few more.  So I'm not 100% sure if it's ok
doing it on a task exit path.

>
> Other commands like perf top and perf script don't currently have
> dependencies on dead threads and I'm kind of glad, for
> understandability, memory footprint, etc. Having the default be that
> dead threads linger on will just encourage the kind of issues we see
> currently in perf report and having to have every tool, perf script
> declare it doesn't want dead threads seems burdensome.

Fair enough.

Thanks,
Namhyung
Adrian Hunter Oct. 23, 2023, 2:24 p.m. UTC | #4
On 12/10/23 09:23, Ian Rogers wrote:
> struct thread values hold onto references to mmaps, dsos, etc. When a
> thread exits it is necessary to clean all of this memory up by
> removing the thread from the machine's threads. Some tools require
> this doesn't happen, such as perf report if offcpu events exist or if
> a task list is being generated, so add a symbol_conf value to make the
> behavior optional. When an exited thread is left in the machine's
> threads, mark it as exited.
> 
> This change relates to commit 40826c45eb0b ("perf thread: Remove
> notion of dead threads"). Dead threads were removed as they had a
> reference count of 0 and were difficult to reason about with the
> reference count checker. Here a thread is removed from threads when it
> exits, unless via symbol_conf the exited thread isn't remove and is
> marked as exited. Reference counting behaves as it normally does.

Can we exclude AUX area tracing?

Essentially, the EXIT event happens when the task is still running
in kernel mode, so the thread has not in fact fully exited.

Example:

# perf record -a --kcore -e intel_pt// uname

Before:

# perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
           uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])

After:

$ perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
           uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
           uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
          :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])

This will also affect samples made after PERF_RECORD_EXIT but before
the task finishes exiting.

> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/builtin-report.c   |  7 +++++++
>  tools/perf/util/machine.c     | 10 +++++++---
>  tools/perf/util/symbol_conf.h |  3 ++-
>  tools/perf/util/thread.h      | 14 ++++++++++++++
>  4 files changed, 30 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index dcedfe00f04d..749246817aed 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -1411,6 +1411,13 @@ int cmd_report(int argc, const char **argv)
>  	if (ret < 0)
>  		goto exit;
>  
> +	/*
> +	 * tasks_mode require access to exited threads to list those that are in
> +	 * the data file. Off-cpu events are synthesized after other events and
> +	 * reference exited threads.
> +	 */
> +	symbol_conf.keep_exited_threads = true;
> +
>  	annotation_options__init(&report.annotation_opts);
>  
>  	ret = perf_config(report__config, &report);
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 6ca7500e2cf4..5cda47eb337d 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
>  	if (dump_trace)
>  		perf_event__fprintf_task(event, stdout);
>  
> -	if (thread != NULL)
> -		thread__put(thread);
> -
> +	if (thread != NULL) {
> +		if (symbol_conf.keep_exited_threads)
> +			thread__set_exited(thread, /*exited=*/true);
> +		else
> +			machine__remove_thread(machine, thread);
> +	}
> +	thread__put(thread);
>  	return 0;
>  }
>  
> diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
> index 2b2fb9e224b0..6040286e07a6 100644
> --- a/tools/perf/util/symbol_conf.h
> +++ b/tools/perf/util/symbol_conf.h
> @@ -43,7 +43,8 @@ struct symbol_conf {
>  			disable_add2line_warn,
>  			buildid_mmap2,
>  			guest_code,
> -			lazy_load_kernel_maps;
> +			lazy_load_kernel_maps,
> +			keep_exited_threads;
>  	const char	*vmlinux_name,
>  			*kallsyms_name,
>  			*source_prefix,
> diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> index e79225a0ea46..0df775b5c110 100644
> --- a/tools/perf/util/thread.h
> +++ b/tools/perf/util/thread.h
> @@ -36,13 +36,22 @@ struct thread_rb_node {
>  };
>  
>  DECLARE_RC_STRUCT(thread) {
> +	/** @maps: mmaps associated with this thread. */
>  	struct maps		*maps;
>  	pid_t			pid_; /* Not all tools update this */
> +	/** @tid: thread ID number unique to a machine. */
>  	pid_t			tid;
> +	/** @ppid: parent process of the process this thread belongs to. */
>  	pid_t			ppid;
>  	int			cpu;
>  	int			guest_cpu; /* For QEMU thread */
>  	refcount_t		refcnt;
> +	/**
> +	 * @exited: Has the thread had an exit event. Such threads are usually
> +	 * removed from the machine's threads but some events/tools require
> +	 * access to dead threads.
> +	 */
> +	bool			exited;
>  	bool			comm_set;
>  	int			comm_len;
>  	struct list_head	namespaces_list;
> @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread)
>  	return &RC_CHK_ACCESS(thread)->refcnt;
>  }
>  
> +static inline void thread__set_exited(struct thread *thread, bool exited)
> +{
> +	RC_CHK_ACCESS(thread)->exited = exited;
> +}
> +
>  static inline bool thread__comm_set(const struct thread *thread)
>  {
>  	return RC_CHK_ACCESS(thread)->comm_set;
Ian Rogers Oct. 23, 2023, 6:49 p.m. UTC | #5
On Mon, Oct 23, 2023 at 7:24 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
>
> On 12/10/23 09:23, Ian Rogers wrote:
> > struct thread values hold onto references to mmaps, dsos, etc. When a
> > thread exits it is necessary to clean all of this memory up by
> > removing the thread from the machine's threads. Some tools require
> > this doesn't happen, such as perf report if offcpu events exist or if
> > a task list is being generated, so add a symbol_conf value to make the
> > behavior optional. When an exited thread is left in the machine's
> > threads, mark it as exited.
> >
> > This change relates to commit 40826c45eb0b ("perf thread: Remove
> > notion of dead threads"). Dead threads were removed as they had a
> > reference count of 0 and were difficult to reason about with the
> > reference count checker. Here a thread is removed from threads when it
> > exits, unless via symbol_conf the exited thread isn't remove and is
> > marked as exited. Reference counting behaves as it normally does.
>
> Can we exclude AUX area tracing?
>
> Essentially, the EXIT event happens when the task is still running
> in kernel mode, so the thread has not in fact fully exited.
>
> Example:
>
> # perf record -a --kcore -e intel_pt// uname
>
> Before:
>
> # perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])
>
> After:
>
> $ perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
>            uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])
>
> This will also affect samples made after PERF_RECORD_EXIT but before
> the task finishes exiting.

Makes sense. Would an appropriate fix be in perf_session__open to set:
symbol_conf.keep_exited_threads = true;

when:
perf_header__has_feat(&session->header, HEADER_AUXTRACE)

It is kind of hacky to be changing global state this way, but
symbol_conf is like that in general.

Thanks,
Ian

> >
> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >  tools/perf/builtin-report.c   |  7 +++++++
> >  tools/perf/util/machine.c     | 10 +++++++---
> >  tools/perf/util/symbol_conf.h |  3 ++-
> >  tools/perf/util/thread.h      | 14 ++++++++++++++
> >  4 files changed, 30 insertions(+), 4 deletions(-)
> >
> > diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> > index dcedfe00f04d..749246817aed 100644
> > --- a/tools/perf/builtin-report.c
> > +++ b/tools/perf/builtin-report.c
> > @@ -1411,6 +1411,13 @@ int cmd_report(int argc, const char **argv)
> >       if (ret < 0)
> >               goto exit;
> >
> > +     /*
> > +      * tasks_mode require access to exited threads to list those that are in
> > +      * the data file. Off-cpu events are synthesized after other events and
> > +      * reference exited threads.
> > +      */
> > +     symbol_conf.keep_exited_threads = true;
> > +
> >       annotation_options__init(&report.annotation_opts);
> >
> >       ret = perf_config(report__config, &report);
> > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> > index 6ca7500e2cf4..5cda47eb337d 100644
> > --- a/tools/perf/util/machine.c
> > +++ b/tools/perf/util/machine.c
> > @@ -2157,9 +2157,13 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
> >       if (dump_trace)
> >               perf_event__fprintf_task(event, stdout);
> >
> > -     if (thread != NULL)
> > -             thread__put(thread);
> > -
> > +     if (thread != NULL) {
> > +             if (symbol_conf.keep_exited_threads)
> > +                     thread__set_exited(thread, /*exited=*/true);
> > +             else
> > +                     machine__remove_thread(machine, thread);
> > +     }
> > +     thread__put(thread);
> >       return 0;
> >  }
> >
> > diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
> > index 2b2fb9e224b0..6040286e07a6 100644
> > --- a/tools/perf/util/symbol_conf.h
> > +++ b/tools/perf/util/symbol_conf.h
> > @@ -43,7 +43,8 @@ struct symbol_conf {
> >                       disable_add2line_warn,
> >                       buildid_mmap2,
> >                       guest_code,
> > -                     lazy_load_kernel_maps;
> > +                     lazy_load_kernel_maps,
> > +                     keep_exited_threads;
> >       const char      *vmlinux_name,
> >                       *kallsyms_name,
> >                       *source_prefix,
> > diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> > index e79225a0ea46..0df775b5c110 100644
> > --- a/tools/perf/util/thread.h
> > +++ b/tools/perf/util/thread.h
> > @@ -36,13 +36,22 @@ struct thread_rb_node {
> >  };
> >
> >  DECLARE_RC_STRUCT(thread) {
> > +     /** @maps: mmaps associated with this thread. */
> >       struct maps             *maps;
> >       pid_t                   pid_; /* Not all tools update this */
> > +     /** @tid: thread ID number unique to a machine. */
> >       pid_t                   tid;
> > +     /** @ppid: parent process of the process this thread belongs to. */
> >       pid_t                   ppid;
> >       int                     cpu;
> >       int                     guest_cpu; /* For QEMU thread */
> >       refcount_t              refcnt;
> > +     /**
> > +      * @exited: Has the thread had an exit event. Such threads are usually
> > +      * removed from the machine's threads but some events/tools require
> > +      * access to dead threads.
> > +      */
> > +     bool                    exited;
> >       bool                    comm_set;
> >       int                     comm_len;
> >       struct list_head        namespaces_list;
> > @@ -189,6 +198,11 @@ static inline refcount_t *thread__refcnt(struct thread *thread)
> >       return &RC_CHK_ACCESS(thread)->refcnt;
> >  }
> >
> > +static inline void thread__set_exited(struct thread *thread, bool exited)
> > +{
> > +     RC_CHK_ACCESS(thread)->exited = exited;
> > +}
> > +
> >  static inline bool thread__comm_set(const struct thread *thread)
> >  {
> >       return RC_CHK_ACCESS(thread)->comm_set;
>
Adrian Hunter Oct. 24, 2023, 5:19 a.m. UTC | #6
On 23/10/23 21:49, Ian Rogers wrote:
> On Mon, Oct 23, 2023 at 7:24 AM Adrian Hunter <adrian.hunter@intel.com> wrote:
>>
>> On 12/10/23 09:23, Ian Rogers wrote:
>>> struct thread values hold onto references to mmaps, dsos, etc. When a
>>> thread exits it is necessary to clean all of this memory up by
>>> removing the thread from the machine's threads. Some tools require
>>> this doesn't happen, such as perf report if offcpu events exist or if
>>> a task list is being generated, so add a symbol_conf value to make the
>>> behavior optional. When an exited thread is left in the machine's
>>> threads, mark it as exited.
>>>
>>> This change relates to commit 40826c45eb0b ("perf thread: Remove
>>> notion of dead threads"). Dead threads were removed as they had a
>>> reference count of 0 and were difficult to reason about with the
>>> reference count checker. Here a thread is removed from threads when it
>>> exits, unless via symbol_conf the exited thread isn't remove and is
>>> marked as exited. Reference counting behaves as it normally does.
>>
>> Can we exclude AUX area tracing?
>>
>> Essentially, the EXIT event happens when the task is still running
>> in kernel mode, so the thread has not in fact fully exited.
>>
>> Example:
>>
>> # perf record -a --kcore -e intel_pt// uname
>>
>> Before:
>>
>> # perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])
>>
>> After:
>>
>> $ perf script --itrace=b --show-task-events -C6 | grep -C10 EXIT
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63124ee __perf_event_header__init_id+0x5e ([kernel.kallsyms]) => ffffffffb63124f7 __perf_event_header__init_id+0x67 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312501 __perf_event_header__init_id+0x71 ([kernel.kallsyms]) => ffffffffb6312512 __perf_event_header__init_id+0x82 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6312531 __perf_event_header__init_id+0xa1 ([kernel.kallsyms]) => ffffffffb6316b3a perf_event_task_output+0x26a ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316b40 perf_event_task_output+0x270 ([kernel.kallsyms]) => ffffffffb6316959 perf_event_task_output+0x89 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6316966 perf_event_task_output+0x96 ([kernel.kallsyms]) => ffffffffb6322040 perf_output_begin+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6322080 perf_output_begin+0x40 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb6322085 perf_output_begin+0x45 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220ce perf_output_begin+0x8e ([kernel.kallsyms]) => ffffffffb611d280 preempt_count_add+0x0 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb611d2bf preempt_count_add+0x3f ([kernel.kallsyms]) => ffffffffb63220d3 perf_output_begin+0x93 ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638:          1   branches:  ffffffffb63220e8 perf_output_begin+0xa8 ([kernel.kallsyms]) => ffffffffb63220ff perf_output_begin+0xbf ([kernel.kallsyms])
>>            uname   14740 [006] 26795.092638: PERF_RECORD_EXIT(14740:14740):(14739:14739)
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322119 perf_output_begin+0xd9 ([kernel.kallsyms]) => ffffffffb6322128 perf_output_begin+0xe8 ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322146 perf_output_begin+0x106 ([kernel.kallsyms]) => ffffffffb63220ea perf_output_begin+0xaa ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63220f9 perf_output_begin+0xb9 ([kernel.kallsyms]) => ffffffffb63221ab perf_output_begin+0x16b ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb63221ae perf_output_begin+0x16e ([kernel.kallsyms]) => ffffffffb63221b6 perf_output_begin+0x176 ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6322202 perf_output_begin+0x1c2 ([kernel.kallsyms]) => ffffffffb6322167 perf_output_begin+0x127 ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb632218c perf_output_begin+0x14c ([kernel.kallsyms]) => ffffffffb631696b perf_event_task_output+0x9b ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6316990 perf_event_task_output+0xc0 ([kernel.kallsyms]) => ffffffffb61034a0 __task_pid_nr_ns+0x0 ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb61034b7 __task_pid_nr_ns+0x17 ([kernel.kallsyms]) => ffffffffb6194dc0 __rcu_read_lock+0x0 ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6194de1 __rcu_read_lock+0x21 ([kernel.kallsyms]) => ffffffffb61034bc __task_pid_nr_ns+0x1c ([kernel.kallsyms])
>>           :14740   14740 [006] 26795.092638:          1   branches:  ffffffffb6103503 __task_pid_nr_ns+0x63 ([kernel.kallsyms]) => ffffffffb610353b __task_pid_nr_ns+0x9b ([kernel.kallsyms])
>>
>> This will also affect samples made after PERF_RECORD_EXIT but before
>> the task finishes exiting.
> 
> Makes sense. Would an appropriate fix be in perf_session__open to set:
> symbol_conf.keep_exited_threads = true;
> 
> when:
> perf_header__has_feat(&session->header, HEADER_AUXTRACE)
> 
> It is kind of hacky to be changing global state this way, but
> symbol_conf is like that in general.

That should work.  Alternatively, could be added to
perf_event__process_auxtrace_info() which would tie it
more directly to auxtrace, and wouldn't have to check
HEADER_AUXTRACE.
diff mbox series

Patch

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index dcedfe00f04d..749246817aed 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1411,6 +1411,13 @@  int cmd_report(int argc, const char **argv)
 	if (ret < 0)
 		goto exit;
 
+	/*
+	 * tasks_mode require access to exited threads to list those that are in
+	 * the data file. Off-cpu events are synthesized after other events and
+	 * reference exited threads.
+	 */
+	symbol_conf.keep_exited_threads = true;
+
 	annotation_options__init(&report.annotation_opts);
 
 	ret = perf_config(report__config, &report);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 6ca7500e2cf4..5cda47eb337d 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2157,9 +2157,13 @@  int machine__process_exit_event(struct machine *machine, union perf_event *event
 	if (dump_trace)
 		perf_event__fprintf_task(event, stdout);
 
-	if (thread != NULL)
-		thread__put(thread);
-
+	if (thread != NULL) {
+		if (symbol_conf.keep_exited_threads)
+			thread__set_exited(thread, /*exited=*/true);
+		else
+			machine__remove_thread(machine, thread);
+	}
+	thread__put(thread);
 	return 0;
 }
 
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 2b2fb9e224b0..6040286e07a6 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -43,7 +43,8 @@  struct symbol_conf {
 			disable_add2line_warn,
 			buildid_mmap2,
 			guest_code,
-			lazy_load_kernel_maps;
+			lazy_load_kernel_maps,
+			keep_exited_threads;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index e79225a0ea46..0df775b5c110 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -36,13 +36,22 @@  struct thread_rb_node {
 };
 
 DECLARE_RC_STRUCT(thread) {
+	/** @maps: mmaps associated with this thread. */
 	struct maps		*maps;
 	pid_t			pid_; /* Not all tools update this */
+	/** @tid: thread ID number unique to a machine. */
 	pid_t			tid;
+	/** @ppid: parent process of the process this thread belongs to. */
 	pid_t			ppid;
 	int			cpu;
 	int			guest_cpu; /* For QEMU thread */
 	refcount_t		refcnt;
+	/**
+	 * @exited: Has the thread had an exit event. Such threads are usually
+	 * removed from the machine's threads but some events/tools require
+	 * access to dead threads.
+	 */
+	bool			exited;
 	bool			comm_set;
 	int			comm_len;
 	struct list_head	namespaces_list;
@@ -189,6 +198,11 @@  static inline refcount_t *thread__refcnt(struct thread *thread)
 	return &RC_CHK_ACCESS(thread)->refcnt;
 }
 
+static inline void thread__set_exited(struct thread *thread, bool exited)
+{
+	RC_CHK_ACCESS(thread)->exited = exited;
+}
+
 static inline bool thread__comm_set(const struct thread *thread)
 {
 	return RC_CHK_ACCESS(thread)->comm_set;