diff mbox series

[5/5,WIP] trace-cmd: Add new subcomand "trace-cmd perf"

Message ID 20201203060226.476475-6-tz.stoyanov@gmail.com (mailing list archive)
State New
Headers show
Series Initial trace-cmd perf support | expand

Commit Message

Tzvetomir Stoyanov (VMware) Dec. 3, 2020, 6:02 a.m. UTC
The new sub command uses perf to collect performance information of the
selected process. It has one mandatory argument:
trace-cmd perf --pid <PID>

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 tracecmd/Makefile              |   1 +
 tracecmd/include/trace-local.h |   2 +
 tracecmd/trace-cmd.c           |   1 +
 tracecmd/trace-perf.c          | 726 +++++++++++++++++++++++++++++++++
 4 files changed, 730 insertions(+)
 create mode 100644 tracecmd/trace-perf.c

Comments

Steven Rostedt Feb. 19, 2021, 2:03 a.m. UTC | #1
On Thu,  3 Dec 2020 08:02:26 +0200
"Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:

> +static int perf_mmap(struct perf_cpu *perf)
> +{
> +	mmap_mask = NUM_PAGES * getpagesize() - 1;
> +
> +	/* associate a buffer with the file */
> +	perf->mpage = mmap(NULL, (NUM_PAGES + 1) * getpagesize(),
> +			PROT_READ | PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
> +	if (perf->mpage == MAP_FAILED)
> +		return -1;
> +	return 0;
> +}

BTW, I found that the above holds the conversions we need for the local
clock!

        printf("time_shift=%d\n", perf->mpage->time_shift);
        printf("time_mult=%d\n", perf->mpage->time_mult);
        printf("time_offset=%lld\n", perf->mpage->time_offset);

Which gives me:

time_shift=31
time_mult=633046315
time_offset=-115773323084683

[ one for each CPU ]

The ftrace local clock is defined by:

u64 notrace trace_clock_local(void)
{
	u64 clock;
	preempt_disable_notrace();
	clock = sched_clock();
	preempt_enable_notrace();
	return clock;
}

Where

u64 sched_clock(void)
{
	if (static_branch_likely(&__use_tsc)) { // true
		u64 tsc_now = rdtsc();

		/* return the value in ns */
		return cycles_2_ns(tsc_now);
	}

and

static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
	struct cyc2ns_data data;
	unsigned long long ns;

	cyc2ns_read_begin(&data); // <- this is where the data comes from

	ns = data.cyc2ns_offset;
	ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);

	cyc2ns_read_end();

	return ns;
}

__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
	int seq, idx;

	preempt_disable_notrace();

	do {
		seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
		idx = seq & 1;

		data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
		data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
		data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

	} while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

The offset, multiplier and shift are from the cyc2ns.data[idx] (per
cpu) is what determines the conversion from x86 cycles to nanoseconds.

Does user space have access to that? Yes! Via perf!

void arch_perf_update_userpage(struct perf_event *event,
			       struct perf_event_mmap_page *userpg, u64 now)
{
[..]
	cyc2ns_read_begin(&data);

	offset = data.cyc2ns_offset + __sched_clock_offset;

	/*
	 * Internal timekeeping for enabled/running/stopped times
	 * is always in the local_clock domain.
	 */
	userpg->cap_user_time = 1;
	userpg->time_mult = data.cyc2ns_mul;
	userpg->time_shift = data.cyc2ns_shift;
	userpg->time_offset = offset - now;

Those above values are the ones I printed at the beginning of this
email.

Hence, we can use x86-tsc as the clock for both the host and guest, and
then using perf find out how to convert that to what the 'local' clock
would produce. At least the multiplier and the shfit.

-- Steve
Steven Rostedt Feb. 19, 2021, 3:53 a.m. UTC | #2
On Thu, 18 Feb 2021 21:03:52 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> Hence, we can use x86-tsc as the clock for both the host and guest, and
> then using perf find out how to convert that to what the 'local' clock
> would produce. At least the multiplier and the shfit.

I just tried this out:

 http://rostedt.org/private/perf-trace.c

Where I did:

 # trace-cmd record -M 1 -C local -e irq -e sched taskset -c 0 ./perf-trace
time_shift=31
time_mult=633046315
time_offset=-125757047487632
 # trace-cmd report |grep print         taskset-34857 [000] 125742.260276: print:                tracing_mark_write: [426606557411358] [125757.283123254] - <idle>-0 [0] softirq_raise: vec=9 [action=RCU]
         taskset-34857 [000] 125742.260283: print:                tracing_mark_write: [426606557431758] [125757.283129268] - <idle>-0 [0] softirq_entry: vec=9 [action=RCU]
         taskset-34857 [000] 125742.260289: print:                tracing_mark_write: [426606557580560] [125757.283173133] - <idle>-0 [0] softirq_exit: vec=9 [action=RCU]
         taskset-34857 [000] 125742.260295: print:                tracing_mark_write: [426606560784758] [125757.284117683] - <idle>-0 [0] softirq_raise: vec=7 [action=SCHED]

The time offset is slightly different, but we don't care. We only want
to convert the cycles into nanoseconds, and this appears to do the job!

-- Steve
Tzvetomir Stoyanov (VMware) Feb. 19, 2021, 7:16 a.m. UTC | #3
Hi Steven,

On Fri, Feb 19, 2021 at 4:03 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu,  3 Dec 2020 08:02:26 +0200
> "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:
>
> > +static int perf_mmap(struct perf_cpu *perf)
> > +{
> > +     mmap_mask = NUM_PAGES * getpagesize() - 1;
> > +
> > +     /* associate a buffer with the file */
> > +     perf->mpage = mmap(NULL, (NUM_PAGES + 1) * getpagesize(),
> > +                     PROT_READ | PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
> > +     if (perf->mpage == MAP_FAILED)
> > +             return -1;
> > +     return 0;
> > +}
>
> BTW, I found that the above holds the conversions we need for the local
> clock!
>
>         printf("time_shift=%d\n", perf->mpage->time_shift);
>         printf("time_mult=%d\n", perf->mpage->time_mult);
>         printf("time_offset=%lld\n", perf->mpage->time_offset);
>
> Which gives me:
>
> time_shift=31
> time_mult=633046315
> time_offset=-115773323084683
>
> [ one for each CPU ]

This will give us time shift/mult/offset for each host CPU, right ? Is
the local trace clock
different for each CPU ? Currently, the time offset is calculated per
VCPU, assuming
that the host CPU on which this VCPU runs has no impact on the
timestamp synchronization.
If the local clock depends on the CPU, then we should calculate the
time offset of each guest
event individually, depending on host CPU and VCPU the event happened
- as the host task which runs
the VCPU can migrate between CPUs at any time. So, we need to:
  1. Add timesync information for each host CPU in the trace.dat file.
  2. Track the migration between CPUs of each task that runs VCPU and
save that information
    in the trace.dat file.
  2. When calculating the new timestamp of each guest event
(individually) - somehow find out on
     which host CPU that guest event happened ?

Points 1 and 2 are doable, but will break the current trace.dat file
option that holds the timesync information.
Point 3 is not clear to me, how we can get such information before the
host and guest events are synchronised ?

>
> The ftrace local clock is defined by:
>
> u64 notrace trace_clock_local(void)
> {
>         u64 clock;
>         preempt_disable_notrace();
>         clock = sched_clock();
>         preempt_enable_notrace();
>         return clock;
> }
>
> Where
>
> u64 sched_clock(void)
> {
>         if (static_branch_likely(&__use_tsc)) { // true
>                 u64 tsc_now = rdtsc();
>
>                 /* return the value in ns */
>                 return cycles_2_ns(tsc_now);
>         }
>
> and
>
> static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
> {
>         struct cyc2ns_data data;
>         unsigned long long ns;
>
>         cyc2ns_read_begin(&data); // <- this is where the data comes from
>
>         ns = data.cyc2ns_offset;
>         ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
>
>         cyc2ns_read_end();
>
>         return ns;
> }
>
> __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
> {
>         int seq, idx;
>
>         preempt_disable_notrace();
>
>         do {
>                 seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
>                 idx = seq & 1;
>
>                 data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
>                 data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
>                 data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
>
>         } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
> }
>
> The offset, multiplier and shift are from the cyc2ns.data[idx] (per
> cpu) is what determines the conversion from x86 cycles to nanoseconds.
>
> Does user space have access to that? Yes! Via perf!
>
> void arch_perf_update_userpage(struct perf_event *event,
>                                struct perf_event_mmap_page *userpg, u64 now)
> {
> [..]
>         cyc2ns_read_begin(&data);
>
>         offset = data.cyc2ns_offset + __sched_clock_offset;
>
>         /*
>          * Internal timekeeping for enabled/running/stopped times
>          * is always in the local_clock domain.
>          */
>         userpg->cap_user_time = 1;
>         userpg->time_mult = data.cyc2ns_mul;
>         userpg->time_shift = data.cyc2ns_shift;
>         userpg->time_offset = offset - now;
>
> Those above values are the ones I printed at the beginning of this
> email.
>
> Hence, we can use x86-tsc as the clock for both the host and guest, and
> then using perf find out how to convert that to what the 'local' clock
> would produce. At least the multiplier and the shfit.
>
> -- Steve
Steven Rostedt Feb. 19, 2021, 2:36 p.m. UTC | #4
On Fri, 19 Feb 2021 09:16:26 +0200
Tzvetomir Stoyanov <tz.stoyanov@gmail.com> wrote:

> Hi Steven,
> 
> On Fri, Feb 19, 2021 at 4:03 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu,  3 Dec 2020 08:02:26 +0200
> > "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:
> >  
> > > +static int perf_mmap(struct perf_cpu *perf)
> > > +{
> > > +     mmap_mask = NUM_PAGES * getpagesize() - 1;
> > > +
> > > +     /* associate a buffer with the file */
> > > +     perf->mpage = mmap(NULL, (NUM_PAGES + 1) * getpagesize(),
> > > +                     PROT_READ | PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
> > > +     if (perf->mpage == MAP_FAILED)
> > > +             return -1;
> > > +     return 0;
> > > +}  
> >
> > BTW, I found that the above holds the conversions we need for the local
> > clock!
> >
> >         printf("time_shift=%d\n", perf->mpage->time_shift);
> >         printf("time_mult=%d\n", perf->mpage->time_mult);
> >         printf("time_offset=%lld\n", perf->mpage->time_offset);
> >
> > Which gives me:
> >
> > time_shift=31
> > time_mult=633046315
> > time_offset=-115773323084683
> >
> > [ one for each CPU ]  
> 
> This will give us time shift/mult/offset for each host CPU, right ? Is
> the local trace clock
> different for each CPU ? 

It can be. Note, the above offset is basically useless. That injects the
current time into the value and we can't rely on it. But the shift and mult
is needed.

But, usually, the shift and offset are identical on most systems across
CPUs, but there's no guarantee that it will always be the case.

>Currently, the time offset is calculated per
> VCPU, assuming
> that the host CPU on which this VCPU runs has no impact on the
> timestamp synchronization.
> If the local clock depends on the CPU, then we should calculate the
> time offset of each guest
> event individually, depending on host CPU and VCPU the event happened
> - as the host task which runs
> the VCPU can migrate between CPUs at any time. So, we need to:
>   1. Add timesync information for each host CPU in the trace.dat file.
>   2. Track the migration between CPUs of each task that runs VCPU and
> save that information
>     in the trace.dat file.

I was thinking about this too. And perhaps we can hold off until we find
systems that have different values for mult and shift.

That said, we can easily add this information by recording the sched_switch
events in a separate buffer. And I've been thinking about doing this by
default anyway. More below.

>   2. When calculating the new timestamp of each guest event
> (individually) - somehow find out on
>      which host CPU that guest event happened ?
> 
> Points 1 and 2 are doable, but will break the current trace.dat file
> option that holds the timesync information.

I don't think we need to have it in the timesync option. I think we can
create another option to hold guest event data.

> Point 3 is not clear to me, how we can get such information before the
> host and guest events are synchronised ?
> 


My thoughts about this is. When we enable tracing of a guest (-A), we then
create an instance on the host that records only kvm enter / exit events as
well as sched switch events. Basically, enable all the events that we need
to synchronize and show entering and exiting of the guest.

The synchronization logic already shows us what host thread controls each
guest VCPU. If we record the kvm enter/exit and sched_switch events in a
separate buffer, we can see when a host thread that runs a guest VCPU
migrates to another CPU. Since the timestamps of those events are recorded
in the meta events themselves (sched_switch), we know exactly where we need
to use the new mult and shift values for the guest events.

Make sense?

-- Steve
Dario Faggioli Feb. 19, 2021, 5:51 p.m. UTC | #5
Hey guys,

On Thu, 2021-02-18 at 22:53 -0500, Steven Rostedt wrote:
> On Thu, 18 Feb 2021 21:03:52 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > Hence, we can use x86-tsc as the clock for both the host and guest,
> > and
> > then using perf find out how to convert that to what the 'local'
> > clock
> > would produce. At least the multiplier and the shfit.
> 
> I just tried this out:
> 
>  http://rostedt.org/private/perf-trace.c
> 
> Where I did:
> 
>  # trace-cmd record -M 1 -C local -e irq -e sched taskset -c 0
> ./perf-trace
> time_shift=31
> time_mult=633046315
> time_offset=-125757047487632
>
Let me ask something really really stupid: where was this run?

Host? Gust? Both?

This seems very interesting... But I definitely need to stare more at
your code (while scratching my head, at least for now :-P).

Thanks and Regards
Tzvetomir Stoyanov (VMware) Feb. 19, 2021, 5:56 p.m. UTC | #6
On Fri, Feb 19, 2021 at 4:36 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Fri, 19 Feb 2021 09:16:26 +0200
> Tzvetomir Stoyanov <tz.stoyanov@gmail.com> wrote:
>
> > Hi Steven,
> >
> > On Fri, Feb 19, 2021 at 4:03 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Thu,  3 Dec 2020 08:02:26 +0200
> > > "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com> wrote:
> > >
> > > > +static int perf_mmap(struct perf_cpu *perf)
> > > > +{
> > > > +     mmap_mask = NUM_PAGES * getpagesize() - 1;
> > > > +
> > > > +     /* associate a buffer with the file */
> > > > +     perf->mpage = mmap(NULL, (NUM_PAGES + 1) * getpagesize(),
> > > > +                     PROT_READ | PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
> > > > +     if (perf->mpage == MAP_FAILED)
> > > > +             return -1;
> > > > +     return 0;
> > > > +}
> > >
> > > BTW, I found that the above holds the conversions we need for the local
> > > clock!
> > >
> > >         printf("time_shift=%d\n", perf->mpage->time_shift);
> > >         printf("time_mult=%d\n", perf->mpage->time_mult);
> > >         printf("time_offset=%lld\n", perf->mpage->time_offset);
> > >
> > > Which gives me:
> > >
> > > time_shift=31
> > > time_mult=633046315
> > > time_offset=-115773323084683
> > >
> > > [ one for each CPU ]
> >
> > This will give us time shift/mult/offset for each host CPU, right ? Is
> > the local trace clock
> > different for each CPU ?
>
> It can be. Note, the above offset is basically useless. That injects the
> current time into the value and we can't rely on it. But the shift and mult
> is needed.
>
> But, usually, the shift and offset are identical on most systems across
> CPUs, but there's no guarantee that it will always be the case.
>
> >Currently, the time offset is calculated per
> > VCPU, assuming
> > that the host CPU on which this VCPU runs has no impact on the
> > timestamp synchronization.
> > If the local clock depends on the CPU, then we should calculate the
> > time offset of each guest
> > event individually, depending on host CPU and VCPU the event happened
> > - as the host task which runs
> > the VCPU can migrate between CPUs at any time. So, we need to:
> >   1. Add timesync information for each host CPU in the trace.dat file.
> >   2. Track the migration between CPUs of each task that runs VCPU and
> > save that information
> >     in the trace.dat file.
>
> I was thinking about this too. And perhaps we can hold off until we find
> systems that have different values for mult and shift.
>
> That said, we can easily add this information by recording the sched_switch
> events in a separate buffer. And I've been thinking about doing this by
> default anyway. More below.
>
> >   2. When calculating the new timestamp of each guest event
> > (individually) - somehow find out on
> >      which host CPU that guest event happened ?
> >
> > Points 1 and 2 are doable, but will break the current trace.dat file
> > option that holds the timesync information.
>
> I don't think we need to have it in the timesync option. I think we can
> create another option to hold guest event data.
>
> > Point 3 is not clear to me, how we can get such information before the
> > host and guest events are synchronised ?
> >
>
>
> My thoughts about this is. When we enable tracing of a guest (-A), we then
> create an instance on the host that records only kvm enter / exit events as
> well as sched switch events. Basically, enable all the events that we need
> to synchronize and show entering and exiting of the guest.
>
> The synchronization logic already shows us what host thread controls each
> guest VCPU. If we record the kvm enter/exit and sched_switch events in a
> separate buffer, we can see when a host thread that runs a guest VCPU
> migrates to another CPU. Since the timestamps of those events are recorded
> in the meta events themselves (sched_switch), we know exactly where we need
> to use the new mult and shift values for the guest events.
>
> Make sense?
Yes, but I have two concerns:
 1. Recording kvm enter / exit events will make this implementation
KVM specific,
     how about other hypervisors ?
 2. The problem with finding guest event -> host CPU relation. We now only the
      timestamp of the guest event, not yet synchronized with the host
time. How will
      find on what host CPU that guest event happened, as the host
task CPU migration
      mapping is recorded with host time ?
>
> -- Steve
Steven Rostedt Feb. 19, 2021, 7:11 p.m. UTC | #7
On Fri, 19 Feb 2021 19:56:59 +0200
Tzvetomir Stoyanov <tz.stoyanov@gmail.com> wrote:


> > Make sense?  
> Yes, but I have two concerns:
>  1. Recording kvm enter / exit events will make this implementation
> KVM specific,
>      how about other hypervisors ?

I think this should all be part of the protocol. The option can be
TRACECMD_OPTION_GUEST_METADATA, where that has a field for "enter/exit"
events with CPU, vCPU, timestamp, and the also a note of sched switch
events.

That is, the implementation will enable kvm events, but other trace-cmd
plugins can implement other events. But what gets recorded into the
trace.dat file can be post processed into a generic format.


>  2. The problem with finding guest event -> host CPU relation. We now only the
>       timestamp of the guest event, not yet synchronized with the host
> time. How will
>       find on what host CPU that guest event happened, as the host
> task CPU migration
>       mapping is recorded with host time ?

It would be done all post processed. Here's the workflow I envision:

 1. Run pre synchronization protocol between host/guest (ie. kvm, ptp)

 2. Start the meta data events.
	Start tracing guest enter/exit events along with scheduling.
	We may be able to even filter specifically if we know the threads a
	head of time, but that's just an optimization.
	These events will be recorded raw into a file (with host threads
	saving the events to disk like it does for the tracing data).
	All this is in a separate instance.

 3. Run tracing
	Recorded events on the guest are sent to the host and stored in raw
	format on the host.
	Optionally, events on the host is also recorded.

 4. At the end of tracing
	Stop the recording of tracing.
	Stop the recording of meta data.
	Run post synchronization protocol between host and guest.

 5. Read the meta data events and convert them into a form of timestamp,
    enter/exit events, migration events. Yes, the timestamp will be that of
    the host.

A couple of things to understand with step 5, and assumptions we can make.

The scale factor between the host and guest for each VCPU should not
change, even if the VCPU migrates. If it does, then there had to have been a
kernel event where the kernel knew about this change, and we need to find a
way to extract that. For now, we shouldn't worry about it, as I'm guessing
that could be rather complex to manage even on the kernel end, and it would
also be expensive for migrations.

I'm making a big assumption here, that either the different VCPUS have
different scaling factors but the host scaling between physical CPUS are
the same, or the physical CPUS have different scaling factors, but the VCPUS
do not. So far I ran this on 4 different machines (including my laptop) and
all had the same mult and shift on the host. Perhaps we just state that we
do not support machines that do not have this.

Anyway, when all is done, the meta data events will map to the host. Using
this mapping and the synchronization between the host and guest, we can map
these migrations and such to the guest itself. And if the vCPUS are
different (and we assume the host is the same for all), and we know when
VCPU scheduled away from a physical CPU (from the host time), we can then
figure out how to find the guest time when that happened.

Or did I miss something?

-- Steve
diff mbox series

Patch

diff --git a/tracecmd/Makefile b/tracecmd/Makefile
index 5a16a5e5..3fda6de8 100644
--- a/tracecmd/Makefile
+++ b/tracecmd/Makefile
@@ -33,6 +33,7 @@  TRACE_CMD_OBJS += trace-usage.o
 TRACE_CMD_OBJS += trace-dump.o
 TRACE_CMD_OBJS += trace-clear.o
 TRACE_CMD_OBJS += trace-obj-debug.o
+TRACE_CMD_OBJS += trace-perf.o
 ifeq ($(VSOCK_DEFINED), 1)
 TRACE_CMD_OBJS += trace-tsync.o
 endif
diff --git a/tracecmd/include/trace-local.h b/tracecmd/include/trace-local.h
index cd2d34a4..b0d03900 100644
--- a/tracecmd/include/trace-local.h
+++ b/tracecmd/include/trace-local.h
@@ -102,6 +102,8 @@  void trace_usage(int argc, char **argv);
 
 void trace_dump(int argc, char **argv);
 
+void trace_perf(int argc, char **argv);
+
 int trace_record_agent(struct tracecmd_msg_handle *msg_handle,
 		       int cpus, int *fds,
 		       int argc, char **argv, bool use_fifos,
diff --git a/tracecmd/trace-cmd.c b/tracecmd/trace-cmd.c
index 7376c5a5..0c5b324f 100644
--- a/tracecmd/trace-cmd.c
+++ b/tracecmd/trace-cmd.c
@@ -104,6 +104,7 @@  struct command commands[] = {
 	{"list", trace_list},
 	{"help", trace_usage},
 	{"dump", trace_dump},
+	{"perf", trace_perf},
 	{"-h", trace_usage},
 };
 
diff --git a/tracecmd/trace-perf.c b/tracecmd/trace-perf.c
new file mode 100644
index 00000000..9b9f3519
--- /dev/null
+++ b/tracecmd/trace-perf.c
@@ -0,0 +1,726 @@ 
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2020, VMware, Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
+ *
+ */
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#include "trace-local.h"
+#include "trace-cmd-local.h"
+
+#define SLEEP_USEC		1000
+#define SAMPLE_PER_SECOND	8000
+
+static volatile bool done;
+
+struct perf_stat {
+	__u64 count;
+	__u64 time_enabled;
+	__u64 time_running;
+	__u64 id;
+};
+
+struct perf_split_event {
+	int size;
+	int part_size;
+	char *buf;
+};
+
+struct perf_cpu {
+	int cpu;
+	int pid;
+	unsigned int prev_pos;
+	int perf_fd;
+	int sample_file_fd;
+	int mmap_file_fd;
+	char *sample_file;
+	char *mmap_file;
+	pthread_t tid;
+	struct perf_split_event split;
+	struct perf_event_mmap_page *mpage;
+};
+
+struct perf_session {
+	int pid;
+	int cpu_count;
+	struct perf_cpu *cpus;
+	struct trace_debug_object *fdebug;
+};
+
+static long perf_event_open(struct perf_event_attr *event, pid_t pid,
+			    int cpu, int group_fd, unsigned long flags)
+{
+	return syscall(__NR_perf_event_open, event, pid, cpu, group_fd, flags);
+}
+
+/*
+ * struct {
+ *	struct perf_event_header	header;
+ *
+ *	#
+ *	# Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
+ *	# The advantage of PERF_SAMPLE_IDENTIFIER is that its position
+ *	# is fixed relative to header.
+ *	#
+ *
+ *	{ u64			id;	  } && PERF_SAMPLE_IDENTIFIER
+ *	{ u64			ip;	  } && PERF_SAMPLE_IP
+ *	{ u32			pid, tid; } && PERF_SAMPLE_TID
+ *	{ u64			time;     } && PERF_SAMPLE_TIME
+ *	{ u64			addr;     } && PERF_SAMPLE_ADDR
+ *	{ u64			id;	  } && PERF_SAMPLE_ID
+ *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
+ *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
+ *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+ *
+ *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
+ *
+ *	{ u64			nr,
+ *	  u64			ips[nr];  } && PERF_SAMPLE_CALLCHAIN
+ *
+ *	#
+ *	# The RAW record below is opaque data wrt the ABI
+ *	#
+ *	# That is, the ABI doesn't make any promises wrt to
+ *	# the stability of its content, it may vary depending
+ *	# on event, hardware, kernel version and phase of
+ *	# the moon.
+ *	#
+ *	# In other words, PERF_SAMPLE_RAW contents are not an ABI.
+ *	#
+ *
+ *	{ u32			size;
+ *	  char                  data[size];}&& PERF_SAMPLE_RAW
+ *
+ *	{ u64                   nr;
+ *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
+ *        { u64 from, to, flags } lbr[nr];
+ *      } && PERF_SAMPLE_BRANCH_STACK
+ *
+ *	{ u64			abi; # enum perf_sample_regs_abi
+ *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
+ *
+ *	{ u64			size;
+ *	  char			data[size];
+ *	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
+ *
+ *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+ *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
+ *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
+ *	{ u64			abi; # enum perf_sample_regs_abi
+ *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ *	{ u64			size;
+ *	  char			data[size]; } && PERF_SAMPLE_AUX
+ * };
+ */
+
+struct perf_event_sample {
+	struct perf_event_header head;
+	uint64_t id;		/* PERF_SAMPLE_IDENTIFIER */
+	uint32_t pid, tid;	/* PERF_SAMPLE_TID */
+	uint64_t time;		/* PERF_SAMPLE_TIME */
+	uint32_t cpu, res;	/* PERF_SAMPLE_CPU */
+	uint64_t period;	/* PERF_SAMPLE_PERIOD */
+	uint64_t nr;
+	uint64_t ips[];		/* PERF_SAMPLE_CALLCHAIN */
+} __attribute__((packed));
+
+struct perf_event_mmap {
+	struct perf_event_header	header;
+
+	uint32_t			pid, tid;
+	uint64_t			addr;
+	uint64_t			len;
+	uint64_t			pgoff;
+	char				filename[];
+} __attribute__((packed));
+
+static int handle_event_sample(struct perf_event_sample *sample, struct perf_cpu *cpu)
+{
+	int i;
+
+	/* check if sample is for our PID */
+	if (sample->pid != cpu->pid)
+		return 0;
+
+	if (write(cpu->sample_file_fd, &sample->time, sizeof(uint64_t)) < 0)
+		return -1;
+	if (write(cpu->sample_file_fd, &sample->nr, sizeof(uint64_t)) < 0)
+		return -1;
+	for (i = 0; i < sample->nr; i++) {
+		if (write(cpu->sample_file_fd, &sample->ips[i], sizeof(uint64_t)) < 0)
+			return -1;
+	}
+
+	return 1;
+}
+
+static int handle_event_mmap(struct perf_event_mmap *event, struct perf_cpu *cpu)
+{
+	/* check if mmap is for our PID */
+	if (event->pid != cpu->pid)
+		return 0;
+	/* check if executable memory is mapped */
+	if (event->header.misc & PERF_RECORD_MISC_MMAP_DATA)
+		return 0;
+
+	if (write(cpu->mmap_file_fd, &event->addr, sizeof(uint64_t)) < 0)
+		return -1;
+	if (write(cpu->mmap_file_fd, &event->len, sizeof(uint64_t)) < 0)
+		return -1;
+	if (write(cpu->mmap_file_fd, &event->pgoff, sizeof(uint64_t)) < 0)
+		return -1;
+	if (write(cpu->mmap_file_fd, event->filename, strlen(event->filename) + 1) < 0)
+		return -1;
+
+	return 1;
+}
+
+#define NUM_PAGES 8
+static unsigned int mmap_mask;
+
+static int handle_event(struct perf_event_header *event, struct perf_cpu *cpu)
+{
+	int ret;
+
+	if (event->size == 0)
+		return -1;
+
+	switch (event->type) {
+	case PERF_RECORD_SAMPLE: /* new callstack sample */
+		ret = handle_event_sample((struct perf_event_sample *)event, cpu);
+		break;
+	case PERF_RECORD_MMAP: /* new memory mapped, may be a library is loaded ? */
+		ret = handle_event_mmap((struct perf_event_mmap *)event, cpu);
+		break;
+	default:
+		/* Got unknown perf event */
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+static int perf_event_read_samples(struct perf_cpu *cpu, unsigned char *buf, int size)
+{
+	unsigned char *end = buf + size;
+	struct perf_event_header *h;
+	int csize = size;
+	int ret;
+
+	if (cpu->split.buf) {
+		/* partial read, check if there is enough data to complete it */
+		if ((cpu->split.size - cpu->split.part_size) > size)
+			return -1;
+		memcpy(cpu->split.buf + cpu->split.part_size,
+		       buf, cpu->split.size - cpu->split.part_size);
+		ret = handle_event((struct perf_event_header *)cpu->split.buf, cpu);
+		buf += (cpu->split.size - cpu->split.part_size);
+		free(cpu->split.buf);
+		memset(&cpu->split, 0, sizeof(struct perf_split_event));
+		if (ret < 0)
+			return -1; /* Failed to process the partial event */
+	}
+
+	while (buf < end) {
+		h = (struct perf_event_header *)buf;
+		if (h->size == 0)
+			return -1;
+		if (h->size > csize) {
+			/* part of the event is in the buffer, do partial read */
+			if (cpu->split.buf != NULL)
+				return -1; /* Previous partial read pending */
+			cpu->split.buf = calloc(1, h->size);
+			if (!cpu->split.buf)
+				return -1;
+			cpu->split.size = h->size;
+			cpu->split.part_size = csize;
+			memcpy(cpu->split.buf, buf, csize);
+			return csize;
+		}
+		if (handle_event(h, cpu) < 0)
+			return -1; /* Failed to process the event */
+		csize -= h->size;
+		buf += h->size;
+	}
+
+	return 0;
+}
+
+static int perf_event_read(struct perf_cpu *cpu)
+{
+	unsigned char *buf;
+	unsigned char *data;
+	unsigned int head;
+	int diff, size;
+	unsigned int old;
+
+	data = (unsigned char *)cpu->mpage + getpagesize();
+
+	head = cpu->mpage->data_head;
+	/* On SMP-capable platforms, after reading the data_head value,
+	 * user space should issue an rmb().
+	 */
+	__sync_synchronize();
+
+	old = cpu->prev_pos;
+	diff = head - old;
+	if (diff < 0)
+		old = head; /* buffer overrun, events lost */
+
+	size = head - old;
+	if ((old & mmap_mask) + size != (head & mmap_mask)) {
+		buf = data + (old & mmap_mask);
+		size = mmap_mask + 1 - (old & mmap_mask);
+		old += size;
+		perf_event_read_samples(cpu, buf, size);
+	}
+	buf = data + (old & mmap_mask);
+	size = head - old;
+	if (size)
+		perf_event_read_samples(cpu, buf, size);
+
+	old += size;
+	cpu->prev_pos = old;
+	/* reset the buffer */
+	cpu->mpage->data_tail = old;
+
+	return size;
+}
+
+static void perf_init_pe(struct perf_event_attr *pe)
+{
+	memset(pe, 0, sizeof(struct perf_event_attr));
+	pe->type = PERF_TYPE_HARDWARE;
+	pe->sample_type = PERF_SAMPLE_CALLCHAIN |
+			  PERF_SAMPLE_IDENTIFIER |
+			  PERF_SAMPLE_TID |
+			  PERF_SAMPLE_TIME |
+			  PERF_SAMPLE_CPU |
+			  PERF_SAMPLE_PERIOD;
+	pe->size = sizeof(struct perf_event_attr);
+	pe->config = PERF_COUNT_HW_CPU_CYCLES;
+	pe->disabled = 1;
+	pe->exclude_kernel = 1;
+	pe->freq = 1;
+	pe->sample_freq = SAMPLE_PER_SECOND;
+	pe->inherit = 1;
+	pe->mmap = 1;
+	pe->comm = 1;
+	pe->task = 1;
+	pe->precise_ip = 1;
+	pe->sample_id_all = 1;
+//	pe->mmap2 = 1;
+	pe->comm_exec = 1;
+	pe->ksymbol = 1;
+	pe->bpf_event = 1;
+	pe->read_format = PERF_FORMAT_ID |
+			PERF_FORMAT_TOTAL_TIME_ENABLED|
+			PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+}
+
+static int perf_enable(struct perf_cpu *perf, bool enable)
+{
+	int ret;
+
+	if (enable) {
+		ret = ioctl(perf->perf_fd, PERF_EVENT_IOC_RESET, 0);
+		ret = ioctl(perf->perf_fd, PERF_EVENT_IOC_ENABLE, 0);
+	} else
+		ret = ioctl(perf->perf_fd, PERF_EVENT_IOC_DISABLE, 0);
+
+	return ret;
+}
+
+static int perf_mmap(struct perf_cpu *perf)
+{
+	mmap_mask = NUM_PAGES * getpagesize() - 1;
+
+	/* associate a buffer with the file */
+	perf->mpage = mmap(NULL, (NUM_PAGES + 1) * getpagesize(),
+			PROT_READ | PROT_WRITE, MAP_SHARED, perf->perf_fd, 0);
+	if (perf->mpage == MAP_FAILED)
+		return -1;
+	return 0;
+}
+
+static void perf_session_destroy(struct perf_session *perf)
+{
+	int i;
+
+	if (!perf)
+		return;
+
+	if (perf->cpu_count && perf->cpus) {
+		for (i = 0; i < perf->cpu_count; i++) {
+			if (perf->cpus[i].perf_fd >= 0)
+				close(perf->cpus[i].perf_fd);
+			if (perf->cpus[i].sample_file_fd >= 0)
+				close(perf->cpus[i].sample_file_fd);
+			if (perf->cpus[i].mmap_file_fd >= 0)
+				close(perf->cpus[i].mmap_file_fd);
+			if (perf->cpus[i].mpage)
+				munmap(perf->cpus[i].mpage,
+				       (NUM_PAGES + 1) * getpagesize());
+			if (perf->cpus[i].sample_file) {
+				remove(perf->cpus[i].sample_file);
+				free(perf->cpus[i].sample_file);
+			}
+			if (perf->cpus[i].mmap_file) {
+				remove(perf->cpus[i].mmap_file);
+				free(perf->cpus[i].mmap_file);
+			}
+			free(perf->cpus[i].split.buf);
+		}
+		free(perf->cpus);
+	}
+
+	if (perf->fdebug)
+		trace_debug_obj_destroy(perf->fdebug);
+
+	free(perf);
+}
+
+#define TMP_FILE "/tmp/perf_temp_data.XXXXXX"
+static struct perf_session *perf_session_new(int pid)
+{
+	int cpus = tracecmd_count_cpus();
+	struct perf_event_attr pe;
+	struct perf_session *perf;
+	char *template;
+	int i;
+
+	perf = calloc(cpus, sizeof(struct perf_session));
+	if (!perf)
+		return NULL;
+	template = strdup(TMP_FILE);
+	if (!template)
+		goto error;
+	perf->fdebug = trace_debug_obj_create_pid(pid);
+	if (!perf->fdebug)
+		goto error;
+	perf->cpus = calloc(cpus, sizeof(struct perf_cpu));
+	if (!cpus)
+		goto error;
+	for (i = 0; i < cpus; i++) {
+		perf->cpus[i].perf_fd = -1;
+		perf->cpus[i].sample_file_fd = -1;
+		perf->cpus[i].mmap_file_fd = -1;
+	}
+
+	perf_init_pe(&pe);
+	for (i = 0; i < cpus; i++) {
+		perf->cpus[i].perf_fd = perf_event_open(&pe, pid, i, -1, 0);
+		if (perf->cpus[i].perf_fd < 0)
+			goto error;
+		fcntl(perf->cpus[i].perf_fd, F_SETFL, O_NONBLOCK);
+		perf->cpus[i].cpu = i;
+		if (perf_mmap(&perf->cpus[i]) < 0)
+			goto error;
+		strcpy(template, TMP_FILE);
+		mktemp(template);
+		perf->cpus[i].sample_file = strdup(template);
+		strcpy(template, TMP_FILE);
+		mktemp(template);
+		perf->cpus[i].mmap_file = strdup(template);
+		perf->cpus[i].pid = pid;
+	}
+	perf->cpu_count = cpus;
+	perf->pid = pid;
+	free(template);
+	return perf;
+error:
+	free(template);
+	perf_session_destroy(perf);
+	return NULL;
+}
+
+void *perf_read_thread(void *context)
+{
+	struct perf_cpu *cpu = (struct perf_cpu *)context;
+	struct timespec swait;
+	sigset_t sig_set;
+	int ret;
+
+	swait.tv_sec = SLEEP_USEC / 1000000,
+	swait.tv_nsec = (SLEEP_USEC % 1000000) * 1000,
+
+	sigemptyset(&sig_set);
+	sigaddset(&sig_set, SIGINT);
+	sigaddset(&sig_set, SIGTERM);
+	pthread_sigmask(SIG_BLOCK, &sig_set, NULL);
+
+	cpu->sample_file_fd = open(cpu->sample_file, O_WRONLY|O_CREAT|O_TRUNC, 0600);
+	if (cpu->sample_file_fd < 0)
+		return NULL;
+	cpu->mmap_file_fd = open(cpu->mmap_file, O_WRONLY|O_CREAT|O_TRUNC, 0600);
+	if (cpu->mmap_file_fd < 0)
+		return NULL;
+
+	perf_enable(cpu, true);
+
+	while (1) {
+		ret = perf_event_read(cpu);
+		if (!ret) /* sleep if the buffer was empty */
+			nanosleep(&swait, NULL);
+	}
+
+	return NULL;
+}
+
+#define PERF_READER_SCHEDULER SCHED_RR
+static int perf_run_readers(struct perf_session *perf)
+{
+	struct sched_param sched;
+	pthread_attr_t attr;
+	int i;
+
+	sched.sched_priority = sched_get_priority_max(PERF_READER_SCHEDULER);
+	pthread_attr_init(&attr);
+	pthread_attr_setschedpolicy(&attr, PERF_READER_SCHEDULER);
+	pthread_attr_setschedparam(&attr, &sched);
+	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+	for (i = 0; i < perf->cpu_count; i++) {
+		if (pthread_create(&perf->cpus[i].tid, &attr,
+				   perf_read_thread, &perf->cpus[i]))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int perf_stop_readers(struct perf_session *perf)
+{
+	int i;
+
+	for (i = 0; i < perf->cpu_count; i++)
+		pthread_cancel(perf->cpus[i].tid);
+
+	for (i = 0; i < perf->cpu_count; i++)
+		pthread_join(perf->cpus[i].tid, NULL);
+
+	return 0;
+}
+
+static void perf_collect_results(struct perf_session *perf, int cpu)
+{
+	unsigned long long addr, len, pgoff;
+	unsigned long long time, count, ip;
+	char file[PATH_MAX];
+	int sfd, mfd, i;
+	int sall = 0;
+	int mall = 0;
+	int ret;
+
+	if (!perf->cpus[cpu].mmap_file || !perf->cpus[cpu].sample_file)
+		return;
+	mfd = open(perf->cpus[cpu].mmap_file, O_RDONLY, 0600);
+	if (mfd < 0)
+		return;
+	sfd = open(perf->cpus[cpu].sample_file, O_RDONLY, 0600);
+	if (sfd < 0)
+		return;
+	do {
+		/* Loop through all dynamically loaded libraries during the trace
+		 * and attach them to the debug object, to be used for symbols resolve.
+		 */
+		ret = read(mfd, &addr, sizeof(addr));
+		if (ret != sizeof(addr))
+			break;
+		ret = read(mfd, &len, sizeof(len));
+		if (ret != sizeof(len))
+			break;
+		ret = read(mfd, &pgoff, sizeof(pgoff));
+		if (ret != sizeof(pgoff))
+			break;
+		ret = trace_read_file_string(mfd, file, PATH_MAX);
+		if (ret < 0)
+			break;
+		trace_debug_obj_add_file(perf->fdebug, file, addr, addr + len, pgoff);
+		mall++;
+	} while (1);
+
+	do {
+		/* Loop through all collected calltraces and add addresses to
+		 * the debug object for function name resolving.
+		 */
+		ret = read(sfd, &time, sizeof(time));
+		if (ret != sizeof(time))
+			break;
+
+		ret = read(sfd, &count, sizeof(count));
+		if (ret != sizeof(count))
+			break;
+
+		for (i = 0; i < count; i++) {
+			ret = read(sfd, &ip, sizeof(ip));
+			if (ret != sizeof(ip))
+				break;
+			ret = trace_debug_add_resolve_symbol(perf->fdebug, ip, NULL);
+		}
+		sall++;
+		if (i < count)
+			break;
+	} while (1);
+	close(sfd);
+	close(mfd);
+
+	printf("Got %d samples and %d mmaps for CPU %d\n", sall, mall, cpu);
+}
+
+static void stop_perf(int sig)
+{
+	done = true;
+}
+
+static int perf_collect(struct perf_session *perf)
+{
+	int i;
+
+	signal(SIGINT, stop_perf);
+	signal(SIGTERM, stop_perf);
+
+	if (perf_run_readers(perf))
+		return -1;
+
+	printf("Collecting callstack of pid %d, hit Ctrl^C to stop recording\n",
+		perf->pid);
+
+	while (!done)
+		usleep(500000);
+
+	for (i = 0; i < perf->cpu_count; i++)
+		perf_enable(&perf->cpus[i], false);
+
+	perf_stop_readers(perf);
+
+	for (i = 0; i < perf->cpu_count; i++)
+		perf_collect_results(perf, i);
+
+	return 0;
+}
+
+struct symb_walk_context {
+	struct trace_seq *s;
+	int count;
+};
+
+static int perf_symbols_walk(struct tracecmd_debug_symbols *symb, void *data)
+{
+	struct symb_walk_context *context = (struct symb_walk_context *)data;
+
+	if (!symb->name)
+		printf("\t Failed to resolve %llx in %s\n",
+			symb->vma_near, symb->fname);
+
+	trace_seq_printf(context->s, "%s %s %llx %llx\n",
+			 symb->fname, symb->name,
+			 symb->vma_start, symb->vma_near);
+	context->count++;
+
+	return 0;
+}
+
+static void
+add_pid_symbols(struct tracecmd_output *handle, struct perf_session *perf)
+{
+	struct symb_walk_context context;
+	struct trace_seq body, head;
+
+	trace_seq_init(&body);
+	trace_seq_init(&head);
+	context.s = &body;
+	context.count = 0;
+	trace_debug_walk_resolved_symbols(perf->fdebug, perf_symbols_walk, &context);
+	trace_seq_printf(&head, "%x %d\n", perf->pid, context.count);
+	trace_seq_terminate(&head);
+	trace_seq_terminate(&body);
+	trace_seq_puts(&head, body.buffer);
+	if (handle && context.count)
+		tracecmd_add_option(handle, TRACECMD_OPTION_PID_SYMBOLS,
+				    head.len + 1, head.buffer);
+	else
+		trace_seq_do_printf(&head);
+	trace_seq_destroy(&body);
+}
+
+static int perf_resolve_trace(struct perf_session *perf)
+{
+	trace_debug_resolve_symbols(perf->fdebug);
+	add_pid_symbols(NULL, perf);
+
+	return 0;
+}
+
+int perf_run(int pid)
+{
+	struct perf_session *perf;
+
+	perf = perf_session_new(pid);
+	if (perf == NULL)
+		return -1;
+
+	perf_collect(perf);
+
+	perf_resolve_trace(perf);
+
+	perf_session_destroy(perf);
+	return 0;
+}
+
+enum {
+	OPT_pid	= 255,
+};
+
+void trace_perf(int argc, char **argv)
+{
+	int pid = 0;
+	int c;
+
+
+	if (strcmp(argv[1], "perf") != 0)
+		usage(argv);
+	for (;;) {
+		int option_index = 0;
+		static struct option long_options[] = {
+			{"pid", required_argument, NULL, OPT_pid},
+			{"help", no_argument, NULL, '?'},
+			{NULL, 0, NULL, 0}
+		};
+
+		c = getopt_long (argc-1, argv+1, "+hp:",
+				long_options, &option_index);
+		if (c == -1)
+			break;
+		switch (c) {
+
+		case OPT_pid:
+		case 'p':
+			pid = atoi(optarg);
+			break;
+		case 'h':
+		case '?':
+		default:
+			usage(argv);
+		}
+	}
+
+	if (perf_run(pid) < 0)
+		printf("Failed to run perf\n");
+}