diff mbox series

perf cs-etm: Fix kernel timestamp handling

Message ID 20230910092413.53538-1-leo.yan@linaro.org (mailing list archive)
State New, archived
Headers show
Series perf cs-etm: Fix kernel timestamp handling | expand

Commit Message

Leo Yan Sept. 10, 2023, 9:24 a.m. UTC
The timestamp can originate from two sources: the kernel timestamp,
which is recorded in the event PERF_RECORD_AUX, and the Arm CoreSight
hardware trace data.  On some Arm platforms, CoreSight trace data fails
to support timestamp tracing.  This can be due to either a missed
connection between the timer counter and Arm CoreSight or the absence
of support for the virtual timestamp.  If Arm CoreSight fails to support
hardware timestamp tracing, we need to fall back on using the kernel
timestamp.

The current code can support both timestamp sources when synthesizing
samples. However, the decoding flow only relies on the hardware
timestamp.  If the hardware timestamp is zero, it becomes impossible to
decode the trace data.  Consequently, in this case, the commands below
won't output any samples:

    perf record -e cs_etm// --per-thread --timestamp -- ls
    perf script

To fix this issue, this patch unifies the method of resolving time:

1) It renames cs_etm__resolve_sample_time() to the more general name
   cs_etm__resolve_time();
2) It changes the function argument type from 'cs_etm_traceid_queue' to
   'cs_etm_packet_queue';
3) In the end, both the decoding flow and the assignment of timestamps
   to samples call cs_etm__resolve_time() to obtain timestamp.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
 tools/perf/util/cs-etm.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

Comments

James Clark Sept. 11, 2023, 1:24 p.m. UTC | #1
On 10/09/2023 10:24, Leo Yan wrote:
> The timestamp can originate from two sources: the kernel timestamp,
> which is recorded in the event PERF_RECORD_AUX, and the Arm CoreSight
> hardware trace data.  On some Arm platforms, CoreSight trace data fails
> to support timestamp tracing.  This can be due to either a missed
> connection between the timer counter and Arm CoreSight or the absence
> of support for the virtual timestamp.  If Arm CoreSight fails to support
> hardware timestamp tracing, we need to fall back on using the kernel
> timestamp.
> 
> The current code can support both timestamp sources when synthesizing
> samples. However, the decoding flow only relies on the hardware
> timestamp.  If the hardware timestamp is zero, it becomes impossible to
> decode the trace data.  Consequently, in this case, the commands below
> won't output any samples:
> 
>     perf record -e cs_etm// --per-thread --timestamp -- ls
>     perf script

Hi Leo,

I couldn't reproduce this issue, even when hard coding the hardware
timestamp to zero in cs_etm_decoder__do_hard_timestamp() like this:

 converted_timestamp = 0;

I'm not sure why this would result in no samples being generated either,
because we don't actually use the timestamps for anything yet [1]. We
always wait until the very end of the file before decoding to ensure
that all of the mmaps are loaded. And the timestamp is just assigned to
the samples, but they shouldn't affect whether they are generated or not.

Unless there is something else I'm missing?

Also, in cs_etm__queue_first_cs_timestamp(), cs_timestamp is used for
sorting the decoding order between CPUs, but if the hardware timestamp
is 0, then it's 0 on all trace. So the sorting would be the same if you
change that to be the kernel timestamp. They're all still the same
static number, but just a different number (because we wait until the
end of the file, 'latest_kernel_timestamp' is always the timestamp of
the last AUX event in the file).

[1]: I still plan to change the decoding to decode up to the current
time in the file, rather than waiting for the end of the file before
starting. That way decoding trace when there were overlapping mmap
regions in time will be more accurate.

Thanks
James

> 
> To fix this issue, this patch unifies the method of resolving time:
> 
> 1) It renames cs_etm__resolve_sample_time() to the more general name
>    cs_etm__resolve_time();
> 2) It changes the function argument type from 'cs_etm_traceid_queue' to
>    'cs_etm_packet_queue';
> 3) In the end, both the decoding flow and the assignment of timestamps
>    to samples call cs_etm__resolve_time() to obtain timestamp.
> 
> Signed-off-by: Leo Yan <leo.yan@linaro.org>
> ---
>  tools/perf/util/cs-etm.c | 32 ++++++++++++++++----------------
>  1 file changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
> index 9729d006550d..fa88e731933d 100644
> --- a/tools/perf/util/cs-etm.c
> +++ b/tools/perf/util/cs-etm.c
> @@ -400,6 +400,17 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
>  	etmq->pending_timestamp_chan_id = trace_chan_id;
>  }
>  
> +static u64 cs_etm__resolve_time(struct cs_etm_queue *etmq,
> +				struct cs_etm_packet_queue *packet_queue)
> +{
> +	struct cs_etm_auxtrace *etm = etmq->etm;
> +
> +	if (!etm->timeless_decoding && etm->has_virtual_ts)
> +		return packet_queue->cs_timestamp;
> +	else
> +		return etm->latest_kernel_timestamp;
> +}
> +
>  static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
>  				      u8 *trace_chan_id)
>  {
> @@ -419,8 +430,7 @@ static u64 converted_timestamp = 0;(struct cs_etm_queue *etmq,
>  	/* Acknowledge pending status */
>  	etmq->pending_timestamp_chan_id = 0;
>  
> -	/* See function cs_etm_decoder__do_{hard|soft}_timestamp() */
> -	return packet_queue->cs_timestamp;
> +	return cs_etm__resolve_time(etmq, packet_queue);
>  }
>  
>  static void cs_etm__clear_packet_queue(struct cs_etm_packet_queue *queue)
> @@ -1434,18 +1444,6 @@ u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
>  		return cs_timestamp;
>  }
>  
> -static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
> -					       struct cs_etm_traceid_queue *tidq)
> -{
> -	struct cs_etm_auxtrace *etm = etmq->etm;
> -	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
> -
> -	if (!etm->timeless_decoding && etm->has_virtual_ts)
> -		return packet_queue->cs_timestamp;
> -	else
> -		return etm->latest_kernel_timestamp;
> -}
> -
>  static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>  					    struct cs_etm_traceid_queue *tidq,
>  					    u64 addr, u64 period)
> @@ -1454,13 +1452,14 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
>  	struct cs_etm_auxtrace *etm = etmq->etm;
>  	union perf_event *event = tidq->event_buf;
>  	struct perf_sample sample = {.ip = 0,};
> +	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
>  
>  	event->sample.header.type = PERF_RECORD_SAMPLE;
>  	event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
>  	event->sample.header.size = sizeof(struct perf_event_header);
>  
>  	/* Set time field based on etm auxtrace config. */
> -	sample.time = cs_etm__resolve_sample_time(etmq, tidq);
> +	sample.time = cs_etm__resolve_time(etmq, packet_queue);
>  
>  	sample.ip = addr;
>  	sample.pid = thread__pid(tidq->thread);
> @@ -1505,6 +1504,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>  	struct cs_etm_auxtrace *etm = etmq->etm;
>  	struct perf_sample sample = {.ip = 0,};
>  	union perf_event *event = tidq->event_buf;
> +	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
>  	struct dummy_branch_stack {
>  		u64			nr;
>  		u64			hw_idx;
> @@ -1520,7 +1520,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
>  	event->sample.header.size = sizeof(struct perf_event_header);
>  
>  	/* Set time field based on etm auxtrace config. */
> -	sample.time = cs_etm__resolve_sample_time(etmq, tidq);
> +	sample.time = cs_etm__resolve_time(etmq, packet_queue);
>  
>  	sample.ip = ip;
>  	sample.pid = thread__pid(tidq->prev_packet_thread);
Leo Yan Sept. 12, 2023, 1:52 a.m. UTC | #2
Hi James,

On Mon, Sep 11, 2023 at 02:24:09PM +0100, James Clark wrote:

[...]

> > The current code can support both timestamp sources when synthesizing
> > samples. However, the decoding flow only relies on the hardware
> > timestamp.  If the hardware timestamp is zero, it becomes impossible to
> > decode the trace data.  Consequently, in this case, the commands below
> > won't output any samples:
> > 
> >     perf record -e cs_etm// --per-thread --timestamp -- ls
> >     perf script
> 
> Hi Leo,
> 
> I couldn't reproduce this issue, even when hard coding the hardware
> timestamp to zero in cs_etm_decoder__do_hard_timestamp() like this:
> 
>  converted_timestamp = 0;

I reproduced this issue on the Juno-r2 board, its "etm->has_virtual_ts"
is always false and Arm CoreSight timestamp packets are zeros.

Besides set "converted_timestamp = 0", it might need to hard code
"etm->has_virtual_ts" to false?

> I'm not sure why this would result in no samples being generated either,
> because we don't actually use the timestamps for anything yet [1]. We
> always wait until the very end of the file before decoding to ensure
> that all of the mmaps are loaded. And the timestamp is just assigned to
> the samples, but they shouldn't affect whether they are generated or not.
> 
> Unless there is something else I'm missing?

Let's review below code.

cs_etm__queue_first_cs_timestamp() retrieves trace data and decodes it,
and breaks the while loop until it find the timestamp is not zero or no
trace data is avaliable.  When the timestamp is always zero, the while
loop continues to drop the CoreSight trace data and don't synthesize
samples.

  cs_etm__queue_first_cs_timestamp()
  {
      ...

      while(1) {
          ret = cs_etm__get_data_block(etmq);
          if (ret <= 0)
              goto out;

          ret = cs_etm__decode_data_block(etmq);
          if (ret)
              goto out;

          cs_timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
          /* We found a timestamp, no need to continue. */
          if (cs_timestamp)
              break;

          cs_etm__clear_all_packet_queues(etmq);
      }
  }

> Also, in cs_etm__queue_first_cs_timestamp(), cs_timestamp is used for
> sorting the decoding order between CPUs, but if the hardware timestamp
> is 0, then it's 0 on all trace.

Correct.

> So the sorting would be the same if you change that to be the kernel
> timestamp. They're all still the same
> static number, but just a different number (because we wait until the
> end of the file, 'latest_kernel_timestamp' is always the timestamp of
> the last AUX event in the file).

If we use the 'latest_kernel_timestamp' as timestamp, it's non-zero
timestamp rather than all timestamp '0'.  Yes, 'latest_kernel_timestamp'
is a coarse kernel timestamp which is shared by all trace data recorded
in the AUX event, though it's a static number, it can allow us to break
the while loop mentioned above.

I understand 'latest_kernel_timestamp' is inaccurate for sorting, but
as least now it exists in current code, quotes from util/cs-etm.c:

  /*
   * Record the latest kernel timestamp available in the header   
   * for samples so that synthesised samples occur from this point
   * onwards.
   */
  if (sample->time && (sample->time != (u64)-1))
        etm->latest_kernel_timestamp = sample->time;

> [1]: I still plan to change the decoding to decode up to the current
> time in the file, rather than waiting for the end of the file before
> starting. That way decoding trace when there were overlapping mmap
> regions in time will be more accurate.

Thanks for heading up.  I am not sure if I understand this correctly,
but seems to me it is a good thing to try for using overlapping mmap
events.

Just a side topic, if an Arm platform connect the Arm timer counter to
Arm CoreSight, and detect virtual timestamp is false (thus
etm->has_virtual_ts is 0).  I think this might happen on some legacy
platforms, can we use some ways to allow users to still use the Arm
CoreSight timestamp in this case?  E.g. we can force set
etm->has_virtual_ts to 1 when user specify the config
'-e cs_etm/timestamp/'.

Thanks,
Leo
James Clark Sept. 12, 2023, 10:57 a.m. UTC | #3
On 12/09/2023 02:52, Leo Yan wrote:
> Hi James,
> 
> On Mon, Sep 11, 2023 at 02:24:09PM +0100, James Clark wrote:
> 
> [...]
> 
>>> The current code can support both timestamp sources when synthesizing
>>> samples. However, the decoding flow only relies on the hardware
>>> timestamp.  If the hardware timestamp is zero, it becomes impossible to
>>> decode the trace data.  Consequently, in this case, the commands below
>>> won't output any samples:
>>>
>>>     perf record -e cs_etm// --per-thread --timestamp -- ls
>>>     perf script
>>
>> Hi Leo,
>>
>> I couldn't reproduce this issue, even when hard coding the hardware
>> timestamp to zero in cs_etm_decoder__do_hard_timestamp() like this:
>>
>>  converted_timestamp = 0;
> 
> I reproduced this issue on the Juno-r2 board, its "etm->has_virtual_ts"
> is always false and Arm CoreSight timestamp packets are zeros.
> 
> Besides set "converted_timestamp = 0", it might need to hard code
> "etm->has_virtual_ts" to false?
> >> I'm not sure why this would result in no samples being generated either,
>> because we don't actually use the timestamps for anything yet [1]. We
>> always wait until the very end of the file before decoding to ensure
>> that all of the mmaps are loaded. And the timestamp is just assigned to
>> the samples, but they shouldn't affect whether they are generated or not.
>>
>> Unless there is something else I'm missing?
> 
> Let's review below code.
> 
> cs_etm__queue_first_cs_timestamp() retrieves trace data and decodes it,
> and breaks the while loop until it find the timestamp is not zero or no
> trace data is avaliable.  When the timestamp is always zero, the while
> loop continues to drop the CoreSight trace data and don't synthesize
> samples.
> 
>   cs_etm__queue_first_cs_timestamp()
>   {
>       ...
> 
>       while(1) {
>           ret = cs_etm__get_data_block(etmq);
>           if (ret <= 0)
>               goto out;
> 
>           ret = cs_etm__decode_data_block(etmq);
>           if (ret)
>               goto out;
> 
>           cs_timestamp = cs_etm__etmq_get_timestamp(etmq, &trace_chan_id);
>           /* We found a timestamp, no need to continue. */
>           if (cs_timestamp)
>               break;
> 
>           cs_etm__clear_all_packet_queues(etmq);
>       }
>   }
> 

Ah, I couldn't reproduce it because I was missing your other patch to
add Coresight timestamps when Perf timestamps are requested. If I add
that patch or force Coresight timestamps then I can reproduce the issue:

   $ perf record -e cs_etm/timestamp=1/ --per-thread --timestamp -- ls

But when running perf script I get this warning:

 Zero Coresight timestamp found at Idx:39. Decoding may be improved by
 prepending 'Z' to your current --itrace arguments.

And then if I force timeless mode like it says to do I get samples:

 $ perf script --itrace=Zi1000i

I'm not sure if silently making it use the kernel timestamp for sorting
makes sense, especially when zero hardware timestamps are a bug, and we
already have a warning and a workaround for it.

Also it would mean that you are requesting timestamps on a platform that
isn't generating them. Maybe the fix could also just be to not request
timestamps, and then it uses the timeless decoding flow that also works:

   $ perf record -e cs_etm/timestamp=0/ --per-thread --timestamp -- ls

My worry is that mixing kernel and coresight timestamps for sorting
could make the code confusing to reason about, and sorting based on a
single static AUX timestamp doesn't make sense logically (it's almost
like a hack where the side effect is to make it work). And there isn't
really anything that there isn't already a workaround for. Unless you
want to explicitly disable the "use the Z option" warning and make it
work transparently? Which I'm not sure is the right thing to do.


>> Also, in cs_etm__queue_first_cs_timestamp(), cs_timestamp is used for
>> sorting the decoding order between CPUs, but if the hardware timestamp
>> is 0, then it's 0 on all trace.
> 
> Correct.
> 
>> So the sorting would be the same if you change that to be the kernel
>> timestamp. They're all still the same
>> static number, but just a different number (because we wait until the
>> end of the file, 'latest_kernel_timestamp' is always the timestamp of
>> the last AUX event in the file).
> 
> If we use the 'latest_kernel_timestamp' as timestamp, it's non-zero
> timestamp rather than all timestamp '0'.  Yes, 'latest_kernel_timestamp'
> is a coarse kernel timestamp which is shared by all trace data recorded
> in the AUX event, though it's a static number, it can allow us to break
> the while loop mentioned above.
> 
> I understand 'latest_kernel_timestamp' is inaccurate for sorting, but
> as least now it exists in current code, quotes from util/cs-etm.c:
> 
>   /*
>    * Record the latest kernel timestamp available in the header   
>    * for samples so that synthesised samples occur from this point
>    * onwards.
>    */
>   if (sample->time && (sample->time != (u64)-1))
>         etm->latest_kernel_timestamp = sample->time;
> 
>> [1]: I still plan to change the decoding to decode up to the current
>> time in the file, rather than waiting for the end of the file before
>> starting. That way decoding trace when there were overlapping mmap
>> regions in time will be more accurate.
> 
> Thanks for heading up.  I am not sure if I understand this correctly,
> but seems to me it is a good thing to try for using overlapping mmap
> events.
> 
> Just a side topic, if an Arm platform connect the Arm timer counter to
> Arm CoreSight, and detect virtual timestamp is false (thus
> etm->has_virtual_ts is 0).  I think this might happen on some legacy
> platforms, can we use some ways to allow users to still use the Arm
> CoreSight timestamp in this case?  E.g. we can force set
> etm->has_virtual_ts to 1 when user specify the config
> '-e cs_etm/timestamp/'.
> 
> Thanks,
> Leo
diff mbox series

Patch

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 9729d006550d..fa88e731933d 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -400,6 +400,17 @@  void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
 	etmq->pending_timestamp_chan_id = trace_chan_id;
 }
 
+static u64 cs_etm__resolve_time(struct cs_etm_queue *etmq,
+				struct cs_etm_packet_queue *packet_queue)
+{
+	struct cs_etm_auxtrace *etm = etmq->etm;
+
+	if (!etm->timeless_decoding && etm->has_virtual_ts)
+		return packet_queue->cs_timestamp;
+	else
+		return etm->latest_kernel_timestamp;
+}
+
 static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
 				      u8 *trace_chan_id)
 {
@@ -419,8 +430,7 @@  static u64 cs_etm__etmq_get_timestamp(struct cs_etm_queue *etmq,
 	/* Acknowledge pending status */
 	etmq->pending_timestamp_chan_id = 0;
 
-	/* See function cs_etm_decoder__do_{hard|soft}_timestamp() */
-	return packet_queue->cs_timestamp;
+	return cs_etm__resolve_time(etmq, packet_queue);
 }
 
 static void cs_etm__clear_packet_queue(struct cs_etm_packet_queue *queue)
@@ -1434,18 +1444,6 @@  u64 cs_etm__convert_sample_time(struct cs_etm_queue *etmq, u64 cs_timestamp)
 		return cs_timestamp;
 }
 
-static inline u64 cs_etm__resolve_sample_time(struct cs_etm_queue *etmq,
-					       struct cs_etm_traceid_queue *tidq)
-{
-	struct cs_etm_auxtrace *etm = etmq->etm;
-	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
-
-	if (!etm->timeless_decoding && etm->has_virtual_ts)
-		return packet_queue->cs_timestamp;
-	else
-		return etm->latest_kernel_timestamp;
-}
-
 static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
 					    struct cs_etm_traceid_queue *tidq,
 					    u64 addr, u64 period)
@@ -1454,13 +1452,14 @@  static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
 	struct cs_etm_auxtrace *etm = etmq->etm;
 	union perf_event *event = tidq->event_buf;
 	struct perf_sample sample = {.ip = 0,};
+	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
 
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = cs_etm__cpu_mode(etmq, addr, tidq->el);
 	event->sample.header.size = sizeof(struct perf_event_header);
 
 	/* Set time field based on etm auxtrace config. */
-	sample.time = cs_etm__resolve_sample_time(etmq, tidq);
+	sample.time = cs_etm__resolve_time(etmq, packet_queue);
 
 	sample.ip = addr;
 	sample.pid = thread__pid(tidq->thread);
@@ -1505,6 +1504,7 @@  static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	struct cs_etm_auxtrace *etm = etmq->etm;
 	struct perf_sample sample = {.ip = 0,};
 	union perf_event *event = tidq->event_buf;
+	struct cs_etm_packet_queue *packet_queue = &tidq->packet_queue;
 	struct dummy_branch_stack {
 		u64			nr;
 		u64			hw_idx;
@@ -1520,7 +1520,7 @@  static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	event->sample.header.size = sizeof(struct perf_event_header);
 
 	/* Set time field based on etm auxtrace config. */
-	sample.time = cs_etm__resolve_sample_time(etmq, tidq);
+	sample.time = cs_etm__resolve_time(etmq, packet_queue);
 
 	sample.ip = ip;
 	sample.pid = thread__pid(tidq->prev_packet_thread);