diff mbox series

perf arm-spe: Use SPE data source for neoverse cores

Message ID 20220121182456.13538-1-alisaidi@amazon.com (mailing list archive)
State New, archived
Headers show
Series perf arm-spe: Use SPE data source for neoverse cores | expand

Commit Message

Ali Saidi Jan. 21, 2022, 6:24 p.m. UTC
When synthesizing data from SPE, augment the type with source information
for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
the same encoding. I can't find encoding information for any other SPE
implementations to unify their choices with Arm's thus that is left for future
work.

This changes enables the expected behavior of perf c2c on a system with SPE where
lines that are shared among multiple cores show up in perf c2c output. 

Signed-off-by: Ali Saidi <alisaidi@amazon.com> 
---
 .../util/arm-spe-decoder/arm-spe-decoder.c    |  1 +
 .../util/arm-spe-decoder/arm-spe-decoder.h    | 12 +++++
 tools/perf/util/arm-spe.c                     | 48 ++++++++++++++-----
 3 files changed, 49 insertions(+), 12 deletions(-)

Comments

James Clark Jan. 24, 2022, 5:24 p.m. UTC | #1
On 21/01/2022 18:24, Ali Saidi wrote:
> When synthesizing data from SPE, augment the type with source information
> for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
> the same encoding. I can't find encoding information for any other SPE
> implementations to unify their choices with Arm's thus that is left for future
> work.
> 
> This changes enables the expected behavior of perf c2c on a system with SPE where
> lines that are shared among multiple cores show up in perf c2c output. 
> 
> Signed-off-by: Ali Saidi <alisaidi@amazon.com> 
> ---
>  .../util/arm-spe-decoder/arm-spe-decoder.c    |  1 +
>  .../util/arm-spe-decoder/arm-spe-decoder.h    | 12 +++++
>  tools/perf/util/arm-spe.c                     | 48 ++++++++++++++-----
>  3 files changed, 49 insertions(+), 12 deletions(-)
> 
[...]
> +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
>  {
>  	union perf_mem_data_src	data_src = { 0 };
> +	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
>  
>  	if (record->op == ARM_SPE_LD)
>  		data_src.mem_op = PERF_MEM_OP_LOAD;
> @@ -409,19 +418,30 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
>  		data_src.mem_op = PERF_MEM_OP_STORE;
>  
>  	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> -		data_src.mem_lvl = PERF_MEM_LVL_L3;
> +		if (is_neoverse && record->source == ARM_SPE_NV_DRAM) {
> +			data_src.mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
> +		} else if (is_neoverse && record->source == ARM_SPE_NV_PEER_CLSTR) {
> +			data_src.mem_snoop = PERF_MEM_SNOOP_HITM;

I'm not following how LLC_ACCESS | LLC_MISS ends up as HITM in this case (ARM_SPE_NV_PEER_CLSTR)?
I thought there was no way to determine a HITM from SPE. Wouldn't one of the other values
like PERF_MEM_SNOOP_MISS be more accurate?

> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;

This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
not obvious how the result is derived because there are some things that don't add up like
ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.

> +		} else {
> +			data_src.mem_lvl = PERF_MEM_LVL_L3;>  
> -		if (record->type & ARM_SPE_LLC_MISS)
> -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> -		else
> -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +			if (record->type & ARM_SPE_LLC_MISS)
> +				data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> +			else
> +				data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +		}
>  	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
> -		data_src.mem_lvl = PERF_MEM_LVL_L1;
> +		if (is_neoverse && record->source == ARM_SPE_NV_L2) {
> +			data_src.mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
> +		} else {
> +			data_src.mem_lvl = PERF_MEM_LVL_L1;
>  
> -		if (record->type & ARM_SPE_L1D_MISS)
> -			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> -		else
> -			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +			if (record->type & ARM_SPE_L1D_MISS)
> +				data_src.mem_lvl |= PERF_MEM_LVL_MISS;
> +			else
> +				data_src.mem_lvl |= PERF_MEM_LVL_HIT;
> +		}
>  	}
>  
>  	if (record->type & ARM_SPE_REMOTE_ACCESS)
> @@ -446,7 +466,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
>  	u64 data_src;
>  	int err;
>  
> -	data_src = arm_spe__synth_data_source(record);
> +	data_src = arm_spe__synth_data_source(record, spe->midr);
>  
>  	if (spe->sample_flc) {
>  		if (record->type & ARM_SPE_L1D_MISS) {
> @@ -796,6 +816,10 @@ static int arm_spe_process_event(struct perf_session *session,
>  	u64 timestamp;
>  	struct arm_spe *spe = container_of(session->auxtrace,
>  			struct arm_spe, auxtrace);
> +	const char *cpuid = perf_env__cpuid(session->evlist->env);
> +	u64 midr = strtol(cpuid, NULL, 16);
> +
> +	spe->midr = midr;
>  
>  	if (dump_trace)
>  		return 0;
>
Ali Saidi Jan. 24, 2022, 10:53 p.m. UTC | #2
On 1/24/22, 11:24 AM, "James Clark" <james.clark@arm.com> wrote:
>On 21/01/2022 18:24, Ali Saidi wrote:
>> When synthesizing data from SPE, augment the type with source information
>> for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use
>> the same encoding. I can't find encoding information for any other SPE
>> implementations to unify their choices with Arm's thus that is left for future
>> work.
>> 
>> This changes enables the expected behavior of perf c2c on a system with SPE where
>> lines that are shared among multiple cores show up in perf c2c output. 
>> 
>> Signed-off-by: Ali Saidi <alisaidi@amazon.com> 
>> ---
>>  .../util/arm-spe-decoder/arm-spe-decoder.c    |  1 +
>>  .../util/arm-spe-decoder/arm-spe-decoder.h    | 12 +++++
>>  tools/perf/util/arm-spe.c                     | 48 ++++++++++++++-----
>>  3 files changed, 49 insertions(+), 12 deletions(-)
>> 
>[...]
>> +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
>>  {
>>  	union perf_mem_data_src	data_src = { 0 };
>> +	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
>>  
>>  	if (record->op == ARM_SPE_LD)
>>  		data_src.mem_op = PERF_MEM_OP_LOAD;
>> @@ -409,19 +418,30 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
>>  		data_src.mem_op = PERF_MEM_OP_STORE;
>>  
>>  	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
>> -		data_src.mem_lvl = PERF_MEM_LVL_L3;
>> +		if (is_neoverse && record->source == ARM_SPE_NV_DRAM) {
>> +			data_src.mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
>> +		} else if (is_neoverse && record->source == ARM_SPE_NV_PEER_CLSTR) {
>> +			data_src.mem_snoop = PERF_MEM_SNOOP_HITM;
>
>I'm not following how LLC_ACCESS | LLC_MISS ends up as HITM in this case (ARM_SPE_NV_PEER_CLSTR)?
>I thought there was no way to determine a HITM from SPE. Wouldn't one of the other values
>like PERF_MEM_SNOOP_MISS be more accurate?

Thanks for taking a look James.

I'd really like someone familiar with perf c2c output to also end up getting
similar output when running on an Arm system with SPE. There are obviously large
micro-architectural differences that have been abstracted away by the data_src
abstraction but fundamentally my understanding of x86 HITM is that the line
was found in the snoop filter of the LLC as being owned by another core and
therefore the request needs to go to another core to get the line.  I'm not
100% sure if on x86 it's really guaranteed to be dirty or not and it's not
always going to be dirty in a Neoverse system, but since the SPE source
indicates it was sourced from another core it is a core-2-core transfer of a
line which is currently owned by another cpu core and that is the interesting
data point that would be used to drive optimization and elimination of frequent
core-2-core transfers (true or false sharing).

>> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
>
>This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
>hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
>not obvious how the result is derived because there are some things that don't add up like
>ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.

Assuming the above is correct, my reading of the existing code that creates the
c2c output is that when an access is marked as an LLC hit, that doesn't
necessarily mean that the data was present in the LLC. I don't see how it could
given that LLC_HIT + HITM means the line was dirty in another CPUs cache, and so
LLC_HIT + HITM seems to mean that it was a hit in the LLC snoop filter and
required a different core to provide the line. This and the above certainly
deserve a comment as to why the miss is being attributed in this way if it's
otherwise acceptable.

Thanks,
Ali
Leo Yan Feb. 2, 2022, 3:01 p.m. UTC | #3
Hi Ali, James,

[ + Al ]

On Mon, Jan 24, 2022 at 10:53:33PM +0000, Ali Saidi wrote:

[...]

> >> +static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
> >>  {
> >>  	union perf_mem_data_src	data_src = { 0 };
> >> +	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
> >>  
> >>  	if (record->op == ARM_SPE_LD)
> >>  		data_src.mem_op = PERF_MEM_OP_LOAD;
> >> @@ -409,19 +418,30 @@ static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
> >>  		data_src.mem_op = PERF_MEM_OP_STORE;
> >>  
> >>  	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
> >> -		data_src.mem_lvl = PERF_MEM_LVL_L3;
> >> +		if (is_neoverse && record->source == ARM_SPE_NV_DRAM) {
> >> +			data_src.mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
> >> +		} else if (is_neoverse && record->source == ARM_SPE_NV_PEER_CLSTR) {
> >> +			data_src.mem_snoop = PERF_MEM_SNOOP_HITM;
> >
> >I'm not following how LLC_ACCESS | LLC_MISS ends up as HITM in this case (ARM_SPE_NV_PEER_CLSTR)?
> >I thought there was no way to determine a HITM from SPE. Wouldn't one of the other values
> >like PERF_MEM_SNOOP_MISS be more accurate?
> 
> Thanks for taking a look James.
> 
> I'd really like someone familiar with perf c2c output to also end up getting
> similar output when running on an Arm system with SPE. There are obviously large
> micro-architectural differences that have been abstracted away by the data_src
> abstraction but fundamentally my understanding of x86 HITM is that the line
> was found in the snoop filter of the LLC as being owned by another core and
> therefore the request needs to go to another core to get the line.  I'm not
> 100% sure if on x86 it's really guaranteed to be dirty or not and it's not
> always going to be dirty in a Neoverse system, but since the SPE source
> indicates it was sourced from another core it is a core-2-core transfer of a
> line which is currently owned by another cpu core and that is the interesting
> data point that would be used to drive optimization and elimination of frequent
> core-2-core transfers (true or false sharing).

Though I don't know the implementation for the hardware conherency
protocols, here I have the same understanding with Ali.

For x86 arch, it uses HITM to indicate the cache line is "modified"
state; on Arm64 Neoverse platforms, there have two data source values
can tell us if the cache line is "modified" state:
ARM_SPE_NV_PEER_CLSTR and ARM_SPE_NV_PEER_CORE.  The snooping can
happen either within the cluster or cross clusters.

> >> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
> >
> >This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
> >hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
> >not obvious how the result is derived because there are some things that don't add up like
> >ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.
> 
> Assuming the above is correct, my reading of the existing code that creates the
> c2c output is that when an access is marked as an LLC hit, that doesn't
> necessarily mean that the data was present in the LLC. I don't see how it could
> given that LLC_HIT + HITM means the line was dirty in another CPUs cache, and so
> LLC_HIT + HITM seems to mean that it was a hit in the LLC snoop filter and
> required a different core to provide the line. This and the above certainly
> deserve a comment as to why the miss is being attributed in this way if it's
> otherwise acceptable.

As James pointed out, this might introduce confusion.  I am wanderding
if we can extract two functions for synthesizing the data source, one is
for Neoverse platform and another is for generic purpose (which
without data source packets), below code is to demonstrate the basic
idea.

Please note, I don't verify this code piece since I have no Neoverse
platform.  Do you mind to share a perf data file which contains data
source packets so I can study more details for its format?

static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
                                                union perf_mem_data_src *data_src)
{
        switch (record->source) {
        case ARM_SPE_NV_L1D:
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
                break;
        case ARM_SPE_NV_L2:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                break;
        case ARM_SPE_NV_PEER_CORE:
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
                break;
        case ARM_SPE_NV_LCL_CLSTR:
                /* To be decided */
                break;
        /* System cache is L3 cache */
        case ARM_SPE_NV_SYS_CACHE:
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                break;
        /* Snooping between L2 caches crossing two clusters */
        case ARM_SPE_NV_PEER_CLSTR:
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                break;
        case ARM_SPE_NV_REMOTE:
                data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
                break;
        case ARM_SPE_NV_DRAM:
                data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
                break;
        default:
                break;
        }

        return;
}

static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
                                               union perf_mem_data_src *data_src)
{
       if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
               data_src->mem_lvl = PERF_MEM_LVL_L3;

               if (record->type & ARM_SPE_LLC_MISS)
                       data_src->mem_lvl |= PERF_MEM_LVL_MISS;
               else
                       data_src->mem_lvl |= PERF_MEM_LVL_HIT;
       } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
               data_src->mem_lvl = PERF_MEM_LVL_L1;

               if (record->type & ARM_SPE_L1D_MISS)
                       data_src->mem_lvl |= PERF_MEM_LVL_MISS;
               else
                       data_src->mem_lvl |= PERF_MEM_LVL_HIT;
       }

       if (record->type & ARM_SPE_REMOTE_ACCESS)
               data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;

       return;
}

static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
{
        union perf_mem_data_src data_src = { 0 };
        bool is_neoverse = is_midr_in_range(midr, neoverse_spe);

        if (record->op == ARM_SPE_LD)
                data_src.mem_op = PERF_MEM_OP_LOAD;
        else
                data_src.mem_op = PERF_MEM_OP_STORE;

        if (is_neoverse)
                arm_spe__synth_data_source_neoverse(record, &data_src);
        else
                arm_spe__synth_data_source_generic(record, &data_src);

        if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
                data_src.mem_dtlb = PERF_MEM_TLB_WK;

                if (record->type & ARM_SPE_TLB_MISS)
                        data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
                else
                        data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
        }

        return data_src.val;
}

Thanks,
Leo
Ali Saidi Feb. 2, 2022, 7:51 p.m. UTC | #4
Hi Leo,

>Hi Ali, James,
>
> [...]
>> 
>> I'd really like someone familiar with perf c2c output to also end up getting
>> similar output when running on an Arm system with SPE. There are obviously large
>> micro-architectural differences that have been abstracted away by the data_src
>> abstraction but fundamentally my understanding of x86 HITM is that the line
>> was found in the snoop filter of the LLC as being owned by another core and
>> therefore the request needs to go to another core to get the line.  I'm not
>> 100% sure if on x86 it's really guaranteed to be dirty or not and it's not
>> always going to be dirty in a Neoverse system, but since the SPE source
>> indicates it was sourced from another core it is a core-2-core transfer of a
>> line which is currently owned by another cpu core and that is the interesting
>> data point that would be used to drive optimization and elimination of frequent
>> core-2-core transfers (true or false sharing).
>
>Though I don't know the implementation for the hardware conherency
>protocols, here I have the same understanding with Ali.
>
>For x86 arch, it uses HITM to indicate the cache line is "modified"
>state; on Arm64 Neoverse platforms, there have two data source values
>can tell us if the cache line is "modified" state:
>ARM_SPE_NV_PEER_CLSTR and ARM_SPE_NV_PEER_CORE.  The snooping can
>happen either within the cluster or cross clusters.

Yes, although it depends on the system topology if there are clusters.

>> >> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
>> >
>> >This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
>> >hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
>> >not obvious how the result is derived because there are some things that don't add up like
>> >ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.
>> 
>> Assuming the above is correct, my reading of the existing code that creates the
>> c2c output is that when an access is marked as an LLC hit, that doesn't
>> necessarily mean that the data was present in the LLC. I don't see how it could
>> given that LLC_HIT + HITM means the line was dirty in another CPUs cache, and so
>> LLC_HIT + HITM seems to mean that it was a hit in the LLC snoop filter and
>> required a different core to provide the line. This and the above certainly
>> deserve a comment as to why the miss is being attributed in this way if it's
>> otherwise acceptable.
>
>As James pointed out, this might introduce confusion.  I am wanderding
>if we can extract two functions for synthesizing the data source, one is
>for Neoverse platform and another is for generic purpose (which
>without data source packets), below code is to demonstrate the basic
>idea.

The code below is cleaner, and I'm happy to rework the patches in this way, but
I think the question still remains about unifying behavior of the tool. If we
mark something with a data source of ARM_SPE_NV_PEER_CORE as at L1 hit + HITM
certainly c2c won't show the correct thing today, but i think it also hides the
intent. The line in question missed the L1, L2, and got to the LLC where we did
find a record that it was in another cores cache (L1 or L2). Looking at the way
that c2c works today, it seems like marking this as a hit in the LLC snoop
filter is the best way to unify behaviors among architectures?

I'll send you a perf.data file OOB.

Thanks,
Ali
Leo Yan Feb. 3, 2022, 9:19 a.m. UTC | #5
On Wed, Feb 02, 2022 at 07:51:15PM +0000, Ali Saidi wrote:

[...]

> >> >> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
> >> >
> >> >This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
> >> >hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
> >> >not obvious how the result is derived because there are some things that don't add up like
> >> >ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.
> >> 
> >> Assuming the above is correct, my reading of the existing code that creates the
> >> c2c output is that when an access is marked as an LLC hit, that doesn't
> >> necessarily mean that the data was present in the LLC. I don't see how it could
> >> given that LLC_HIT + HITM means the line was dirty in another CPUs cache, and so
> >> LLC_HIT + HITM seems to mean that it was a hit in the LLC snoop filter and
> >> required a different core to provide the line. This and the above certainly
> >> deserve a comment as to why the miss is being attributed in this way if it's
> >> otherwise acceptable.
> >
> >As James pointed out, this might introduce confusion.  I am wanderding
> >if we can extract two functions for synthesizing the data source, one is
> >for Neoverse platform and another is for generic purpose (which
> >without data source packets), below code is to demonstrate the basic
> >idea.
> 
> The code below is cleaner, and I'm happy to rework the patches in this way, but
> I think the question still remains about unifying behavior of the tool. If we
> mark something with a data source of ARM_SPE_NV_PEER_CORE as at L1 hit + HITM
> certainly c2c won't show the correct thing today, but i think it also hides the
> intent. The line in question missed the L1, L2, and got to the LLC where we did
> find a record that it was in another cores cache (L1 or L2). Looking at the way
> that c2c works today, it seems like marking this as a hit in the LLC snoop
> filter is the best way to unify behaviors among architectures?

Thanks a lot for pointing out this.  I looked into the code and
compared the memory trace data from x86, I found the HITM tag is always
sticking to LLC from x86's memory events.  This would be the main reason
why current code in perf is only support HITM for LLC.

I don't think it's a good way to always mark LLC snoop, even it's a
snooping operation in L1 or L2 cache on Arm64 platforms; this would
introduce confusion for users when using Arm SPE for profiling.

Alternatively, we can support HITM tag for L1/L2 cache levels in perf,
this can allow us to match memory topology on Arm64 arch, and it should
not introduce any regression on x86 arch.

Could you confirm if below code can fix the issue or not?

From 56e70a9ca974ad062788cfccd2ae9ddafa13d3ae Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Thu, 3 Feb 2022 16:53:34 +0800
Subject: [PATCH] perf mem: Support HITM statistics for L1/L2 caches

Current code only support HITM statistics for last level cache (LLC) and
remote node's cache.  This works for x86 architecture since the HITM tag
is associated with LLC but not with L1/L2 cache.

On Arm64 architectures, due to the different memory hierarchy and
topology, the snooping can happen on L1 or L2 cache line, and thus it's
possible that coherency protocol fetches data from peer core or
cluster's L1/L2 cache.  For this reason, HITM tag is not necessarily
bound to LLC anymore.

For a general solution, this patch extends to set HITM tag for L1 and L2
cache, thus this can allow perf c2c tool to work properly for Arm64
architecture.  On the other hand, since x86 architecture doesn't set
HITM tag for L1/L2 cache, thus this patch should not introduce any
functionality change for x86 platforms.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
---
 tools/perf/util/mem-events.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index ed0ab838bcc5..7a0ab3d26843 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -527,8 +527,18 @@ do {				\
 			if (lvl & P(LVL, UNC)) stats->ld_uncache++;
 			if (lvl & P(LVL, IO))  stats->ld_io++;
 			if (lvl & P(LVL, LFB)) stats->ld_fbhit++;
-			if (lvl & P(LVL, L1 )) stats->ld_l1hit++;
-			if (lvl & P(LVL, L2 )) stats->ld_l2hit++;
+			if (lvl & P(LVL, L1 )) {
+				if (snoop & P(SNOOP, HITM))
+					HITM_INC(lcl_hitm);
+				else
+					stats->ld_l1hit++;
+			}
+			if (lvl & P(LVL, L2 )) {
+				if (snoop & P(SNOOP, HITM))
+					HITM_INC(lcl_hitm);
+				else
+					stats->ld_l2hit++;
+			}
 			if (lvl & P(LVL, L3 )) {
 				if (snoop & P(SNOOP, HITM))
 					HITM_INC(lcl_hitm);

> I'll send you a perf.data file OOB.

Very appreciate!  I will look into it and will let you know
if I have any new finding.

Thanks,
Leo
Ali Saidi Feb. 5, 2022, 12:07 a.m. UTC | #6
Hi Leo,

On 2/3/22, 3:20 AM, "Leo Yan" <leo.yan@linaro.org> wrote:
>[...]
>> >> >> +			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
>> >> >
>> >> >This one also adds PERF_MEM_LVL_HIT even though the check of "if (record->type & ARM_SPE_LLC_MISS)"
>> >> >hasn't happened yet. Maybe some comments would make it a bit clearer, but at the moment it's
>> >> >not obvious how the result is derived because there are some things that don't add up like
>> >> >ARM_SPE_LLC_MISS == PERF_MEM_LVL_HIT.
>> >> 
>> >> Assuming the above is correct, my reading of the existing code that creates the
>> >> c2c output is that when an access is marked as an LLC hit, that doesn't
>> >> necessarily mean that the data was present in the LLC. I don't see how it could
>> >> given that LLC_HIT + HITM means the line was dirty in another CPUs cache, and so
>> >> LLC_HIT + HITM seems to mean that it was a hit in the LLC snoop filter and
>> >> required a different core to provide the line. This and the above certainly
>> >> deserve a comment as to why the miss is being attributed in this way if it's
>> >> otherwise acceptable.
>> >
>> >As James pointed out, this might introduce confusion.  I am wanderding
>> >if we can extract two functions for synthesizing the data source, one is
>> >for Neoverse platform and another is for generic purpose (which
>> >without data source packets), below code is to demonstrate the basic
>> >idea.
>> 
>> The code below is cleaner, and I'm happy to rework the patches in this way, but
>> I think the question still remains about unifying behavior of the tool. If we
>> mark something with a data source of ARM_SPE_NV_PEER_CORE as at L1 hit + HITM
>> certainly c2c won't show the correct thing today, but i think it also hides the
>> intent. The line in question missed the L1, L2, and got to the LLC where we did
>> find a record that it was in another cores cache (L1 or L2). Looking at the way
>> that c2c works today, it seems like marking this as a hit in the LLC snoop
>> filter is the best way to unify behaviors among architectures?
>
>Thanks a lot for pointing out this.  I looked into the code and
>compared the memory trace data from x86, I found the HITM tag is always
>sticking to LLC from x86's memory events.  This would be the main reason
>why current code in perf is only support HITM for LLC.
>
>I don't think it's a good way to always mark LLC snoop, even it's a
>snooping operation in L1 or L2 cache on Arm64 platforms; this would
>introduce confusion for users when using Arm SPE for profiling.
>
>Alternatively, we can support HITM tag for L1/L2 cache levels in perf,
>this can allow us to match memory topology on Arm64 arch, and it should
>not introduce any regression on x86 arch.
>
>Could you confirm if below code can fix the issue or not?

Yes, that should do it. Want me to repsin this with the changes we discussed?

Ali
Leo Yan Feb. 5, 2022, 2:18 a.m. UTC | #7
Hi Ali,

On Sat, Feb 05, 2022 at 12:07:19AM +0000, Ali Saidi wrote:

[...]

> >Alternatively, we can support HITM tag for L1/L2 cache levels in perf,
> >this can allow us to match memory topology on Arm64 arch, and it should
> >not introduce any regression on x86 arch.
> >
> >Could you confirm if below code can fix the issue or not?
> 
> Yes, that should do it. Want me to repsin this with the changes we discussed?

Yes, please go ahead and thanks a lot for your effort.

Leo
diff mbox series

Patch

diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
index 5e390a1a79ab..091987dd3966 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c
@@ -220,6 +220,7 @@  static int arm_spe_read_record(struct arm_spe_decoder *decoder)
 
 			break;
 		case ARM_SPE_DATA_SOURCE:
+			decoder->record.source = payload;
 			break;
 		case ARM_SPE_BAD:
 			break;
diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
index 69b31084d6be..1ecf4ee99415 100644
--- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
+++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h
@@ -29,10 +29,22 @@  enum arm_spe_op_type {
 	ARM_SPE_ST		= 1 << 1,
 };
 
+enum arm_spe_neoverse_data_source {
+	ARM_SPE_NV_L1D        = 0x0,
+	ARM_SPE_NV_L2         = 0x8,
+	ARM_SPE_NV_PEER_CORE  = 0x9,
+	ARM_SPE_NV_LCL_CLSTR  = 0xa,
+	ARM_SPE_NV_SYS_CACHE  = 0xb,
+	ARM_SPE_NV_PEER_CLSTR = 0xc,
+	ARM_SPE_NV_REMOTE     = 0xd,
+	ARM_SPE_NV_DRAM       = 0xe,
+};
+
 struct arm_spe_record {
 	enum arm_spe_sample_type type;
 	int err;
 	u32 op;
+	u16 source;
 	u32 latency;
 	u64 from_ip;
 	u64 to_ip;
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index d2b64e3f588b..d025af13f5e4 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -34,6 +34,7 @@ 
 #include "arm-spe-decoder/arm-spe-decoder.h"
 #include "arm-spe-decoder/arm-spe-pkt-decoder.h"
 
+#include <../../../arch/arm64/include/asm/cputype.h>
 #define MAX_TIMESTAMP (~0ULL)
 
 struct arm_spe {
@@ -45,6 +46,7 @@  struct arm_spe {
 	struct perf_session		*session;
 	struct machine			*machine;
 	u32				pmu_type;
+	u64				midr;
 
 	struct perf_tsc_conversion	tc;
 
@@ -399,9 +401,16 @@  static bool arm_spe__is_memory_event(enum arm_spe_sample_type type)
 	return false;
 }
 
-static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
+static const struct midr_range neoverse_spe[] = {
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
+	MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
+	{},
+};
+
+static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
 {
 	union perf_mem_data_src	data_src = { 0 };
+	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
 
 	if (record->op == ARM_SPE_LD)
 		data_src.mem_op = PERF_MEM_OP_LOAD;
@@ -409,19 +418,30 @@  static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
 		data_src.mem_op = PERF_MEM_OP_STORE;
 
 	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
-		data_src.mem_lvl = PERF_MEM_LVL_L3;
+		if (is_neoverse && record->source == ARM_SPE_NV_DRAM) {
+			data_src.mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+		} else if (is_neoverse && record->source == ARM_SPE_NV_PEER_CLSTR) {
+			data_src.mem_snoop = PERF_MEM_SNOOP_HITM;
+			data_src.mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+		} else {
+			data_src.mem_lvl = PERF_MEM_LVL_L3;
 
-		if (record->type & ARM_SPE_LLC_MISS)
-			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
-		else
-			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+			if (record->type & ARM_SPE_LLC_MISS)
+				data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+			else
+				data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+		}
 	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
-		data_src.mem_lvl = PERF_MEM_LVL_L1;
+		if (is_neoverse && record->source == ARM_SPE_NV_L2) {
+			data_src.mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+		} else {
+			data_src.mem_lvl = PERF_MEM_LVL_L1;
 
-		if (record->type & ARM_SPE_L1D_MISS)
-			data_src.mem_lvl |= PERF_MEM_LVL_MISS;
-		else
-			data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+			if (record->type & ARM_SPE_L1D_MISS)
+				data_src.mem_lvl |= PERF_MEM_LVL_MISS;
+			else
+				data_src.mem_lvl |= PERF_MEM_LVL_HIT;
+		}
 	}
 
 	if (record->type & ARM_SPE_REMOTE_ACCESS)
@@ -446,7 +466,7 @@  static int arm_spe_sample(struct arm_spe_queue *speq)
 	u64 data_src;
 	int err;
 
-	data_src = arm_spe__synth_data_source(record);
+	data_src = arm_spe__synth_data_source(record, spe->midr);
 
 	if (spe->sample_flc) {
 		if (record->type & ARM_SPE_L1D_MISS) {
@@ -796,6 +816,10 @@  static int arm_spe_process_event(struct perf_session *session,
 	u64 timestamp;
 	struct arm_spe *spe = container_of(session->auxtrace,
 			struct arm_spe, auxtrace);
+	const char *cpuid = perf_env__cpuid(session->evlist->env);
+	u64 midr = strtol(cpuid, NULL, 16);
+
+	spe->midr = midr;
 
 	if (dump_trace)
 		return 0;