diff mbox series

[v3,03/48] perf stat: Correct aggregation CPU map

Message ID 20211230072030.302559-5-irogers@google.com (mailing list archive)
State New, archived
Headers show
Series Refactor perf cpumap | expand

Commit Message

Ian Rogers Dec. 30, 2021, 7:19 a.m. UTC
Switch the perf_cpu_map in aggr_update_shadow from
the evlist to the counter's cpu map, so the index is appropriate. This
addresses a problem where uncore counts, with a cpumap like:
$ cat /sys/devices/uncore_imc_0/cpumask
0,18
Don't aggregate counts in CPUs based on the index of those values in the
cpumap (0 and 1) but on the actual CPU (0 and 18). Thereby correcting
metric calculations in per-socket mode for counters without a full
cpumask.

On a SkylakeX with a tweaked DRAM_BW_Use metric, to remove unnecessary
scaling, this gives:

Before:
$ /perf stat --per-socket -M DRAM_BW_Use -I 1000
     1.001102293 S0        1              27.01 MiB  uncore_imc/cas_count_write/ #   103.00 DRAM_BW_Use
     1.001102293 S0        1              30.22 MiB  uncore_imc/cas_count_read/
     1.001102293 S0        1      1,001,102,293 ns   duration_time
     1.001102293 S1        1              20.10 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
     1.001102293 S1        1              32.74 MiB  uncore_imc/cas_count_read/
     1.001102293 S1        0      <not counted> ns   duration_time
     2.003517973 S0        1              83.04 MiB  uncore_imc/cas_count_write/ #   920.00 DRAM_BW_Use
     2.003517973 S0        1             145.95 MiB  uncore_imc/cas_count_read/
     2.003517973 S0        1      1,002,415,680 ns   duration_time
     2.003517973 S1        1             302.45 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
     2.003517973 S1        1             290.99 MiB  uncore_imc/cas_count_read/
     2.003517973 S1        0      <not counted> ns   duration_time

After:
$ perf stat --per-socket -M DRAM_BW_Use -I 1000
     1.001080840 S0        1              24.96 MiB  uncore_imc/cas_count_write/ #    54.00 DRAM_BW_Use
     1.001080840 S0        1              33.64 MiB  uncore_imc/cas_count_read/
     1.001080840 S0        1      1,001,080,840 ns   duration_time
     1.001080840 S1        1              42.43 MiB  uncore_imc/cas_count_write/ #    84.00 DRAM_BW_Use
     1.001080840 S1        1              47.05 MiB  uncore_imc/cas_count_read/
     1.001080840 S1        0      <not counted> ns   duration_time

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/util/stat-display.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Comments

Jiri Olsa Jan. 4, 2022, 2:19 p.m. UTC | #1
On Wed, Dec 29, 2021 at 11:19:45PM -0800, Ian Rogers wrote:
> Switch the perf_cpu_map in aggr_update_shadow from
> the evlist to the counter's cpu map, so the index is appropriate. This
> addresses a problem where uncore counts, with a cpumap like:
> $ cat /sys/devices/uncore_imc_0/cpumask
> 0,18
> Don't aggregate counts in CPUs based on the index of those values in the
> cpumap (0 and 1) but on the actual CPU (0 and 18). Thereby correcting
> metric calculations in per-socket mode for counters without a full
> cpumask.
> 
> On a SkylakeX with a tweaked DRAM_BW_Use metric, to remove unnecessary
> scaling, this gives:
> 
> Before:
> $ /perf stat --per-socket -M DRAM_BW_Use -I 1000
>      1.001102293 S0        1              27.01 MiB  uncore_imc/cas_count_write/ #   103.00 DRAM_BW_Use
>      1.001102293 S0        1              30.22 MiB  uncore_imc/cas_count_read/
>      1.001102293 S0        1      1,001,102,293 ns   duration_time
>      1.001102293 S1        1              20.10 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
>      1.001102293 S1        1              32.74 MiB  uncore_imc/cas_count_read/
>      1.001102293 S1        0      <not counted> ns   duration_time
>      2.003517973 S0        1              83.04 MiB  uncore_imc/cas_count_write/ #   920.00 DRAM_BW_Use
>      2.003517973 S0        1             145.95 MiB  uncore_imc/cas_count_read/
>      2.003517973 S0        1      1,002,415,680 ns   duration_time
>      2.003517973 S1        1             302.45 MiB  uncore_imc/cas_count_write/ #     0.00 DRAM_BW_Use
>      2.003517973 S1        1             290.99 MiB  uncore_imc/cas_count_read/
>      2.003517973 S1        0      <not counted> ns   duration_time
> 
> After:
> $ perf stat --per-socket -M DRAM_BW_Use -I 1000
>      1.001080840 S0        1              24.96 MiB  uncore_imc/cas_count_write/ #    54.00 DRAM_BW_Use
>      1.001080840 S0        1              33.64 MiB  uncore_imc/cas_count_read/
>      1.001080840 S0        1      1,001,080,840 ns   duration_time
>      1.001080840 S1        1              42.43 MiB  uncore_imc/cas_count_write/ #    84.00 DRAM_BW_Use
>      1.001080840 S1        1              47.05 MiB  uncore_imc/cas_count_read/
>      1.001080840 S1        0      <not counted> ns   duration_time
> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  tools/perf/util/stat-display.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
> index 588601000f3f..b0fa81ffce61 100644
> --- a/tools/perf/util/stat-display.c
> +++ b/tools/perf/util/stat-display.c
> @@ -526,7 +526,7 @@ static void aggr_update_shadow(struct perf_stat_config *config,
>  		evlist__for_each_entry(evlist, counter) {
>  			val = 0;
>  			for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
> -				s2 = config->aggr_get_id(config, evlist->core.cpus, cpu);
> +				s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
>  				if (!cpu_map__compare_aggr_cpu_id(s2, id))
>  					continue;
>  				val += perf_counts(counter->counts, cpu, 0)->val;

makes sense, there's another instance of this in first_shadow_cpu

thanks,
jirka

> -- 
> 2.34.1.448.ga2b2bfdf31-goog
>
diff mbox series

Patch

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 588601000f3f..b0fa81ffce61 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -526,7 +526,7 @@  static void aggr_update_shadow(struct perf_stat_config *config,
 		evlist__for_each_entry(evlist, counter) {
 			val = 0;
 			for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) {
-				s2 = config->aggr_get_id(config, evlist->core.cpus, cpu);
+				s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
 				if (!cpu_map__compare_aggr_cpu_id(s2, id))
 					continue;
 				val += perf_counts(counter->counts, cpu, 0)->val;