diff mbox series

[v3,10/10] perf/x86/rapl: Add per-core energy counter support for AMD CPUs

Message ID 20240624055907.7720-11-Dhananjay.Ugwekar@amd.com (mailing list archive)
State Superseded
Headers show
Series Add per-core RAPL energy counter support for AMD CPUs | expand

Commit Message

Dhananjay Ugwekar June 24, 2024, 5:59 a.m. UTC
Add a new "power_per_core" PMU and "energy-per-core" event for
monitoring energy consumption by each core. The existing energy-cores
event aggregates the energy consumption at the package level.
This new event aligns with the AMD's per_core energy counters.

Tested the package level and core level PMU counters with workloads
pinned to different CPUs.

Results with workload pinned to CPU 1 in core 1 on a AMD Zen4 Genoa
machine:

$ perf stat -a --per-core -e power_per_core/energy-per-core/ sleep 1

 Performance counter stats for 'system wide':

S0-D0-C0         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C1         1          5.72 Joules power_per_core/energy-per-core/
S0-D0-C2         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C3         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C4         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C5         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C6         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C7         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C8         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C9         1          0.02 Joules power_per_core/energy-per-core/
S0-D0-C10        1          0.02 Joules power_per_core/energy-per-core/

Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
---
 arch/x86/events/rapl.c | 189 +++++++++++++++++++++++++++++++++++------
 1 file changed, 164 insertions(+), 25 deletions(-)

Comments

Zhang Rui June 26, 2024, 3:18 p.m. UTC | #1
> @@ -131,8 +146,10 @@ enum rapl_unit_quirk {
>  };
>  
>  struct rapl_model {
> -       struct perf_msr *rapl_msrs;
> +       struct perf_msr *rapl_pkg_msrs;

IMO, this should be part of patch 8/10.

[...]

> @@ -685,6 +774,13 @@ static void __init rapl_advertise(void)
>                                 rapl_pkg_domain_names[i],
> rapl_hw_unit[i]);
>                 }
>         }
> +
> +       for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
> +               if (rapl_core_cntr_mask & (1 << i)) {
> +                       pr_info("hw unit of domain %s 2^-%d
> Joules\n",
> +                               rapl_core_domain_names[i],
> rapl_hw_unit[i]);

rapl_hw_unit[] is for package pmu only and
rapl_hw_unit[0] is rapl_hw_unit[PERF_RAPL_PP0] rather than
rapl_hw_unit[PERF_RAPL_PER_CORE]

you cannot use rapl_hw_unit[i] to represent per-core rapl domain unit.

> +               }
> +       }
>  }
>  
>  static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
> @@ -705,15 +801,16 @@ static const struct attribute_group
> *rapl_attr_update[] = {
>         NULL,
>  };
>  
> -static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr)
> +static const struct attribute_group *rapl_per_core_attr_update[] = {
> +       &rapl_events_per_core_group,
> +};
> +
> +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr,
> int nr_rapl_pmu,
> +                                const struct attribute_group
> **rapl_attr_groups,
> +                                const struct attribute_group
> **rapl_attr_update)
>  {
>         struct rapl_pmus *rapl_pmus;
>  
> -       int nr_rapl_pmu = topology_max_packages() *
> topology_max_dies_per_package();
> -
> -       if (rapl_pmu_is_pkg_scope())
> -               nr_rapl_pmu = topology_max_packages();
> -
>         rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu,
> nr_rapl_pmu), GFP_KERNEL);
>         if (!rapl_pmus)
>                 return -ENOMEM;
> @@ -741,7 +838,7 @@ static struct rapl_model model_snb = {
>                           BIT(PERF_RAPL_PKG) |
>                           BIT(PERF_RAPL_PP1),
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_snbep = {
> @@ -749,7 +846,7 @@ static struct rapl_model model_snbep = {
>                           BIT(PERF_RAPL_PKG) |
>                           BIT(PERF_RAPL_RAM),
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_hsw = {
> @@ -758,7 +855,7 @@ static struct rapl_model model_hsw = {
>                           BIT(PERF_RAPL_RAM) |
>                           BIT(PERF_RAPL_PP1),
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_hsx = {
> @@ -767,7 +864,7 @@ static struct rapl_model model_hsx = {
>                           BIT(PERF_RAPL_RAM),
>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_knl = {
> @@ -775,7 +872,7 @@ static struct rapl_model model_knl = {
>                           BIT(PERF_RAPL_RAM),
>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_skl = {
> @@ -785,7 +882,7 @@ static struct rapl_model model_skl = {
>                           BIT(PERF_RAPL_PP1) |
>                           BIT(PERF_RAPL_PSYS),
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>  };
>  
>  static struct rapl_model model_spr = {
> @@ -795,13 +892,15 @@ static struct rapl_model model_spr = {
>                           BIT(PERF_RAPL_PSYS),
>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_SPR,
>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
> -       .rapl_msrs      = intel_rapl_spr_msrs,
> +       .rapl_pkg_msrs  = intel_rapl_spr_msrs,
>  };

All the above renaming code should be in patch 8/10.
Or else it is a distraction for reviewing this patch.

>  
>  static struct rapl_model model_amd_hygon = {
>         .pkg_events     = BIT(PERF_RAPL_PKG),
> +       .core_events    = BIT(PERF_RAPL_PER_CORE),
>         .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
> -       .rapl_msrs      = amd_rapl_pkg_msrs,
> +       .rapl_pkg_msrs  = amd_rapl_pkg_msrs,
> +       .rapl_core_msrs = amd_rapl_core_msrs,
>  };
>  
>  static const struct x86_cpu_id rapl_model_match[] __initconst = {
> @@ -858,6 +957,11 @@ static int __init rapl_pmu_init(void)
>  {
>         const struct x86_cpu_id *id;
>         int ret;
> +       int nr_rapl_pmu = topology_max_packages() *
> topology_max_dies_per_package();
> +       int nr_cores = topology_max_packages() *
> topology_num_cores_per_package();

I'd suggest either using two variables nr_pkgs/nr_cores, or reuse one
variable nr_rapl_pmu for both pkg pmu and per-core pmu.

> +
> +       if (rapl_pmu_is_pkg_scope())
> +               nr_rapl_pmu = topology_max_packages();
>  
>         id = x86_match_cpu(rapl_model_match);
>         if (!id)
> @@ -865,17 +969,34 @@ static int __init rapl_pmu_init(void)
>  
>         rapl_model = (struct rapl_model *) id->driver_data;
>  
> -       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_msrs,
> PERF_RAPL_PKG_EVENTS_MAX,
> +       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
> >rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
>                                         false, (void *) &rapl_model-
> >pkg_events);
>  
>         ret = rapl_check_hw_unit();
>         if (ret)
>                 return ret;
>  
> -       ret = init_rapl_pmus(&rapl_pmus_pkg);
> +       ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu,
> rapl_attr_groups, rapl_attr_update);
>         if (ret)
>                 return ret;
>  
> +       if (rapl_model->core_events) {
> +               rapl_core_cntr_mask = perf_msr_probe(rapl_model-
> >rapl_core_msrs,
> +                                                   
> PERF_RAPL_CORE_EVENTS_MAX, false,
> +                                                    (void *)
> &rapl_model->core_events);
> +
> +               ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
> +                                    rapl_per_core_attr_groups,
> rapl_per_core_attr_update);
> +               if (ret) {
> +                       /*
> +                        * If initialization of per_core PMU fails,
> reset per_core
> +                        * flag, and continue with power PMU
> initialization.
> +                        */
> +                       pr_warn("Per-core PMU initialization failed
> (%d)\n", ret);
> +                       rapl_model->core_events = 0UL;
> +               }
> +       }
> +
>         /*
>          * Install callbacks. Core will call them for each online
> cpu.
>          */
> @@ -889,6 +1010,20 @@ static int __init rapl_pmu_init(void)
>         if (ret)
>                 goto out1;
>  
> +       if (rapl_model->core_events) {
> +               ret = perf_pmu_register(&rapl_pmus_core->pmu,
> "power_per_core", -1);
> +               if (ret) {
> +                       /*
> +                        * If registration of per_core PMU fails,
> cleanup per_core PMU
> +                        * variables, reset the per_core flag and
> keep the
> +                        * power PMU untouched.
> +                        */
> +                       pr_warn("Per-core PMU registration failed
> (%d)\n", ret);
> +                       cleanup_rapl_pmus(rapl_pmus_core);
> +                       rapl_model->core_events = 0UL;
> +               }
> +       }
> +
>         rapl_advertise();
>         return 0;
>  
> @@ -906,5 +1041,9 @@ static void __exit intel_rapl_exit(void)
>         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
>         perf_pmu_unregister(&rapl_pmus_pkg->pmu);
>         cleanup_rapl_pmus(rapl_pmus_pkg);
> +       if (rapl_model->core_events) {
> +               perf_pmu_unregister(&rapl_pmus_core->pmu);
> +               cleanup_rapl_pmus(rapl_pmus_core);
> +       }

we do check rapl_pmus_core before accessing it, but we never check
rapl_pmus_pkg because the previous code assumes it always exists.

so could there be a problem if some one starts the per-core pmu when
pkg pmu is unregistered and cleaned up?

say, in rapl_pmu_event_init(),

if (event->attr.type == rapl_pmus_pkg->pmu.type ||
   (rapl_pmus_core && event->attr.type == rapl_pmus_core->pmu.type))

this can break because rapl_pmus_pkg is freed, right?

thanks,
rui


>  }
>  module_exit(intel_rapl_exit);
Dhananjay Ugwekar June 26, 2024, 4:37 p.m. UTC | #2
Hello Rui,

On 6/26/2024 8:48 PM, Zhang, Rui wrote:
> 
>> @@ -131,8 +146,10 @@ enum rapl_unit_quirk {
>>  };
>>  
>>  struct rapl_model {
>> -       struct perf_msr *rapl_msrs;
>> +       struct perf_msr *rapl_pkg_msrs;
> 
> IMO, this should be part of patch 8/10.

Makes sense, better to move all the renaming code to 8th patch.

> 
> [...]
> 
>> @@ -685,6 +774,13 @@ static void __init rapl_advertise(void)
>>                                 rapl_pkg_domain_names[i],
>> rapl_hw_unit[i]);
>>                 }
>>         }
>> +
>> +       for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
>> +               if (rapl_core_cntr_mask & (1 << i)) {
>> +                       pr_info("hw unit of domain %s 2^-%d
>> Joules\n",
>> +                               rapl_core_domain_names[i],
>> rapl_hw_unit[i]);
> 
> rapl_hw_unit[] is for package pmu only and
> rapl_hw_unit[0] is rapl_hw_unit[PERF_RAPL_PP0] rather than
> rapl_hw_unit[PERF_RAPL_PER_CORE]
> 
> you cannot use rapl_hw_unit[i] to represent per-core rapl domain unit.

Yes right, I saw that all the elements in the rapl_hw_unit array were actually 
using the value from the same register "MSR_RAPL_POWER_UNIT" or "MSR_AMD_RAPL_POWER_UNIT".
Except for the two quirks,
 
 737         case RAPL_UNIT_QUIRK_INTEL_HSW:                                                                                                                                                                      
 738                 rapl_hw_unit[PERF_RAPL_RAM] = 16;                                                                                                                                                            
 739                 break;                                                                                                                                                                                       
 740         /* SPR uses a fixed energy unit for Psys domain. */
 741         case RAPL_UNIT_QUIRK_INTEL_SPR:
 742                 rapl_hw_unit[PERF_RAPL_PSYS] = 0;
 743                 break;

So, as for AMD systems the rapl_hw_unit[] elements will always have the same value, I ended 
up using the rapl_hw_unit[PERF_RAPL_PP0] for rapl_hw_unit[PERF_RAPL_PER_CORE], but I do realize
it is quite hacky. So, better to do it cleanly and add a separate array/variable for the core events.

> 
>> +               }
>> +       }
>>  }
>>  
>>  static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
>> @@ -705,15 +801,16 @@ static const struct attribute_group
>> *rapl_attr_update[] = {
>>         NULL,
>>  };
>>  
>> -static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr)
>> +static const struct attribute_group *rapl_per_core_attr_update[] = {
>> +       &rapl_events_per_core_group,
>> +};
>> +
>> +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr,
>> int nr_rapl_pmu,
>> +                                const struct attribute_group
>> **rapl_attr_groups,
>> +                                const struct attribute_group
>> **rapl_attr_update)
>>  {
>>         struct rapl_pmus *rapl_pmus;
>>  
>> -       int nr_rapl_pmu = topology_max_packages() *
>> topology_max_dies_per_package();
>> -
>> -       if (rapl_pmu_is_pkg_scope())
>> -               nr_rapl_pmu = topology_max_packages();
>> -
>>         rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu,
>> nr_rapl_pmu), GFP_KERNEL);
>>         if (!rapl_pmus)
>>                 return -ENOMEM;
>> @@ -741,7 +838,7 @@ static struct rapl_model model_snb = {
>>                           BIT(PERF_RAPL_PKG) |
>>                           BIT(PERF_RAPL_PP1),
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_snbep = {
>> @@ -749,7 +846,7 @@ static struct rapl_model model_snbep = {
>>                           BIT(PERF_RAPL_PKG) |
>>                           BIT(PERF_RAPL_RAM),
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_hsw = {
>> @@ -758,7 +855,7 @@ static struct rapl_model model_hsw = {
>>                           BIT(PERF_RAPL_RAM) |
>>                           BIT(PERF_RAPL_PP1),
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_hsx = {
>> @@ -767,7 +864,7 @@ static struct rapl_model model_hsx = {
>>                           BIT(PERF_RAPL_RAM),
>>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_knl = {
>> @@ -775,7 +872,7 @@ static struct rapl_model model_knl = {
>>                           BIT(PERF_RAPL_RAM),
>>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_HSW,
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_skl = {
>> @@ -785,7 +882,7 @@ static struct rapl_model model_skl = {
>>                           BIT(PERF_RAPL_PP1) |
>>                           BIT(PERF_RAPL_PSYS),
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_msrs,
>>  };
>>  
>>  static struct rapl_model model_spr = {
>> @@ -795,13 +892,15 @@ static struct rapl_model model_spr = {
>>                           BIT(PERF_RAPL_PSYS),
>>         .unit_quirk     = RAPL_UNIT_QUIRK_INTEL_SPR,
>>         .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = intel_rapl_spr_msrs,
>> +       .rapl_pkg_msrs  = intel_rapl_spr_msrs,
>>  };
> 
> All the above renaming code should be in patch 8/10.
> Or else it is a distraction for reviewing this patch.

Agreed, will move it in the next version.

> 
>>  
>>  static struct rapl_model model_amd_hygon = {
>>         .pkg_events     = BIT(PERF_RAPL_PKG),
>> +       .core_events    = BIT(PERF_RAPL_PER_CORE),
>>         .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
>> -       .rapl_msrs      = amd_rapl_pkg_msrs,
>> +       .rapl_pkg_msrs  = amd_rapl_pkg_msrs,
>> +       .rapl_core_msrs = amd_rapl_core_msrs,
>>  };
>>  
>>  static const struct x86_cpu_id rapl_model_match[] __initconst = {
>> @@ -858,6 +957,11 @@ static int __init rapl_pmu_init(void)
>>  {
>>         const struct x86_cpu_id *id;
>>         int ret;
>> +       int nr_rapl_pmu = topology_max_packages() *
>> topology_max_dies_per_package();
>> +       int nr_cores = topology_max_packages() *
>> topology_num_cores_per_package();
> 
> I'd suggest either using two variables nr_pkgs/nr_cores, or reuse one
> variable nr_rapl_pmu for both pkg pmu and per-core pmu.

I understand your point, but the problem with that is, there are actually three scopes needed here

Some Intel systems need a *die* scope for the rapl_pmus_pkg PMU
Some Intel systems and all AMD systems need a *package* scope for the rapl_pmus_pkg PMU
And AMD systems need a *core* scope for the rapl_pmus_per_core PMU

I think what we can do is three variables, nr_dies (for all Intel systems as before), 
nr_pkgs(for AMD systems rapl_pmus_pkg PMU) and nr_cores(for rapl_pmus_per_core PMU)

Sounds good?

> 
>> +
>> +       if (rapl_pmu_is_pkg_scope())
>> +               nr_rapl_pmu = topology_max_packages();
>>  
>>         id = x86_match_cpu(rapl_model_match);
>>         if (!id)
>> @@ -865,17 +969,34 @@ static int __init rapl_pmu_init(void)
>>  
>>         rapl_model = (struct rapl_model *) id->driver_data;
>>  
>> -       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_msrs,
>> PERF_RAPL_PKG_EVENTS_MAX,
>> +       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
>>> rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
>>                                         false, (void *) &rapl_model-
>>> pkg_events);
>>  
>>         ret = rapl_check_hw_unit();
>>         if (ret)
>>                 return ret;
>>  
>> -       ret = init_rapl_pmus(&rapl_pmus_pkg);
>> +       ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu,
>> rapl_attr_groups, rapl_attr_update);
>>         if (ret)
>>                 return ret;
>>  
>> +       if (rapl_model->core_events) {
>> +               rapl_core_cntr_mask = perf_msr_probe(rapl_model-
>>> rapl_core_msrs,
>> +                                                   
>> PERF_RAPL_CORE_EVENTS_MAX, false,
>> +                                                    (void *)
>> &rapl_model->core_events);
>> +
>> +               ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
>> +                                    rapl_per_core_attr_groups,
>> rapl_per_core_attr_update);
>> +               if (ret) {
>> +                       /*
>> +                        * If initialization of per_core PMU fails,
>> reset per_core
>> +                        * flag, and continue with power PMU
>> initialization.
>> +                        */
>> +                       pr_warn("Per-core PMU initialization failed
>> (%d)\n", ret);
>> +                       rapl_model->core_events = 0UL;
>> +               }
>> +       }
>> +
>>         /*
>>          * Install callbacks. Core will call them for each online
>> cpu.
>>          */
>> @@ -889,6 +1010,20 @@ static int __init rapl_pmu_init(void)
>>         if (ret)
>>                 goto out1;
>>  
>> +       if (rapl_model->core_events) {
>> +               ret = perf_pmu_register(&rapl_pmus_core->pmu,
>> "power_per_core", -1);
>> +               if (ret) {
>> +                       /*
>> +                        * If registration of per_core PMU fails,
>> cleanup per_core PMU
>> +                        * variables, reset the per_core flag and
>> keep the
>> +                        * power PMU untouched.
>> +                        */
>> +                       pr_warn("Per-core PMU registration failed
>> (%d)\n", ret);
>> +                       cleanup_rapl_pmus(rapl_pmus_core);
>> +                       rapl_model->core_events = 0UL;
>> +               }
>> +       }
>> +
>>         rapl_advertise();
>>         return 0;
>>  
>> @@ -906,5 +1041,9 @@ static void __exit intel_rapl_exit(void)
>>         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
>>         perf_pmu_unregister(&rapl_pmus_pkg->pmu);
>>         cleanup_rapl_pmus(rapl_pmus_pkg);
>> +       if (rapl_model->core_events) {
>> +               perf_pmu_unregister(&rapl_pmus_core->pmu);
>> +               cleanup_rapl_pmus(rapl_pmus_core);
>> +       }
> 
> we do check rapl_pmus_core before accessing it, but we never check
> rapl_pmus_pkg because the previous code assumes it always exists.
> 
> so could there be a problem if some one starts the per-core pmu when
> pkg pmu is unregistered and cleaned up?
> 
> say, in rapl_pmu_event_init(),
> 
> if (event->attr.type == rapl_pmus_pkg->pmu.type ||
>    (rapl_pmus_core && event->attr.type == rapl_pmus_core->pmu.type))
> 
> this can break because rapl_pmus_pkg is freed, right?

Hmm, I think this situation can't arise as whenever the power PMU fails, we 
directly go to the failure path and dont setup the per-core PMU(which means 
no one will be able to start the per-core PMU), 
Please let me know if there is a scenario where this assumption can fail.

Thanks for all the helpful suggestions!, will incorporate them in v4.

Regards,
Dhananjay

> 
> thanks,
> rui
> 
> 
>>  }
>>  module_exit(intel_rapl_exit);
>
Zhang Rui June 27, 2024, 6:49 a.m. UTC | #3
Hi, Dhananjay
> 
> > 
> > [...]
> > 
> > > @@ -685,6 +774,13 @@ static void __init rapl_advertise(void)
> > >                                 rapl_pkg_domain_names[i],
> > > rapl_hw_unit[i]);
> > >                 }
> > >         }
> > > +
> > > +       for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
> > > +               if (rapl_core_cntr_mask & (1 << i)) {
> > > +                       pr_info("hw unit of domain %s 2^-%d
> > > Joules\n",
> > > +                               rapl_core_domain_names[i],
> > > rapl_hw_unit[i]);
> > 
> > rapl_hw_unit[] is for package pmu only and
> > rapl_hw_unit[0] is rapl_hw_unit[PERF_RAPL_PP0] rather than
> > rapl_hw_unit[PERF_RAPL_PER_CORE]
> > 
> > you cannot use rapl_hw_unit[i] to represent per-core rapl domain
> > unit.
> 
> Yes right, I saw that all the elements in the rapl_hw_unit array were
> actually 
> using the value from the same register "MSR_RAPL_POWER_UNIT" or
> "MSR_AMD_RAPL_POWER_UNIT".
> Except for the two quirks,
>  
>  737         case
> RAPL_UNIT_QUIRK_INTEL_HSW:                                           
>                                                                      
>                                                       
>  738                 rapl_hw_unit[PERF_RAPL_RAM] =
> 16;                                                                  
>                                                                      
>                      
>  739                
> break;                                                               
>                                                                      
>                                                    
>  740         /* SPR uses a fixed energy unit for Psys domain. */
>  741         case RAPL_UNIT_QUIRK_INTEL_SPR:
>  742                 rapl_hw_unit[PERF_RAPL_PSYS] = 0;
>  743                 break;
> 
> So, as for AMD systems the rapl_hw_unit[] elements will always have
> the same value, I ended 
> up using the rapl_hw_unit[PERF_RAPL_PP0] for
> rapl_hw_unit[PERF_RAPL_PER_CORE], but I do realize
> it is quite hacky. So, better to do it cleanly and add a separate
> array/variable for the core events.
> 
yeah, that is much better.
> 

> > 
> > >  
> > >  static struct rapl_model model_amd_hygon = {
> > >         .pkg_events     = BIT(PERF_RAPL_PKG),
> > > +       .core_events    = BIT(PERF_RAPL_PER_CORE),
> > >         .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
> > > -       .rapl_msrs      = amd_rapl_pkg_msrs,
> > > +       .rapl_pkg_msrs  = amd_rapl_pkg_msrs,
> > > +       .rapl_core_msrs = amd_rapl_core_msrs,
> > >  };
> > >  
> > >  static const struct x86_cpu_id rapl_model_match[] __initconst =
> > > {
> > > @@ -858,6 +957,11 @@ static int __init rapl_pmu_init(void)
> > >  {
> > >         const struct x86_cpu_id *id;
> > >         int ret;
> > > +       int nr_rapl_pmu = topology_max_packages() *
> > > topology_max_dies_per_package();
> > > +       int nr_cores = topology_max_packages() *
> > > topology_num_cores_per_package();
> > 
> > I'd suggest either using two variables nr_pkgs/nr_cores, or reuse
> > one
> > variable nr_rapl_pmu for both pkg pmu and per-core pmu.
> 
> I understand your point, but the problem with that is, there are
> actually three scopes needed here
> 
> Some Intel systems need a *die* scope for the rapl_pmus_pkg PMU
> Some Intel systems and all AMD systems need a *package* scope for the
> rapl_pmus_pkg PMU
> And AMD systems need a *core* scope for the rapl_pmus_per_core PMU
> 
> I think what we can do is three variables, nr_dies (for all Intel
> systems as before), 
> nr_pkgs(for AMD systems rapl_pmus_pkg PMU)

Not necessarily, we already uses rapl_pmus_pkg for intel systems,
right?

>  and nr_cores(for rapl_pmus_per_core PMU)
> 
> Sounds good?

what about just one variable "count" and reuse it for every cases?

> 
> > 
> > > +
> > > +       if (rapl_pmu_is_pkg_scope())
> > > +               nr_rapl_pmu = topology_max_packages();
> > >  
> > >         id = x86_match_cpu(rapl_model_match);
> > >         if (!id)
> > > @@ -865,17 +969,34 @@ static int __init rapl_pmu_init(void)
> > >  
> > >         rapl_model = (struct rapl_model *) id->driver_data;
> > >  
> > > -       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
> > > >rapl_msrs,
> > > PERF_RAPL_PKG_EVENTS_MAX,
> > > +       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
> > > > rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
> > >                                         false, (void *)
> > > &rapl_model-
> > > > pkg_events);
> > >  
> > >         ret = rapl_check_hw_unit();
> > >         if (ret)
> > >                 return ret;
> > >  
> > > -       ret = init_rapl_pmus(&rapl_pmus_pkg);
> > > +       ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu,
> > > rapl_attr_groups, rapl_attr_update);
> > >         if (ret)
> > >                 return ret;
> > >  
> > > +       if (rapl_model->core_events) {
> > > +               rapl_core_cntr_mask = perf_msr_probe(rapl_model-
> > > > rapl_core_msrs,
> > > +                                                   
> > > PERF_RAPL_CORE_EVENTS_MAX, false,
> > > +                                                    (void *)
> > > &rapl_model->core_events);
> > > +
> > > +               ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
> > > +                                    rapl_per_core_attr_groups,
> > > rapl_per_core_attr_update);
> > > +               if (ret) {
> > > +                       /*
> > > +                        * If initialization of per_core PMU
> > > fails,
> > > reset per_core
> > > +                        * flag, and continue with power PMU
> > > initialization.
> > > +                        */
> > > +                       pr_warn("Per-core PMU initialization
> > > failed
> > > (%d)\n", ret);
> > > +                       rapl_model->core_events = 0UL;
> > > +               }
> > > +       }
> > > +
> > >         /*
> > >          * Install callbacks. Core will call them for each online
> > > cpu.
> > >          */
> > > @@ -889,6 +1010,20 @@ static int __init rapl_pmu_init(void)
> > >         if (ret)
> > >                 goto out1;
> > >  
> > > +       if (rapl_model->core_events) {
> > > +               ret = perf_pmu_register(&rapl_pmus_core->pmu,
> > > "power_per_core", -1);
> > > +               if (ret) {
> > > +                       /*
> > > +                        * If registration of per_core PMU fails,
> > > cleanup per_core PMU
> > > +                        * variables, reset the per_core flag and
> > > keep the
> > > +                        * power PMU untouched.
> > > +                        */
> > > +                       pr_warn("Per-core PMU registration failed
> > > (%d)\n", ret);
> > > +                       cleanup_rapl_pmus(rapl_pmus_core);
> > > +                       rapl_model->core_events = 0UL;
> > > +               }
> > > +       }
> > > +
> > >         rapl_advertise();
> > >         return 0;
> > >  
> > > @@ -906,5 +1041,9 @@ static void __exit intel_rapl_exit(void)
> > >         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE)
> > > ;
> > >         perf_pmu_unregister(&rapl_pmus_pkg->pmu);
> > >         cleanup_rapl_pmus(rapl_pmus_pkg);
> > > +       if (rapl_model->core_events) {
> > > +               perf_pmu_unregister(&rapl_pmus_core->pmu);
> > > +               cleanup_rapl_pmus(rapl_pmus_core);
> > > +       }
> > 
> > we do check rapl_pmus_core before accessing it, but we never check
> > rapl_pmus_pkg because the previous code assumes it always exists.
> > 
> > so could there be a problem if some one starts the per-core pmu
> > when
> > pkg pmu is unregistered and cleaned up?
> > 
> > say, in rapl_pmu_event_init(),
> > 
> > if (event->attr.type == rapl_pmus_pkg->pmu.type ||
> >    (rapl_pmus_core && event->attr.type == rapl_pmus_core-
> > >pmu.type))
> > 
> > this can break because rapl_pmus_pkg is freed, right?
> 
> Hmm, I think this situation can't arise as whenever the power PMU
> fails, we 
> directly go to the failure path and dont setup the per-core PMU(which
> means 
> no one will be able to start the per-core PMU), 
> Please let me know if there is a scenario where this assumption can
> fail.

I mean if we do module unload and access power-per-core pmu at the same
time, could there be a race?

why not just unregister and cleanup the per-core pmu before the pkg
pmu?
> 

thanks,
rui
Dhananjay Ugwekar June 27, 2024, 11:13 a.m. UTC | #4
Hello Rui,

On 6/27/2024 12:19 PM, Zhang, Rui wrote:
> Hi, Dhananjay
>>
>>>
>>> [...]
>>>
>>>> @@ -685,6 +774,13 @@ static void __init rapl_advertise(void)
>>>>                                 rapl_pkg_domain_names[i],
>>>> rapl_hw_unit[i]);
>>>>                 }
>>>>         }
>>>> +
>>>> +       for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
>>>> +               if (rapl_core_cntr_mask & (1 << i)) {
>>>> +                       pr_info("hw unit of domain %s 2^-%d
>>>> Joules\n",
>>>> +                               rapl_core_domain_names[i],
>>>> rapl_hw_unit[i]);
>>>
>>> rapl_hw_unit[] is for package pmu only and
>>> rapl_hw_unit[0] is rapl_hw_unit[PERF_RAPL_PP0] rather than
>>> rapl_hw_unit[PERF_RAPL_PER_CORE]
>>>
>>> you cannot use rapl_hw_unit[i] to represent per-core rapl domain
>>> unit.
>>
>> Yes right, I saw that all the elements in the rapl_hw_unit array were
>> actually 
>> using the value from the same register "MSR_RAPL_POWER_UNIT" or
>> "MSR_AMD_RAPL_POWER_UNIT".
>> Except for the two quirks,
>>  
>>  737         case
>> RAPL_UNIT_QUIRK_INTEL_HSW:                                           
>>                                                                      
>>                                                       
>>  738                 rapl_hw_unit[PERF_RAPL_RAM] =
>> 16;                                                                  
>>                                                                      
>>                      
>>  739                
>> break;                                                               
>>                                                                      
>>                                                    
>>  740         /* SPR uses a fixed energy unit for Psys domain. */
>>  741         case RAPL_UNIT_QUIRK_INTEL_SPR:
>>  742                 rapl_hw_unit[PERF_RAPL_PSYS] = 0;
>>  743                 break;
>>
>> So, as for AMD systems the rapl_hw_unit[] elements will always have
>> the same value, I ended 
>> up using the rapl_hw_unit[PERF_RAPL_PP0] for
>> rapl_hw_unit[PERF_RAPL_PER_CORE], but I do realize
>> it is quite hacky. So, better to do it cleanly and add a separate
>> array/variable for the core events.
>>
> yeah, that is much better.
>>
> 
>>>
>>>>  
>>>>  static struct rapl_model model_amd_hygon = {
>>>>         .pkg_events     = BIT(PERF_RAPL_PKG),
>>>> +       .core_events    = BIT(PERF_RAPL_PER_CORE),
>>>>         .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
>>>> -       .rapl_msrs      = amd_rapl_pkg_msrs,
>>>> +       .rapl_pkg_msrs  = amd_rapl_pkg_msrs,
>>>> +       .rapl_core_msrs = amd_rapl_core_msrs,
>>>>  };
>>>>  
>>>>  static const struct x86_cpu_id rapl_model_match[] __initconst =
>>>> {
>>>> @@ -858,6 +957,11 @@ static int __init rapl_pmu_init(void)
>>>>  {
>>>>         const struct x86_cpu_id *id;
>>>>         int ret;
>>>> +       int nr_rapl_pmu = topology_max_packages() *
>>>> topology_max_dies_per_package();
>>>> +       int nr_cores = topology_max_packages() *
>>>> topology_num_cores_per_package();
>>>
>>> I'd suggest either using two variables nr_pkgs/nr_cores, or reuse
>>> one
>>> variable nr_rapl_pmu for both pkg pmu and per-core pmu.
>>
>> I understand your point, but the problem with that is, there are
>> actually three scopes needed here
>>
>> Some Intel systems need a *die* scope for the rapl_pmus_pkg PMU
>> Some Intel systems and all AMD systems need a *package* scope for the
>> rapl_pmus_pkg PMU
>> And AMD systems need a *core* scope for the rapl_pmus_per_core PMU
>>
>> I think what we can do is three variables, nr_dies (for all Intel
>> systems as before), 
>> nr_pkgs(for AMD systems rapl_pmus_pkg PMU)
> 
> Not necessarily, we already uses rapl_pmus_pkg for intel systems,
> right?

Yes, but scope of rapl_pmus_pkg is actually *die* for some Intel systems
(Xeon Cascade Lake-AP to be specific), right?

Whereas, all AMD systems have pkg scope for power PMU.

> 
>>  and nr_cores(for rapl_pmus_per_core PMU)
>>
>> Sounds good?
> 
> what about just one variable "count" and reuse it for every cases?

Sure that would be cleaner, albeit not that explicit, will make the change 
in next version

> 
>>
>>>
>>>> +
>>>> +       if (rapl_pmu_is_pkg_scope())
>>>> +               nr_rapl_pmu = topology_max_packages();
>>>>  
>>>>         id = x86_match_cpu(rapl_model_match);
>>>>         if (!id)
>>>> @@ -865,17 +969,34 @@ static int __init rapl_pmu_init(void)
>>>>  
>>>>         rapl_model = (struct rapl_model *) id->driver_data;
>>>>  
>>>> -       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
>>>>> rapl_msrs,
>>>> PERF_RAPL_PKG_EVENTS_MAX,
>>>> +       rapl_pkg_cntr_mask = perf_msr_probe(rapl_model-
>>>>> rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
>>>>                                         false, (void *)
>>>> &rapl_model-
>>>>> pkg_events);
>>>>  
>>>>         ret = rapl_check_hw_unit();
>>>>         if (ret)
>>>>                 return ret;
>>>>  
>>>> -       ret = init_rapl_pmus(&rapl_pmus_pkg);
>>>> +       ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu,
>>>> rapl_attr_groups, rapl_attr_update);
>>>>         if (ret)
>>>>                 return ret;
>>>>  
>>>> +       if (rapl_model->core_events) {
>>>> +               rapl_core_cntr_mask = perf_msr_probe(rapl_model-
>>>>> rapl_core_msrs,
>>>> +                                                   
>>>> PERF_RAPL_CORE_EVENTS_MAX, false,
>>>> +                                                    (void *)
>>>> &rapl_model->core_events);
>>>> +
>>>> +               ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
>>>> +                                    rapl_per_core_attr_groups,
>>>> rapl_per_core_attr_update);
>>>> +               if (ret) {
>>>> +                       /*
>>>> +                        * If initialization of per_core PMU
>>>> fails,
>>>> reset per_core
>>>> +                        * flag, and continue with power PMU
>>>> initialization.
>>>> +                        */
>>>> +                       pr_warn("Per-core PMU initialization
>>>> failed
>>>> (%d)\n", ret);
>>>> +                       rapl_model->core_events = 0UL;
>>>> +               }
>>>> +       }
>>>> +
>>>>         /*
>>>>          * Install callbacks. Core will call them for each online
>>>> cpu.
>>>>          */
>>>> @@ -889,6 +1010,20 @@ static int __init rapl_pmu_init(void)
>>>>         if (ret)
>>>>                 goto out1;
>>>>  
>>>> +       if (rapl_model->core_events) {
>>>> +               ret = perf_pmu_register(&rapl_pmus_core->pmu,
>>>> "power_per_core", -1);
>>>> +               if (ret) {
>>>> +                       /*
>>>> +                        * If registration of per_core PMU fails,
>>>> cleanup per_core PMU
>>>> +                        * variables, reset the per_core flag and
>>>> keep the
>>>> +                        * power PMU untouched.
>>>> +                        */
>>>> +                       pr_warn("Per-core PMU registration failed
>>>> (%d)\n", ret);
>>>> +                       cleanup_rapl_pmus(rapl_pmus_core);
>>>> +                       rapl_model->core_events = 0UL;
>>>> +               }
>>>> +       }
>>>> +
>>>>         rapl_advertise();
>>>>         return 0;
>>>>  
>>>> @@ -906,5 +1041,9 @@ static void __exit intel_rapl_exit(void)
>>>>         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE)
>>>> ;
>>>>         perf_pmu_unregister(&rapl_pmus_pkg->pmu);
>>>>         cleanup_rapl_pmus(rapl_pmus_pkg);
>>>> +       if (rapl_model->core_events) {
>>>> +               perf_pmu_unregister(&rapl_pmus_core->pmu);
>>>> +               cleanup_rapl_pmus(rapl_pmus_core);
>>>> +       }
>>>
>>> we do check rapl_pmus_core before accessing it, but we never check
>>> rapl_pmus_pkg because the previous code assumes it always exists.
>>>
>>> so could there be a problem if some one starts the per-core pmu
>>> when
>>> pkg pmu is unregistered and cleaned up?
>>>
>>> say, in rapl_pmu_event_init(),
>>>
>>> if (event->attr.type == rapl_pmus_pkg->pmu.type ||
>>>    (rapl_pmus_core && event->attr.type == rapl_pmus_core-
>>>> pmu.type))
>>>
>>> this can break because rapl_pmus_pkg is freed, right?
>>
>> Hmm, I think this situation can't arise as whenever the power PMU
>> fails, we 
>> directly go to the failure path and dont setup the per-core PMU(which
>> means 
>> no one will be able to start the per-core PMU), 
>> Please let me know if there is a scenario where this assumption can
>> fail.
> 
> I mean if we do module unload and access power-per-core pmu at the same
> time, could there be a race?
> 
> why not just unregister and cleanup the per-core pmu before the pkg
> pmu?

Yes, better to be safe, will reorder the cleanup.

Thanks,
Dhananjay

>>
> 
> thanks,
> rui
>
diff mbox series

Patch

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index e962209e9e4d..8206038a01ac 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -39,6 +39,10 @@ 
  *	  event: rapl_energy_psys
  *    perf code: 0x5
  *
+ *  per_core counter: consumption of a single physical core
+ *	  event: rapl_energy_per_core (power_per_core PMU)
+ *    perf code: 0x1
+ *
  * We manage those counters as free running (read-only). They may be
  * use simultaneously by other tools, such as turbostat.
  *
@@ -81,6 +85,13 @@  enum perf_rapl_pkg_events {
 	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
 };
 
+enum perf_rapl_core_events {
+	PERF_RAPL_PER_CORE = 0,		/* per-core */
+
+	PERF_RAPL_CORE_EVENTS_MAX,
+	NR_RAPL_CORE_DOMAINS = PERF_RAPL_CORE_EVENTS_MAX,
+};
+
 static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
 	"pp0-core",
 	"package",
@@ -89,6 +100,10 @@  static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst
 	"psys",
 };
 
+static const char *const rapl_core_domain_names[NR_RAPL_CORE_DOMAINS] __initconst = {
+	"per-core",
+};
+
 /*
  * event code: LSB 8 bits, passed in attr->config
  * any other bit is reserved
@@ -131,8 +146,10 @@  enum rapl_unit_quirk {
 };
 
 struct rapl_model {
-	struct perf_msr *rapl_msrs;
+	struct perf_msr *rapl_pkg_msrs;
+	struct perf_msr *rapl_core_msrs;
 	unsigned long	pkg_events;
+	unsigned long	core_events;
 	unsigned int	msr_power_unit;
 	enum rapl_unit_quirk	unit_quirk;
 };
@@ -140,7 +157,9 @@  struct rapl_model {
  /* 1/2^hw_unit Joule */
 static int rapl_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
 static struct rapl_pmus *rapl_pmus_pkg;
+static struct rapl_pmus *rapl_pmus_core;
 static unsigned int rapl_pkg_cntr_mask;
+static unsigned int rapl_core_cntr_mask;
 static u64 rapl_timer_ms;
 static struct rapl_model *rapl_model;
 
@@ -345,9 +364,13 @@  static int rapl_pmu_event_init(struct perf_event *event)
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, ret = 0;
 	struct rapl_pmu *rapl_pmu;
+	struct rapl_pmus *curr_rapl_pmus;
 
 	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmus_pkg->pmu.type)
+	if (event->attr.type == rapl_pmus_pkg->pmu.type ||
+		(rapl_pmus_core && event->attr.type == rapl_pmus_core->pmu.type))
+		curr_rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
+	else
 		return -ENOENT;
 
 	/* check only supported bits are set */
@@ -357,7 +380,8 @@  static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+	if (curr_rapl_pmus == rapl_pmus_pkg)
+		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 
 	if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
 		return -EINVAL;
@@ -366,7 +390,8 @@  static int rapl_pmu_event_init(struct perf_event *event)
 	bit = cfg - 1;
 
 	/* check event supported */
-	if (!(rapl_pkg_cntr_mask & (1 << bit)))
+	if (!(rapl_pkg_cntr_mask & (1 << bit)) &&
+	    !(rapl_core_cntr_mask & (1 << bit)))
 		return -EINVAL;
 
 	/* unsupported modes and filters */
@@ -374,12 +399,18 @@  static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
-	rapl_pmu = cpu_to_rapl_pmu(event->cpu);
+	if (curr_rapl_pmus == rapl_pmus_core) {
+		rapl_pmu = curr_rapl_pmus->rapl_pmu[topology_logical_core_id(event->cpu)];
+		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
+	} else {
+		rapl_pmu = curr_rapl_pmus->rapl_pmu[get_rapl_pmu_idx(event->cpu)];
+		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+	}
+
 	if (!rapl_pmu)
 		return -EINVAL;
 	event->cpu = rapl_pmu->cpu;
 	event->pmu_private = rapl_pmu;
-	event->hw.event_base = rapl_model->rapl_msrs[bit].msr;
 	event->hw.config = cfg;
 	event->hw.idx = bit;
 
@@ -408,17 +439,38 @@  static struct attribute_group rapl_pmu_attr_group = {
 	.attrs = rapl_pmu_attrs,
 };
 
+static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev,
+					     struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_core->cpumask);
+}
+
+static struct device_attribute dev_attr_per_core_cpumask = __ATTR(cpumask, 0444,
+								 rapl_get_attr_per_core_cpumask,
+								 NULL);
+
+static struct attribute *rapl_pmu_per_core_attrs[] = {
+	&dev_attr_per_core_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_per_core_attr_group = {
+	.attrs = rapl_pmu_per_core_attrs,
+};
+
 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
+RAPL_EVENT_ATTR_STR(energy-per-core,   rapl_per_core, "event=0x01");
 
 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-per-core.unit,   rapl_per_core_unit, "Joules");
 
 /*
  * we compute in 0.23 nJ increments regardless of MSR
@@ -428,6 +480,7 @@  RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890
 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-per-core.scale,   rapl_per_core_scale, "2.3283064365386962890625e-10");
 
 /*
  * There are no default events, but we need to create
@@ -461,6 +514,13 @@  static const struct attribute_group *rapl_attr_groups[] = {
 	NULL,
 };
 
+static const struct attribute_group *rapl_per_core_attr_groups[] = {
+	&rapl_pmu_per_core_attr_group,
+	&rapl_pmu_format_group,
+	&rapl_pmu_events_group,
+	NULL,
+};
+
 static struct attribute *rapl_events_cores[] = {
 	EVENT_PTR(rapl_cores),
 	EVENT_PTR(rapl_cores_unit),
@@ -521,6 +581,18 @@  static struct attribute_group rapl_events_psys_group = {
 	.attrs = rapl_events_psys,
 };
 
+static struct attribute *rapl_events_per_core[] = {
+	EVENT_PTR(rapl_per_core),
+	EVENT_PTR(rapl_per_core_unit),
+	EVENT_PTR(rapl_per_core_scale),
+	NULL,
+};
+
+static struct attribute_group rapl_events_per_core_group = {
+	.name  = "events",
+	.attrs = rapl_events_per_core,
+};
+
 static bool test_msr(int idx, void *data)
 {
 	return test_bit(idx, (unsigned long *) data);
@@ -558,6 +630,11 @@  static struct perf_msr amd_rapl_pkg_msrs[] = {
 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
 };
 
+static struct perf_msr amd_rapl_core_msrs[] = {
+	[PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group,
+				 test_msr, false, RAPL_MSR_MASK },
+};
+
 static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
 			      const struct cpumask *event_cpumask, unsigned int cpu)
 {
@@ -583,8 +660,14 @@  static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu
 
 static int rapl_cpu_offline(unsigned int cpu)
 {
-	return __rapl_cpu_offline(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
+	int ret =  __rapl_cpu_offline(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
 				  get_rapl_pmu_cpumask(cpu), cpu);
+
+	if (ret == 0 && rapl_model->core_events)
+		ret = __rapl_cpu_offline(rapl_pmus_core, topology_logical_core_id(cpu),
+				   topology_sibling_cpumask(cpu), cpu);
+
+	return ret;
 }
 
 static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
@@ -622,8 +705,14 @@  static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_
 
 static int rapl_cpu_online(unsigned int cpu)
 {
-	return __rapl_cpu_online(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
+	int ret =  __rapl_cpu_online(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
 				 get_rapl_pmu_cpumask(cpu), cpu);
+
+	if (ret == 0 && rapl_model->core_events)
+		ret = __rapl_cpu_online(rapl_pmus_core, topology_logical_core_id(cpu),
+				   topology_sibling_cpumask(cpu), cpu);
+
+	return ret;
 }
 
 
@@ -677,7 +766,7 @@  static void __init rapl_advertise(void)
 	int i;
 
 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
-		hweight32(rapl_pkg_cntr_mask), rapl_timer_ms);
+		hweight32(rapl_pkg_cntr_mask) + hweight32(rapl_core_cntr_mask), rapl_timer_ms);
 
 	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
 		if (rapl_pkg_cntr_mask & (1 << i)) {
@@ -685,6 +774,13 @@  static void __init rapl_advertise(void)
 				rapl_pkg_domain_names[i], rapl_hw_unit[i]);
 		}
 	}
+
+	for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
+		if (rapl_core_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_core_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
 }
 
 static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
@@ -705,15 +801,16 @@  static const struct attribute_group *rapl_attr_update[] = {
 	NULL,
 };
 
-static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr)
+static const struct attribute_group *rapl_per_core_attr_update[] = {
+	&rapl_events_per_core_group,
+};
+
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int nr_rapl_pmu,
+				 const struct attribute_group **rapl_attr_groups,
+				 const struct attribute_group **rapl_attr_update)
 {
 	struct rapl_pmus *rapl_pmus;
 
-	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
-
-	if (rapl_pmu_is_pkg_scope())
-		nr_rapl_pmu = topology_max_packages();
-
 	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
 	if (!rapl_pmus)
 		return -ENOMEM;
@@ -741,7 +838,7 @@  static struct rapl_model model_snb = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_PP1),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_snbep = {
@@ -749,7 +846,7 @@  static struct rapl_model model_snbep = {
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsw = {
@@ -758,7 +855,7 @@  static struct rapl_model model_hsw = {
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsx = {
@@ -767,7 +864,7 @@  static struct rapl_model model_hsx = {
 			  BIT(PERF_RAPL_RAM),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_knl = {
@@ -775,7 +872,7 @@  static struct rapl_model model_knl = {
 			  BIT(PERF_RAPL_RAM),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_skl = {
@@ -785,7 +882,7 @@  static struct rapl_model model_skl = {
 			  BIT(PERF_RAPL_PP1) |
 			  BIT(PERF_RAPL_PSYS),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_spr = {
@@ -795,13 +892,15 @@  static struct rapl_model model_spr = {
 			  BIT(PERF_RAPL_PSYS),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_spr_msrs,
+	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
 };
 
 static struct rapl_model model_amd_hygon = {
 	.pkg_events	= BIT(PERF_RAPL_PKG),
+	.core_events	= BIT(PERF_RAPL_PER_CORE),
 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
-	.rapl_msrs      = amd_rapl_pkg_msrs,
+	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
+	.rapl_core_msrs	= amd_rapl_core_msrs,
 };
 
 static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -858,6 +957,11 @@  static int __init rapl_pmu_init(void)
 {
 	const struct x86_cpu_id *id;
 	int ret;
+	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+	int nr_cores = topology_max_packages() * topology_num_cores_per_package();
+
+	if (rapl_pmu_is_pkg_scope())
+		nr_rapl_pmu = topology_max_packages();
 
 	id = x86_match_cpu(rapl_model_match);
 	if (!id)
@@ -865,17 +969,34 @@  static int __init rapl_pmu_init(void)
 
 	rapl_model = (struct rapl_model *) id->driver_data;
 
-	rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+	rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
 					false, (void *) &rapl_model->pkg_events);
 
 	ret = rapl_check_hw_unit();
 	if (ret)
 		return ret;
 
-	ret = init_rapl_pmus(&rapl_pmus_pkg);
+	ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu, rapl_attr_groups, rapl_attr_update);
 	if (ret)
 		return ret;
 
+	if (rapl_model->core_events) {
+		rapl_core_cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
+						     PERF_RAPL_CORE_EVENTS_MAX, false,
+						     (void *) &rapl_model->core_events);
+
+		ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
+				     rapl_per_core_attr_groups, rapl_per_core_attr_update);
+		if (ret) {
+			/*
+			 * If initialization of per_core PMU fails, reset per_core
+			 * flag, and continue with power PMU initialization.
+			 */
+			pr_warn("Per-core PMU initialization failed (%d)\n", ret);
+			rapl_model->core_events = 0UL;
+		}
+	}
+
 	/*
 	 * Install callbacks. Core will call them for each online cpu.
 	 */
@@ -889,6 +1010,20 @@  static int __init rapl_pmu_init(void)
 	if (ret)
 		goto out1;
 
+	if (rapl_model->core_events) {
+		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1);
+		if (ret) {
+			/*
+			 * If registration of per_core PMU fails, cleanup per_core PMU
+			 * variables, reset the per_core flag and keep the
+			 * power PMU untouched.
+			 */
+			pr_warn("Per-core PMU registration failed (%d)\n", ret);
+			cleanup_rapl_pmus(rapl_pmus_core);
+			rapl_model->core_events = 0UL;
+		}
+	}
+
 	rapl_advertise();
 	return 0;
 
@@ -906,5 +1041,9 @@  static void __exit intel_rapl_exit(void)
 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
 	cleanup_rapl_pmus(rapl_pmus_pkg);
+	if (rapl_model->core_events) {
+		perf_pmu_unregister(&rapl_pmus_core->pmu);
+		cleanup_rapl_pmus(rapl_pmus_core);
+	}
 }
 module_exit(intel_rapl_exit);