Message ID | 20200519155011.56184-1-nchatrad@amd.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [1/3,v7] hwmon: Add amd_energy driver to report energy counters | expand |
On 5/19/20 8:50 AM, Naveen Krishna Chatradhi wrote: > This patch adds hwmon based amd_energy driver support for > family 17h processors from AMD. > > The driver provides following interface to the userspace > 1. Reports the per core consumption > * file: "energy%d_input", label: "Ecore%03d" > 2. Reports per socket energy consumption > * file: "energy%d_input", label: "Esocket%d" > 3. To, increase the wrap around time of the socket energy > counters, a 64bit accumultor is implemented. > 4. Reports scaled energy value in Joules. > > Cc: Guenter Roeck <linux@roeck-us.net> > Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com> > --- This version looks ok, and it looks like there are no more unexpected counter wraps either. Series applied to hwmon-next. Thanks, Guenter > Changes in v7: > 1. Move all static variables into the data structure > 2. Move the read-update code under the mutex lock > > drivers/hwmon/Kconfig | 10 + > drivers/hwmon/Makefile | 1 + > drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++ > 3 files changed, 417 insertions(+) > create mode 100644 drivers/hwmon/amd_energy.c > > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig > index 4c62f900bf7e..e165e10c49ef 100644 > --- a/drivers/hwmon/Kconfig > +++ b/drivers/hwmon/Kconfig > @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER > This driver can also be built as a module. If so, the module > will be called fam15h_power. > > +config SENSORS_AMD_ENERGY > + tristate "AMD RAPL MSR based Energy driver" > + depends on X86 > + help > + If you say yes here you get support for core and package energy > + sensors, based on RAPL MSR for AMD family 17h and above CPUs. > + > + This driver can also be built as a module. If so, the module > + will be called as amd_energy. > + > config SENSORS_APPLESMC > tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)" > depends on INPUT && X86 > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile > index b0b9c8e57176..318f89dc7133 100644 > --- a/drivers/hwmon/Makefile > +++ b/drivers/hwmon/Makefile > @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411) += adt7411.o > obj-$(CONFIG_SENSORS_ADT7462) += adt7462.o > obj-$(CONFIG_SENSORS_ADT7470) += adt7470.o > obj-$(CONFIG_SENSORS_ADT7475) += adt7475.o > +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o > obj-$(CONFIG_SENSORS_APPLESMC) += applesmc.o > obj-$(CONFIG_SENSORS_ARM_SCMI) += scmi-hwmon.o > obj-$(CONFIG_SENSORS_ARM_SCPI) += scpi-hwmon.o > diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c > new file mode 100644 > index 000000000000..bc8b643a37d5 > --- /dev/null > +++ b/drivers/hwmon/amd_energy.c > @@ -0,0 +1,406 @@ > +// SPDX-License-Identifier: GPL-2.0-only > + > +/* > + * Copyright (C) 2020 Advanced Micro Devices, Inc. > + */ > +#include <asm/cpu_device_id.h> > + > +#include <linux/bits.h> > +#include <linux/cpu.h> > +#include <linux/cpumask.h> > +#include <linux/delay.h> > +#include <linux/device.h> > +#include <linux/hwmon.h> > +#include <linux/kernel.h> > +#include <linux/kthread.h> > +#include <linux/list.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/processor.h> > +#include <linux/platform_device.h> > +#include <linux/sched.h> > +#include <linux/slab.h> > +#include <linux/topology.h> > +#include <linux/types.h> > + > +#define DRVNAME "amd_energy" > + > +#define ENERGY_PWR_UNIT_MSR 0xC0010299 > +#define ENERGY_CORE_MSR 0xC001029A > +#define ENERGY_PKG_MSR 0xC001029B > + > +#define AMD_ENERGY_UNIT_MASK 0x01F00 > +#define AMD_ENERGY_MASK 0xFFFFFFFF > + > +struct sensor_accumulator { > + u64 energy_ctr; > + u64 prev_value; > + char label[10]; > +}; > + > +struct amd_energy_data { > + struct hwmon_channel_info energy_info; > + const struct hwmon_channel_info *info[2]; > + struct hwmon_chip_info chip; > + struct task_struct *wrap_accumulate; > + /* Lock around the accumulator */ > + struct mutex lock; > + /* An accumulator for each core and socket */ > + struct sensor_accumulator *accums; > + /* Energy Status Units */ > + u64 energy_units; > + int nr_cpus; > + int nr_socks; > + int core_id; > +}; > + > +static int amd_energy_read_labels(struct device *dev, > + enum hwmon_sensor_types type, > + u32 attr, int channel, > + const char **str) > +{ > + struct amd_energy_data *data = dev_get_drvdata(dev); > + > + *str = data->accums[channel].label; > + return 0; > +} > + > +static void get_energy_units(struct amd_energy_data *data) > +{ > + u64 rapl_units; > + > + rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units); > + data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8; > +} > + > +static void accumulate_socket_delta(struct amd_energy_data *data, > + int sock, int cpu) > +{ > + struct sensor_accumulator *s_accum; > + u64 input; > + > + mutex_lock(&data->lock); > + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); > + input &= AMD_ENERGY_MASK; > + > + s_accum = &data->accums[data->nr_cpus + sock]; > + if (input >= s_accum->prev_value) > + s_accum->energy_ctr += > + input - s_accum->prev_value; > + else > + s_accum->energy_ctr += UINT_MAX - > + s_accum->prev_value + input; > + > + s_accum->prev_value = input; > + mutex_unlock(&data->lock); > +} > + > +static void accumulate_core_delta(struct amd_energy_data *data) > +{ > + struct sensor_accumulator *c_accum; > + u64 input; > + int cpu; > + > + mutex_lock(&data->lock); > + if (data->core_id >= data->nr_cpus) > + data->core_id = 0; > + > + cpu = data->core_id; > + > + if (!cpu_online(cpu)) > + goto out; > + > + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); > + input &= AMD_ENERGY_MASK; > + > + c_accum = &data->accums[cpu]; > + > + if (input >= c_accum->prev_value) > + c_accum->energy_ctr += > + input - c_accum->prev_value; > + else > + c_accum->energy_ctr += UINT_MAX - > + c_accum->prev_value + input; > + > + c_accum->prev_value = input; > + > +out: > + data->core_id++; > + mutex_unlock(&data->lock); > +} > + > +static void read_accumulate(struct amd_energy_data *data) > +{ > + int sock; > + > + for (sock = 0; sock < data->nr_socks; sock++) { > + int cpu; > + > + cpu = cpumask_first_and(cpu_online_mask, > + cpumask_of_node(sock)); > + > + accumulate_socket_delta(data, sock, cpu); > + } > + > + accumulate_core_delta(data); > +} > + > +static void amd_add_delta(struct amd_energy_data *data, int ch, > + int cpu, long *val, bool is_core) > +{ > + struct sensor_accumulator *s_accum, *c_accum; > + u64 input; > + > + mutex_lock(&data->lock); > + if (!is_core) { > + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); > + input &= AMD_ENERGY_MASK; > + > + s_accum = &data->accums[ch]; > + if (input >= s_accum->prev_value) > + input += s_accum->energy_ctr - > + s_accum->prev_value; > + else > + input += UINT_MAX - s_accum->prev_value + > + s_accum->energy_ctr; > + } else { > + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); > + input &= AMD_ENERGY_MASK; > + > + c_accum = &data->accums[ch]; > + if (input >= c_accum->prev_value) > + input += c_accum->energy_ctr - > + c_accum->prev_value; > + else > + input += UINT_MAX - c_accum->prev_value + > + c_accum->energy_ctr; > + } > + > + /* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */ > + *val = div64_ul(input * 1000000UL, BIT(data->energy_units)); > + > + mutex_unlock(&data->lock); > +} > + > +static int amd_energy_read(struct device *dev, > + enum hwmon_sensor_types type, > + u32 attr, int channel, long *val) > +{ > + struct amd_energy_data *data = dev_get_drvdata(dev); > + int cpu; > + > + if (channel >= data->nr_cpus) { > + cpu = cpumask_first_and(cpu_online_mask, > + cpumask_of_node > + (channel - data->nr_cpus)); > + amd_add_delta(data, channel, cpu, val, false); > + } else { > + cpu = channel; > + if (!cpu_online(cpu)) > + return -ENODEV; > + > + amd_add_delta(data, channel, cpu, val, true); > + } > + > + return 0; > +} > + > +static umode_t amd_energy_is_visible(const void *_data, > + enum hwmon_sensor_types type, > + u32 attr, int channel) > +{ > + return 0444; > +} > + > +static int energy_accumulator(void *p) > +{ > + struct amd_energy_data *data = (struct amd_energy_data *)p; > + > + while (!kthread_should_stop()) { > + /* > + * Ignoring the conditions such as > + * cpu being offline or rdmsr failure > + */ > + read_accumulate(data); > + > + set_current_state(TASK_INTERRUPTIBLE); > + if (kthread_should_stop()) > + break; > + > + /* > + * On a 240W system, with default resolution the > + * Socket Energy status register may wrap around in > + * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins) > + * > + * let us accumulate for every 100secs > + */ > + schedule_timeout(msecs_to_jiffies(100000)); > + } > + return 0; > +} > + > +static const struct hwmon_ops amd_energy_ops = { > + .is_visible = amd_energy_is_visible, > + .read = amd_energy_read, > + .read_string = amd_energy_read_labels, > +}; > + > +static int amd_create_sensor(struct device *dev, > + struct amd_energy_data *data, > + u8 type, u32 config) > +{ > + struct hwmon_channel_info *info = &data->energy_info; > + struct sensor_accumulator *accums; > + int i, num_siblings, cpus, sockets; > + u32 *s_config; > + > + /* Identify the number of siblings per core */ > + num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; > + > + sockets = num_possible_nodes(); > + > + /* > + * Energy counter register is accessed at core level. > + * Hence, filterout the siblings. > + */ > + cpus = num_present_cpus() / num_siblings; > + > + s_config = devm_kcalloc(dev, cpus + sockets, > + sizeof(u32), GFP_KERNEL); > + if (!s_config) > + return -ENOMEM; > + > + accums = devm_kcalloc(dev, cpus + sockets, > + sizeof(struct sensor_accumulator), > + GFP_KERNEL); > + if (!accums) > + return -ENOMEM; > + > + info->type = type; > + info->config = s_config; > + > + data->nr_cpus = cpus; > + data->nr_socks = sockets; > + data->accums = accums; > + > + for (i = 0; i < cpus + sockets; i++) { > + s_config[i] = config; > + if (i < cpus) > + scnprintf(accums[i].label, 10, > + "Ecore%03u", i); > + else > + scnprintf(accums[i].label, 10, > + "Esocket%u", (i - cpus)); > + } > + > + return 0; > +} > + > +static int amd_energy_probe(struct platform_device *pdev) > +{ > + struct device *hwmon_dev; > + struct amd_energy_data *data; > + struct device *dev = &pdev->dev; > + > + data = devm_kzalloc(dev, > + sizeof(struct amd_energy_data), GFP_KERNEL); > + if (!data) > + return -ENOMEM; > + > + data->chip.ops = &amd_energy_ops; > + data->chip.info = data->info; > + > + dev_set_drvdata(dev, data); > + /* Populate per-core energy reporting */ > + data->info[0] = &data->energy_info; > + amd_create_sensor(dev, data, hwmon_energy, > + HWMON_E_INPUT | HWMON_E_LABEL); > + > + mutex_init(&data->lock); > + get_energy_units(data); > + > + hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME, > + data, > + &data->chip, > + NULL); > + if (IS_ERR(hwmon_dev)) > + return PTR_ERR(hwmon_dev); > + > + data->wrap_accumulate = kthread_run(energy_accumulator, data, > + "%s", dev_name(hwmon_dev)); > + if (IS_ERR(data->wrap_accumulate)) > + return PTR_ERR(data->wrap_accumulate); > + > + return PTR_ERR_OR_ZERO(data->wrap_accumulate); > +} > + > +static int amd_energy_remove(struct platform_device *pdev) > +{ > + struct amd_energy_data *data = dev_get_drvdata(&pdev->dev); > + > + if (data && data->wrap_accumulate) > + kthread_stop(data->wrap_accumulate); > + > + return 0; > +} > + > +static const struct platform_device_id amd_energy_ids[] = { > + { .name = DRVNAME, }, > + {} > +}; > +MODULE_DEVICE_TABLE(platform, amd_energy_ids); > + > +static struct platform_driver amd_energy_driver = { > + .probe = amd_energy_probe, > + .remove = amd_energy_remove, > + .id_table = amd_energy_ids, > + .driver = { > + .name = DRVNAME, > + }, > +}; > + > +static struct platform_device *amd_energy_platdev; > + > +static const struct x86_cpu_id cpu_ids[] __initconst = { > + X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL), > + {} > +}; > +MODULE_DEVICE_TABLE(x86cpu, cpu_ids); > + > +static int __init amd_energy_init(void) > +{ > + int ret; > + > + if (!x86_match_cpu(cpu_ids)) > + return -ENODEV; > + > + ret = platform_driver_register(&amd_energy_driver); > + if (ret) > + return ret; > + > + amd_energy_platdev = platform_device_alloc(DRVNAME, 0); > + if (!amd_energy_platdev) > + return -ENOMEM; > + > + ret = platform_device_add(amd_energy_platdev); > + if (ret) { > + platform_device_put(amd_energy_platdev); > + platform_driver_unregister(&amd_energy_driver); > + return ret; > + } > + > + return ret; > +} > + > +static void __exit amd_energy_exit(void) > +{ > + platform_device_unregister(amd_energy_platdev); > + platform_driver_unregister(&amd_energy_driver); > +} > + > +module_init(amd_energy_init); > +module_exit(amd_energy_exit); > + > +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface"); > +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>"); > +MODULE_LICENSE("GPL"); >
Hi Guenter, On Fri, 22 May 2020 at 18:56, Guenter Roeck <linux@roeck-us.net> wrote: > > On 5/19/20 8:50 AM, Naveen Krishna Chatradhi wrote: > > This patch adds hwmon based amd_energy driver support for > > family 17h processors from AMD. > > > > The driver provides following interface to the userspace > > 1. Reports the per core consumption > > * file: "energy%d_input", label: "Ecore%03d" > > 2. Reports per socket energy consumption > > * file: "energy%d_input", label: "Esocket%d" > > 3. To, increase the wrap around time of the socket energy > > counters, a 64bit accumultor is implemented. > > 4. Reports scaled energy value in Joules. > > > > Cc: Guenter Roeck <linux@roeck-us.net> > > Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com> > > --- > > This version looks ok, and it looks like there are no more > unexpected counter wraps either. Series applied to hwmon-next. Thank you for your guidance and support. > > Thanks, > Guenter > > > Changes in v7: > > 1. Move all static variables into the data structure > > 2. Move the read-update code under the mutex lock > > > > drivers/hwmon/Kconfig | 10 + > > drivers/hwmon/Makefile | 1 + > > drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++ > > 3 files changed, 417 insertions(+) > > create mode 100644 drivers/hwmon/amd_energy.c > > > > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig > > index 4c62f900bf7e..e165e10c49ef 100644 > > --- a/drivers/hwmon/Kconfig > > +++ b/drivers/hwmon/Kconfig > > @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER > > This driver can also be built as a module. If so, the module > > will be called fam15h_power. > > > > +config SENSORS_AMD_ENERGY > > + tristate "AMD RAPL MSR based Energy driver" > > + depends on X86 > > + help > > + If you say yes here you get support for core and package energy > > + sensors, based on RAPL MSR for AMD family 17h and above CPUs. > > + > > + This driver can also be built as a module. If so, the module > > + will be called as amd_energy. > > + > > config SENSORS_APPLESMC > > tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)" > > depends on INPUT && X86 > > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile > > index b0b9c8e57176..318f89dc7133 100644 > > --- a/drivers/hwmon/Makefile > > +++ b/drivers/hwmon/Makefile > > @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411) += adt7411.o > > obj-$(CONFIG_SENSORS_ADT7462) += adt7462.o > > obj-$(CONFIG_SENSORS_ADT7470) += adt7470.o > > obj-$(CONFIG_SENSORS_ADT7475) += adt7475.o > > +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o > > obj-$(CONFIG_SENSORS_APPLESMC) += applesmc.o > > obj-$(CONFIG_SENSORS_ARM_SCMI) += scmi-hwmon.o > > obj-$(CONFIG_SENSORS_ARM_SCPI) += scpi-hwmon.o > > diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c > > new file mode 100644 > > index 000000000000..bc8b643a37d5 > > --- /dev/null > > +++ b/drivers/hwmon/amd_energy.c > > @@ -0,0 +1,406 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > + > > +/* > > + * Copyright (C) 2020 Advanced Micro Devices, Inc. > > + */ > > +#include <asm/cpu_device_id.h> > > + > > +#include <linux/bits.h> > > +#include <linux/cpu.h> > > +#include <linux/cpumask.h> > > +#include <linux/delay.h> > > +#include <linux/device.h> > > +#include <linux/hwmon.h> > > +#include <linux/kernel.h> > > +#include <linux/kthread.h> > > +#include <linux/list.h> > > +#include <linux/module.h> > > +#include <linux/mutex.h> > > +#include <linux/processor.h> > > +#include <linux/platform_device.h> > > +#include <linux/sched.h> > > +#include <linux/slab.h> > > +#include <linux/topology.h> > > +#include <linux/types.h> > > + > > +#define DRVNAME "amd_energy" > > + > > +#define ENERGY_PWR_UNIT_MSR 0xC0010299 > > +#define ENERGY_CORE_MSR 0xC001029A > > +#define ENERGY_PKG_MSR 0xC001029B > > + > > +#define AMD_ENERGY_UNIT_MASK 0x01F00 > > +#define AMD_ENERGY_MASK 0xFFFFFFFF > > + > > +struct sensor_accumulator { > > + u64 energy_ctr; > > + u64 prev_value; > > + char label[10]; > > +}; > > + > > +struct amd_energy_data { > > + struct hwmon_channel_info energy_info; > > + const struct hwmon_channel_info *info[2]; > > + struct hwmon_chip_info chip; > > + struct task_struct *wrap_accumulate; > > + /* Lock around the accumulator */ > > + struct mutex lock; > > + /* An accumulator for each core and socket */ > > + struct sensor_accumulator *accums; > > + /* Energy Status Units */ > > + u64 energy_units; > > + int nr_cpus; > > + int nr_socks; > > + int core_id; > > +}; > > + > > +static int amd_energy_read_labels(struct device *dev, > > + enum hwmon_sensor_types type, > > + u32 attr, int channel, > > + const char **str) > > +{ > > + struct amd_energy_data *data = dev_get_drvdata(dev); > > + > > + *str = data->accums[channel].label; > > + return 0; > > +} > > + > > +static void get_energy_units(struct amd_energy_data *data) > > +{ > > + u64 rapl_units; > > + > > + rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units); > > + data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8; > > +} > > + > > +static void accumulate_socket_delta(struct amd_energy_data *data, > > + int sock, int cpu) > > +{ > > + struct sensor_accumulator *s_accum; > > + u64 input; > > + > > + mutex_lock(&data->lock); > > + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); > > + input &= AMD_ENERGY_MASK; > > + > > + s_accum = &data->accums[data->nr_cpus + sock]; > > + if (input >= s_accum->prev_value) > > + s_accum->energy_ctr += > > + input - s_accum->prev_value; > > + else > > + s_accum->energy_ctr += UINT_MAX - > > + s_accum->prev_value + input; > > + > > + s_accum->prev_value = input; > > + mutex_unlock(&data->lock); > > +} > > + > > +static void accumulate_core_delta(struct amd_energy_data *data) > > +{ > > + struct sensor_accumulator *c_accum; > > + u64 input; > > + int cpu; > > + > > + mutex_lock(&data->lock); > > + if (data->core_id >= data->nr_cpus) > > + data->core_id = 0; > > + > > + cpu = data->core_id; > > + > > + if (!cpu_online(cpu)) > > + goto out; > > + > > + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); > > + input &= AMD_ENERGY_MASK; > > + > > + c_accum = &data->accums[cpu]; > > + > > + if (input >= c_accum->prev_value) > > + c_accum->energy_ctr += > > + input - c_accum->prev_value; > > + else > > + c_accum->energy_ctr += UINT_MAX - > > + c_accum->prev_value + input; > > + > > + c_accum->prev_value = input; > > + > > +out: > > + data->core_id++; > > + mutex_unlock(&data->lock); > > +} > > + > > +static void read_accumulate(struct amd_energy_data *data) > > +{ > > + int sock; > > + > > + for (sock = 0; sock < data->nr_socks; sock++) { > > + int cpu; > > + > > + cpu = cpumask_first_and(cpu_online_mask, > > + cpumask_of_node(sock)); > > + > > + accumulate_socket_delta(data, sock, cpu); > > + } > > + > > + accumulate_core_delta(data); > > +} > > + > > +static void amd_add_delta(struct amd_energy_data *data, int ch, > > + int cpu, long *val, bool is_core) > > +{ > > + struct sensor_accumulator *s_accum, *c_accum; > > + u64 input; > > + > > + mutex_lock(&data->lock); > > + if (!is_core) { > > + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); > > + input &= AMD_ENERGY_MASK; > > + > > + s_accum = &data->accums[ch]; > > + if (input >= s_accum->prev_value) > > + input += s_accum->energy_ctr - > > + s_accum->prev_value; > > + else > > + input += UINT_MAX - s_accum->prev_value + > > + s_accum->energy_ctr; > > + } else { > > + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); > > + input &= AMD_ENERGY_MASK; > > + > > + c_accum = &data->accums[ch]; > > + if (input >= c_accum->prev_value) > > + input += c_accum->energy_ctr - > > + c_accum->prev_value; > > + else > > + input += UINT_MAX - c_accum->prev_value + > > + c_accum->energy_ctr; > > + } > > + > > + /* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */ > > + *val = div64_ul(input * 1000000UL, BIT(data->energy_units)); > > + > > + mutex_unlock(&data->lock); > > +} > > + > > +static int amd_energy_read(struct device *dev, > > + enum hwmon_sensor_types type, > > + u32 attr, int channel, long *val) > > +{ > > + struct amd_energy_data *data = dev_get_drvdata(dev); > > + int cpu; > > + > > + if (channel >= data->nr_cpus) { > > + cpu = cpumask_first_and(cpu_online_mask, > > + cpumask_of_node > > + (channel - data->nr_cpus)); > > + amd_add_delta(data, channel, cpu, val, false); > > + } else { > > + cpu = channel; > > + if (!cpu_online(cpu)) > > + return -ENODEV; > > + > > + amd_add_delta(data, channel, cpu, val, true); > > + } > > + > > + return 0; > > +} > > + > > +static umode_t amd_energy_is_visible(const void *_data, > > + enum hwmon_sensor_types type, > > + u32 attr, int channel) > > +{ > > + return 0444; > > +} > > + > > +static int energy_accumulator(void *p) > > +{ > > + struct amd_energy_data *data = (struct amd_energy_data *)p; > > + > > + while (!kthread_should_stop()) { > > + /* > > + * Ignoring the conditions such as > > + * cpu being offline or rdmsr failure > > + */ > > + read_accumulate(data); > > + > > + set_current_state(TASK_INTERRUPTIBLE); > > + if (kthread_should_stop()) > > + break; > > + > > + /* > > + * On a 240W system, with default resolution the > > + * Socket Energy status register may wrap around in > > + * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins) > > + * > > + * let us accumulate for every 100secs > > + */ > > + schedule_timeout(msecs_to_jiffies(100000)); > > + } > > + return 0; > > +} > > + > > +static const struct hwmon_ops amd_energy_ops = { > > + .is_visible = amd_energy_is_visible, > > + .read = amd_energy_read, > > + .read_string = amd_energy_read_labels, > > +}; > > + > > +static int amd_create_sensor(struct device *dev, > > + struct amd_energy_data *data, > > + u8 type, u32 config) > > +{ > > + struct hwmon_channel_info *info = &data->energy_info; > > + struct sensor_accumulator *accums; > > + int i, num_siblings, cpus, sockets; > > + u32 *s_config; > > + > > + /* Identify the number of siblings per core */ > > + num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; > > + > > + sockets = num_possible_nodes(); > > + > > + /* > > + * Energy counter register is accessed at core level. > > + * Hence, filterout the siblings. > > + */ > > + cpus = num_present_cpus() / num_siblings; > > + > > + s_config = devm_kcalloc(dev, cpus + sockets, > > + sizeof(u32), GFP_KERNEL); > > + if (!s_config) > > + return -ENOMEM; > > + > > + accums = devm_kcalloc(dev, cpus + sockets, > > + sizeof(struct sensor_accumulator), > > + GFP_KERNEL); > > + if (!accums) > > + return -ENOMEM; > > + > > + info->type = type; > > + info->config = s_config; > > + > > + data->nr_cpus = cpus; > > + data->nr_socks = sockets; > > + data->accums = accums; > > + > > + for (i = 0; i < cpus + sockets; i++) { > > + s_config[i] = config; > > + if (i < cpus) > > + scnprintf(accums[i].label, 10, > > + "Ecore%03u", i); > > + else > > + scnprintf(accums[i].label, 10, > > + "Esocket%u", (i - cpus)); > > + } > > + > > + return 0; > > +} > > + > > +static int amd_energy_probe(struct platform_device *pdev) > > +{ > > + struct device *hwmon_dev; > > + struct amd_energy_data *data; > > + struct device *dev = &pdev->dev; > > + > > + data = devm_kzalloc(dev, > > + sizeof(struct amd_energy_data), GFP_KERNEL); > > + if (!data) > > + return -ENOMEM; > > + > > + data->chip.ops = &amd_energy_ops; > > + data->chip.info = data->info; > > + > > + dev_set_drvdata(dev, data); > > + /* Populate per-core energy reporting */ > > + data->info[0] = &data->energy_info; > > + amd_create_sensor(dev, data, hwmon_energy, > > + HWMON_E_INPUT | HWMON_E_LABEL); > > + > > + mutex_init(&data->lock); > > + get_energy_units(data); > > + > > + hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME, > > + data, > > + &data->chip, > > + NULL); > > + if (IS_ERR(hwmon_dev)) > > + return PTR_ERR(hwmon_dev); > > + > > + data->wrap_accumulate = kthread_run(energy_accumulator, data, > > + "%s", dev_name(hwmon_dev)); > > + if (IS_ERR(data->wrap_accumulate)) > > + return PTR_ERR(data->wrap_accumulate); > > + > > + return PTR_ERR_OR_ZERO(data->wrap_accumulate); > > +} > > + > > +static int amd_energy_remove(struct platform_device *pdev) > > +{ > > + struct amd_energy_data *data = dev_get_drvdata(&pdev->dev); > > + > > + if (data && data->wrap_accumulate) > > + kthread_stop(data->wrap_accumulate); > > + > > + return 0; > > +} > > + > > +static const struct platform_device_id amd_energy_ids[] = { > > + { .name = DRVNAME, }, > > + {} > > +}; > > +MODULE_DEVICE_TABLE(platform, amd_energy_ids); > > + > > +static struct platform_driver amd_energy_driver = { > > + .probe = amd_energy_probe, > > + .remove = amd_energy_remove, > > + .id_table = amd_energy_ids, > > + .driver = { > > + .name = DRVNAME, > > + }, > > +}; > > + > > +static struct platform_device *amd_energy_platdev; > > + > > +static const struct x86_cpu_id cpu_ids[] __initconst = { > > + X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL), > > + {} > > +}; > > +MODULE_DEVICE_TABLE(x86cpu, cpu_ids); > > + > > +static int __init amd_energy_init(void) > > +{ > > + int ret; > > + > > + if (!x86_match_cpu(cpu_ids)) > > + return -ENODEV; > > + > > + ret = platform_driver_register(&amd_energy_driver); > > + if (ret) > > + return ret; > > + > > + amd_energy_platdev = platform_device_alloc(DRVNAME, 0); > > + if (!amd_energy_platdev) > > + return -ENOMEM; > > + > > + ret = platform_device_add(amd_energy_platdev); > > + if (ret) { > > + platform_device_put(amd_energy_platdev); > > + platform_driver_unregister(&amd_energy_driver); > > + return ret; > > + } > > + > > + return ret; > > +} > > + > > +static void __exit amd_energy_exit(void) > > +{ > > + platform_device_unregister(amd_energy_platdev); > > + platform_driver_unregister(&amd_energy_driver); > > +} > > + > > +module_init(amd_energy_init); > > +module_exit(amd_energy_exit); > > + > > +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface"); > > +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>"); > > +MODULE_LICENSE("GPL"); > > >
On Fri, 22 May 2020, Naveen Krishna Ch wrote: > > This version looks ok, and it looks like there are no more > > unexpected counter wraps either. Series applied to hwmon-next. > > Thank you for your guidance and support. Thank you, looking forward to using this driver. Meanwhile, I have a couple of questions about AMD RAPL counters. The documentation says the PKG_ENERGY_STAT MSR is instantiated per CCD and L3 complex, implying that energy is accumulated for 3 or 4 cores in an L3 complex. However your patch reads that MSR per socket, and from testing on my Ryzen 4500U CPU I can confirm that the MSR gives the same value when accessed from each core. Therefore I think your code is correct and the documentation is wrong, can you confirm? Next, I'm seeing paradoxical results when attempting to test accuracy of the counters. When running an AVX workload on 6 cores, I see CORE_ENERGY_STAT MSR reporting values just above 9 Watts per core, with 55 Watts in total, and yet PKG_ENERGY_STAT reporting only 30 Watts. This is clearly a contradiction since package energy must be at least the sum of core energy readings. Furthermore, this is happening on a 18 W CPU in a laptop, which definitely cannot sustain 55 Watts, and even 30 Watts seems too high. Can you clarify how the counters work and under what conditions they give accurate readings? Thanks. Alexander
Hi Alexander, Thanks for using the driver. On Wed, 27 May 2020 at 04:07, Alexander Monakov <amonakov@ispras.ru> wrote: > > On Fri, 22 May 2020, Naveen Krishna Ch wrote: > > > > This version looks ok, and it looks like there are no more > > > unexpected counter wraps either. Series applied to hwmon-next. > > > > Thank you for your guidance and support. > > Thank you, looking forward to using this driver. Meanwhile, I have > a couple of questions about AMD RAPL counters. > > The documentation says the PKG_ENERGY_STAT MSR is instantiated per CCD > and L3 complex, implying that energy is accumulated for 3 or 4 cores > in an L3 complex. > > However your patch reads that MSR per socket, and from testing on my > Ryzen 4500U CPU I can confirm that the MSR gives the same value when > accessed from each core. Therefore I think your code is correct and > the documentation is wrong, can you confirm? I confirmed this with the hardware folks, the definition of package has changed with the chiplet design. May be the documentation needs an update. > > Next, I'm seeing paradoxical results when attempting to test accuracy > of the counters. When running an AVX workload on 6 cores, I see > CORE_ENERGY_STAT MSR reporting values just above 9 Watts per core, with > 55 Watts in total, and yet PKG_ENERGY_STAT reporting only 30 Watts. > This is clearly a contradiction since package energy must be at least > the sum of core energy readings. Furthermore, this is happening on a > 18 W CPU in a laptop, which definitely cannot sustain 55 Watts, and > even 30 Watts seems too high. > > Can you clarify how the counters work and under what conditions they > give accurate readings? These registers are 32bit counters, they might wrap-around quite faster at high work loads. So, we used a kernel thread to accumulate the values of each core and socket to 64bit values. Depending on when the module is inserted in the system, the initial values of the counters could be different and we do not have a way to know, how many time the registers are wrapped around in the past. Difference of Energy over Time is the best way to use this information. For example: at t1 = 0, Read the register before starting the job, say E1. at t2= 300secs, Read the register after the completion of the job, say E2. power consumed = (E2-E1)/(t2 - t1) in Watts. In our evaluation, the sum of the energy consumption of cores of a socket was always less (actually far lesser) than the socket energy consumption. One other thing could be the enumeration of the cores of a package in the Desktop/Laptop systems might need different handling in the read. I will check this. > > Thanks. > Alexander
On Wed, 27 May 2020, Naveen Krishna Ch wrote: > These registers are 32bit counters, they might wrap-around quite faster at > high work loads. So, we used a kernel thread to accumulate the values of > each core and socket to 64bit values. > > Depending on when the module is inserted in the system, the initial values > of the counters could be different and we do not have a way to know, how > many time the registers are wrapped around in the past. I understand that. If you anticipate that the module may be inserted after a wraparound, the driver should populate 'prev_value' with actual counter values instead of zeros. That way the driver will properly accumulate energy over time it's been inserted. As implemented, the driver counts energy since boot time, minus unknown amount lost to wraparounds if the driver was loaded too late. In my case I observed the contradictory readings over a period of several seconds where no wraparound was possible. > In our evaluation, the sum of the energy consumption of cores of a socket was > always less (actually far lesser) than the socket energy consumption. Did you try on laptop CPUs (Renoir SoC, Ryzen 4x00U marketing name)? You also might need specific workloads to observe the issue, I first found it with a small hand-written test, and then found a bigger discrepancy with AVX test from https://github.com/travisdowns/avx-turbo Alexander
Hi Alexander On Wed, 27 May 2020 at 12:29, Alexander Monakov <amonakov@ispras.ru> wrote: > > On Wed, 27 May 2020, Naveen Krishna Ch wrote: > > > These registers are 32bit counters, they might wrap-around quite faster at > > high work loads. So, we used a kernel thread to accumulate the values of > > each core and socket to 64bit values. > > > > Depending on when the module is inserted in the system, the initial values > > of the counters could be different and we do not have a way to know, how > > many time the registers are wrapped around in the past. > > I understand that. If you anticipate that the module may be inserted after a > wraparound, the driver should populate 'prev_value' with actual counter > values instead of zeros. That way the driver will properly accumulate > energy over time it's been inserted. As implemented, the driver counts > energy since boot time, minus unknown amount lost to wraparounds if the > driver was loaded too late. No problem if this module is built into the kernel. If this module is inserted at later point, unless the user keeps the counters since the boot and provide it as an input during the module insert (we can implement this). There is no other way to provide the lost count. > In my case I observed the contradictory readings over a period of several > seconds where no wraparound was possible. > > > In our evaluation, the sum of the energy consumption of cores of a socket was > > always less (actually far lesser) than the socket energy consumption. > > Did you try on laptop CPUs (Renoir SoC, Ryzen 4x00U marketing name)? You also > might need specific workloads to observe the issue, I first found it with a > small hand-written test, and then found a bigger discrepancy with AVX test > from https://github.com/travisdowns/avx-turbo I tried on an octa core machine, must be Renoir vendor_id : AuthenticAMD cpu family : 23 model : 96 On an idle system over a period of 500secs: At t=500sec | At t= 0 | Diff of energy | in Joules | Power in Watts core 0 | 650186538 | 649712585 | 473953 | 0.473953 J | 0.000947906 W core 1 | 507792434 | 507131301 | 661133 | 0.661133 J | 0.001322266 W core 2 | 455706497 | 455163970 | 542527 | 0.542527 J | 0.001085054 W core 3 | 392240356 | 391740417 | 499939 | 0.499939 J | 0.000999878 W core 4 | 411461654 | 410687881 | 773773 | 0.773773 J | 0.001547546 W core 5 | 288821884 | 288071395 | 750489 | 0.750489 J | 0.001500978 W core 6 | 186975219 | 186250793 | 724426 | 0.724426 J | 0.001448852 W core 7 | 131509216 | 130458816 | 1050400 | 1.0504 J | 0.0021008 W Socket 0 | 31638431930 | 29370733505 | 2267698425 | 2267.698 J | 4.53539685 W Power consumed by socket: 4.53539685 W Sum of power consumed by cores: 0.010953W On an system with AVX test running over a period of 500 secs: At t=500sec | At t= 0 | Diff of energy | in Joules | Power in Watts core 0 | 649348495 | 413687530 | 235660965 | 235.660965 | 0.47132193 core 1 | 506880081 | 294882827 | 211997254 | 211.997254 | 0.423994508 core 2 | 454804046 | 271046127 | 183757919 | 183.757919 | 0.367515838 core 3 | 391508712 | 237531021 | 153977691 | 153.977691 | 0.307955382 core 4 | 410336868 | 284410079 | 125926789 | 125.926789 | 0.251853578 core 5 | 287569732 | 192306015 | 95263717 | 95.263717 | 0.190527434 core 6 | 185909622 | 120556152 | 65353470 | 65.35347 | 0.13070694 core 7 | 129932006 | 95385940 | 34546066 | 34.546066 | 0.069092132 Socket 0 | 28399099655 | 24799819931 3599279724 | 3599.27972 | 7.198559448 Power consumed by socket: 7.198559448 W Sum of power consumed by cores: 2.212968 W Can you confirm this. > > Alexander
On 5/27/20 3:35 AM, Naveen Krishna Ch wrote: > Hi Alexander > > On Wed, 27 May 2020 at 12:29, Alexander Monakov <amonakov@ispras.ru> wrote: >> >> On Wed, 27 May 2020, Naveen Krishna Ch wrote: >> >>> These registers are 32bit counters, they might wrap-around quite faster at >>> high work loads. So, we used a kernel thread to accumulate the values of >>> each core and socket to 64bit values. >>> >>> Depending on when the module is inserted in the system, the initial values >>> of the counters could be different and we do not have a way to know, how >>> many time the registers are wrapped around in the past. >> >> I understand that. If you anticipate that the module may be inserted after a >> wraparound, the driver should populate 'prev_value' with actual counter >> values instead of zeros. That way the driver will properly accumulate >> energy over time it's been inserted. As implemented, the driver counts >> energy since boot time, minus unknown amount lost to wraparounds if the >> driver was loaded too late. > No problem if this module is built into the kernel. > If this module is inserted at later point, unless the user keeps the > counters since > the boot and provide it as an input during the module insert (we can > implement this). I won't accept such a parameter. If you may recall, I did bring this up as reason why I abandoned the matching change for the coretemp driver, predicting that people would complain about it. Now they do. Not surprising. We (and you) will have to live with it. Guenter
On Wed, 27 May 2020, Guenter Roeck wrote: > >> I understand that. If you anticipate that the module may be inserted after a > >> wraparound, the driver should populate 'prev_value' with actual counter > >> values instead of zeros. That way the driver will properly accumulate > >> energy over time it's been inserted. As implemented, the driver counts > >> energy since boot time, minus unknown amount lost to wraparounds if the > >> driver was loaded too late. > > No problem if this module is built into the kernel. > > If this module is inserted at later point, unless the user keeps the > > counters since > > the boot and provide it as an input during the module insert (we can > > implement this). > > I won't accept such a parameter. If you may recall, I did bring this up as > reason why I abandoned the matching change for the coretemp driver, predicting > that people would complain about it. Now they do. Not surprising. We (and you) > will have to live with it. I'm not exactly complaining, I'm proposing a solution: at probe time, populate prev_value members with MSR values instead of zeros. That way, the module will correctly count energy over the time it's been loaded. It can be unloaded and reloaded freely, and doing so would allow to easily measure energy across large spans of time, which sounds like an improvement. I can try to cook a corresponding patch if that sounds alright. Cheers. Alexander
Hi Alexander On Wed, 27 May 2020 at 19:38, Alexander Monakov <amonakov@ispras.ru> wrote: > > On Wed, 27 May 2020, Guenter Roeck wrote: > > > >> I understand that. If you anticipate that the module may be inserted after a > > >> wraparound, the driver should populate 'prev_value' with actual counter > > >> values instead of zeros. That way the driver will properly accumulate > > >> energy over time it's been inserted. As implemented, the driver counts > > >> energy since boot time, minus unknown amount lost to wraparounds if the > > >> driver was loaded too late. > > > No problem if this module is built into the kernel. > > > If this module is inserted at later point, unless the user keeps the > > > counters since > > > the boot and provide it as an input during the module insert (we can > > > implement this). > > > > I won't accept such a parameter. If you may recall, I did bring this up as > > reason why I abandoned the matching change for the coretemp driver, predicting > > that people would complain about it. Now they do. Not surprising. We (and you) > > will have to live with it. Yes agree, letting user provide initial values is not appropriate. My bad. > > I'm not exactly complaining, I'm proposing a solution: at probe time, populate > prev_value members with MSR values instead of zeros. That way, the module will > correctly count energy over the time it's been loaded. It can be unloaded and > reloaded freely, and doing so would allow to easily measure energy across large > spans of time, which sounds like an improvement. In the current driver, the accumulation thread starts during the probe and the prev_value of sockets and the core0 is updated with the current MSR value. It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for the other cores. By populating prev_value of all the cores during probe, we can certainly save some information (values with in one wrap-around at the best). If this information is useful, i can submit a patch to do so. Guenter, could you suggest us on this. > > I can try to cook a corresponding patch if that sounds alright. > > Cheers. > Alexander
On Wed, May 27, 2020 at 05:08:55PM +0300, Alexander Monakov wrote: > On Wed, 27 May 2020, Guenter Roeck wrote: > > > >> I understand that. If you anticipate that the module may be inserted after a > > >> wraparound, the driver should populate 'prev_value' with actual counter > > >> values instead of zeros. That way the driver will properly accumulate > > >> energy over time it's been inserted. As implemented, the driver counts > > >> energy since boot time, minus unknown amount lost to wraparounds if the > > >> driver was loaded too late. > > > No problem if this module is built into the kernel. > > > If this module is inserted at later point, unless the user keeps the > > > counters since > > > the boot and provide it as an input during the module insert (we can > > > implement this). > > > > I won't accept such a parameter. If you may recall, I did bring this up as > > reason why I abandoned the matching change for the coretemp driver, predicting > > that people would complain about it. Now they do. Not surprising. We (and you) > > will have to live with it. > > I'm not exactly complaining, I'm proposing a solution: at probe time, populate > prev_value members with MSR values instead of zeros. That way, the module will > correctly count energy over the time it's been loaded. It can be unloaded and > reloaded freely, and doing so would allow to easily measure energy across large > spans of time, which sounds like an improvement. > That would ignore energy accumulated from before the driver was loaded, and would thus trigger another set of complaints. A slight improvement might be to add up core energy counters when loading the driver, compare it against the package counter, and pick the larger value for the initial package counter(s). This would at least ensure that the package counter is never less than the sum of the core counters. Guenter > I can try to cook a corresponding patch if that sounds alright. > > Cheers. > Alexander
On Wed, 27 May 2020, Guenter Roeck wrote: > > I'm not exactly complaining, I'm proposing a solution: at probe time, populate > > prev_value members with MSR values instead of zeros. That way, the module will > > correctly count energy over the time it's been loaded. It can be unloaded and > > reloaded freely, and doing so would allow to easily measure energy across large > > spans of time, which sounds like an improvement. > > > That would ignore energy accumulated from before the driver was loaded, and > would thus trigger another set of complaints. That doesn't sound right. There's no way for the driver to be sure that the counters did not wrap around before it was loaded. Here's a few scenarios how such wraparound is possible: - while the user was messing in the bootloader for a few minutes - if the user kexec'd a new kernel - if the counters were not reset during a warm reboot Ignoring initial values of the counters is generally the right thing to do. In the specific circumstances when the user wants to measure energy used since machine power-up, and they know the boot happened so quick the counters did not wrap around, they can easily script that with e.g. the rdmsr tool. Or perhaps the driver could pr_info the initial values at probe time. Have such complaints already appeared in practice? Also note that documentation doesn't promise that counters start from zero at power-up time, although that's of course a natural assumption. > A slight improvement might be to add up core energy counters when loading > the driver, compare it against the package counter, and pick the larger > value for the initial package counter(s). This would at least ensure that > the package counter is never less than the sum of the core counters. No, fudging the initial reading like this wouldn't help, because I was pointing out how core counters increment quicker than the package counter; i.e. even if the kernel fudged the initial values, they would still grow contradictory quick enough (on some workloads). Alexander
On 5/27/20 7:42 AM, Naveen Krishna Ch wrote: > Hi Alexander > > On Wed, 27 May 2020 at 19:38, Alexander Monakov <amonakov@ispras.ru> wrote: >> >> On Wed, 27 May 2020, Guenter Roeck wrote: >> >>>>> I understand that. If you anticipate that the module may be inserted after a >>>>> wraparound, the driver should populate 'prev_value' with actual counter >>>>> values instead of zeros. That way the driver will properly accumulate >>>>> energy over time it's been inserted. As implemented, the driver counts >>>>> energy since boot time, minus unknown amount lost to wraparounds if the >>>>> driver was loaded too late. >>>> No problem if this module is built into the kernel. >>>> If this module is inserted at later point, unless the user keeps the >>>> counters since >>>> the boot and provide it as an input during the module insert (we can >>>> implement this). >>> >>> I won't accept such a parameter. If you may recall, I did bring this up as >>> reason why I abandoned the matching change for the coretemp driver, predicting >>> that people would complain about it. Now they do. Not surprising. We (and you) >>> will have to live with it. > Yes agree, letting user provide initial values is not appropriate. My bad. > >> >> I'm not exactly complaining, I'm proposing a solution: at probe time, populate >> prev_value members with MSR values instead of zeros. That way, the module will >> correctly count energy over the time it's been loaded. It can be unloaded and >> reloaded freely, and doing so would allow to easily measure energy across large >> spans of time, which sounds like an improvement. > > In the current driver, the accumulation thread starts during the probe and the > prev_value of sockets and the core0 is updated with the current MSR value. > It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for > the other cores. > > By populating prev_value of all the cores during probe, we can > certainly save some > information (values with in one wrap-around at the best). If this > information is > useful, i can submit a patch to do so. Guenter, could you suggest us on this. > As long as you don't ignore counter values from before loading the driver, sure, go ahead. Guenter >> >> I can try to cook a corresponding patch if that sounds alright. >> >> Cheers. >> Alexander > > >
On Wed, 27 May 2020, Guenter Roeck wrote: > > In the current driver, the accumulation thread starts during the probe and the > > prev_value of sockets and the core0 is updated with the current MSR value. > > It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for > > the other cores. > > > > By populating prev_value of all the cores during probe, we can > > certainly save some > > information (values with in one wrap-around at the best). If this > > information is > > useful, i can submit a patch to do so. Guenter, could you suggest us on this. > > > > As long as you don't ignore counter values from before loading the driver, > sure, go ahead. Hm? If I'm understanding correctly what Naveen is proposing, that would simply move reading the initial values ahead by a few hundred cycles (the latency to start the thread). It wouldn't change anything about the issue, and make the code more complicated :( Alexander
On 5/27/20 8:12 AM, Alexander Monakov wrote: > On Wed, 27 May 2020, Guenter Roeck wrote: > >>> I'm not exactly complaining, I'm proposing a solution: at probe time, populate >>> prev_value members with MSR values instead of zeros. That way, the module will >>> correctly count energy over the time it's been loaded. It can be unloaded and >>> reloaded freely, and doing so would allow to easily measure energy across large >>> spans of time, which sounds like an improvement. >>> >> That would ignore energy accumulated from before the driver was loaded, and >> would thus trigger another set of complaints. > > That doesn't sound right. There's no way for the driver to be sure that the > counters did not wrap around before it was loaded. Here's a few scenarios > how such wraparound is possible: > > - while the user was messing in the bootloader for a few minutes > - if the user kexec'd a new kernel > - if the counters were not reset during a warm reboot > > Ignoring initial values of the counters is generally the right thing to do. > In the specific circumstances when the user wants to measure energy used > since machine power-up, and they know the boot happened so quick the counters > did not wrap around, they can easily script that with e.g. the rdmsr tool. > Or perhaps the driver could pr_info the initial values at probe time. > > Have such complaints already appeared in practice? > The main argument in the coretemp case, if I recall correctly, was that the driver _doesn't_ provide valid data from the beginning of time, and would thus be worthless. So, yes, such complaints have already appeared in practice. > Also note that documentation doesn't promise that counters start from zero > at power-up time, although that's of course a natural assumption. > > >> A slight improvement might be to add up core energy counters when loading >> the driver, compare it against the package counter, and pick the larger >> value for the initial package counter(s). This would at least ensure that >> the package counter is never less than the sum of the core counters. > > No, fudging the initial reading like this wouldn't help, because I was > pointing out how core counters increment quicker than the package counter; > i.e. even if the kernel fudged the initial values, they would still grow > contradictory quick enough (on some workloads). > This exchange is exactly what I was concerned about when this driver was first submitted. I should have known better, and I should not have accepted it. Right now I seriously wonder if I should revert/drop it. Any arguments/thoughts why I _shouldn't_ do that ? Guenter
On 5/27/20 8:25 AM, Alexander Monakov wrote: > On Wed, 27 May 2020, Guenter Roeck wrote: > >>> In the current driver, the accumulation thread starts during the probe and the >>> prev_value of sockets and the core0 is updated with the current MSR value. >>> It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for >>> the other cores. >>> >>> By populating prev_value of all the cores during probe, we can >>> certainly save some >>> information (values with in one wrap-around at the best). If this >>> information is >>> useful, i can submit a patch to do so. Guenter, could you suggest us on this. >>> >> >> As long as you don't ignore counter values from before loading the driver, >> sure, go ahead. > > Hm? If I'm understanding correctly what Naveen is proposing, that would simply > move reading the initial values ahead by a few hundred cycles (the latency to > start the thread). It wouldn't change anything about the issue, and make the > code more complicated :( > At this point the question is if I should drop this driver entirely. Guenter
On Wed, 27 May 2020, Guenter Roeck wrote: > This exchange is exactly what I was concerned about when this driver > was first submitted. I should have known better, and I should not > have accepted it. Right now I seriously wonder if I should revert/drop > it. Any arguments/thoughts why I _shouldn't_ do that ? Let me apologize and explain my perspective. These AMD MSRs have been previously wired up in the turbostat tool, and very recently another developer submitted a patch to wire up the package energy counter MSR for use with perf. Unlike the above, this driver is submitted by AMD. As I have noticed a substantial issue (sum of core counters contradicting the package counter), I have attempted to report it in this thread. Since AMD is submitting the code, I was hoping to get their attention to the issue, and ideally get some explanations about how the counters work and to what extent we can expect them to be accurate. I think most of the discussion about (not) ignoring initial counter values was in part caused by misunderstanding exactly what issue I was reporting. After all, it's not so important if the driver accurately captures boot-time energy use, if the counters are not trustworthy. I don't have an answer to your question (whether you should keep the driver). I hope you see where I'm coming from. I'm not quite aware of the history with coretemp driver, so if all this caused you extra headaches, I apologize for my part in the mess. Alexander
On 5/27/20 9:41 AM, Alexander Monakov wrote: > On Wed, 27 May 2020, Guenter Roeck wrote: > >> This exchange is exactly what I was concerned about when this driver >> was first submitted. I should have known better, and I should not >> have accepted it. Right now I seriously wonder if I should revert/drop >> it. Any arguments/thoughts why I _shouldn't_ do that ? > > Let me apologize and explain my perspective. > > These AMD MSRs have been previously wired up in the turbostat tool, and > very recently another developer submitted a patch to wire up the package > energy counter MSR for use with perf. > > Unlike the above, this driver is submitted by AMD. As I have noticed a > substantial issue (sum of core counters contradicting the package counter), > I have attempted to report it in this thread. Since AMD is submitting the > code, I was hoping to get their attention to the issue, and ideally get > some explanations about how the counters work and to what extent we can > expect them to be accurate. > > I think most of the discussion about (not) ignoring initial counter > values was in part caused by misunderstanding exactly what issue I was > reporting. After all, it's not so important if the driver accurately > captures boot-time energy use, if the counters are not trustworthy. > > I don't have an answer to your question (whether you should keep the > driver). I hope you see where I'm coming from. I'm not quite aware of > the history with coretemp driver, so if all this caused you extra > headaches, I apologize for my part in the mess. > There are two opposing arguments: - The driver can not guarantee that there are no previous overflows, thus it should always start counting from 0. - The driver is in all typical and most common situations loaded when the system boots, when there have been no overflows. Therefore, it should include startup counter values to provide accurate information for those most common use cases. My prediction was that we would see endless arguments about this, with one set of people arguing one way, another set of people arguing the other way, and both being extremely passionate about it. You have already proven my point. This is a perfect lose-lose situation, with me sitting in the middle. In such situations my reaction tends to be to pull the plug. Guenter
On Wed, 27 May 2020, Naveen Krishna Ch wrote: > I tried on an octa core machine, must be Renoir > vendor_id : AuthenticAMD > cpu family : 23 > model : 96 Yes, same here. The only difference I'm on a 6-core 4500U. > On an idle system over a period of 500secs: [snip] > Power consumed by socket: 4.53539685 W > Sum of power consumed by cores: 0.010953W Yes, no problem for idle readings. I'm getting 2W/package on AC and close to 0W on battery. > On an system with AVX test running over a period of 500 secs: > > At t=500sec | At t= 0 | Diff of energy > | in Joules | Power in Watts > core 0 | 649348495 | 413687530 | 235660965 | > 235.660965 | 0.47132193 > core 1 | 506880081 | 294882827 | 211997254 | > 211.997254 | 0.423994508 > core 2 | 454804046 | 271046127 | 183757919 | > 183.757919 | 0.367515838 > core 3 | 391508712 | 237531021 | 153977691 | > 153.977691 | 0.307955382 > core 4 | 410336868 | 284410079 | 125926789 | > 125.926789 | 0.251853578 > core 5 | 287569732 | 192306015 | 95263717 | > 95.263717 | 0.190527434 > core 6 | 185909622 | 120556152 | 65353470 | > 65.35347 | 0.13070694 > core 7 | 129932006 | 95385940 | 34546066 | > 34.546066 | 0.069092132 > Socket 0 | 28399099655 | 24799819931 3599279724 | 3599.27972 | > 7.198559448 > Power consumed by socket: 7.198559448 W > Sum of power consumed by cores: 2.212968 W > > Can you confirm this. No, this isn't right. The workload is not stressing all cores, you're reporting 235 J for core 0 and only 35 J for core 7. If you are running the avx-turbo test, please run it like this: ./avx-turbo --iters 10000000 --spec avx256_fma_t/8 This will run AVX code on all 8 cores for about 1 second. To run it for longer, increase the --iters argument. Thanks. Alexander
On Wed, 27 May 2020, Guenter Roeck wrote: > My prediction was that we would see endless arguments about this, > with one set of people arguing one way, another set of people > arguing the other way, and both being extremely passionate > about it. You have already proven my point. Sorry :/ Personally I'd be okay with either approach. I even think it's possible to find a middle ground by mentioning boot-time energy use with pr_info or such at probe time, and then counting from zero. My main goal here is to get answers about contradictory counters. Cheers. Alexander
Hi Alexander On Wed, 27 May 2020 at 22:29, Alexander Monakov <amonakov@ispras.ru> wrote: > > On Wed, 27 May 2020, Naveen Krishna Ch wrote: > > > I tried on an octa core machine, must be Renoir > > vendor_id : AuthenticAMD > > cpu family : 23 > > model : 96 > > Yes, same here. The only difference I'm on a 6-core 4500U. > > > On an idle system over a period of 500secs: > [snip] > > Power consumed by socket: 4.53539685 W > > Sum of power consumed by cores: 0.010953W > > Yes, no problem for idle readings. I'm getting 2W/package on AC and > close to 0W on battery. > > > > On an system with AVX test running over a period of 500 secs: > > > > At t=500sec | At t= 0 | Diff of energy > > | in Joules | Power in Watts > > core 0 | 649348495 | 413687530 | 235660965 | > > 235.660965 | 0.47132193 > > core 1 | 506880081 | 294882827 | 211997254 | > > 211.997254 | 0.423994508 > > core 2 | 454804046 | 271046127 | 183757919 | > > 183.757919 | 0.367515838 > > core 3 | 391508712 | 237531021 | 153977691 | > > 153.977691 | 0.307955382 > > core 4 | 410336868 | 284410079 | 125926789 | > > 125.926789 | 0.251853578 > > core 5 | 287569732 | 192306015 | 95263717 | > > 95.263717 | 0.190527434 > > core 6 | 185909622 | 120556152 | 65353470 | > > 65.35347 | 0.13070694 > > core 7 | 129932006 | 95385940 | 34546066 | > > 34.546066 | 0.069092132 > > Socket 0 | 28399099655 | 24799819931 3599279724 | 3599.27972 | > > 7.198559448 > > Power consumed by socket: 7.198559448 W > > Sum of power consumed by cores: 2.212968 W > > > > Can you confirm this. > > No, this isn't right. The workload is not stressing all cores, you're > reporting 235 J for core 0 and only 35 J for core 7. > > If you are running the avx-turbo test, please run it like this: > > ./avx-turbo --iters 10000000 --spec avx256_fma_t/8 > > This will run AVX code on all 8 cores for about 1 second. To run it for longer, > increase the --iters argument. I ran with all cores loaded now and i see that sum of diff of energy consumed by all cores is greater than the diff of socket energy consumed in 100secs. Kindly, give me time. I will come back with some explanation or solution. > > Thanks. > Alexander
On Thu, 28 May 2020, Naveen Krishna Ch wrote: > > No, this isn't right. The workload is not stressing all cores, you're > > reporting 235 J for core 0 and only 35 J for core 7. > > > > If you are running the avx-turbo test, please run it like this: > > > > ./avx-turbo --iters 10000000 --spec avx256_fma_t/8 > > > > This will run AVX code on all 8 cores for about 1 second. To run it for longer, > > increase the --iters argument. > I ran with all cores loaded now and i see that > sum of diff of energy consumed by all cores is greater than the diff of > socket energy consumed in 100secs. > > Kindly, give me time. I will come back with some explanation or solution. Hello. Is there any update on this issue? Thanks. Alexander
[AMD Public Use] Hi Alexander, The issue does not seem to be in the driver, it is in the register values populated by the firmware. We have tested this in the server platforms and it works fine. I've raised an issue internally to analyze and provide a solution for client systems. I'm still waiting for their update. Guenter, This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch to support only server specific models 0x0 to 0x3f until we triage the issue for client systems. Regards, Naveen -----Original Message----- From: Alexander Monakov <amonakov@ispras.ru> Sent: Thursday, June 11, 2020 1:52 AM To: Naveen Krishna Ch <naveenkrishna.ch@gmail.com> Cc: Guenter Roeck <linux@roeck-us.net>; Chatradhi, Naveen Krishna <NaveenKrishna.Chatradhi@amd.com>; linux-hwmon@vger.kernel.org Subject: Re: [PATCH 1/3 v7] hwmon: Add amd_energy driver to report energy counters [CAUTION: External Email] On Thu, 28 May 2020, Naveen Krishna Ch wrote: > > No, this isn't right. The workload is not stressing all cores, > > you're reporting 235 J for core 0 and only 35 J for core 7. > > > > If you are running the avx-turbo test, please run it like this: > > > > ./avx-turbo --iters 10000000 --spec avx256_fma_t/8 > > > > This will run AVX code on all 8 cores for about 1 second. To run it > > for longer, increase the --iters argument. > I ran with all cores loaded now and i see that sum of diff of energy > consumed by all cores is greater than the diff of socket energy > consumed in 100secs. > > Kindly, give me time. I will come back with some explanation or solution. Hello. Is there any update on this issue? Thanks. Alexander
On 6/16/20 7:46 AM, Chatradhi, Naveen Krishna wrote: > [AMD Public Use] > > Hi Alexander, > > The issue does not seem to be in the driver, it is in the register values populated by the firmware. > We have tested this in the server platforms and it works fine. > I've raised an issue internally to analyze and provide a solution for client systems. > I'm still waiting for their update. > > Guenter, > This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch > to support only server specific models 0x0 to 0x3f until we triage the issue for client systems. > Yes, I think that would make sense. Thanks, Guenter
Hi Guenter On Tue, 16 Jun 2020 at 20:23, Guenter Roeck <linux@roeck-us.net> wrote: > > On 6/16/20 7:46 AM, Chatradhi, Naveen Krishna wrote: > > [AMD Public Use] > > > > Hi Alexander, > > > > The issue does not seem to be in the driver, it is in the register values populated by the firmware. > > We have tested this in the server platforms and it works fine. > > I've raised an issue internally to analyze and provide a solution for client systems. > > I'm still waiting for their update. > > > > Guenter, > > This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch > > to support only server specific models 0x0 to 0x3f until we triage the issue for client systems. > > > > Yes, I think that would make sense. Sure, I will submit that. Thanks for the prompt response > > Thanks, > Guenter
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig index 4c62f900bf7e..e165e10c49ef 100644 --- a/drivers/hwmon/Kconfig +++ b/drivers/hwmon/Kconfig @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER This driver can also be built as a module. If so, the module will be called fam15h_power. +config SENSORS_AMD_ENERGY + tristate "AMD RAPL MSR based Energy driver" + depends on X86 + help + If you say yes here you get support for core and package energy + sensors, based on RAPL MSR for AMD family 17h and above CPUs. + + This driver can also be built as a module. If so, the module + will be called as amd_energy. + config SENSORS_APPLESMC tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)" depends on INPUT && X86 diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index b0b9c8e57176..318f89dc7133 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411) += adt7411.o obj-$(CONFIG_SENSORS_ADT7462) += adt7462.o obj-$(CONFIG_SENSORS_ADT7470) += adt7470.o obj-$(CONFIG_SENSORS_ADT7475) += adt7475.o +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o obj-$(CONFIG_SENSORS_APPLESMC) += applesmc.o obj-$(CONFIG_SENSORS_ARM_SCMI) += scmi-hwmon.o obj-$(CONFIG_SENSORS_ARM_SCPI) += scpi-hwmon.o diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c new file mode 100644 index 000000000000..bc8b643a37d5 --- /dev/null +++ b/drivers/hwmon/amd_energy.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2020 Advanced Micro Devices, Inc. + */ +#include <asm/cpu_device_id.h> + +#include <linux/bits.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/hwmon.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/processor.h> +#include <linux/platform_device.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/topology.h> +#include <linux/types.h> + +#define DRVNAME "amd_energy" + +#define ENERGY_PWR_UNIT_MSR 0xC0010299 +#define ENERGY_CORE_MSR 0xC001029A +#define ENERGY_PKG_MSR 0xC001029B + +#define AMD_ENERGY_UNIT_MASK 0x01F00 +#define AMD_ENERGY_MASK 0xFFFFFFFF + +struct sensor_accumulator { + u64 energy_ctr; + u64 prev_value; + char label[10]; +}; + +struct amd_energy_data { + struct hwmon_channel_info energy_info; + const struct hwmon_channel_info *info[2]; + struct hwmon_chip_info chip; + struct task_struct *wrap_accumulate; + /* Lock around the accumulator */ + struct mutex lock; + /* An accumulator for each core and socket */ + struct sensor_accumulator *accums; + /* Energy Status Units */ + u64 energy_units; + int nr_cpus; + int nr_socks; + int core_id; +}; + +static int amd_energy_read_labels(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, + const char **str) +{ + struct amd_energy_data *data = dev_get_drvdata(dev); + + *str = data->accums[channel].label; + return 0; +} + +static void get_energy_units(struct amd_energy_data *data) +{ + u64 rapl_units; + + rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units); + data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8; +} + +static void accumulate_socket_delta(struct amd_energy_data *data, + int sock, int cpu) +{ + struct sensor_accumulator *s_accum; + u64 input; + + mutex_lock(&data->lock); + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); + input &= AMD_ENERGY_MASK; + + s_accum = &data->accums[data->nr_cpus + sock]; + if (input >= s_accum->prev_value) + s_accum->energy_ctr += + input - s_accum->prev_value; + else + s_accum->energy_ctr += UINT_MAX - + s_accum->prev_value + input; + + s_accum->prev_value = input; + mutex_unlock(&data->lock); +} + +static void accumulate_core_delta(struct amd_energy_data *data) +{ + struct sensor_accumulator *c_accum; + u64 input; + int cpu; + + mutex_lock(&data->lock); + if (data->core_id >= data->nr_cpus) + data->core_id = 0; + + cpu = data->core_id; + + if (!cpu_online(cpu)) + goto out; + + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); + input &= AMD_ENERGY_MASK; + + c_accum = &data->accums[cpu]; + + if (input >= c_accum->prev_value) + c_accum->energy_ctr += + input - c_accum->prev_value; + else + c_accum->energy_ctr += UINT_MAX - + c_accum->prev_value + input; + + c_accum->prev_value = input; + +out: + data->core_id++; + mutex_unlock(&data->lock); +} + +static void read_accumulate(struct amd_energy_data *data) +{ + int sock; + + for (sock = 0; sock < data->nr_socks; sock++) { + int cpu; + + cpu = cpumask_first_and(cpu_online_mask, + cpumask_of_node(sock)); + + accumulate_socket_delta(data, sock, cpu); + } + + accumulate_core_delta(data); +} + +static void amd_add_delta(struct amd_energy_data *data, int ch, + int cpu, long *val, bool is_core) +{ + struct sensor_accumulator *s_accum, *c_accum; + u64 input; + + mutex_lock(&data->lock); + if (!is_core) { + rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input); + input &= AMD_ENERGY_MASK; + + s_accum = &data->accums[ch]; + if (input >= s_accum->prev_value) + input += s_accum->energy_ctr - + s_accum->prev_value; + else + input += UINT_MAX - s_accum->prev_value + + s_accum->energy_ctr; + } else { + rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input); + input &= AMD_ENERGY_MASK; + + c_accum = &data->accums[ch]; + if (input >= c_accum->prev_value) + input += c_accum->energy_ctr - + c_accum->prev_value; + else + input += UINT_MAX - c_accum->prev_value + + c_accum->energy_ctr; + } + + /* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */ + *val = div64_ul(input * 1000000UL, BIT(data->energy_units)); + + mutex_unlock(&data->lock); +} + +static int amd_energy_read(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct amd_energy_data *data = dev_get_drvdata(dev); + int cpu; + + if (channel >= data->nr_cpus) { + cpu = cpumask_first_and(cpu_online_mask, + cpumask_of_node + (channel - data->nr_cpus)); + amd_add_delta(data, channel, cpu, val, false); + } else { + cpu = channel; + if (!cpu_online(cpu)) + return -ENODEV; + + amd_add_delta(data, channel, cpu, val, true); + } + + return 0; +} + +static umode_t amd_energy_is_visible(const void *_data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + return 0444; +} + +static int energy_accumulator(void *p) +{ + struct amd_energy_data *data = (struct amd_energy_data *)p; + + while (!kthread_should_stop()) { + /* + * Ignoring the conditions such as + * cpu being offline or rdmsr failure + */ + read_accumulate(data); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + /* + * On a 240W system, with default resolution the + * Socket Energy status register may wrap around in + * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins) + * + * let us accumulate for every 100secs + */ + schedule_timeout(msecs_to_jiffies(100000)); + } + return 0; +} + +static const struct hwmon_ops amd_energy_ops = { + .is_visible = amd_energy_is_visible, + .read = amd_energy_read, + .read_string = amd_energy_read_labels, +}; + +static int amd_create_sensor(struct device *dev, + struct amd_energy_data *data, + u8 type, u32 config) +{ + struct hwmon_channel_info *info = &data->energy_info; + struct sensor_accumulator *accums; + int i, num_siblings, cpus, sockets; + u32 *s_config; + + /* Identify the number of siblings per core */ + num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; + + sockets = num_possible_nodes(); + + /* + * Energy counter register is accessed at core level. + * Hence, filterout the siblings. + */ + cpus = num_present_cpus() / num_siblings; + + s_config = devm_kcalloc(dev, cpus + sockets, + sizeof(u32), GFP_KERNEL); + if (!s_config) + return -ENOMEM; + + accums = devm_kcalloc(dev, cpus + sockets, + sizeof(struct sensor_accumulator), + GFP_KERNEL); + if (!accums) + return -ENOMEM; + + info->type = type; + info->config = s_config; + + data->nr_cpus = cpus; + data->nr_socks = sockets; + data->accums = accums; + + for (i = 0; i < cpus + sockets; i++) { + s_config[i] = config; + if (i < cpus) + scnprintf(accums[i].label, 10, + "Ecore%03u", i); + else + scnprintf(accums[i].label, 10, + "Esocket%u", (i - cpus)); + } + + return 0; +} + +static int amd_energy_probe(struct platform_device *pdev) +{ + struct device *hwmon_dev; + struct amd_energy_data *data; + struct device *dev = &pdev->dev; + + data = devm_kzalloc(dev, + sizeof(struct amd_energy_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->chip.ops = &amd_energy_ops; + data->chip.info = data->info; + + dev_set_drvdata(dev, data); + /* Populate per-core energy reporting */ + data->info[0] = &data->energy_info; + amd_create_sensor(dev, data, hwmon_energy, + HWMON_E_INPUT | HWMON_E_LABEL); + + mutex_init(&data->lock); + get_energy_units(data); + + hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME, + data, + &data->chip, + NULL); + if (IS_ERR(hwmon_dev)) + return PTR_ERR(hwmon_dev); + + data->wrap_accumulate = kthread_run(energy_accumulator, data, + "%s", dev_name(hwmon_dev)); + if (IS_ERR(data->wrap_accumulate)) + return PTR_ERR(data->wrap_accumulate); + + return PTR_ERR_OR_ZERO(data->wrap_accumulate); +} + +static int amd_energy_remove(struct platform_device *pdev) +{ + struct amd_energy_data *data = dev_get_drvdata(&pdev->dev); + + if (data && data->wrap_accumulate) + kthread_stop(data->wrap_accumulate); + + return 0; +} + +static const struct platform_device_id amd_energy_ids[] = { + { .name = DRVNAME, }, + {} +}; +MODULE_DEVICE_TABLE(platform, amd_energy_ids); + +static struct platform_driver amd_energy_driver = { + .probe = amd_energy_probe, + .remove = amd_energy_remove, + .id_table = amd_energy_ids, + .driver = { + .name = DRVNAME, + }, +}; + +static struct platform_device *amd_energy_platdev; + +static const struct x86_cpu_id cpu_ids[] __initconst = { + X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, cpu_ids); + +static int __init amd_energy_init(void) +{ + int ret; + + if (!x86_match_cpu(cpu_ids)) + return -ENODEV; + + ret = platform_driver_register(&amd_energy_driver); + if (ret) + return ret; + + amd_energy_platdev = platform_device_alloc(DRVNAME, 0); + if (!amd_energy_platdev) + return -ENOMEM; + + ret = platform_device_add(amd_energy_platdev); + if (ret) { + platform_device_put(amd_energy_platdev); + platform_driver_unregister(&amd_energy_driver); + return ret; + } + + return ret; +} + +static void __exit amd_energy_exit(void) +{ + platform_device_unregister(amd_energy_platdev); + platform_driver_unregister(&amd_energy_driver); +} + +module_init(amd_energy_init); +module_exit(amd_energy_exit); + +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface"); +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>"); +MODULE_LICENSE("GPL");
This patch adds hwmon based amd_energy driver support for family 17h processors from AMD. The driver provides following interface to the userspace 1. Reports the per core consumption * file: "energy%d_input", label: "Ecore%03d" 2. Reports per socket energy consumption * file: "energy%d_input", label: "Esocket%d" 3. To, increase the wrap around time of the socket energy counters, a 64bit accumultor is implemented. 4. Reports scaled energy value in Joules. Cc: Guenter Roeck <linux@roeck-us.net> Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com> --- Changes in v7: 1. Move all static variables into the data structure 2. Move the read-update code under the mutex lock drivers/hwmon/Kconfig | 10 + drivers/hwmon/Makefile | 1 + drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++ 3 files changed, 417 insertions(+) create mode 100644 drivers/hwmon/amd_energy.c