diff mbox series

[1/3,v7] hwmon: Add amd_energy driver to report energy counters

Message ID 20200519155011.56184-1-nchatrad@amd.com (mailing list archive)
State Accepted
Headers show
Series [1/3,v7] hwmon: Add amd_energy driver to report energy counters | expand

Commit Message

Naveen Krishna Chatradhi May 19, 2020, 3:50 p.m. UTC
This patch adds hwmon based amd_energy driver support for
family 17h processors from AMD.

The driver provides following interface to the userspace
1. Reports the per core consumption
	* file: "energy%d_input", label: "Ecore%03d"
2. Reports per socket energy consumption
	* file: "energy%d_input", label: "Esocket%d"
3. To, increase the wrap around time of the socket energy
   counters, a 64bit accumultor is implemented.
4. Reports scaled energy value in Joules.

Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
Changes in v7:
1. Move all static variables into the data structure
2. Move the read-update code under the mutex lock

 drivers/hwmon/Kconfig      |  10 +
 drivers/hwmon/Makefile     |   1 +
 drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++
 3 files changed, 417 insertions(+)
 create mode 100644 drivers/hwmon/amd_energy.c

Comments

Guenter Roeck May 22, 2020, 1:26 p.m. UTC | #1
On 5/19/20 8:50 AM, Naveen Krishna Chatradhi wrote:
> This patch adds hwmon based amd_energy driver support for
> family 17h processors from AMD.
> 
> The driver provides following interface to the userspace
> 1. Reports the per core consumption
> 	* file: "energy%d_input", label: "Ecore%03d"
> 2. Reports per socket energy consumption
> 	* file: "energy%d_input", label: "Esocket%d"
> 3. To, increase the wrap around time of the socket energy
>    counters, a 64bit accumultor is implemented.
> 4. Reports scaled energy value in Joules.
> 
> Cc: Guenter Roeck <linux@roeck-us.net>
> Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> ---

This version looks ok, and it looks like there are no more
unexpected counter wraps either. Series applied to hwmon-next.

Thanks,
Guenter

> Changes in v7:
> 1. Move all static variables into the data structure
> 2. Move the read-update code under the mutex lock
> 
>  drivers/hwmon/Kconfig      |  10 +
>  drivers/hwmon/Makefile     |   1 +
>  drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 417 insertions(+)
>  create mode 100644 drivers/hwmon/amd_energy.c
> 
> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> index 4c62f900bf7e..e165e10c49ef 100644
> --- a/drivers/hwmon/Kconfig
> +++ b/drivers/hwmon/Kconfig
> @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER
>  	  This driver can also be built as a module. If so, the module
>  	  will be called fam15h_power.
>  
> +config SENSORS_AMD_ENERGY
> +	tristate "AMD RAPL MSR based Energy driver"
> +	depends on X86
> +	help
> +	  If you say yes here you get support for core and package energy
> +	  sensors, based on RAPL MSR for AMD family 17h and above CPUs.
> +
> +	  This driver can also be built as a module. If so, the module
> +	  will be called as amd_energy.
> +
>  config SENSORS_APPLESMC
>  	tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
>  	depends on INPUT && X86
> diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> index b0b9c8e57176..318f89dc7133 100644
> --- a/drivers/hwmon/Makefile
> +++ b/drivers/hwmon/Makefile
> @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411)	+= adt7411.o
>  obj-$(CONFIG_SENSORS_ADT7462)	+= adt7462.o
>  obj-$(CONFIG_SENSORS_ADT7470)	+= adt7470.o
>  obj-$(CONFIG_SENSORS_ADT7475)	+= adt7475.o
> +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
>  obj-$(CONFIG_SENSORS_APPLESMC)	+= applesmc.o
>  obj-$(CONFIG_SENSORS_ARM_SCMI)	+= scmi-hwmon.o
>  obj-$(CONFIG_SENSORS_ARM_SCPI)	+= scpi-hwmon.o
> diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
> new file mode 100644
> index 000000000000..bc8b643a37d5
> --- /dev/null
> +++ b/drivers/hwmon/amd_energy.c
> @@ -0,0 +1,406 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * Copyright (C) 2020 Advanced Micro Devices, Inc.
> + */
> +#include <asm/cpu_device_id.h>
> +
> +#include <linux/bits.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/hwmon.h>
> +#include <linux/kernel.h>
> +#include <linux/kthread.h>
> +#include <linux/list.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/processor.h>
> +#include <linux/platform_device.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/topology.h>
> +#include <linux/types.h>
> +
> +#define DRVNAME			"amd_energy"
> +
> +#define ENERGY_PWR_UNIT_MSR	0xC0010299
> +#define ENERGY_CORE_MSR		0xC001029A
> +#define ENERGY_PKG_MSR		0xC001029B
> +
> +#define AMD_ENERGY_UNIT_MASK	0x01F00
> +#define AMD_ENERGY_MASK		0xFFFFFFFF
> +
> +struct sensor_accumulator {
> +	u64 energy_ctr;
> +	u64 prev_value;
> +	char label[10];
> +};
> +
> +struct amd_energy_data {
> +	struct hwmon_channel_info energy_info;
> +	const struct hwmon_channel_info *info[2];
> +	struct hwmon_chip_info chip;
> +	struct task_struct *wrap_accumulate;
> +	/* Lock around the accumulator */
> +	struct mutex lock;
> +	/* An accumulator for each core and socket */
> +	struct sensor_accumulator *accums;
> +	/* Energy Status Units */
> +	u64 energy_units;
> +	int nr_cpus;
> +	int nr_socks;
> +	int core_id;
> +};
> +
> +static int amd_energy_read_labels(struct device *dev,
> +				  enum hwmon_sensor_types type,
> +				  u32 attr, int channel,
> +				  const char **str)
> +{
> +	struct amd_energy_data *data = dev_get_drvdata(dev);
> +
> +	*str = data->accums[channel].label;
> +	return 0;
> +}
> +
> +static void get_energy_units(struct amd_energy_data *data)
> +{
> +	u64 rapl_units;
> +
> +	rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
> +	data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
> +}
> +
> +static void accumulate_socket_delta(struct amd_energy_data *data,
> +				    int sock, int cpu)
> +{
> +	struct sensor_accumulator *s_accum;
> +	u64 input;
> +
> +	mutex_lock(&data->lock);
> +	rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> +	input &= AMD_ENERGY_MASK;
> +
> +	s_accum = &data->accums[data->nr_cpus + sock];
> +	if (input >= s_accum->prev_value)
> +		s_accum->energy_ctr +=
> +			input - s_accum->prev_value;
> +	else
> +		s_accum->energy_ctr += UINT_MAX -
> +			s_accum->prev_value + input;
> +
> +	s_accum->prev_value = input;
> +	mutex_unlock(&data->lock);
> +}
> +
> +static void accumulate_core_delta(struct amd_energy_data *data)
> +{
> +	struct sensor_accumulator *c_accum;
> +	u64 input;
> +	int cpu;
> +
> +	mutex_lock(&data->lock);
> +	if (data->core_id >= data->nr_cpus)
> +		data->core_id = 0;
> +
> +	cpu = data->core_id;
> +
> +	if (!cpu_online(cpu))
> +		goto out;
> +
> +	rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> +	input &= AMD_ENERGY_MASK;
> +
> +	c_accum = &data->accums[cpu];
> +
> +	if (input >= c_accum->prev_value)
> +		c_accum->energy_ctr +=
> +			input - c_accum->prev_value;
> +	else
> +		c_accum->energy_ctr += UINT_MAX -
> +			c_accum->prev_value + input;
> +
> +	c_accum->prev_value = input;
> +
> +out:
> +	data->core_id++;
> +	mutex_unlock(&data->lock);
> +}
> +
> +static void read_accumulate(struct amd_energy_data *data)
> +{
> +	int sock;
> +
> +	for (sock = 0; sock < data->nr_socks; sock++) {
> +		int cpu;
> +
> +		cpu = cpumask_first_and(cpu_online_mask,
> +					cpumask_of_node(sock));
> +
> +		accumulate_socket_delta(data, sock, cpu);
> +	}
> +
> +	accumulate_core_delta(data);
> +}
> +
> +static void amd_add_delta(struct amd_energy_data *data, int ch,
> +			  int cpu, long *val, bool is_core)
> +{
> +	struct sensor_accumulator *s_accum, *c_accum;
> +	u64 input;
> +
> +	mutex_lock(&data->lock);
> +	if (!is_core) {
> +		rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> +		input &= AMD_ENERGY_MASK;
> +
> +		s_accum = &data->accums[ch];
> +		if (input >= s_accum->prev_value)
> +			input += s_accum->energy_ctr -
> +				  s_accum->prev_value;
> +		else
> +			input += UINT_MAX - s_accum->prev_value +
> +				  s_accum->energy_ctr;
> +	} else {
> +		rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> +		input &= AMD_ENERGY_MASK;
> +
> +		c_accum = &data->accums[ch];
> +		if (input >= c_accum->prev_value)
> +			input += c_accum->energy_ctr -
> +				 c_accum->prev_value;
> +		else
> +			input += UINT_MAX - c_accum->prev_value +
> +				 c_accum->energy_ctr;
> +	}
> +
> +	/* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
> +	*val = div64_ul(input * 1000000UL, BIT(data->energy_units));
> +
> +	mutex_unlock(&data->lock);
> +}
> +
> +static int amd_energy_read(struct device *dev,
> +			   enum hwmon_sensor_types type,
> +			   u32 attr, int channel, long *val)
> +{
> +	struct amd_energy_data *data = dev_get_drvdata(dev);
> +	int cpu;
> +
> +	if (channel >= data->nr_cpus) {
> +		cpu = cpumask_first_and(cpu_online_mask,
> +					cpumask_of_node
> +					(channel - data->nr_cpus));
> +		amd_add_delta(data, channel, cpu, val, false);
> +	} else {
> +		cpu = channel;
> +		if (!cpu_online(cpu))
> +			return -ENODEV;
> +
> +		amd_add_delta(data, channel, cpu, val, true);
> +	}
> +
> +	return 0;
> +}
> +
> +static umode_t amd_energy_is_visible(const void *_data,
> +				     enum hwmon_sensor_types type,
> +				     u32 attr, int channel)
> +{
> +	return 0444;
> +}
> +
> +static int energy_accumulator(void *p)
> +{
> +	struct amd_energy_data *data = (struct amd_energy_data *)p;
> +
> +	while (!kthread_should_stop()) {
> +		/*
> +		 * Ignoring the conditions such as
> +		 * cpu being offline or rdmsr failure
> +		 */
> +		read_accumulate(data);
> +
> +		set_current_state(TASK_INTERRUPTIBLE);
> +		if (kthread_should_stop())
> +			break;
> +
> +		/*
> +		 * On a 240W system, with default resolution the
> +		 * Socket Energy status register may wrap around in
> +		 * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
> +		 *
> +		 * let us accumulate for every 100secs
> +		 */
> +		schedule_timeout(msecs_to_jiffies(100000));
> +	}
> +	return 0;
> +}
> +
> +static const struct hwmon_ops amd_energy_ops = {
> +	.is_visible = amd_energy_is_visible,
> +	.read = amd_energy_read,
> +	.read_string = amd_energy_read_labels,
> +};
> +
> +static int amd_create_sensor(struct device *dev,
> +			     struct amd_energy_data *data,
> +			     u8 type, u32 config)
> +{
> +	struct hwmon_channel_info *info = &data->energy_info;
> +	struct sensor_accumulator *accums;
> +	int i, num_siblings, cpus, sockets;
> +	u32 *s_config;
> +
> +	/* Identify the number of siblings per core */
> +	num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
> +
> +	sockets = num_possible_nodes();
> +
> +	/*
> +	 * Energy counter register is accessed at core level.
> +	 * Hence, filterout the siblings.
> +	 */
> +	cpus = num_present_cpus() / num_siblings;
> +
> +	s_config = devm_kcalloc(dev, cpus + sockets,
> +				sizeof(u32), GFP_KERNEL);
> +	if (!s_config)
> +		return -ENOMEM;
> +
> +	accums = devm_kcalloc(dev, cpus + sockets,
> +			      sizeof(struct sensor_accumulator),
> +			      GFP_KERNEL);
> +	if (!accums)
> +		return -ENOMEM;
> +
> +	info->type = type;
> +	info->config = s_config;
> +
> +	data->nr_cpus = cpus;
> +	data->nr_socks = sockets;
> +	data->accums = accums;
> +
> +	for (i = 0; i < cpus + sockets; i++) {
> +		s_config[i] = config;
> +		if (i < cpus)
> +			scnprintf(accums[i].label, 10,
> +				  "Ecore%03u", i);
> +		else
> +			scnprintf(accums[i].label, 10,
> +				  "Esocket%u", (i - cpus));
> +	}
> +
> +	return 0;
> +}
> +
> +static int amd_energy_probe(struct platform_device *pdev)
> +{
> +	struct device *hwmon_dev;
> +	struct amd_energy_data *data;
> +	struct device *dev = &pdev->dev;
> +
> +	data = devm_kzalloc(dev,
> +			    sizeof(struct amd_energy_data), GFP_KERNEL);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	data->chip.ops = &amd_energy_ops;
> +	data->chip.info = data->info;
> +
> +	dev_set_drvdata(dev, data);
> +	/* Populate per-core energy reporting */
> +	data->info[0] = &data->energy_info;
> +	amd_create_sensor(dev, data, hwmon_energy,
> +			  HWMON_E_INPUT | HWMON_E_LABEL);
> +
> +	mutex_init(&data->lock);
> +	get_energy_units(data);
> +
> +	hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
> +							 data,
> +							 &data->chip,
> +							 NULL);
> +	if (IS_ERR(hwmon_dev))
> +		return PTR_ERR(hwmon_dev);
> +
> +	data->wrap_accumulate = kthread_run(energy_accumulator, data,
> +					    "%s", dev_name(hwmon_dev));
> +	if (IS_ERR(data->wrap_accumulate))
> +		return PTR_ERR(data->wrap_accumulate);
> +
> +	return PTR_ERR_OR_ZERO(data->wrap_accumulate);
> +}
> +
> +static int amd_energy_remove(struct platform_device *pdev)
> +{
> +	struct amd_energy_data *data = dev_get_drvdata(&pdev->dev);
> +
> +	if (data && data->wrap_accumulate)
> +		kthread_stop(data->wrap_accumulate);
> +
> +	return 0;
> +}
> +
> +static const struct platform_device_id amd_energy_ids[] = {
> +	{ .name = DRVNAME, },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(platform, amd_energy_ids);
> +
> +static struct platform_driver amd_energy_driver = {
> +	.probe = amd_energy_probe,
> +	.remove	= amd_energy_remove,
> +	.id_table = amd_energy_ids,
> +	.driver = {
> +		.name = DRVNAME,
> +	},
> +};
> +
> +static struct platform_device *amd_energy_platdev;
> +
> +static const struct x86_cpu_id cpu_ids[] __initconst = {
> +	X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
> +	{}
> +};
> +MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
> +
> +static int __init amd_energy_init(void)
> +{
> +	int ret;
> +
> +	if (!x86_match_cpu(cpu_ids))
> +		return -ENODEV;
> +
> +	ret = platform_driver_register(&amd_energy_driver);
> +	if (ret)
> +		return ret;
> +
> +	amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
> +	if (!amd_energy_platdev)
> +		return -ENOMEM;
> +
> +	ret = platform_device_add(amd_energy_platdev);
> +	if (ret) {
> +		platform_device_put(amd_energy_platdev);
> +		platform_driver_unregister(&amd_energy_driver);
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static void __exit amd_energy_exit(void)
> +{
> +	platform_device_unregister(amd_energy_platdev);
> +	platform_driver_unregister(&amd_energy_driver);
> +}
> +
> +module_init(amd_energy_init);
> +module_exit(amd_energy_exit);
> +
> +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
> +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
> +MODULE_LICENSE("GPL");
>
Naveen Krishna Ch May 22, 2020, 2:19 p.m. UTC | #2
Hi Guenter,

On Fri, 22 May 2020 at 18:56, Guenter Roeck <linux@roeck-us.net> wrote:
>
> On 5/19/20 8:50 AM, Naveen Krishna Chatradhi wrote:
> > This patch adds hwmon based amd_energy driver support for
> > family 17h processors from AMD.
> >
> > The driver provides following interface to the userspace
> > 1. Reports the per core consumption
> >       * file: "energy%d_input", label: "Ecore%03d"
> > 2. Reports per socket energy consumption
> >       * file: "energy%d_input", label: "Esocket%d"
> > 3. To, increase the wrap around time of the socket energy
> >    counters, a 64bit accumultor is implemented.
> > 4. Reports scaled energy value in Joules.
> >
> > Cc: Guenter Roeck <linux@roeck-us.net>
> > Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
> > ---
>
> This version looks ok, and it looks like there are no more
> unexpected counter wraps either. Series applied to hwmon-next.

Thank you for your guidance and support.

>
> Thanks,
> Guenter
>
> > Changes in v7:
> > 1. Move all static variables into the data structure
> > 2. Move the read-update code under the mutex lock
> >
> >  drivers/hwmon/Kconfig      |  10 +
> >  drivers/hwmon/Makefile     |   1 +
> >  drivers/hwmon/amd_energy.c | 406 +++++++++++++++++++++++++++++++++++++
> >  3 files changed, 417 insertions(+)
> >  create mode 100644 drivers/hwmon/amd_energy.c
> >
> > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> > index 4c62f900bf7e..e165e10c49ef 100644
> > --- a/drivers/hwmon/Kconfig
> > +++ b/drivers/hwmon/Kconfig
> > @@ -324,6 +324,16 @@ config SENSORS_FAM15H_POWER
> >         This driver can also be built as a module. If so, the module
> >         will be called fam15h_power.
> >
> > +config SENSORS_AMD_ENERGY
> > +     tristate "AMD RAPL MSR based Energy driver"
> > +     depends on X86
> > +     help
> > +       If you say yes here you get support for core and package energy
> > +       sensors, based on RAPL MSR for AMD family 17h and above CPUs.
> > +
> > +       This driver can also be built as a module. If so, the module
> > +       will be called as amd_energy.
> > +
> >  config SENSORS_APPLESMC
> >       tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
> >       depends on INPUT && X86
> > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> > index b0b9c8e57176..318f89dc7133 100644
> > --- a/drivers/hwmon/Makefile
> > +++ b/drivers/hwmon/Makefile
> > @@ -45,6 +45,7 @@ obj-$(CONFIG_SENSORS_ADT7411)       += adt7411.o
> >  obj-$(CONFIG_SENSORS_ADT7462)        += adt7462.o
> >  obj-$(CONFIG_SENSORS_ADT7470)        += adt7470.o
> >  obj-$(CONFIG_SENSORS_ADT7475)        += adt7475.o
> > +obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
> >  obj-$(CONFIG_SENSORS_APPLESMC)       += applesmc.o
> >  obj-$(CONFIG_SENSORS_ARM_SCMI)       += scmi-hwmon.o
> >  obj-$(CONFIG_SENSORS_ARM_SCPI)       += scpi-hwmon.o
> > diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
> > new file mode 100644
> > index 000000000000..bc8b643a37d5
> > --- /dev/null
> > +++ b/drivers/hwmon/amd_energy.c
> > @@ -0,0 +1,406 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +
> > +/*
> > + * Copyright (C) 2020 Advanced Micro Devices, Inc.
> > + */
> > +#include <asm/cpu_device_id.h>
> > +
> > +#include <linux/bits.h>
> > +#include <linux/cpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/delay.h>
> > +#include <linux/device.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/kernel.h>
> > +#include <linux/kthread.h>
> > +#include <linux/list.h>
> > +#include <linux/module.h>
> > +#include <linux/mutex.h>
> > +#include <linux/processor.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/sched.h>
> > +#include <linux/slab.h>
> > +#include <linux/topology.h>
> > +#include <linux/types.h>
> > +
> > +#define DRVNAME                      "amd_energy"
> > +
> > +#define ENERGY_PWR_UNIT_MSR  0xC0010299
> > +#define ENERGY_CORE_MSR              0xC001029A
> > +#define ENERGY_PKG_MSR               0xC001029B
> > +
> > +#define AMD_ENERGY_UNIT_MASK 0x01F00
> > +#define AMD_ENERGY_MASK              0xFFFFFFFF
> > +
> > +struct sensor_accumulator {
> > +     u64 energy_ctr;
> > +     u64 prev_value;
> > +     char label[10];
> > +};
> > +
> > +struct amd_energy_data {
> > +     struct hwmon_channel_info energy_info;
> > +     const struct hwmon_channel_info *info[2];
> > +     struct hwmon_chip_info chip;
> > +     struct task_struct *wrap_accumulate;
> > +     /* Lock around the accumulator */
> > +     struct mutex lock;
> > +     /* An accumulator for each core and socket */
> > +     struct sensor_accumulator *accums;
> > +     /* Energy Status Units */
> > +     u64 energy_units;
> > +     int nr_cpus;
> > +     int nr_socks;
> > +     int core_id;
> > +};
> > +
> > +static int amd_energy_read_labels(struct device *dev,
> > +                               enum hwmon_sensor_types type,
> > +                               u32 attr, int channel,
> > +                               const char **str)
> > +{
> > +     struct amd_energy_data *data = dev_get_drvdata(dev);
> > +
> > +     *str = data->accums[channel].label;
> > +     return 0;
> > +}
> > +
> > +static void get_energy_units(struct amd_energy_data *data)
> > +{
> > +     u64 rapl_units;
> > +
> > +     rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
> > +     data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
> > +}
> > +
> > +static void accumulate_socket_delta(struct amd_energy_data *data,
> > +                                 int sock, int cpu)
> > +{
> > +     struct sensor_accumulator *s_accum;
> > +     u64 input;
> > +
> > +     mutex_lock(&data->lock);
> > +     rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> > +     input &= AMD_ENERGY_MASK;
> > +
> > +     s_accum = &data->accums[data->nr_cpus + sock];
> > +     if (input >= s_accum->prev_value)
> > +             s_accum->energy_ctr +=
> > +                     input - s_accum->prev_value;
> > +     else
> > +             s_accum->energy_ctr += UINT_MAX -
> > +                     s_accum->prev_value + input;
> > +
> > +     s_accum->prev_value = input;
> > +     mutex_unlock(&data->lock);
> > +}
> > +
> > +static void accumulate_core_delta(struct amd_energy_data *data)
> > +{
> > +     struct sensor_accumulator *c_accum;
> > +     u64 input;
> > +     int cpu;
> > +
> > +     mutex_lock(&data->lock);
> > +     if (data->core_id >= data->nr_cpus)
> > +             data->core_id = 0;
> > +
> > +     cpu = data->core_id;
> > +
> > +     if (!cpu_online(cpu))
> > +             goto out;
> > +
> > +     rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> > +     input &= AMD_ENERGY_MASK;
> > +
> > +     c_accum = &data->accums[cpu];
> > +
> > +     if (input >= c_accum->prev_value)
> > +             c_accum->energy_ctr +=
> > +                     input - c_accum->prev_value;
> > +     else
> > +             c_accum->energy_ctr += UINT_MAX -
> > +                     c_accum->prev_value + input;
> > +
> > +     c_accum->prev_value = input;
> > +
> > +out:
> > +     data->core_id++;
> > +     mutex_unlock(&data->lock);
> > +}
> > +
> > +static void read_accumulate(struct amd_energy_data *data)
> > +{
> > +     int sock;
> > +
> > +     for (sock = 0; sock < data->nr_socks; sock++) {
> > +             int cpu;
> > +
> > +             cpu = cpumask_first_and(cpu_online_mask,
> > +                                     cpumask_of_node(sock));
> > +
> > +             accumulate_socket_delta(data, sock, cpu);
> > +     }
> > +
> > +     accumulate_core_delta(data);
> > +}
> > +
> > +static void amd_add_delta(struct amd_energy_data *data, int ch,
> > +                       int cpu, long *val, bool is_core)
> > +{
> > +     struct sensor_accumulator *s_accum, *c_accum;
> > +     u64 input;
> > +
> > +     mutex_lock(&data->lock);
> > +     if (!is_core) {
> > +             rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
> > +             input &= AMD_ENERGY_MASK;
> > +
> > +             s_accum = &data->accums[ch];
> > +             if (input >= s_accum->prev_value)
> > +                     input += s_accum->energy_ctr -
> > +                               s_accum->prev_value;
> > +             else
> > +                     input += UINT_MAX - s_accum->prev_value +
> > +                               s_accum->energy_ctr;
> > +     } else {
> > +             rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
> > +             input &= AMD_ENERGY_MASK;
> > +
> > +             c_accum = &data->accums[ch];
> > +             if (input >= c_accum->prev_value)
> > +                     input += c_accum->energy_ctr -
> > +                              c_accum->prev_value;
> > +             else
> > +                     input += UINT_MAX - c_accum->prev_value +
> > +                              c_accum->energy_ctr;
> > +     }
> > +
> > +     /* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
> > +     *val = div64_ul(input * 1000000UL, BIT(data->energy_units));
> > +
> > +     mutex_unlock(&data->lock);
> > +}
> > +
> > +static int amd_energy_read(struct device *dev,
> > +                        enum hwmon_sensor_types type,
> > +                        u32 attr, int channel, long *val)
> > +{
> > +     struct amd_energy_data *data = dev_get_drvdata(dev);
> > +     int cpu;
> > +
> > +     if (channel >= data->nr_cpus) {
> > +             cpu = cpumask_first_and(cpu_online_mask,
> > +                                     cpumask_of_node
> > +                                     (channel - data->nr_cpus));
> > +             amd_add_delta(data, channel, cpu, val, false);
> > +     } else {
> > +             cpu = channel;
> > +             if (!cpu_online(cpu))
> > +                     return -ENODEV;
> > +
> > +             amd_add_delta(data, channel, cpu, val, true);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static umode_t amd_energy_is_visible(const void *_data,
> > +                                  enum hwmon_sensor_types type,
> > +                                  u32 attr, int channel)
> > +{
> > +     return 0444;
> > +}
> > +
> > +static int energy_accumulator(void *p)
> > +{
> > +     struct amd_energy_data *data = (struct amd_energy_data *)p;
> > +
> > +     while (!kthread_should_stop()) {
> > +             /*
> > +              * Ignoring the conditions such as
> > +              * cpu being offline or rdmsr failure
> > +              */
> > +             read_accumulate(data);
> > +
> > +             set_current_state(TASK_INTERRUPTIBLE);
> > +             if (kthread_should_stop())
> > +                     break;
> > +
> > +             /*
> > +              * On a 240W system, with default resolution the
> > +              * Socket Energy status register may wrap around in
> > +              * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
> > +              *
> > +              * let us accumulate for every 100secs
> > +              */
> > +             schedule_timeout(msecs_to_jiffies(100000));
> > +     }
> > +     return 0;
> > +}
> > +
> > +static const struct hwmon_ops amd_energy_ops = {
> > +     .is_visible = amd_energy_is_visible,
> > +     .read = amd_energy_read,
> > +     .read_string = amd_energy_read_labels,
> > +};
> > +
> > +static int amd_create_sensor(struct device *dev,
> > +                          struct amd_energy_data *data,
> > +                          u8 type, u32 config)
> > +{
> > +     struct hwmon_channel_info *info = &data->energy_info;
> > +     struct sensor_accumulator *accums;
> > +     int i, num_siblings, cpus, sockets;
> > +     u32 *s_config;
> > +
> > +     /* Identify the number of siblings per core */
> > +     num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
> > +
> > +     sockets = num_possible_nodes();
> > +
> > +     /*
> > +      * Energy counter register is accessed at core level.
> > +      * Hence, filterout the siblings.
> > +      */
> > +     cpus = num_present_cpus() / num_siblings;
> > +
> > +     s_config = devm_kcalloc(dev, cpus + sockets,
> > +                             sizeof(u32), GFP_KERNEL);
> > +     if (!s_config)
> > +             return -ENOMEM;
> > +
> > +     accums = devm_kcalloc(dev, cpus + sockets,
> > +                           sizeof(struct sensor_accumulator),
> > +                           GFP_KERNEL);
> > +     if (!accums)
> > +             return -ENOMEM;
> > +
> > +     info->type = type;
> > +     info->config = s_config;
> > +
> > +     data->nr_cpus = cpus;
> > +     data->nr_socks = sockets;
> > +     data->accums = accums;
> > +
> > +     for (i = 0; i < cpus + sockets; i++) {
> > +             s_config[i] = config;
> > +             if (i < cpus)
> > +                     scnprintf(accums[i].label, 10,
> > +                               "Ecore%03u", i);
> > +             else
> > +                     scnprintf(accums[i].label, 10,
> > +                               "Esocket%u", (i - cpus));
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static int amd_energy_probe(struct platform_device *pdev)
> > +{
> > +     struct device *hwmon_dev;
> > +     struct amd_energy_data *data;
> > +     struct device *dev = &pdev->dev;
> > +
> > +     data = devm_kzalloc(dev,
> > +                         sizeof(struct amd_energy_data), GFP_KERNEL);
> > +     if (!data)
> > +             return -ENOMEM;
> > +
> > +     data->chip.ops = &amd_energy_ops;
> > +     data->chip.info = data->info;
> > +
> > +     dev_set_drvdata(dev, data);
> > +     /* Populate per-core energy reporting */
> > +     data->info[0] = &data->energy_info;
> > +     amd_create_sensor(dev, data, hwmon_energy,
> > +                       HWMON_E_INPUT | HWMON_E_LABEL);
> > +
> > +     mutex_init(&data->lock);
> > +     get_energy_units(data);
> > +
> > +     hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
> > +                                                      data,
> > +                                                      &data->chip,
> > +                                                      NULL);
> > +     if (IS_ERR(hwmon_dev))
> > +             return PTR_ERR(hwmon_dev);
> > +
> > +     data->wrap_accumulate = kthread_run(energy_accumulator, data,
> > +                                         "%s", dev_name(hwmon_dev));
> > +     if (IS_ERR(data->wrap_accumulate))
> > +             return PTR_ERR(data->wrap_accumulate);
> > +
> > +     return PTR_ERR_OR_ZERO(data->wrap_accumulate);
> > +}
> > +
> > +static int amd_energy_remove(struct platform_device *pdev)
> > +{
> > +     struct amd_energy_data *data = dev_get_drvdata(&pdev->dev);
> > +
> > +     if (data && data->wrap_accumulate)
> > +             kthread_stop(data->wrap_accumulate);
> > +
> > +     return 0;
> > +}
> > +
> > +static const struct platform_device_id amd_energy_ids[] = {
> > +     { .name = DRVNAME, },
> > +     {}
> > +};
> > +MODULE_DEVICE_TABLE(platform, amd_energy_ids);
> > +
> > +static struct platform_driver amd_energy_driver = {
> > +     .probe = amd_energy_probe,
> > +     .remove = amd_energy_remove,
> > +     .id_table = amd_energy_ids,
> > +     .driver = {
> > +             .name = DRVNAME,
> > +     },
> > +};
> > +
> > +static struct platform_device *amd_energy_platdev;
> > +
> > +static const struct x86_cpu_id cpu_ids[] __initconst = {
> > +     X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
> > +     {}
> > +};
> > +MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
> > +
> > +static int __init amd_energy_init(void)
> > +{
> > +     int ret;
> > +
> > +     if (!x86_match_cpu(cpu_ids))
> > +             return -ENODEV;
> > +
> > +     ret = platform_driver_register(&amd_energy_driver);
> > +     if (ret)
> > +             return ret;
> > +
> > +     amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
> > +     if (!amd_energy_platdev)
> > +             return -ENOMEM;
> > +
> > +     ret = platform_device_add(amd_energy_platdev);
> > +     if (ret) {
> > +             platform_device_put(amd_energy_platdev);
> > +             platform_driver_unregister(&amd_energy_driver);
> > +             return ret;
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> > +static void __exit amd_energy_exit(void)
> > +{
> > +     platform_device_unregister(amd_energy_platdev);
> > +     platform_driver_unregister(&amd_energy_driver);
> > +}
> > +
> > +module_init(amd_energy_init);
> > +module_exit(amd_energy_exit);
> > +
> > +MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
> > +MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
> > +MODULE_LICENSE("GPL");
> >
>
Alexander Monakov May 26, 2020, 10:37 p.m. UTC | #3
On Fri, 22 May 2020, Naveen Krishna Ch wrote:

> > This version looks ok, and it looks like there are no more
> > unexpected counter wraps either. Series applied to hwmon-next.
> 
> Thank you for your guidance and support.

Thank you, looking forward to using this driver. Meanwhile, I have
a couple of questions about AMD RAPL counters.

The documentation says the PKG_ENERGY_STAT MSR is instantiated per CCD
and L3 complex, implying that energy is accumulated for 3 or 4 cores
in an L3 complex.

However your patch reads that MSR per socket, and from testing on my
Ryzen 4500U CPU I can confirm that the MSR gives the same value when
accessed from each core. Therefore I think your code is correct and
the documentation is wrong, can you confirm?

Next, I'm seeing paradoxical results when attempting to test accuracy
of the counters. When running an AVX workload on 6 cores, I see
CORE_ENERGY_STAT MSR reporting values just above 9 Watts per core, with
55 Watts in total, and yet PKG_ENERGY_STAT reporting only 30 Watts.
This is clearly a contradiction since package energy must be at least 
the sum of core energy readings. Furthermore, this is happening on a
18 W CPU in a laptop, which definitely cannot sustain 55 Watts, and
even 30 Watts seems too high.

Can you clarify how the counters work and under what conditions they
give accurate readings?

Thanks.
Alexander
Naveen Krishna Ch May 27, 2020, 3:03 a.m. UTC | #4
Hi Alexander,

Thanks for using the driver.

On Wed, 27 May 2020 at 04:07, Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Fri, 22 May 2020, Naveen Krishna Ch wrote:
>
> > > This version looks ok, and it looks like there are no more
> > > unexpected counter wraps either. Series applied to hwmon-next.
> >
> > Thank you for your guidance and support.
>
> Thank you, looking forward to using this driver. Meanwhile, I have
> a couple of questions about AMD RAPL counters.
>
> The documentation says the PKG_ENERGY_STAT MSR is instantiated per CCD
> and L3 complex, implying that energy is accumulated for 3 or 4 cores
> in an L3 complex.
>
> However your patch reads that MSR per socket, and from testing on my
> Ryzen 4500U CPU I can confirm that the MSR gives the same value when
> accessed from each core. Therefore I think your code is correct and
> the documentation is wrong, can you confirm?
I confirmed this with the hardware folks, the definition of package has
changed with the chiplet design. May be the documentation needs an update.

>
> Next, I'm seeing paradoxical results when attempting to test accuracy
> of the counters. When running an AVX workload on 6 cores, I see
> CORE_ENERGY_STAT MSR reporting values just above 9 Watts per core, with
> 55 Watts in total, and yet PKG_ENERGY_STAT reporting only 30 Watts.
> This is clearly a contradiction since package energy must be at least
> the sum of core energy readings. Furthermore, this is happening on a
> 18 W CPU in a laptop, which definitely cannot sustain 55 Watts, and
> even 30 Watts seems too high.
>
> Can you clarify how the counters work and under what conditions they
> give accurate readings?

These registers are 32bit counters, they might wrap-around quite faster at
high work loads. So, we used a kernel thread to accumulate the values of
each core and socket to 64bit values.

Depending on when the module is inserted in the system, the initial values
of the counters could be different and we do not have a way to know, how
many time the registers are wrapped around in the past.

Difference of Energy over Time is the best way to use this information.
For example:
at t1 = 0, Read the register before starting the job, say E1.
at t2= 300secs, Read the register after the completion of the job, say E2.
power consumed = (E2-E1)/(t2 - t1) in Watts.

In our evaluation, the sum of the energy consumption of cores of a socket was
always less (actually far lesser) than the socket energy consumption.

One other thing could be the enumeration of the cores of a package in the
Desktop/Laptop systems might need different handling in the read. I will check
this.

>
> Thanks.
> Alexander
Alexander Monakov May 27, 2020, 6:59 a.m. UTC | #5
On Wed, 27 May 2020, Naveen Krishna Ch wrote:

> These registers are 32bit counters, they might wrap-around quite faster at
> high work loads. So, we used a kernel thread to accumulate the values of
> each core and socket to 64bit values.
> 
> Depending on when the module is inserted in the system, the initial values
> of the counters could be different and we do not have a way to know, how
> many time the registers are wrapped around in the past.

I understand that. If you anticipate that the module may be inserted after a
wraparound, the driver should populate 'prev_value' with actual counter
values instead of zeros. That way the driver will properly accumulate
energy over time it's been inserted. As implemented, the driver counts
energy since boot time, minus unknown amount lost to wraparounds if the
driver was loaded too late.

In my case I observed the contradictory readings over a period of several
seconds where no wraparound was possible.

> In our evaluation, the sum of the energy consumption of cores of a socket was
> always less (actually far lesser) than the socket energy consumption.

Did you try on laptop CPUs (Renoir SoC, Ryzen 4x00U marketing name)? You also
might need specific workloads to observe the issue, I first found it with a
small hand-written test, and then found a bigger discrepancy with AVX test
from https://github.com/travisdowns/avx-turbo

Alexander
Naveen Krishna Ch May 27, 2020, 10:35 a.m. UTC | #6
Hi Alexander

On Wed, 27 May 2020 at 12:29, Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Wed, 27 May 2020, Naveen Krishna Ch wrote:
>
> > These registers are 32bit counters, they might wrap-around quite faster at
> > high work loads. So, we used a kernel thread to accumulate the values of
> > each core and socket to 64bit values.
> >
> > Depending on when the module is inserted in the system, the initial values
> > of the counters could be different and we do not have a way to know, how
> > many time the registers are wrapped around in the past.
>
> I understand that. If you anticipate that the module may be inserted after a
> wraparound, the driver should populate 'prev_value' with actual counter
> values instead of zeros. That way the driver will properly accumulate
> energy over time it's been inserted. As implemented, the driver counts
> energy since boot time, minus unknown amount lost to wraparounds if the
> driver was loaded too late.
No problem if this module is built into the kernel.
If this module is inserted at later point, unless the user keeps the
counters since
the boot and provide it as an input during the module insert (we can
implement this).
There is no other way to provide the lost count.

> In my case I observed the contradictory readings over a period of several
> seconds where no wraparound was possible.
>
> > In our evaluation, the sum of the energy consumption of cores of a socket was
> > always less (actually far lesser) than the socket energy consumption.
>
> Did you try on laptop CPUs (Renoir SoC, Ryzen 4x00U marketing name)? You also
> might need specific workloads to observe the issue, I first found it with a
> small hand-written test, and then found a bigger discrepancy with AVX test
> from https://github.com/travisdowns/avx-turbo

I tried on an octa core machine, must be Renoir
vendor_id       : AuthenticAMD
cpu family      : 23
model           : 96

On an idle system over a period of 500secs:

                   At t=500sec   | At t= 0             | Diff of
energy |  in Joules   | Power in Watts
core 0      | 650186538     | 649712585      | 473953          |
0.473953 J  | 0.000947906 W
core 1      | 507792434     | 507131301      | 661133          |
0.661133 J  | 0.001322266 W
core 2      | 455706497     | 455163970      | 542527          |
0.542527 J  | 0.001085054 W
core 3      | 392240356     | 391740417      | 499939          |
0.499939 J  | 0.000999878 W
core 4      | 411461654      | 410687881      | 773773         |
0.773773 J  | 0.001547546 W
core 5      | 288821884      | 288071395      | 750489         |
0.750489 J  | 0.001500978 W
core 6      | 186975219      | 186250793      | 724426         |
0.724426 J  | 0.001448852 W
core 7      | 131509216      | 130458816      | 1050400       | 1.0504
J      | 0.0021008 W
Socket 0  | 31638431930  |  29370733505 | 2267698425 |  2267.698 J |
4.53539685 W
Power consumed by socket: 4.53539685 W
Sum of power consumed by cores: 0.010953W

On an system with AVX test running over a period of 500 secs:

                 At t=500sec    | At t= 0           | Diff of energy
 | in Joules        | Power in Watts
core 0     | 649348495     | 413687530    | 235660965       |
235.660965   | 0.47132193
core 1     | 506880081     | 294882827    | 211997254       |
211.997254    | 0.423994508
core 2     | 454804046     | 271046127    | 183757919       |
183.757919    | 0.367515838
core 3     | 391508712     | 237531021    | 153977691       |
153.977691    | 0.307955382
core 4     | 410336868     | 284410079    | 125926789       |
125.926789    | 0.251853578
core 5     | 287569732     | 192306015    | 95263717         |
95.263717      | 0.190527434
core 6     | 185909622     | 120556152    | 65353470         |
65.35347        | 0.13070694
core 7     | 129932006     | 95385940      | 34546066         |
34.546066      | 0.069092132
Socket 0 | 28399099655 |  24799819931 3599279724     | 3599.27972    |
7.198559448
Power consumed by socket: 7.198559448 W
Sum of power consumed by cores:  2.212968 W

Can you confirm this.
>
> Alexander
Guenter Roeck May 27, 2020, 1:28 p.m. UTC | #7
On 5/27/20 3:35 AM, Naveen Krishna Ch wrote:
> Hi Alexander
> 
> On Wed, 27 May 2020 at 12:29, Alexander Monakov <amonakov@ispras.ru> wrote:
>>
>> On Wed, 27 May 2020, Naveen Krishna Ch wrote:
>>
>>> These registers are 32bit counters, they might wrap-around quite faster at
>>> high work loads. So, we used a kernel thread to accumulate the values of
>>> each core and socket to 64bit values.
>>>
>>> Depending on when the module is inserted in the system, the initial values
>>> of the counters could be different and we do not have a way to know, how
>>> many time the registers are wrapped around in the past.
>>
>> I understand that. If you anticipate that the module may be inserted after a
>> wraparound, the driver should populate 'prev_value' with actual counter
>> values instead of zeros. That way the driver will properly accumulate
>> energy over time it's been inserted. As implemented, the driver counts
>> energy since boot time, minus unknown amount lost to wraparounds if the
>> driver was loaded too late.
> No problem if this module is built into the kernel.
> If this module is inserted at later point, unless the user keeps the
> counters since
> the boot and provide it as an input during the module insert (we can
> implement this).

I won't accept such a parameter. If you may recall, I did bring this up as
reason why I abandoned the matching change for the coretemp driver, predicting
that people would complain about it. Now they do. Not surprising. We (and you)
will have to live with it.

Guenter
Alexander Monakov May 27, 2020, 2:08 p.m. UTC | #8
On Wed, 27 May 2020, Guenter Roeck wrote:

> >> I understand that. If you anticipate that the module may be inserted after a
> >> wraparound, the driver should populate 'prev_value' with actual counter
> >> values instead of zeros. That way the driver will properly accumulate
> >> energy over time it's been inserted. As implemented, the driver counts
> >> energy since boot time, minus unknown amount lost to wraparounds if the
> >> driver was loaded too late.
> > No problem if this module is built into the kernel.
> > If this module is inserted at later point, unless the user keeps the
> > counters since
> > the boot and provide it as an input during the module insert (we can
> > implement this).
> 
> I won't accept such a parameter. If you may recall, I did bring this up as
> reason why I abandoned the matching change for the coretemp driver, predicting
> that people would complain about it. Now they do. Not surprising. We (and you)
> will have to live with it.

I'm not exactly complaining, I'm proposing a solution: at probe time, populate
prev_value members with MSR values instead of zeros. That way, the module will
correctly count energy over the time it's been loaded. It can be unloaded and
reloaded freely, and doing so would allow to easily measure energy across large
spans of time, which sounds like an improvement.

I can try to cook a corresponding patch if that sounds alright.

Cheers.
Alexander
Naveen Krishna Ch May 27, 2020, 2:42 p.m. UTC | #9
Hi Alexander

On Wed, 27 May 2020 at 19:38, Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Wed, 27 May 2020, Guenter Roeck wrote:
>
> > >> I understand that. If you anticipate that the module may be inserted after a
> > >> wraparound, the driver should populate 'prev_value' with actual counter
> > >> values instead of zeros. That way the driver will properly accumulate
> > >> energy over time it's been inserted. As implemented, the driver counts
> > >> energy since boot time, minus unknown amount lost to wraparounds if the
> > >> driver was loaded too late.
> > > No problem if this module is built into the kernel.
> > > If this module is inserted at later point, unless the user keeps the
> > > counters since
> > > the boot and provide it as an input during the module insert (we can
> > > implement this).
> >
> > I won't accept such a parameter. If you may recall, I did bring this up as
> > reason why I abandoned the matching change for the coretemp driver, predicting
> > that people would complain about it. Now they do. Not surprising. We (and you)
> > will have to live with it.
Yes agree, letting user provide initial values is not appropriate. My bad.

>
> I'm not exactly complaining, I'm proposing a solution: at probe time, populate
> prev_value members with MSR values instead of zeros. That way, the module will
> correctly count energy over the time it's been loaded. It can be unloaded and
> reloaded freely, and doing so would allow to easily measure energy across large
> spans of time, which sounds like an improvement.

In the current driver, the accumulation thread starts during the probe and the
prev_value of sockets and the core0 is updated with the current MSR value.
It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for
the other cores.

By populating prev_value of all the cores during probe, we can
certainly save some
information (values with in one wrap-around at the best). If this
information is
useful, i can submit a patch to do so. Guenter, could you suggest us on this.

>
> I can try to cook a corresponding patch if that sounds alright.
>
> Cheers.
> Alexander
Guenter Roeck May 27, 2020, 2:48 p.m. UTC | #10
On Wed, May 27, 2020 at 05:08:55PM +0300, Alexander Monakov wrote:
> On Wed, 27 May 2020, Guenter Roeck wrote:
> 
> > >> I understand that. If you anticipate that the module may be inserted after a
> > >> wraparound, the driver should populate 'prev_value' with actual counter
> > >> values instead of zeros. That way the driver will properly accumulate
> > >> energy over time it's been inserted. As implemented, the driver counts
> > >> energy since boot time, minus unknown amount lost to wraparounds if the
> > >> driver was loaded too late.
> > > No problem if this module is built into the kernel.
> > > If this module is inserted at later point, unless the user keeps the
> > > counters since
> > > the boot and provide it as an input during the module insert (we can
> > > implement this).
> > 
> > I won't accept such a parameter. If you may recall, I did bring this up as
> > reason why I abandoned the matching change for the coretemp driver, predicting
> > that people would complain about it. Now they do. Not surprising. We (and you)
> > will have to live with it.
> 
> I'm not exactly complaining, I'm proposing a solution: at probe time, populate
> prev_value members with MSR values instead of zeros. That way, the module will
> correctly count energy over the time it's been loaded. It can be unloaded and
> reloaded freely, and doing so would allow to easily measure energy across large
> spans of time, which sounds like an improvement.
> 
That would ignore energy accumulated from before the driver was loaded, and
would thus trigger another set of complaints.

A slight improvement might be to add up core energy counters when loading
the driver, compare it against the package counter, and pick the larger
value for the initial package counter(s). This would at least ensure that
the package counter is never less than the sum of the core counters.

Guenter

> I can try to cook a corresponding patch if that sounds alright.
> 
> Cheers.
> Alexander
Alexander Monakov May 27, 2020, 3:12 p.m. UTC | #11
On Wed, 27 May 2020, Guenter Roeck wrote:

> > I'm not exactly complaining, I'm proposing a solution: at probe time, populate
> > prev_value members with MSR values instead of zeros. That way, the module will
> > correctly count energy over the time it's been loaded. It can be unloaded and
> > reloaded freely, and doing so would allow to easily measure energy across large
> > spans of time, which sounds like an improvement.
> > 
> That would ignore energy accumulated from before the driver was loaded, and
> would thus trigger another set of complaints.

That doesn't sound right. There's no way for the driver to be sure that the
counters did not wrap around before it was loaded. Here's a few scenarios
how such wraparound is possible:

- while the user was messing in the bootloader for a few minutes
- if the user kexec'd a new kernel
- if the counters were not reset during a warm reboot

Ignoring initial values of the counters is generally the right thing to do.
In the specific circumstances when the user wants to measure energy used
since machine power-up, and they know the boot happened so quick the counters
did not wrap around, they can easily script that with e.g. the rdmsr tool.
Or perhaps the driver could pr_info the initial values at probe time.

Have such complaints already appeared in practice?

Also note that documentation doesn't promise that counters start from zero
at power-up time, although that's of course a natural assumption.


> A slight improvement might be to add up core energy counters when loading
> the driver, compare it against the package counter, and pick the larger
> value for the initial package counter(s). This would at least ensure that
> the package counter is never less than the sum of the core counters.

No, fudging the initial reading like this wouldn't help, because I was
pointing out how core counters increment quicker than the package counter;
i.e. even if the kernel fudged the initial values, they would still grow
contradictory quick enough (on some workloads).

Alexander
Guenter Roeck May 27, 2020, 3:15 p.m. UTC | #12
On 5/27/20 7:42 AM, Naveen Krishna Ch wrote:
> Hi Alexander
> 
> On Wed, 27 May 2020 at 19:38, Alexander Monakov <amonakov@ispras.ru> wrote:
>>
>> On Wed, 27 May 2020, Guenter Roeck wrote:
>>
>>>>> I understand that. If you anticipate that the module may be inserted after a
>>>>> wraparound, the driver should populate 'prev_value' with actual counter
>>>>> values instead of zeros. That way the driver will properly accumulate
>>>>> energy over time it's been inserted. As implemented, the driver counts
>>>>> energy since boot time, minus unknown amount lost to wraparounds if the
>>>>> driver was loaded too late.
>>>> No problem if this module is built into the kernel.
>>>> If this module is inserted at later point, unless the user keeps the
>>>> counters since
>>>> the boot and provide it as an input during the module insert (we can
>>>> implement this).
>>>
>>> I won't accept such a parameter. If you may recall, I did bring this up as
>>> reason why I abandoned the matching change for the coretemp driver, predicting
>>> that people would complain about it. Now they do. Not surprising. We (and you)
>>> will have to live with it.
> Yes agree, letting user provide initial values is not appropriate. My bad.
> 
>>
>> I'm not exactly complaining, I'm proposing a solution: at probe time, populate
>> prev_value members with MSR values instead of zeros. That way, the module will
>> correctly count energy over the time it's been loaded. It can be unloaded and
>> reloaded freely, and doing so would allow to easily measure energy across large
>> spans of time, which sounds like an improvement.
> 
> In the current driver, the accumulation thread starts during the probe and the
> prev_value of sockets and the core0 is updated with the current MSR value.
> It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for
> the other cores.
> 
> By populating prev_value of all the cores during probe, we can
> certainly save some
> information (values with in one wrap-around at the best). If this
> information is
> useful, i can submit a patch to do so. Guenter, could you suggest us on this.
> 

As long as you don't ignore counter values from before loading the driver,
sure, go ahead.

Guenter

>>
>> I can try to cook a corresponding patch if that sounds alright.
>>
>> Cheers.
>> Alexander
> 
> 
>
Alexander Monakov May 27, 2020, 3:25 p.m. UTC | #13
On Wed, 27 May 2020, Guenter Roeck wrote:

> > In the current driver, the accumulation thread starts during the probe and the
> > prev_value of sockets and the core0 is updated with the current MSR value.
> > It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for
> > the other cores.
> > 
> > By populating prev_value of all the cores during probe, we can
> > certainly save some
> > information (values with in one wrap-around at the best). If this
> > information is
> > useful, i can submit a patch to do so. Guenter, could you suggest us on this.
> > 
> 
> As long as you don't ignore counter values from before loading the driver,
> sure, go ahead.

Hm? If I'm understanding correctly what Naveen is proposing, that would simply
move reading the initial values ahead by a few hundred cycles (the latency to
start the thread). It wouldn't change anything about the issue, and make the 
code more complicated :(

Alexander
Guenter Roeck May 27, 2020, 3:33 p.m. UTC | #14
On 5/27/20 8:12 AM, Alexander Monakov wrote:
> On Wed, 27 May 2020, Guenter Roeck wrote:
> 
>>> I'm not exactly complaining, I'm proposing a solution: at probe time, populate
>>> prev_value members with MSR values instead of zeros. That way, the module will
>>> correctly count energy over the time it's been loaded. It can be unloaded and
>>> reloaded freely, and doing so would allow to easily measure energy across large
>>> spans of time, which sounds like an improvement.
>>>
>> That would ignore energy accumulated from before the driver was loaded, and
>> would thus trigger another set of complaints.
> 
> That doesn't sound right. There's no way for the driver to be sure that the
> counters did not wrap around before it was loaded. Here's a few scenarios
> how such wraparound is possible:
> 
> - while the user was messing in the bootloader for a few minutes
> - if the user kexec'd a new kernel
> - if the counters were not reset during a warm reboot
> 
> Ignoring initial values of the counters is generally the right thing to do.
> In the specific circumstances when the user wants to measure energy used
> since machine power-up, and they know the boot happened so quick the counters
> did not wrap around, they can easily script that with e.g. the rdmsr tool.
> Or perhaps the driver could pr_info the initial values at probe time.
> 
> Have such complaints already appeared in practice?
>
The main argument in the coretemp case, if I recall correctly, was that
the driver _doesn't_ provide valid data from the beginning of time, and
would thus be worthless. So, yes, such complaints have already appeared
in practice.

> Also note that documentation doesn't promise that counters start from zero
> at power-up time, although that's of course a natural assumption.
> 
> 
>> A slight improvement might be to add up core energy counters when loading
>> the driver, compare it against the package counter, and pick the larger
>> value for the initial package counter(s). This would at least ensure that
>> the package counter is never less than the sum of the core counters.
> 
> No, fudging the initial reading like this wouldn't help, because I was
> pointing out how core counters increment quicker than the package counter;
> i.e. even if the kernel fudged the initial values, they would still grow
> contradictory quick enough (on some workloads).
> 

This exchange is exactly what I was concerned about when this driver
was first submitted. I should have known better, and I should not
have accepted it. Right now I seriously wonder if I should revert/drop
it. Any arguments/thoughts why I _shouldn't_ do that ?

Guenter
Guenter Roeck May 27, 2020, 3:35 p.m. UTC | #15
On 5/27/20 8:25 AM, Alexander Monakov wrote:
> On Wed, 27 May 2020, Guenter Roeck wrote:
> 
>>> In the current driver, the accumulation thread starts during the probe and the
>>> prev_value of sockets and the core0 is updated with the current MSR value.
>>> It takes (nr_cpus - 1) iterations of the thread to populate the prev_value for
>>> the other cores.
>>>
>>> By populating prev_value of all the cores during probe, we can
>>> certainly save some
>>> information (values with in one wrap-around at the best). If this
>>> information is
>>> useful, i can submit a patch to do so. Guenter, could you suggest us on this.
>>>
>>
>> As long as you don't ignore counter values from before loading the driver,
>> sure, go ahead.
> 
> Hm? If I'm understanding correctly what Naveen is proposing, that would simply
> move reading the initial values ahead by a few hundred cycles (the latency to
> start the thread). It wouldn't change anything about the issue, and make the 
> code more complicated :(
> 

At this point the question is if I should drop this driver entirely.

Guenter
Alexander Monakov May 27, 2020, 4:41 p.m. UTC | #16
On Wed, 27 May 2020, Guenter Roeck wrote:

> This exchange is exactly what I was concerned about when this driver
> was first submitted. I should have known better, and I should not
> have accepted it. Right now I seriously wonder if I should revert/drop
> it. Any arguments/thoughts why I _shouldn't_ do that ?

Let me apologize and explain my perspective.

These AMD MSRs have been previously wired up in the turbostat tool, and
very recently another developer submitted a patch to wire up the package
energy counter MSR for use with perf.

Unlike the above, this driver is submitted by AMD. As I have noticed a
substantial issue (sum of core counters contradicting the package counter),
I have attempted to report it in this thread. Since AMD is submitting the
code, I was hoping to get their attention to the issue, and ideally get
some explanations about how the counters work and to what extent we can
expect them to be accurate.

I think most of the discussion about (not) ignoring initial counter
values was in part caused by misunderstanding exactly what issue I was
reporting. After all, it's not so important if the driver accurately
captures boot-time energy use, if the counters are not trustworthy.

I don't have an answer to your question (whether you should keep the
driver). I hope you see where I'm coming from. I'm not quite aware of
the history with coretemp driver, so if all this caused you extra
headaches, I apologize for my part in the mess.

Alexander
Guenter Roeck May 27, 2020, 4:57 p.m. UTC | #17
On 5/27/20 9:41 AM, Alexander Monakov wrote:
> On Wed, 27 May 2020, Guenter Roeck wrote:
> 
>> This exchange is exactly what I was concerned about when this driver
>> was first submitted. I should have known better, and I should not
>> have accepted it. Right now I seriously wonder if I should revert/drop
>> it. Any arguments/thoughts why I _shouldn't_ do that ?
> 
> Let me apologize and explain my perspective.
> 
> These AMD MSRs have been previously wired up in the turbostat tool, and
> very recently another developer submitted a patch to wire up the package
> energy counter MSR for use with perf.
> 
> Unlike the above, this driver is submitted by AMD. As I have noticed a
> substantial issue (sum of core counters contradicting the package counter),
> I have attempted to report it in this thread. Since AMD is submitting the
> code, I was hoping to get their attention to the issue, and ideally get
> some explanations about how the counters work and to what extent we can
> expect them to be accurate.
> 
> I think most of the discussion about (not) ignoring initial counter
> values was in part caused by misunderstanding exactly what issue I was
> reporting. After all, it's not so important if the driver accurately
> captures boot-time energy use, if the counters are not trustworthy.
> 
> I don't have an answer to your question (whether you should keep the
> driver). I hope you see where I'm coming from. I'm not quite aware of
> the history with coretemp driver, so if all this caused you extra
> headaches, I apologize for my part in the mess.
> 

There are two opposing arguments:

- The driver can not guarantee that there are no previous overflows,
  thus it should always start counting from 0.
- The driver is in all typical and most common situations loaded
  when the system boots, when there have been no overflows. Therefore,
  it should include startup counter values to provide accurate
  information for those most common use cases.

My prediction was that we would see endless arguments about this,
with one set of people arguing one way, another set of people
arguing the other way, and both being extremely passionate
about it. You have already proven my point.

This is a perfect lose-lose situation, with me sitting in the
middle. In such situations my reaction tends to be to pull
the plug.

Guenter
Alexander Monakov May 27, 2020, 4:59 p.m. UTC | #18
On Wed, 27 May 2020, Naveen Krishna Ch wrote:

> I tried on an octa core machine, must be Renoir
> vendor_id       : AuthenticAMD
> cpu family      : 23
> model           : 96

Yes, same here. The only difference I'm on a 6-core 4500U.
 
> On an idle system over a period of 500secs:
[snip]
> Power consumed by socket: 4.53539685 W
> Sum of power consumed by cores: 0.010953W

Yes, no problem for idle readings. I'm getting 2W/package on AC and
close to 0W on battery.


> On an system with AVX test running over a period of 500 secs:
> 
>                  At t=500sec    | At t= 0           | Diff of energy
>  | in Joules        | Power in Watts
> core 0     | 649348495     | 413687530    | 235660965       |
> 235.660965   | 0.47132193
> core 1     | 506880081     | 294882827    | 211997254       |
> 211.997254    | 0.423994508
> core 2     | 454804046     | 271046127    | 183757919       |
> 183.757919    | 0.367515838
> core 3     | 391508712     | 237531021    | 153977691       |
> 153.977691    | 0.307955382
> core 4     | 410336868     | 284410079    | 125926789       |
> 125.926789    | 0.251853578
> core 5     | 287569732     | 192306015    | 95263717         |
> 95.263717      | 0.190527434
> core 6     | 185909622     | 120556152    | 65353470         |
> 65.35347        | 0.13070694
> core 7     | 129932006     | 95385940      | 34546066         |
> 34.546066      | 0.069092132
> Socket 0 | 28399099655 |  24799819931 3599279724     | 3599.27972    |
> 7.198559448
> Power consumed by socket: 7.198559448 W
> Sum of power consumed by cores:  2.212968 W
> 
> Can you confirm this.

No, this isn't right. The workload is not stressing all cores, you're
reporting 235 J for core 0 and only 35 J for core 7.

If you are running the avx-turbo test, please run it like this:

  ./avx-turbo --iters 10000000 --spec avx256_fma_t/8

This will run AVX code on all 8 cores for about 1 second. To run it for longer,
increase the --iters argument.

Thanks.
Alexander
Alexander Monakov May 27, 2020, 5:12 p.m. UTC | #19
On Wed, 27 May 2020, Guenter Roeck wrote:

> My prediction was that we would see endless arguments about this,
> with one set of people arguing one way, another set of people
> arguing the other way, and both being extremely passionate
> about it. You have already proven my point.

Sorry :/  Personally I'd be okay with either approach. I even think
it's possible to find a middle ground by mentioning boot-time energy
use with pr_info or such at probe time, and then counting from zero.

My main goal here is to get answers about contradictory counters.

Cheers.
Alexander
Naveen Krishna Ch May 28, 2020, 4:11 a.m. UTC | #20
Hi Alexander

On Wed, 27 May 2020 at 22:29, Alexander Monakov <amonakov@ispras.ru> wrote:
>
> On Wed, 27 May 2020, Naveen Krishna Ch wrote:
>
> > I tried on an octa core machine, must be Renoir
> > vendor_id       : AuthenticAMD
> > cpu family      : 23
> > model           : 96
>
> Yes, same here. The only difference I'm on a 6-core 4500U.
>
> > On an idle system over a period of 500secs:
> [snip]
> > Power consumed by socket: 4.53539685 W
> > Sum of power consumed by cores: 0.010953W
>
> Yes, no problem for idle readings. I'm getting 2W/package on AC and
> close to 0W on battery.
>
>
> > On an system with AVX test running over a period of 500 secs:
> >
> >                  At t=500sec    | At t= 0           | Diff of energy
> >  | in Joules        | Power in Watts
> > core 0     | 649348495     | 413687530    | 235660965       |
> > 235.660965   | 0.47132193
> > core 1     | 506880081     | 294882827    | 211997254       |
> > 211.997254    | 0.423994508
> > core 2     | 454804046     | 271046127    | 183757919       |
> > 183.757919    | 0.367515838
> > core 3     | 391508712     | 237531021    | 153977691       |
> > 153.977691    | 0.307955382
> > core 4     | 410336868     | 284410079    | 125926789       |
> > 125.926789    | 0.251853578
> > core 5     | 287569732     | 192306015    | 95263717         |
> > 95.263717      | 0.190527434
> > core 6     | 185909622     | 120556152    | 65353470         |
> > 65.35347        | 0.13070694
> > core 7     | 129932006     | 95385940      | 34546066         |
> > 34.546066      | 0.069092132
> > Socket 0 | 28399099655 |  24799819931 3599279724     | 3599.27972    |
> > 7.198559448
> > Power consumed by socket: 7.198559448 W
> > Sum of power consumed by cores:  2.212968 W
> >
> > Can you confirm this.
>
> No, this isn't right. The workload is not stressing all cores, you're
> reporting 235 J for core 0 and only 35 J for core 7.
>
> If you are running the avx-turbo test, please run it like this:
>
>   ./avx-turbo --iters 10000000 --spec avx256_fma_t/8
>
> This will run AVX code on all 8 cores for about 1 second. To run it for longer,
> increase the --iters argument.
I ran with all cores loaded now and i see that
sum of diff of energy consumed by all cores is greater than the diff of
socket energy consumed in 100secs.

Kindly, give me time. I will come back with some explanation or solution.
>
> Thanks.
> Alexander
Alexander Monakov June 10, 2020, 8:21 p.m. UTC | #21
On Thu, 28 May 2020, Naveen Krishna Ch wrote:

> > No, this isn't right. The workload is not stressing all cores, you're
> > reporting 235 J for core 0 and only 35 J for core 7.
> >
> > If you are running the avx-turbo test, please run it like this:
> >
> >   ./avx-turbo --iters 10000000 --spec avx256_fma_t/8
> >
> > This will run AVX code on all 8 cores for about 1 second. To run it for longer,
> > increase the --iters argument.
> I ran with all cores loaded now and i see that
> sum of diff of energy consumed by all cores is greater than the diff of
> socket energy consumed in 100secs.
> 
> Kindly, give me time. I will come back with some explanation or solution.

Hello. Is there any update on this issue?

Thanks.
Alexander
Chatradhi, Naveen Krishna June 16, 2020, 2:46 p.m. UTC | #22
[AMD Public Use]

Hi Alexander,

The issue does not seem to be in the driver, it is in the register values populated by the firmware.
We have tested this in the server platforms and it works fine.
I've raised an issue internally to analyze and provide a solution for client systems.
I'm still waiting for their update. 

Guenter,
This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch
to support only server specific models 0x0 to 0x3f until we triage the issue for client systems.

Regards,
Naveen
-----Original Message-----
From: Alexander Monakov <amonakov@ispras.ru> 
Sent: Thursday, June 11, 2020 1:52 AM
To: Naveen Krishna Ch <naveenkrishna.ch@gmail.com>
Cc: Guenter Roeck <linux@roeck-us.net>; Chatradhi, Naveen Krishna <NaveenKrishna.Chatradhi@amd.com>; linux-hwmon@vger.kernel.org
Subject: Re: [PATCH 1/3 v7] hwmon: Add amd_energy driver to report energy counters

[CAUTION: External Email]

On Thu, 28 May 2020, Naveen Krishna Ch wrote:

> > No, this isn't right. The workload is not stressing all cores, 
> > you're reporting 235 J for core 0 and only 35 J for core 7.
> >
> > If you are running the avx-turbo test, please run it like this:
> >
> >   ./avx-turbo --iters 10000000 --spec avx256_fma_t/8
> >
> > This will run AVX code on all 8 cores for about 1 second. To run it 
> > for longer, increase the --iters argument.
> I ran with all cores loaded now and i see that sum of diff of energy 
> consumed by all cores is greater than the diff of socket energy 
> consumed in 100secs.
>
> Kindly, give me time. I will come back with some explanation or solution.

Hello. Is there any update on this issue?

Thanks.
Alexander
Guenter Roeck June 16, 2020, 2:53 p.m. UTC | #23
On 6/16/20 7:46 AM, Chatradhi, Naveen Krishna wrote:
> [AMD Public Use]
> 
> Hi Alexander,
> 
> The issue does not seem to be in the driver, it is in the register values populated by the firmware.
> We have tested this in the server platforms and it works fine.
> I've raised an issue internally to analyze and provide a solution for client systems.
> I'm still waiting for their update. 
> 
> Guenter,
> This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch
> to support only server specific models 0x0 to 0x3f until we triage the issue for client systems.
> 

Yes, I think that would make sense.

Thanks,
Guenter
Naveen Krishna Ch June 16, 2020, 2:57 p.m. UTC | #24
Hi Guenter

On Tue, 16 Jun 2020 at 20:23, Guenter Roeck <linux@roeck-us.net> wrote:
>
> On 6/16/20 7:46 AM, Chatradhi, Naveen Krishna wrote:
> > [AMD Public Use]
> >
> > Hi Alexander,
> >
> > The issue does not seem to be in the driver, it is in the register values populated by the firmware.
> > We have tested this in the server platforms and it works fine.
> > I've raised an issue internally to analyze and provide a solution for client systems.
> > I'm still waiting for their update.
> >
> > Guenter,
> > This issue is seen on client machines model ranging from 0x40 and above. Should I submit a patch
> > to support only server specific models 0x0 to 0x3f until we triage the issue for client systems.
> >
>
> Yes, I think that would make sense.
Sure, I will submit that. Thanks for the prompt response
>
> Thanks,
> Guenter
diff mbox series

Patch

diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 4c62f900bf7e..e165e10c49ef 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -324,6 +324,16 @@  config SENSORS_FAM15H_POWER
 	  This driver can also be built as a module. If so, the module
 	  will be called fam15h_power.
 
+config SENSORS_AMD_ENERGY
+	tristate "AMD RAPL MSR based Energy driver"
+	depends on X86
+	help
+	  If you say yes here you get support for core and package energy
+	  sensors, based on RAPL MSR for AMD family 17h and above CPUs.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called as amd_energy.
+
 config SENSORS_APPLESMC
 	tristate "Apple SMC (Motion sensor, light sensor, keyboard backlight)"
 	depends on INPUT && X86
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index b0b9c8e57176..318f89dc7133 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -45,6 +45,7 @@  obj-$(CONFIG_SENSORS_ADT7411)	+= adt7411.o
 obj-$(CONFIG_SENSORS_ADT7462)	+= adt7462.o
 obj-$(CONFIG_SENSORS_ADT7470)	+= adt7470.o
 obj-$(CONFIG_SENSORS_ADT7475)	+= adt7475.o
+obj-$(CONFIG_SENSORS_AMD_ENERGY) += amd_energy.o
 obj-$(CONFIG_SENSORS_APPLESMC)	+= applesmc.o
 obj-$(CONFIG_SENSORS_ARM_SCMI)	+= scmi-hwmon.o
 obj-$(CONFIG_SENSORS_ARM_SCPI)	+= scpi-hwmon.o
diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c
new file mode 100644
index 000000000000..bc8b643a37d5
--- /dev/null
+++ b/drivers/hwmon/amd_energy.c
@@ -0,0 +1,406 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Advanced Micro Devices, Inc.
+ */
+#include <asm/cpu_device_id.h>
+
+#include <linux/bits.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/hwmon.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/processor.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+
+#define DRVNAME			"amd_energy"
+
+#define ENERGY_PWR_UNIT_MSR	0xC0010299
+#define ENERGY_CORE_MSR		0xC001029A
+#define ENERGY_PKG_MSR		0xC001029B
+
+#define AMD_ENERGY_UNIT_MASK	0x01F00
+#define AMD_ENERGY_MASK		0xFFFFFFFF
+
+struct sensor_accumulator {
+	u64 energy_ctr;
+	u64 prev_value;
+	char label[10];
+};
+
+struct amd_energy_data {
+	struct hwmon_channel_info energy_info;
+	const struct hwmon_channel_info *info[2];
+	struct hwmon_chip_info chip;
+	struct task_struct *wrap_accumulate;
+	/* Lock around the accumulator */
+	struct mutex lock;
+	/* An accumulator for each core and socket */
+	struct sensor_accumulator *accums;
+	/* Energy Status Units */
+	u64 energy_units;
+	int nr_cpus;
+	int nr_socks;
+	int core_id;
+};
+
+static int amd_energy_read_labels(struct device *dev,
+				  enum hwmon_sensor_types type,
+				  u32 attr, int channel,
+				  const char **str)
+{
+	struct amd_energy_data *data = dev_get_drvdata(dev);
+
+	*str = data->accums[channel].label;
+	return 0;
+}
+
+static void get_energy_units(struct amd_energy_data *data)
+{
+	u64 rapl_units;
+
+	rdmsrl_safe(ENERGY_PWR_UNIT_MSR, &rapl_units);
+	data->energy_units = (rapl_units & AMD_ENERGY_UNIT_MASK) >> 8;
+}
+
+static void accumulate_socket_delta(struct amd_energy_data *data,
+				    int sock, int cpu)
+{
+	struct sensor_accumulator *s_accum;
+	u64 input;
+
+	mutex_lock(&data->lock);
+	rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
+	input &= AMD_ENERGY_MASK;
+
+	s_accum = &data->accums[data->nr_cpus + sock];
+	if (input >= s_accum->prev_value)
+		s_accum->energy_ctr +=
+			input - s_accum->prev_value;
+	else
+		s_accum->energy_ctr += UINT_MAX -
+			s_accum->prev_value + input;
+
+	s_accum->prev_value = input;
+	mutex_unlock(&data->lock);
+}
+
+static void accumulate_core_delta(struct amd_energy_data *data)
+{
+	struct sensor_accumulator *c_accum;
+	u64 input;
+	int cpu;
+
+	mutex_lock(&data->lock);
+	if (data->core_id >= data->nr_cpus)
+		data->core_id = 0;
+
+	cpu = data->core_id;
+
+	if (!cpu_online(cpu))
+		goto out;
+
+	rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
+	input &= AMD_ENERGY_MASK;
+
+	c_accum = &data->accums[cpu];
+
+	if (input >= c_accum->prev_value)
+		c_accum->energy_ctr +=
+			input - c_accum->prev_value;
+	else
+		c_accum->energy_ctr += UINT_MAX -
+			c_accum->prev_value + input;
+
+	c_accum->prev_value = input;
+
+out:
+	data->core_id++;
+	mutex_unlock(&data->lock);
+}
+
+static void read_accumulate(struct amd_energy_data *data)
+{
+	int sock;
+
+	for (sock = 0; sock < data->nr_socks; sock++) {
+		int cpu;
+
+		cpu = cpumask_first_and(cpu_online_mask,
+					cpumask_of_node(sock));
+
+		accumulate_socket_delta(data, sock, cpu);
+	}
+
+	accumulate_core_delta(data);
+}
+
+static void amd_add_delta(struct amd_energy_data *data, int ch,
+			  int cpu, long *val, bool is_core)
+{
+	struct sensor_accumulator *s_accum, *c_accum;
+	u64 input;
+
+	mutex_lock(&data->lock);
+	if (!is_core) {
+		rdmsrl_safe_on_cpu(cpu, ENERGY_PKG_MSR, &input);
+		input &= AMD_ENERGY_MASK;
+
+		s_accum = &data->accums[ch];
+		if (input >= s_accum->prev_value)
+			input += s_accum->energy_ctr -
+				  s_accum->prev_value;
+		else
+			input += UINT_MAX - s_accum->prev_value +
+				  s_accum->energy_ctr;
+	} else {
+		rdmsrl_safe_on_cpu(cpu, ENERGY_CORE_MSR, &input);
+		input &= AMD_ENERGY_MASK;
+
+		c_accum = &data->accums[ch];
+		if (input >= c_accum->prev_value)
+			input += c_accum->energy_ctr -
+				 c_accum->prev_value;
+		else
+			input += UINT_MAX - c_accum->prev_value +
+				 c_accum->energy_ctr;
+	}
+
+	/* Energy consumed = (1/(2^ESU) * RAW * 1000000UL) μJoules */
+	*val = div64_ul(input * 1000000UL, BIT(data->energy_units));
+
+	mutex_unlock(&data->lock);
+}
+
+static int amd_energy_read(struct device *dev,
+			   enum hwmon_sensor_types type,
+			   u32 attr, int channel, long *val)
+{
+	struct amd_energy_data *data = dev_get_drvdata(dev);
+	int cpu;
+
+	if (channel >= data->nr_cpus) {
+		cpu = cpumask_first_and(cpu_online_mask,
+					cpumask_of_node
+					(channel - data->nr_cpus));
+		amd_add_delta(data, channel, cpu, val, false);
+	} else {
+		cpu = channel;
+		if (!cpu_online(cpu))
+			return -ENODEV;
+
+		amd_add_delta(data, channel, cpu, val, true);
+	}
+
+	return 0;
+}
+
+static umode_t amd_energy_is_visible(const void *_data,
+				     enum hwmon_sensor_types type,
+				     u32 attr, int channel)
+{
+	return 0444;
+}
+
+static int energy_accumulator(void *p)
+{
+	struct amd_energy_data *data = (struct amd_energy_data *)p;
+
+	while (!kthread_should_stop()) {
+		/*
+		 * Ignoring the conditions such as
+		 * cpu being offline or rdmsr failure
+		 */
+		read_accumulate(data);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		/*
+		 * On a 240W system, with default resolution the
+		 * Socket Energy status register may wrap around in
+		 * 2^32*15.3 e-6/240 = 273.8041 secs (~4.5 mins)
+		 *
+		 * let us accumulate for every 100secs
+		 */
+		schedule_timeout(msecs_to_jiffies(100000));
+	}
+	return 0;
+}
+
+static const struct hwmon_ops amd_energy_ops = {
+	.is_visible = amd_energy_is_visible,
+	.read = amd_energy_read,
+	.read_string = amd_energy_read_labels,
+};
+
+static int amd_create_sensor(struct device *dev,
+			     struct amd_energy_data *data,
+			     u8 type, u32 config)
+{
+	struct hwmon_channel_info *info = &data->energy_info;
+	struct sensor_accumulator *accums;
+	int i, num_siblings, cpus, sockets;
+	u32 *s_config;
+
+	/* Identify the number of siblings per core */
+	num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+	sockets = num_possible_nodes();
+
+	/*
+	 * Energy counter register is accessed at core level.
+	 * Hence, filterout the siblings.
+	 */
+	cpus = num_present_cpus() / num_siblings;
+
+	s_config = devm_kcalloc(dev, cpus + sockets,
+				sizeof(u32), GFP_KERNEL);
+	if (!s_config)
+		return -ENOMEM;
+
+	accums = devm_kcalloc(dev, cpus + sockets,
+			      sizeof(struct sensor_accumulator),
+			      GFP_KERNEL);
+	if (!accums)
+		return -ENOMEM;
+
+	info->type = type;
+	info->config = s_config;
+
+	data->nr_cpus = cpus;
+	data->nr_socks = sockets;
+	data->accums = accums;
+
+	for (i = 0; i < cpus + sockets; i++) {
+		s_config[i] = config;
+		if (i < cpus)
+			scnprintf(accums[i].label, 10,
+				  "Ecore%03u", i);
+		else
+			scnprintf(accums[i].label, 10,
+				  "Esocket%u", (i - cpus));
+	}
+
+	return 0;
+}
+
+static int amd_energy_probe(struct platform_device *pdev)
+{
+	struct device *hwmon_dev;
+	struct amd_energy_data *data;
+	struct device *dev = &pdev->dev;
+
+	data = devm_kzalloc(dev,
+			    sizeof(struct amd_energy_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->chip.ops = &amd_energy_ops;
+	data->chip.info = data->info;
+
+	dev_set_drvdata(dev, data);
+	/* Populate per-core energy reporting */
+	data->info[0] = &data->energy_info;
+	amd_create_sensor(dev, data, hwmon_energy,
+			  HWMON_E_INPUT | HWMON_E_LABEL);
+
+	mutex_init(&data->lock);
+	get_energy_units(data);
+
+	hwmon_dev = devm_hwmon_device_register_with_info(dev, DRVNAME,
+							 data,
+							 &data->chip,
+							 NULL);
+	if (IS_ERR(hwmon_dev))
+		return PTR_ERR(hwmon_dev);
+
+	data->wrap_accumulate = kthread_run(energy_accumulator, data,
+					    "%s", dev_name(hwmon_dev));
+	if (IS_ERR(data->wrap_accumulate))
+		return PTR_ERR(data->wrap_accumulate);
+
+	return PTR_ERR_OR_ZERO(data->wrap_accumulate);
+}
+
+static int amd_energy_remove(struct platform_device *pdev)
+{
+	struct amd_energy_data *data = dev_get_drvdata(&pdev->dev);
+
+	if (data && data->wrap_accumulate)
+		kthread_stop(data->wrap_accumulate);
+
+	return 0;
+}
+
+static const struct platform_device_id amd_energy_ids[] = {
+	{ .name = DRVNAME, },
+	{}
+};
+MODULE_DEVICE_TABLE(platform, amd_energy_ids);
+
+static struct platform_driver amd_energy_driver = {
+	.probe = amd_energy_probe,
+	.remove	= amd_energy_remove,
+	.id_table = amd_energy_ids,
+	.driver = {
+		.name = DRVNAME,
+	},
+};
+
+static struct platform_device *amd_energy_platdev;
+
+static const struct x86_cpu_id cpu_ids[] __initconst = {
+	X86_MATCH_VENDOR_FAM(AMD, 0x17, NULL),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, cpu_ids);
+
+static int __init amd_energy_init(void)
+{
+	int ret;
+
+	if (!x86_match_cpu(cpu_ids))
+		return -ENODEV;
+
+	ret = platform_driver_register(&amd_energy_driver);
+	if (ret)
+		return ret;
+
+	amd_energy_platdev = platform_device_alloc(DRVNAME, 0);
+	if (!amd_energy_platdev)
+		return -ENOMEM;
+
+	ret = platform_device_add(amd_energy_platdev);
+	if (ret) {
+		platform_device_put(amd_energy_platdev);
+		platform_driver_unregister(&amd_energy_driver);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit amd_energy_exit(void)
+{
+	platform_device_unregister(amd_energy_platdev);
+	platform_driver_unregister(&amd_energy_driver);
+}
+
+module_init(amd_energy_init);
+module_exit(amd_energy_exit);
+
+MODULE_DESCRIPTION("Driver for AMD Energy reporting from RAPL MSR via HWMON interface");
+MODULE_AUTHOR("Naveen Krishna Chatradhi <nchatrad@amd.com>");
+MODULE_LICENSE("GPL");