diff mbox series

[v6,1/5] drm/xe/hwmon: Expose power attributes

Message ID 20230925081842.3566834-2-badal.nilawar@intel.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Add HWMON support for DGFX | expand

Commit Message

Nilawar, Badal Sept. 25, 2023, 8:18 a.m. UTC
Expose Card reactive sustained (pl1) power limit as power_max and
card default power limit (tdp) as power_rated_max.

v2:
  - Fix review comments (Riana)
v3:
  - Use drmm_mutex_init (Matt Brost)
  - Print error value (Matt Brost)
  - Convert enums to uppercase (Matt Brost)
  - Avoid extra reg read in hwmon_is_visible function (Riana)
  - Use xe_device_assert_mem_access when applicable (Matt Brost)
  - Add intel-xe@lists.freedesktop.org in Documentation (Matt Brost)
v4:
  - Use prefix xe_hwmon prefix for all functions (Matt Brost/Andi)
  - %s/hwmon_reg/xe_hwmon_reg (Andi)
  - Fix review comments (Guenter/Andi)
v5:
  - Fix review comments (Riana)
v6:
  - Use drm_warn in default case (Rodrigo)
  - s/ENODEV/EOPNOTSUPP (Andi)

Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Riana Tauro <riana.tauro@intel.com>
Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>
---
 .../ABI/testing/sysfs-driver-intel-xe-hwmon   |  22 ++
 drivers/gpu/drm/xe/Makefile                   |   3 +
 drivers/gpu/drm/xe/regs/xe_gt_regs.h          |   4 +
 drivers/gpu/drm/xe/regs/xe_mchbar_regs.h      |  33 ++
 drivers/gpu/drm/xe/xe_device.c                |   3 +
 drivers/gpu/drm/xe/xe_device_types.h          |   2 +
 drivers/gpu/drm/xe/xe_hwmon.c                 | 357 ++++++++++++++++++
 drivers/gpu/drm/xe/xe_hwmon.h                 |  19 +
 8 files changed, 443 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
 create mode 100644 drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
 create mode 100644 drivers/gpu/drm/xe/xe_hwmon.c
 create mode 100644 drivers/gpu/drm/xe/xe_hwmon.h

Comments

Andi Shyti Sept. 25, 2023, 8:58 a.m. UTC | #1
Hi Badal,

On Mon, Sep 25, 2023 at 01:48:38PM +0530, Badal Nilawar wrote:
> Expose Card reactive sustained (pl1) power limit as power_max and
> card default power limit (tdp) as power_rated_max.
> 
> v2:
>   - Fix review comments (Riana)
> v3:
>   - Use drmm_mutex_init (Matt Brost)
>   - Print error value (Matt Brost)
>   - Convert enums to uppercase (Matt Brost)
>   - Avoid extra reg read in hwmon_is_visible function (Riana)
>   - Use xe_device_assert_mem_access when applicable (Matt Brost)
>   - Add intel-xe@lists.freedesktop.org in Documentation (Matt Brost)
> v4:
>   - Use prefix xe_hwmon prefix for all functions (Matt Brost/Andi)
>   - %s/hwmon_reg/xe_hwmon_reg (Andi)
>   - Fix review comments (Guenter/Andi)
> v5:
>   - Fix review comments (Riana)
> v6:
>   - Use drm_warn in default case (Rodrigo)
>   - s/ENODEV/EOPNOTSUPP (Andi)
> 
> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Reviewed-by: Riana Tauro <riana.tauro@intel.com>
> Signed-off-by: Badal Nilawar <badal.nilawar@intel.com>

looks good to me:

Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com> 

Andi
Dixit, Ashutosh Sept. 27, 2023, 4:45 a.m. UTC | #2
On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>

Hi Badal,

> +static int xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg,

Maybe xe_hwmon_read_write_reg? process_reg sounds bad. Basically we don't
process a register, we read or write it.

> +				enum xe_hwmon_reg_operation operation, u32 *value,
> +				u32 clr, u32 set)
> +{
> +	struct xe_reg reg;
> +
> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> +
> +	if (!reg.raw)
> +		return -EOPNOTSUPP;
> +
> +	switch (operation) {
> +	case REG_READ:
> +		*value = xe_mmio_read32(hwmon->gt, reg);
> +		return 0;
> +	case REG_WRITE:
> +		xe_mmio_write32(hwmon->gt, reg, *value);
> +		return 0;
> +	case REG_RMW:
> +		*value = xe_mmio_rmw32(hwmon->gt, reg, clr, set);
> +		return 0;
> +	default:
> +		drm_warn(&gt_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n",
> +			 operation);
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
> +int xe_hwmon_process_reg_read64(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, u64 *value)
> +{
> +	struct xe_reg reg;
> +
> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> +
> +	if (!reg.raw)
> +		return -EOPNOTSUPP;
> +
> +	*value = xe_mmio_read64_2x32(hwmon->gt, reg);
> +
> +	return 0;

We can't make read64 part of enum xe_hwmon_reg_operation?


> +}
> +
> +#define PL1_DISABLE 0
> +
> +/*
> + * HW allows arbitrary PL1 limits to be set but silently clamps these values to
> + * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the
> + * same pattern for sysfs, allow arbitrary PL1 limits to be set but display
> + * clamped values when read.
> + */
> +static int xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value)
> +{
> +	u32 reg_val;
> +	u64 reg_val64, min, max;
> +
> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val, 0, 0);
> +	/* Check if PL1 limit is disabled */
> +	if (!(reg_val & PKG_PWR_LIM_1_EN)) {
> +		*value = PL1_DISABLE;
> +		return 0;
> +	}
> +
> +	reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val);
> +	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
> +
> +	xe_hwmon_process_reg_read64(hwmon, REG_PKG_POWER_SKU, &reg_val64);
> +	min = REG_FIELD_GET(PKG_MIN_PWR, reg_val64);
> +	min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
> +	max = REG_FIELD_GET(PKG_MAX_PWR, reg_val64);
> +	max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
> +
> +	if (min && max)
> +		*value = clamp_t(u64, *value, min, max);

Not exactly correct. Should be:

	if (min)
		clamp at min
	if (max)
		clamp at max

I was thinking of changing it for i915 but was lazy.


> +
> +	return 0;
> +}
> +
> +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
> +{
> +	u32 reg_val;
> +
> +	/* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */
> +	if (value == PL1_DISABLE) {
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> +				     PKG_PWR_LIM_1_EN, 0);
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,

If we are not checking for return codes from these functions, why are they
not void?

Also, how about separate read/write/rmw functions as Andi was suggesting?
They would be clearer I think.

Thanks.
--
Ashutosh
Dixit, Ashutosh Sept. 27, 2023, 4:53 a.m. UTC | #3
On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>
> +static umode_t
> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> +		    u32 attr, int channel)
> +{
> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> +	int ret;
> +
> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));

Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
doesn't read/write registers.

Also do we need to take forcewake? i915 had forcewake table so it would
take forcewake automatically but XE doesn't do that.

Thanks.
--
Ashutosh
Nilawar, Badal Sept. 27, 2023, 8:39 a.m. UTC | #4
On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>>
>> +static umode_t
>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>> +		    u32 attr, int channel)
>> +{
>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
>> +	int ret;
>> +
>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> 
> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> doesn't read/write registers.
Agreed, but visible function is called only once while registering hwmon 
interface, which happen during driver probe. During driver probe device 
will be in resumed state. So no harm in keeping 
xe_device_mem_access_get/put in visible function.
> 
> Also do we need to take forcewake? i915 had forcewake table so it would
> take forcewake automatically but XE doesn't do that.
Hwmon regs doesn't fall under GT domain so doesn't need forcewake.
> 
> Thanks.
> --
> Ashutosh
Nilawar, Badal Sept. 27, 2023, 10:28 a.m. UTC | #5
Hi Ashutosh,

On 27-09-2023 10:15, Dixit, Ashutosh wrote:
> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>>
> 
> Hi Badal,
> 
>> +static int xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg,
> 
> Maybe xe_hwmon_read_write_reg? process_reg sounds bad. Basically we don't
> process a register, we read or write it.
I don't think it sound that bad. When we say process register apart from 
read/write/rmw what else we will be doing. I think lets not rename this 
function.
> 
>> +				enum xe_hwmon_reg_operation operation, u32 *value,
>> +				u32 clr, u32 set)
>> +{
>> +	struct xe_reg reg;
>> +
>> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
>> +
>> +	if (!reg.raw)
>> +		return -EOPNOTSUPP;
>> +
>> +	switch (operation) {
>> +	case REG_READ:
>> +		*value = xe_mmio_read32(hwmon->gt, reg);
>> +		return 0;
>> +	case REG_WRITE:
>> +		xe_mmio_write32(hwmon->gt, reg, *value);
>> +		return 0;
>> +	case REG_RMW:
>> +		*value = xe_mmio_rmw32(hwmon->gt, reg, clr, set);
>> +		return 0;
>> +	default:
>> +		drm_warn(&gt_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n",
>> +			 operation);
>> +		return -EOPNOTSUPP;
>> +	}
>> +}
>> +
>> +int xe_hwmon_process_reg_read64(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, u64 *value)
>> +{
>> +	struct xe_reg reg;
>> +
>> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
>> +
>> +	if (!reg.raw)
>> +		return -EOPNOTSUPP;
>> +
>> +	*value = xe_mmio_read64_2x32(hwmon->gt, reg);
>> +
>> +	return 0;
> 
> We can't make read64 part of enum xe_hwmon_reg_operation?
read64 takes argument "u64 *value" so kept it separate.
> 
> 
>> +}
>> +
>> +#define PL1_DISABLE 0
>> +
>> +/*
>> + * HW allows arbitrary PL1 limits to be set but silently clamps these values to
>> + * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the
>> + * same pattern for sysfs, allow arbitrary PL1 limits to be set but display
>> + * clamped values when read.
>> + */
>> +static int xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value)
>> +{
>> +	u32 reg_val;
>> +	u64 reg_val64, min, max;
>> +
>> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val, 0, 0);
>> +	/* Check if PL1 limit is disabled */
>> +	if (!(reg_val & PKG_PWR_LIM_1_EN)) {
>> +		*value = PL1_DISABLE;
>> +		return 0;
>> +	}
>> +
>> +	reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val);
>> +	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
>> +
>> +	xe_hwmon_process_reg_read64(hwmon, REG_PKG_POWER_SKU, &reg_val64);
>> +	min = REG_FIELD_GET(PKG_MIN_PWR, reg_val64);
>> +	min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
>> +	max = REG_FIELD_GET(PKG_MAX_PWR, reg_val64);
>> +	max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
>> +
>> +	if (min && max)
>> +		*value = clamp_t(u64, *value, min, max);
> 
> Not exactly correct. Should be:
> 
> 	if (min)
> 		clamp at min
> 	if (max)
> 		clamp at max
> 
> I was thinking of changing it for i915 but was lazy.
Sure, thanks for pointing this.
> 
> 
>> +
>> +	return 0;
>> +}
>> +
>> +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
>> +{
>> +	u32 reg_val;
>> +
>> +	/* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */
>> +	if (value == PL1_DISABLE) {
>> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
>> +				     PKG_PWR_LIM_1_EN, 0);
>> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
> 
> If we are not checking for return codes from these functions, why are they
> not void?
Top level functions expect return. For function xe_hwmon_power_max_write 
returning error if PL1 disable not possible. The functions 
xe_hwmon_power_max_read/xe_hwmon_power_rated_max_read can be made void, 
then it will look like. What difference its going to make? I feel 
existing approach is much readable.

case hwmon_power_max:
          xe_hwmon_power_max_read(hwmon, val);
	 return 0;
case hwmon_power_rated_max:
          xe_hwmon_power_rated_max_read(hwmon, val);
	 return 0;
> 
> Also, how about separate read/write/rmw functions as Andi was suggesting?
> They would be clearer I think.
Would not prefer to add further abstraction, lets keep as is. Going 
further while adding new platforms will think about adding it.

Regards,
Badal
> 
> Thanks.
> --
> Ashutosh
Dixit, Ashutosh Sept. 28, 2023, 4:54 a.m. UTC | #6
On Wed, 27 Sep 2023 03:28:51 -0700, Nilawar, Badal wrote:
>
> Hi Ashutosh,
>
> On 27-09-2023 10:15, Dixit, Ashutosh wrote:
> > On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> >>
> >
> > Hi Badal,
> >
> >> +static int xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg,
> >
> > Maybe xe_hwmon_read_write_reg? process_reg sounds bad. Basically we don't
> > process a register, we read or write it.
> I don't think it sound that bad. When we say process register apart from
> read/write/rmw what else we will be doing. I think lets not rename this
> function.

OK, maybe leave as is (though another option is xe_hwmon_operate_reg since
we already have xe_hwmon_reg_op, or xe_hwmon_rw_reg).

> >
> >> +				enum xe_hwmon_reg_operation operation, u32 *value,
> >> +				u32 clr, u32 set)
> >> +{
> >> +	struct xe_reg reg;
> >> +
> >> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> >> +
> >> +	if (!reg.raw)
> >> +		return -EOPNOTSUPP;
> >> +
> >> +	switch (operation) {
> >> +	case REG_READ:
> >> +		*value = xe_mmio_read32(hwmon->gt, reg);
> >> +		return 0;
> >> +	case REG_WRITE:
> >> +		xe_mmio_write32(hwmon->gt, reg, *value);
> >> +		return 0;
> >> +	case REG_RMW:
> >> +		*value = xe_mmio_rmw32(hwmon->gt, reg, clr, set);
> >> +		return 0;
> >> +	default:
> >> +		drm_warn(&gt_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n",
> >> +			 operation);
> >> +		return -EOPNOTSUPP;
> >> +	}
> >> +}
> >> +
> >> +int xe_hwmon_process_reg_read64(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, u64 *value)
> >> +{
> >> +	struct xe_reg reg;
> >> +
> >> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> >> +
> >> +	if (!reg.raw)
> >> +		return -EOPNOTSUPP;
> >> +
> >> +	*value = xe_mmio_read64_2x32(hwmon->gt, reg);
> >> +
> >> +	return 0;
> >
> > We can't make read64 part of enum xe_hwmon_reg_operation?
> read64 takes argument "u64 *value" so kept it separate.

OK, makes sense.

> >
> >
> >> +}
> >> +
> >> +#define PL1_DISABLE 0
> >> +
> >> +/*
> >> + * HW allows arbitrary PL1 limits to be set but silently clamps these values to
> >> + * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the
> >> + * same pattern for sysfs, allow arbitrary PL1 limits to be set but display
> >> + * clamped values when read.
> >> + */
> >> +static int xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value)
> >> +{
> >> +	u32 reg_val;
> >> +	u64 reg_val64, min, max;
> >> +
> >> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val, 0, 0);
> >> +	/* Check if PL1 limit is disabled */
> >> +	if (!(reg_val & PKG_PWR_LIM_1_EN)) {
> >> +		*value = PL1_DISABLE;
> >> +		return 0;
> >> +	}
> >> +
> >> +	reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val);
> >> +	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
> >> +
> >> +	xe_hwmon_process_reg_read64(hwmon, REG_PKG_POWER_SKU, &reg_val64);
> >> +	min = REG_FIELD_GET(PKG_MIN_PWR, reg_val64);
> >> +	min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
> >> +	max = REG_FIELD_GET(PKG_MAX_PWR, reg_val64);
> >> +	max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
> >> +
> >> +	if (min && max)
> >> +		*value = clamp_t(u64, *value, min, max);
> >
> > Not exactly correct. Should be:
> >
> >	if (min)
> >		clamp at min
> >	if (max)
> >		clamp at max
> >
> > I was thinking of changing it for i915 but was lazy.
> Sure, thanks for pointing this.
> >
> >
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
> >> +{
> >> +	u32 reg_val;
> >> +
> >> +	/* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */
> >> +	if (value == PL1_DISABLE) {
> >> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> >> +				     PKG_PWR_LIM_1_EN, 0);
> >> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
> >
> > If we are not checking for return codes from these functions, why are they
> > not void?
> Top level functions expect return. For function xe_hwmon_power_max_write
> returning error if PL1 disable not possible. The functions
> xe_hwmon_power_max_read/xe_hwmon_power_rated_max_read can be made void,
> then it will look like. What difference its going to make? I feel existing
> approach is much readable.

As I have pointed out in the other mail it is not. It raises more questions
about why the return code is not being checked, whether the function can
return an error. So it is better to be crisp as to what can actually happen.

>
> case hwmon_power_max:
>          xe_hwmon_power_max_read(hwmon, val);
>	 return 0;
> case hwmon_power_rated_max:
>          xe_hwmon_power_rated_max_read(hwmon, val);
>	 return 0;

This is fine.

> >
> > Also, how about separate read/write/rmw functions as Andi was suggesting?
> > They would be clearer I think.
> Would not prefer to add further abstraction, lets keep as is. Going further
> while adding new platforms will think about adding it.

OK, no need to add wrappers.

Thanks.
--
Ashutosh
Dixit, Ashutosh Sept. 28, 2023, 4:55 a.m. UTC | #7
On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
>

Hi Badal,

> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> > On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> >>
> >> +static umode_t
> >> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> >> +		    u32 attr, int channel)
> >> +{
> >> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> >> +	int ret;
> >> +
> >> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> >
> > Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> > is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> > doesn't read/write registers.
> Agreed, but visible function is called only once while registering hwmon
> interface, which happen during driver probe. During driver probe device
> will be in resumed state. So no harm in keeping
> xe_device_mem_access_get/put in visible function.

To me it doesn't make any sense to keep xe_device_mem_access_get/put
anywhere except in xe_hwmon_process_reg where the HW access actually
happens. We can eliminate xe_device_mem_access_get/put's all over the place
if we do it. Isn't it?

The only restriction I have heard of (though not sure why) is that
xe_device_mem_access_get/put should not be called under lock. Though I am
not sure it is for spinlock or also mutex. So as we were saying the locking
will also need to move to xe_hwmon_process_reg.

So:

xe_hwmon_process_reg()
{
	xe_device_mem_access_get
	mutex_lock
	...
	mutex_unlock
	xe_device_mem_access_put
}

So once again if this is not possible for some reason let's figure out why.

> >
> > Also do we need to take forcewake? i915 had forcewake table so it would
> > take forcewake automatically but XE doesn't do that.
> Hwmon regs doesn't fall under GT domain so doesn't need forcewake.

OK, great.

Thanks.
--
Ashutosh
Dixit, Ashutosh Sept. 28, 2023, 4:55 a.m. UTC | #8
On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>

Hi Badal,

Here's how I think this we should change this patch.

> diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
> new file mode 100644
> index 000000000000..44d814e111c6
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_hwmon.c
> @@ -0,0 +1,357 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2023 Intel Corporation
> + */
> +
> +#include <linux/hwmon.h>
> +
> +#include <drm/drm_managed.h>
> +#include "regs/xe_gt_regs.h"
> +#include "regs/xe_mchbar_regs.h"
> +#include "xe_device.h"
> +#include "xe_gt.h"
> +#include "xe_hwmon.h"
> +#include "xe_mmio.h"
> +
> +enum xe_hwmon_reg {
> +	REG_PKG_RAPL_LIMIT,
> +	REG_PKG_POWER_SKU,
> +	REG_PKG_POWER_SKU_UNIT,
> +};
> +
> +enum xe_hwmon_reg_operation {

enum xe_hwmon_reg_op

> +	REG_READ,
> +	REG_WRITE,
> +	REG_RMW,
> +};
> +
> +/*
> + * SF_* - scale factors for particular quantities according to hwmon spec.
> + */
> +#define SF_POWER	1000000		/* microwatts */
> +
> +struct xe_hwmon {
> +	struct device *hwmon_dev;
> +	struct xe_gt *gt;
> +	struct mutex hwmon_lock; /* rmw operations*/
> +	int scl_shift_power;
> +};
> +
> +static u32 xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg)

Return 'struct xe_reg' from here. Caller does .raw if needed, so caller can
do xe_hwmon_get_reg(...).raw to check when needed.

So basically this function can return a NULL register if say a particular
register does not exist on a platorm.

Also this is the function which should be called from the is_visible()
functions (as has already been done) (so if this function returns NULL the
sysfs entries will not be visible). This allows for other functions to be
void.

> +{
> +	struct xe_device *xe = gt_to_xe(hwmon->gt);
> +	struct xe_reg reg = XE_REG(0);
> +
> +	switch (hwmon_reg) {
> +	case REG_PKG_RAPL_LIMIT:
> +		if (xe->info.platform == XE_DG2)
> +			reg = PCU_CR_PACKAGE_RAPL_LIMIT;
> +		else if (xe->info.platform == XE_PVC)
> +			reg = PVC_GT0_PACKAGE_RAPL_LIMIT;
> +		break;
> +	case REG_PKG_POWER_SKU:
> +		if (xe->info.platform == XE_DG2)
> +			reg = PCU_CR_PACKAGE_POWER_SKU;
> +		else if (xe->info.platform == XE_PVC)
> +			reg = PVC_GT0_PACKAGE_POWER_SKU;
> +		break;
> +	case REG_PKG_POWER_SKU_UNIT:
> +		if (xe->info.platform == XE_DG2)
> +			reg = PCU_CR_PACKAGE_POWER_SKU_UNIT;
> +		else if (xe->info.platform == XE_PVC)
> +			reg = PVC_GT0_PACKAGE_POWER_SKU_UNIT;
> +		break;
> +	default:
> +		drm_warn(&xe->drm, "Unknown xe hwmon reg id: %d\n", hwmon_reg);
> +		break;
> +	}
> +
> +	return reg.raw;
> +}
> +
> +static int xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg,

Should be void. As described above this should never get called if a
particular register does not exist because the sysfs entries will not be
visible.

> +				enum xe_hwmon_reg_operation operation, u32 *value,
> +				u32 clr, u32 set)

I would also make the 'op' the second argument, so it is a little bit
easier to see if we are reading or writing (as I said elsewhere we can skip
adding read/write wrappers).

> +{
> +	struct xe_reg reg;
> +
> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> +
> +	if (!reg.raw)
> +		return -EOPNOTSUPP;

If register doesn't exist is a WARN_ON.

> +
> +	switch (operation) {
> +	case REG_READ:
> +		*value = xe_mmio_read32(hwmon->gt, reg);
> +		return 0;
> +	case REG_WRITE:
> +		xe_mmio_write32(hwmon->gt, reg, *value);
> +		return 0;
> +	case REG_RMW:
> +		*value = xe_mmio_rmw32(hwmon->gt, reg, clr, set);
> +		return 0;
> +	default:
> +		drm_warn(&gt_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n",
> +			 operation);
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
> +int xe_hwmon_process_reg_read64(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, u64 *value)

Why not just xe_hwmon_reg_read64?

> +{
> +	struct xe_reg reg;
> +
> +	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
> +
> +	if (!reg.raw)
> +		return -EOPNOTSUPP;
> +
> +	*value = xe_mmio_read64_2x32(hwmon->gt, reg);
> +
> +	return 0;

Again should be void, for the same reason as xe_hwmon_process_reg.

> +}
> +
> +#define PL1_DISABLE 0
> +
> +/*
> + * HW allows arbitrary PL1 limits to be set but silently clamps these values to
> + * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the
> + * same pattern for sysfs, allow arbitrary PL1 limits to be set but display
> + * clamped values when read.
> + */
> +static int xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value)
> +{
> +	u32 reg_val;
> +	u64 reg_val64, min, max;
> +
> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val, 0, 0);
> +	/* Check if PL1 limit is disabled */
> +	if (!(reg_val & PKG_PWR_LIM_1_EN)) {
> +		*value = PL1_DISABLE;
> +		return 0;
> +	}
> +
> +	reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val);
> +	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
> +
> +	xe_hwmon_process_reg_read64(hwmon, REG_PKG_POWER_SKU, &reg_val64);
> +	min = REG_FIELD_GET(PKG_MIN_PWR, reg_val64);
> +	min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
> +	max = REG_FIELD_GET(PKG_MAX_PWR, reg_val64);
> +	max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
> +
> +	if (min && max)
> +		*value = clamp_t(u64, *value, min, max);
> +
> +	return 0;
> +}

Should be void.

> +
> +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
> +{
> +	u32 reg_val;
> +
> +	/* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */
> +	if (value == PL1_DISABLE) {
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> +				     PKG_PWR_LIM_1_EN, 0);
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
> +				     PKG_PWR_LIM_1_EN, 0);
> +
> +		if (reg_val & PKG_PWR_LIM_1_EN)
> +			return -EOPNOTSUPP;

This function cannot be void since we return an error here, so it's fine.

> +	}
> +
> +	/* Computation in 64-bits to avoid overflow. Round to nearest. */
> +	reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, SF_POWER);
> +	reg_val = PKG_PWR_LIM_1_EN | REG_FIELD_PREP(PKG_PWR_LIM_1, reg_val);
> +
> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> +			     PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, reg_val);
> +
> +	return 0;
> +}
> +
> +static int xe_hwmon_power_rated_max_read(struct xe_hwmon *hwmon, long *value)
> +{
> +	u32 reg_val;
> +
> +	xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU, REG_READ, &reg_val, 0, 0);
> +	reg_val = REG_FIELD_GET(PKG_TDP, reg_val);
> +	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
> +
> +	return 0;
> +}

Should be void.

> +
> +static const struct hwmon_channel_info *hwmon_info[] = {
> +	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX),
> +	NULL
> +};
> +
> +static umode_t
> +xe_hwmon_power_is_visible(struct xe_hwmon *hwmon, u32 attr, int chan)
> +{
> +	switch (attr) {
> +	case hwmon_power_max:
> +		return xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT) ? 0664 : 0;
> +	case hwmon_power_rated_max:
> +		return xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU) ? 0444 : 0;
> +	default:
> +		return 0;
> +	}
> +}

This is fine.

> +
> +static int
> +xe_hwmon_power_read(struct xe_hwmon *hwmon, u32 attr, int chan, long *val)
> +{
> +	switch (attr) {
> +	case hwmon_power_max:
> +		return xe_hwmon_power_max_read(hwmon, val);
> +	case hwmon_power_rated_max:
> +		return xe_hwmon_power_rated_max_read(hwmon, val);
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}

Fine, just take care of void returns.

> +
> +static int
> +xe_hwmon_power_write(struct xe_hwmon *hwmon, u32 attr, int chan, long val)
> +{
> +	switch (attr) {
> +	case hwmon_power_max:
> +		return xe_hwmon_power_max_write(hwmon, val);
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
> +static umode_t
> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> +		    u32 attr, int channel)
> +{
> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> +	int ret;
> +
> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));

Let's move xe_device_mem_access_get() to xe_hwmon_process_reg().

> +
> +	switch (type) {
> +	case hwmon_power:
> +		ret = xe_hwmon_power_is_visible(hwmon, attr, channel);
> +		break;
> +	default:
> +		ret = 0;
> +		break;

return 0;

> +	}
> +
> +	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
> +
> +	return ret;
> +}
> +
> +static int
> +xe_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
> +	      int channel, long *val)
> +{
> +	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
> +	int ret;
> +
> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));

Move xe_device_mem_access_get() to xe_hwmon_process_reg().

> +
> +	switch (type) {
> +	case hwmon_power:
> +		ret = xe_hwmon_power_read(hwmon, attr, channel, val);
> +		break;
> +	default:
> +		ret = -EOPNOTSUPP;
> +		break;

return -EOPNOTSUPP;


> +	}
> +
> +	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
> +
> +	return ret;
> +}
> +
> +static int
> +xe_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr,
> +	       int channel, long val)
> +{
> +	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
> +	int ret;
> +
> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));

Move xe_device_mem_access_get() to xe_hwmon_process_reg().

> +
> +	switch (type) {
> +	case hwmon_power:
> +		ret = xe_hwmon_power_write(hwmon, attr, channel, val);
> +		break;
> +	default:
> +		ret = -EOPNOTSUPP;
> +		break;

return -EOPNOTSUPP;

> +	}
> +
> +	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
> +
> +	return ret;
> +}
> +
> +static const struct hwmon_ops hwmon_ops = {
> +	.is_visible = xe_hwmon_is_visible,
> +	.read = xe_hwmon_read,
> +	.write = xe_hwmon_write,
> +};
> +
> +static const struct hwmon_chip_info hwmon_chip_info = {
> +	.ops = &hwmon_ops,
> +	.info = hwmon_info,
> +};
> +
> +static void
> +xe_hwmon_get_preregistration_info(struct xe_device *xe)
> +{
> +	struct xe_hwmon *hwmon = xe->hwmon;
> +	u32 val_sku_unit = 0;
> +	int ret;
> +
> +	ret = xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU_UNIT, REG_READ, &val_sku_unit, 0, 0);
> +	/*
> +	 * The contents of register PKG_POWER_SKU_UNIT do not change,
> +	 * so read it once and store the shift values.
> +	 */
> +	if (!ret)
> +		hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit);

if xe_hwmon_is_visible(... hwmon_power ...) {
	xe_hwmon_process_reg();
	hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit);
}

Please let me know if any of this is not possible. I will look at the other
patches after you respond, though it looks like they will also need almost
identical changes.

Thanks.
--
Ashutosh
Nilawar, Badal Sept. 29, 2023, 6:37 a.m. UTC | #9
On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
>>
> 
> Hi Badal,
> 
>> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
>>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>>>>
>>>> +static umode_t
>>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>>>> +		    u32 attr, int channel)
>>>> +{
>>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
>>>> +	int ret;
>>>> +
>>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
>>>
>>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
>>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
>>> doesn't read/write registers.
>> Agreed, but visible function is called only once while registering hwmon
>> interface, which happen during driver probe. During driver probe device
>> will be in resumed state. So no harm in keeping
>> xe_device_mem_access_get/put in visible function.
> 
> To me it doesn't make any sense to keep xe_device_mem_access_get/put
> anywhere except in xe_hwmon_process_reg where the HW access actually
> happens. We can eliminate xe_device_mem_access_get/put's all over the place
> if we do it. Isn't it?
Agreed, thought process here suggest that take rpm wakeref at lowest 
possible level. I already tried this in rfc series and in some extent in 
rev2. There is problem with this approach. See my comments below.
> 
> The only restriction I have heard of (though not sure why) is that
> xe_device_mem_access_get/put should not be called under lock. Though I am
> not sure it is for spinlock or also mutex. So as we were saying the locking
> will also need to move to xe_hwmon_process_reg.
Yes from rev2 comments its dangerous to take mutex before 
xe_device_mem_access_get/put. With code for "PL1 disable/restore during 
resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> 
rpm resume -> disable pl1 -> mutex lock (dead lock here).
> 
> So:
> 
> xe_hwmon_process_reg()
> {
> 	xe_device_mem_access_get
> 	mutex_lock
> 	...
> 	mutex_unlock
> 	xe_device_mem_access_put
> }
> 
> So once again if this is not possible for some reason let's figure out why.
There are two problems with this approach.

Problem 1: If you see implementation of xe_hwmon_power_max_write, reg 
access is happening 3 times, so there will be 3 rpm suspend/resume 
cycles. I was observing the same with rfc implementation. So in 
subsequent series xe_device_mem_access_put/get is moved to top level 
functions i.e. hwmon hooks.

Problem 2: If locking moved inside xe_hwmon_process_reg then between two 
subsequent reg accesses it will open small window during which race can 
happen.
As Anshuman suggested in other thread for read are sequential and 
protected by sysfs layer. So lets apply locking only for RW attributes.


+static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
+{
+	u32 reg_val;
+
+	/* Disable PL1 limit and verify, as limit cannot be disabled on all 
platforms */
+	if (value == PL1_DISABLE) {
+		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
+				     PKG_PWR_LIM_1_EN, 0);
+		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
+				     PKG_PWR_LIM_1_EN, 0);
+
+		if (reg_val & PKG_PWR_LIM_1_EN)
+			return -EOPNOTSUPP;
+	}
+
+	/* Computation in 64-bits to avoid overflow. Round to nearest. */
+	reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, 
SF_POWER);
+	reg_val = PKG_PWR_LIM_1_EN | REG_FIELD_PREP(PKG_PWR_LIM_1, reg_val);
+
+	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
+			     PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, reg_val);
+
+	return 0;
+}

Regards,
Badal
> 
>>>
>>> Also do we need to take forcewake? i915 had forcewake table so it would
>>> take forcewake automatically but XE doesn't do that.
>> Hwmon regs doesn't fall under GT domain so doesn't need forcewake.
> 
> OK, great.
> 
> Thanks.
> --
> Ashutosh
Dixit, Ashutosh Sept. 29, 2023, 4:48 p.m. UTC | #10
On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
>

Hi Badal,

> On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> > On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
> >
> >> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> >>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> >>>>
> >>>> +static umode_t
> >>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> >>>> +		    u32 attr, int channel)
> >>>> +{
> >>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> >>>> +	int ret;
> >>>> +
> >>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> >>>
> >>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> >>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> >>> doesn't read/write registers.
> >> Agreed, but visible function is called only once while registering hwmon
> >> interface, which happen during driver probe. During driver probe device
> >> will be in resumed state. So no harm in keeping
> >> xe_device_mem_access_get/put in visible function.
> >
> > To me it doesn't make any sense to keep xe_device_mem_access_get/put
> > anywhere except in xe_hwmon_process_reg where the HW access actually
> > happens. We can eliminate xe_device_mem_access_get/put's all over the place
> > if we do it. Isn't it?
> Agreed, thought process here suggest that take rpm wakeref at lowest
> possible level. I already tried this in rfc series and in some extent in
> rev2. There is problem with this approach. See my comments below.
> >
> > The only restriction I have heard of (though not sure why) is that
> > xe_device_mem_access_get/put should not be called under lock. Though I am
> > not sure it is for spinlock or also mutex. So as we were saying the locking
> > will also need to move to xe_hwmon_process_reg.
> Yes from rev2 comments its dangerous to take mutex before
> xe_device_mem_access_get/put. With code for "PL1 disable/restore during
> resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
> resume -> disable pl1 -> mutex lock (dead lock here).

But this is already the wrong order as mentioned below. If we follow the
below order do we still see deadlock?

> >
> > So:
> >
> > xe_hwmon_process_reg()
> > {
> >	xe_device_mem_access_get
> >	mutex_lock
> >	...
> >	mutex_unlock
> >	xe_device_mem_access_put
> > }
> >
> > So once again if this is not possible for some reason let's figure out why.
> There are two problems with this approach.
>
> Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
> access is happening 3 times, so there will be 3 rpm suspend/resume
> cycles. I was observing the same with rfc implementation. So in subsequent
> series xe_device_mem_access_put/get is moved to top level functions
> i.e. hwmon hooks.

This is not exactly correct because there is also a 1 second autosuspend
delay which will prevent such rpm suspend/resume cycles:

xe_pm_runtime_init:
	pm_runtime_set_autosuspend_delay(dev, 1000);


>
> Problem 2: If locking moved inside xe_hwmon_process_reg then between two
> subsequent reg accesses it will open small window during which race can
> happen.
> As Anshuman suggested in other thread for read are sequential and protected
> by sysfs layer. So lets apply locking only for RW attributes.

But what is the locking trying to protect? As far as I understand it is
just the registers which have to be atomically modified/read. So it seems
sufficient to just protect the register accesses with the lock.

So I am still not convinced.

Thanks.
--
Ashutosh


>
> +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
> +{
> +	u32 reg_val;
> +
> +	/* Disable PL1 limit and verify, as limit cannot be disabled on all
> platforms */
> +	if (value == PL1_DISABLE) {
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> +				     PKG_PWR_LIM_1_EN, 0);
> +		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
> +				     PKG_PWR_LIM_1_EN, 0);
> +
> +		if (reg_val & PKG_PWR_LIM_1_EN)
> +			return -EOPNOTSUPP;
> +	}
> +
> +	/* Computation in 64-bits to avoid overflow. Round to nearest. */
> +	reg_val = DIV_ROUND_CLOSEST_ULL((u64)value <<
> hwmon->scl_shift_power, SF_POWER);
> +	reg_val = PKG_PWR_LIM_1_EN | REG_FIELD_PREP(PKG_PWR_LIM_1, reg_val);
> +
> +	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
> +			     PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, reg_val);
> +
> +	return 0;
> +}
>
> Regards,
> Badal
> >
> >>>
> >>> Also do we need to take forcewake? i915 had forcewake table so it would
> >>> take forcewake automatically but XE doesn't do that.
> >> Hwmon regs doesn't fall under GT domain so doesn't need forcewake.
> >
> > OK, great.
> >
> > Thanks.
> > --
> > Ashutosh
Dixit, Ashutosh Sept. 29, 2023, 9:41 p.m. UTC | #11
On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
>

Hi Badal,

> On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
> >
> > On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> > > On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
> > >
> > >> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> > >>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> > >>>>
> > >>>> +static umode_t
> > >>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> > >>>> +		    u32 attr, int channel)
> > >>>> +{
> > >>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> > >>>> +	int ret;
> > >>>> +
> > >>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> > >>>
> > >>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> > >>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> > >>> doesn't read/write registers.
> > >> Agreed, but visible function is called only once while registering hwmon
> > >> interface, which happen during driver probe. During driver probe device
> > >> will be in resumed state. So no harm in keeping
> > >> xe_device_mem_access_get/put in visible function.
> > >
> > > To me it doesn't make any sense to keep xe_device_mem_access_get/put
> > > anywhere except in xe_hwmon_process_reg where the HW access actually
> > > happens. We can eliminate xe_device_mem_access_get/put's all over the place
> > > if we do it. Isn't it?
> > Agreed, thought process here suggest that take rpm wakeref at lowest
> > possible level. I already tried this in rfc series and in some extent in
> > rev2. There is problem with this approach. See my comments below.
> > >
> > > The only restriction I have heard of (though not sure why) is that
> > > xe_device_mem_access_get/put should not be called under lock>. Though I am
> > > not sure it is for spinlock or also mutex. So as we were saying the locking
> > > will also need to move to xe_hwmon_process_reg.
> > Yes from rev2 comments its dangerous to take mutex before
> > xe_device_mem_access_get/put. With code for "PL1 disable/restore during
> > resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
> > resume -> disable pl1 -> mutex lock (dead lock here).
>
> But this is already the wrong order as mentioned below. If we follow the
> below order do we still see deadlock?
>
> > >
> > > So:
> > >
> > > xe_hwmon_process_reg()
> > > {
> > >	xe_device_mem_access_get
> > >	mutex_lock
> > >	...
> > >	mutex_unlock
> > >	xe_device_mem_access_put
> > > }
> > >
> > > So once again if this is not possible for some reason let's figure out why.
> > There are two problems with this approach.
> >
> > Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
> > access is happening 3 times, so there will be 3 rpm suspend/resume
> > cycles. I was observing the same with rfc implementation. So in subsequent
> > series xe_device_mem_access_put/get is moved to top level functions
> > i.e. hwmon hooks.
>
> This is not exactly correct because there is also a 1 second autosuspend
> delay which will prevent such rpm suspend/resume cycles:
>
> xe_pm_runtime_init:
>	pm_runtime_set_autosuspend_delay(dev, 1000);
>
>
> >
> > Problem 2: If locking moved inside xe_hwmon_process_reg then between two
> > subsequent reg accesses it will open small window during which race can
> > happen.
> > As Anshuman suggested in other thread for read are sequential and protected
> > by sysfs layer. So lets apply locking only for RW attributes.
>
> But what is the locking trying to protect? As far as I understand it is
> just the registers which have to be atomically modified/read. So it seems
> sufficient to just protect the register accesses with the lock.
>
> So I am still not convinced.

Let's figure out the locking first depending on what needs to be protected
(just registers or other data too). And then we can see where to put the
xe_device_mem_access_get/put's (following the rule that
xe_device_mem_access_get/put's should not be called under lock).

Thanks.
--
Ashutosh
Dixit, Ashutosh Oct. 4, 2023, 12:52 a.m. UTC | #12
On Fri, 29 Sep 2023 14:41:22 -0700, Dixit, Ashutosh wrote:
>

Hi Badal,

Why did you merge the hwmon patches when there is still open discussion
below on the patches? According to upstream rules (I'm not sure if you know
about this) you should not merge patches, even if you have R-b's on the
patches, till all review comments are resolved.

Generally you are expected to either address the comments or reply to the
comments are at least inform that you are merging, disregarding the
comments. IMO you should at least have done one of these before merging.

Cc: @Vivi, Rodrigo

Thanks.
--
Ashutosh


> On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
> > On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
> > >
> > > On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> > > > On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
> > > >
> > > >> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> > > >>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> > > >>>>
> > > >>>> +static umode_t
> > > >>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> > > >>>> +		    u32 attr, int channel)
> > > >>>> +{
> > > >>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> > > >>>> +	int ret;
> > > >>>> +
> > > >>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> > > >>>
> > > >>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> > > >>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> > > >>> doesn't read/write registers.
> > > >> Agreed, but visible function is called only once while registering hwmon
> > > >> interface, which happen during driver probe. During driver probe device
> > > >> will be in resumed state. So no harm in keeping
> > > >> xe_device_mem_access_get/put in visible function.
> > > >
> > > > To me it doesn't make any sense to keep xe_device_mem_access_get/put
> > > > anywhere except in xe_hwmon_process_reg where the HW access actually
> > > > happens. We can eliminate xe_device_mem_access_get/put's all over the place
> > > > if we do it. Isn't it?
> > > Agreed, thought process here suggest that take rpm wakeref at lowest
> > > possible level. I already tried this in rfc series and in some extent in
> > > rev2. There is problem with this approach. See my comments below.
> > > >
> > > > The only restriction I have heard of (though not sure why) is that
> > > > xe_device_mem_access_get/put should not be called under lock>. Though I am
> > > > not sure it is for spinlock or also mutex. So as we were saying the locking
> > > > will also need to move to xe_hwmon_process_reg.
> > > Yes from rev2 comments its dangerous to take mutex before
> > > xe_device_mem_access_get/put. With code for "PL1 disable/restore during
> > > resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
> > > resume -> disable pl1 -> mutex lock (dead lock here).
> >
> > But this is already the wrong order as mentioned below. If we follow the
> > below order do we still see deadlock?
> >
> > > >
> > > > So:
> > > >
> > > > xe_hwmon_process_reg()
> > > > {
> > > >	xe_device_mem_access_get
> > > >	mutex_lock
> > > >	...
> > > >	mutex_unlock
> > > >	xe_device_mem_access_put
> > > > }
> > > >
> > > > So once again if this is not possible for some reason let's figure out why.
> > > There are two problems with this approach.
> > >
> > > Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
> > > access is happening 3 times, so there will be 3 rpm suspend/resume
> > > cycles. I was observing the same with rfc implementation. So in subsequent
> > > series xe_device_mem_access_put/get is moved to top level functions
> > > i.e. hwmon hooks.
> >
> > This is not exactly correct because there is also a 1 second autosuspend
> > delay which will prevent such rpm suspend/resume cycles:
> >
> > xe_pm_runtime_init:
> >	pm_runtime_set_autosuspend_delay(dev, 1000);
> >
> >
> > >
> > > Problem 2: If locking moved inside xe_hwmon_process_reg then between two
> > > subsequent reg accesses it will open small window during which race can
> > > happen.
> > > As Anshuman suggested in other thread for read are sequential and protected
> > > by sysfs layer. So lets apply locking only for RW attributes.
> >
> > But what is the locking trying to protect? As far as I understand it is
> > just the registers which have to be atomically modified/read. So it seems
> > sufficient to just protect the register accesses with the lock.
> >
> > So I am still not convinced.
>
> Let's figure out the locking first depending on what needs to be protected
> (just registers or other data too). And then we can see where to put the
> xe_device_mem_access_get/put's (following the rule that
> xe_device_mem_access_get/put's should not be called under lock).
Nilawar, Badal Oct. 4, 2023, 6:43 a.m. UTC | #13
Hi Anshutosh,

On 04-10-2023 06:22, Dixit, Ashutosh wrote:
> On Fri, 29 Sep 2023 14:41:22 -0700, Dixit, Ashutosh wrote:
>>
> 
> Hi Badal,
> 
> Why did you merge the hwmon patches when there is still open discussion
> below on the patches? According to upstream rules (I'm not sure if you know
> about this) you should not merge patches, even if you have R-b's on the
> patches, till all review comments are resolved.
> 
> Generally you are expected to either address the comments or reply to the
> comments are at least inform that you are merging, disregarding the > comments. IMO you should at least have done one of these before merging.

I did selective merging. I haven't merged 5th patch yet as locking is 
still in discussion. I am working on addressing locking and thought I 
will address some of your comments with it.

Thanks,
Badal
> 
> Cc: @Vivi, Rodrigo
> 
> Thanks.
> --
> Ashutosh
> 
> 
>> On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
>>> On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
>>>>
>>>> On 28-09-2023 10:25, Dixit, Ashutosh wrote:
>>>>> On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
>>>>>
>>>>>> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
>>>>>>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>>>>>>>>
>>>>>>>> +static umode_t
>>>>>>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>>>>>>>> +		    u32 attr, int channel)
>>>>>>>> +{
>>>>>>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
>>>>>>>> +	int ret;
>>>>>>>> +
>>>>>>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
>>>>>>>
>>>>>>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
>>>>>>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
>>>>>>> doesn't read/write registers.
>>>>>> Agreed, but visible function is called only once while registering hwmon
>>>>>> interface, which happen during driver probe. During driver probe device
>>>>>> will be in resumed state. So no harm in keeping
>>>>>> xe_device_mem_access_get/put in visible function.
>>>>>
>>>>> To me it doesn't make any sense to keep xe_device_mem_access_get/put
>>>>> anywhere except in xe_hwmon_process_reg where the HW access actually
>>>>> happens. We can eliminate xe_device_mem_access_get/put's all over the place
>>>>> if we do it. Isn't it?
>>>> Agreed, thought process here suggest that take rpm wakeref at lowest
>>>> possible level. I already tried this in rfc series and in some extent in
>>>> rev2. There is problem with this approach. See my comments below.
>>>>>
>>>>> The only restriction I have heard of (though not sure why) is that
>>>>> xe_device_mem_access_get/put should not be called under lock>. Though I am
>>>>> not sure it is for spinlock or also mutex. So as we were saying the locking
>>>>> will also need to move to xe_hwmon_process_reg.
>>>> Yes from rev2 comments its dangerous to take mutex before
>>>> xe_device_mem_access_get/put. With code for "PL1 disable/restore during
>>>> resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
>>>> resume -> disable pl1 -> mutex lock (dead lock here).
>>>
>>> But this is already the wrong order as mentioned below. If we follow the
>>> below order do we still see deadlock?
>>>
>>>>>
>>>>> So:
>>>>>
>>>>> xe_hwmon_process_reg()
>>>>> {
>>>>> 	xe_device_mem_access_get
>>>>> 	mutex_lock
>>>>> 	...
>>>>> 	mutex_unlock
>>>>> 	xe_device_mem_access_put
>>>>> }
>>>>>
>>>>> So once again if this is not possible for some reason let's figure out why.
>>>> There are two problems with this approach.
>>>>
>>>> Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
>>>> access is happening 3 times, so there will be 3 rpm suspend/resume
>>>> cycles. I was observing the same with rfc implementation. So in subsequent
>>>> series xe_device_mem_access_put/get is moved to top level functions
>>>> i.e. hwmon hooks.
>>>
>>> This is not exactly correct because there is also a 1 second autosuspend
>>> delay which will prevent such rpm suspend/resume cycles:
>>>
>>> xe_pm_runtime_init:
>>> 	pm_runtime_set_autosuspend_delay(dev, 1000);
>>>
>>>
>>>>
>>>> Problem 2: If locking moved inside xe_hwmon_process_reg then between two
>>>> subsequent reg accesses it will open small window during which race can
>>>> happen.
>>>> As Anshuman suggested in other thread for read are sequential and protected
>>>> by sysfs layer. So lets apply locking only for RW attributes.
>>>
>>> But what is the locking trying to protect? As far as I understand it is
>>> just the registers which have to be atomically modified/read. So it seems
>>> sufficient to just protect the register accesses with the lock.
>>>
>>> So I am still not convinced.
>>
>> Let's figure out the locking first depending on what needs to be protected
>> (just registers or other data too). And then we can see where to put the
>> xe_device_mem_access_get/put's (following the rule that
>> xe_device_mem_access_get/put's should not be called under lock).
Nilawar, Badal Oct. 4, 2023, 10:18 a.m. UTC | #14
Hi Ashutosh,

On 30-09-2023 03:11, Dixit, Ashutosh wrote:
> On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
>>
> 
> Hi Badal,
> 
>> On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
>>>
>>> On 28-09-2023 10:25, Dixit, Ashutosh wrote:
>>>> On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
>>>>
>>>>> On 27-09-2023 10:23, Dixit, Ashutosh wrote:
>>>>>> On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
>>>>>>>
>>>>>>> +static umode_t
>>>>>>> +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
>>>>>>> +		    u32 attr, int channel)
>>>>>>> +{
>>>>>>> +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
>>>>>>> +	int ret;
>>>>>>> +
>>>>>>> +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
>>>>>>
>>>>>> Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
>>>>>> is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
>>>>>> doesn't read/write registers.
>>>>> Agreed, but visible function is called only once while registering hwmon
>>>>> interface, which happen during driver probe. During driver probe device
>>>>> will be in resumed state. So no harm in keeping
>>>>> xe_device_mem_access_get/put in visible function.
>>>>
>>>> To me it doesn't make any sense to keep xe_device_mem_access_get/put
>>>> anywhere except in xe_hwmon_process_reg where the HW access actually
>>>> happens. We can eliminate xe_device_mem_access_get/put's all over the place
>>>> if we do it. Isn't it?
>>> Agreed, thought process here suggest that take rpm wakeref at lowest
>>> possible level. I already tried this in rfc series and in some extent in
>>> rev2. There is problem with this approach. See my comments below.
>>>>
>>>> The only restriction I have heard of (though not sure why) is that
>>>> xe_device_mem_access_get/put should not be called under lock>. Though I am
>>>> not sure it is for spinlock or also mutex. So as we were saying the locking
>>>> will also need to move to xe_hwmon_process_reg.
>>> Yes from rev2 comments its dangerous to take mutex before
>>> xe_device_mem_access_get/put. With code for "PL1 disable/restore during
>>> resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
>>> resume -> disable pl1 -> mutex lock (dead lock here).
>>
>> But this is already the wrong order as mentioned below. If we follow the
>> below order do we still see deadlock?
>>
>>>>
>>>> So:
>>>>
>>>> xe_hwmon_process_reg()
>>>> {
>>>> 	xe_device_mem_access_get
>>>> 	mutex_lock
>>>> 	...
>>>> 	mutex_unlock
>>>> 	xe_device_mem_access_put
>>>> }
>>>>
>>>> So once again if this is not possible for some reason let's figure out why.
>>> There are two problems with this approach.
>>>
>>> Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
>>> access is happening 3 times, so there will be 3 rpm suspend/resume
>>> cycles. I was observing the same with rfc implementation. So in subsequent
>>> series xe_device_mem_access_put/get is moved to top level functions
>>> i.e. hwmon hooks.
>>
>> This is not exactly correct because there is also a 1 second autosuspend
>> delay which will prevent such rpm suspend/resume cycles:
>>
>> xe_pm_runtime_init:
>> 	pm_runtime_set_autosuspend_delay(dev, 1000);
>>
rpm auto suspend delay can be 0 as well, IGT does set it to 0. In that 
case there will be rpm cycle for every register access. So it better to 
keep xe_device_mem_access_get/put at attribute level i.e. in hwmon hooks.
>>
>>>
>>> Problem 2: If locking moved inside xe_hwmon_process_reg then between two
>>> subsequent reg accesses it will open small window during which race can
>>> happen.
>>> As Anshuman suggested in other thread for read are sequential and protected
>>> by sysfs layer. So lets apply locking only for RW attributes.
>>
>> But what is the locking trying to protect? As far as I understand it is
>> just the registers which have to be atomically modified/read. So it seems
>> sufficient to just protect the register accesses with the lock.
>>
>> So I am still not convinced.
In i915 initially rmw accesses were protected with lock but later with 
addition of PL1 disable (during resume) logic in some extent locking got 
extended to attribute level.

Another scenario where locking is needed is for rw attributes where 
write and read can happen from different threads.

For readonly attributes as per this 
(https://elixir.bootlin.com/linux/latest/source/fs/seq_file.c) locking 
is not needed.

I think lets apply locking at attribute level.
> 
> Let's figure out the locking first depending on what needs to be protected
> (just registers or other data too). And then we can see where to put the
> xe_device_mem_access_get/put's (following the rule that
> xe_device_mem_access_get/put's should not be called under lock).


Regards,
Badal
> 
> Thanks.
> --
> Ashutosh
Rodrigo Vivi Oct. 4, 2023, 3:56 p.m. UTC | #15
On Wed, Oct 04, 2023 at 12:13:06PM +0530, Nilawar, Badal wrote:
> Hi Anshutosh,
> 
> On 04-10-2023 06:22, Dixit, Ashutosh wrote:
> > On Fri, 29 Sep 2023 14:41:22 -0700, Dixit, Ashutosh wrote:
> > > 
> > 
> > Hi Badal,
> > 
> > Why did you merge the hwmon patches when there is still open discussion
> > below on the patches? According to upstream rules (I'm not sure if you know
> > about this) you should not merge patches, even if you have R-b's on the
> > patches, till all review comments are resolved.
> > 
> > Generally you are expected to either address the comments or reply to the
> > comments are at least inform that you are merging, disregarding the > comments. IMO you should at least have done one of these before merging.
> 
> I did selective merging. I haven't merged 5th patch yet as locking is still
> in discussion. I am working on addressing locking and thought I will address
> some of your comments with it.

Just to ensure the split is clear to everyone and that we have CI running on
the exact chunk that is getting merged, next time, please split the series,
rebase and resend the ones that are ready. you might even use --subject-prefix=CI

and as always, let's not rush things in and be sure that all questions
and concerns are addressed.

Thanks,
Rodrigo.

> 
> Thanks,
> Badal
> > 
> > Cc: @Vivi, Rodrigo
> > 
> > Thanks.
> > --
> > Ashutosh
> > 
> > 
> > > On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
> > > > On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
> > > > > 
> > > > > On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> > > > > > On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
> > > > > > 
> > > > > > > On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> > > > > > > > On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> > > > > > > > > 
> > > > > > > > > +static umode_t
> > > > > > > > > +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> > > > > > > > > +		    u32 attr, int channel)
> > > > > > > > > +{
> > > > > > > > > +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> > > > > > > > > +	int ret;
> > > > > > > > > +
> > > > > > > > > +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> > > > > > > > 
> > > > > > > > Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> > > > > > > > is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> > > > > > > > doesn't read/write registers.
> > > > > > > Agreed, but visible function is called only once while registering hwmon
> > > > > > > interface, which happen during driver probe. During driver probe device
> > > > > > > will be in resumed state. So no harm in keeping
> > > > > > > xe_device_mem_access_get/put in visible function.
> > > > > > 
> > > > > > To me it doesn't make any sense to keep xe_device_mem_access_get/put
> > > > > > anywhere except in xe_hwmon_process_reg where the HW access actually
> > > > > > happens. We can eliminate xe_device_mem_access_get/put's all over the place
> > > > > > if we do it. Isn't it?
> > > > > Agreed, thought process here suggest that take rpm wakeref at lowest
> > > > > possible level. I already tried this in rfc series and in some extent in
> > > > > rev2. There is problem with this approach. See my comments below.
> > > > > > 
> > > > > > The only restriction I have heard of (though not sure why) is that
> > > > > > xe_device_mem_access_get/put should not be called under lock>. Though I am
> > > > > > not sure it is for spinlock or also mutex. So as we were saying the locking
> > > > > > will also need to move to xe_hwmon_process_reg.
> > > > > Yes from rev2 comments its dangerous to take mutex before
> > > > > xe_device_mem_access_get/put. With code for "PL1 disable/restore during
> > > > > resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
> > > > > resume -> disable pl1 -> mutex lock (dead lock here).
> > > > 
> > > > But this is already the wrong order as mentioned below. If we follow the
> > > > below order do we still see deadlock?
> > > > 
> > > > > > 
> > > > > > So:
> > > > > > 
> > > > > > xe_hwmon_process_reg()
> > > > > > {
> > > > > > 	xe_device_mem_access_get
> > > > > > 	mutex_lock
> > > > > > 	...
> > > > > > 	mutex_unlock
> > > > > > 	xe_device_mem_access_put
> > > > > > }
> > > > > > 
> > > > > > So once again if this is not possible for some reason let's figure out why.
> > > > > There are two problems with this approach.
> > > > > 
> > > > > Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
> > > > > access is happening 3 times, so there will be 3 rpm suspend/resume
> > > > > cycles. I was observing the same with rfc implementation. So in subsequent
> > > > > series xe_device_mem_access_put/get is moved to top level functions
> > > > > i.e. hwmon hooks.
> > > > 
> > > > This is not exactly correct because there is also a 1 second autosuspend
> > > > delay which will prevent such rpm suspend/resume cycles:
> > > > 
> > > > xe_pm_runtime_init:
> > > > 	pm_runtime_set_autosuspend_delay(dev, 1000);
> > > > 
> > > > 
> > > > > 
> > > > > Problem 2: If locking moved inside xe_hwmon_process_reg then between two
> > > > > subsequent reg accesses it will open small window during which race can
> > > > > happen.
> > > > > As Anshuman suggested in other thread for read are sequential and protected
> > > > > by sysfs layer. So lets apply locking only for RW attributes.
> > > > 
> > > > But what is the locking trying to protect? As far as I understand it is
> > > > just the registers which have to be atomically modified/read. So it seems
> > > > sufficient to just protect the register accesses with the lock.
> > > > 
> > > > So I am still not convinced.
> > > 
> > > Let's figure out the locking first depending on what needs to be protected
> > > (just registers or other data too). And then we can see where to put the
> > > xe_device_mem_access_get/put's (following the rule that
> > > xe_device_mem_access_get/put's should not be called under lock).
Rodrigo Vivi Oct. 4, 2023, 4:11 p.m. UTC | #16
On Wed, Oct 04, 2023 at 11:56:33AM -0400, Rodrigo Vivi wrote:
> On Wed, Oct 04, 2023 at 12:13:06PM +0530, Nilawar, Badal wrote:
> > Hi Anshutosh,
> > 
> > On 04-10-2023 06:22, Dixit, Ashutosh wrote:
> > > On Fri, 29 Sep 2023 14:41:22 -0700, Dixit, Ashutosh wrote:
> > > > 
> > > 
> > > Hi Badal,
> > > 
> > > Why did you merge the hwmon patches when there is still open discussion
> > > below on the patches? According to upstream rules (I'm not sure if you know
> > > about this) you should not merge patches, even if you have R-b's on the
> > > patches, till all review comments are resolved.
> > > 
> > > Generally you are expected to either address the comments or reply to the
> > > comments are at least inform that you are merging, disregarding the > comments. IMO you should at least have done one of these before merging.
> > 
> > I did selective merging. I haven't merged 5th patch yet as locking is still
> > in discussion. I am working on addressing locking and thought I will address
> > some of your comments with it.

There was still an open discussion going around the (merged) patch 1, regarding
the positioning of the the mem_access get/put. So, next time hold a bit before
pushing. But the positioning of those mem_access get/put are safe although maybe
not ideal... (needed?!). Anyway that can be a follow up fix or improvement and
I'm okay with the way it currently is in the code.

> 
> Just to ensure the split is clear to everyone and that we have CI running on
> the exact chunk that is getting merged, next time, please split the series,
> rebase and resend the ones that are ready. you might even use --subject-prefix=CI
> 
> and as always, let's not rush things in and be sure that all questions
> and concerns are addressed.
> 
> Thanks,
> Rodrigo.
> 
> > 
> > Thanks,
> > Badal
> > > 
> > > Cc: @Vivi, Rodrigo
> > > 
> > > Thanks.
> > > --
> > > Ashutosh
> > > 
> > > 
> > > > On Fri, 29 Sep 2023 09:48:36 -0700, Dixit, Ashutosh wrote:
> > > > > On Thu, 28 Sep 2023 23:37:35 -0700, Nilawar, Badal wrote:
> > > > > > 
> > > > > > On 28-09-2023 10:25, Dixit, Ashutosh wrote:
> > > > > > > On Wed, 27 Sep 2023 01:39:46 -0700, Nilawar, Badal wrote:
> > > > > > > 
> > > > > > > > On 27-09-2023 10:23, Dixit, Ashutosh wrote:
> > > > > > > > > On Mon, 25 Sep 2023 01:18:38 -0700, Badal Nilawar wrote:
> > > > > > > > > > 
> > > > > > > > > > +static umode_t
> > > > > > > > > > +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
> > > > > > > > > > +		    u32 attr, int channel)
> > > > > > > > > > +{
> > > > > > > > > > +	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
> > > > > > > > > > +	int ret;
> > > > > > > > > > +
> > > > > > > > > > +	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
> > > > > > > > > 
> > > > > > > > > Maybe we do xe_device_mem_access_get/put in xe_hwmon_process_reg where it
> > > > > > > > > is needed? E.g. xe_hwmon_is_visible doesn't need to do this because it
> > > > > > > > > doesn't read/write registers.
> > > > > > > > Agreed, but visible function is called only once while registering hwmon
> > > > > > > > interface, which happen during driver probe. During driver probe device
> > > > > > > > will be in resumed state. So no harm in keeping
> > > > > > > > xe_device_mem_access_get/put in visible function.
> > > > > > > 
> > > > > > > To me it doesn't make any sense to keep xe_device_mem_access_get/put
> > > > > > > anywhere except in xe_hwmon_process_reg where the HW access actually
> > > > > > > happens. We can eliminate xe_device_mem_access_get/put's all over the place
> > > > > > > if we do it. Isn't it?
> > > > > > Agreed, thought process here suggest that take rpm wakeref at lowest
> > > > > > possible level. I already tried this in rfc series and in some extent in
> > > > > > rev2. There is problem with this approach. See my comments below.
> > > > > > > 
> > > > > > > The only restriction I have heard of (though not sure why) is that
> > > > > > > xe_device_mem_access_get/put should not be called under lock>. Though I am
> > > > > > > not sure it is for spinlock or also mutex. So as we were saying the locking
> > > > > > > will also need to move to xe_hwmon_process_reg.
> > > > > > Yes from rev2 comments its dangerous to take mutex before
> > > > > > xe_device_mem_access_get/put. With code for "PL1 disable/restore during
> > > > > > resume" I saw deadlock. Scenario was power1_max write -> mutex lock -> rpm
> > > > > > resume -> disable pl1 -> mutex lock (dead lock here).
> > > > > 
> > > > > But this is already the wrong order as mentioned below. If we follow the
> > > > > below order do we still see deadlock?
> > > > > 
> > > > > > > 
> > > > > > > So:
> > > > > > > 
> > > > > > > xe_hwmon_process_reg()
> > > > > > > {
> > > > > > > 	xe_device_mem_access_get
> > > > > > > 	mutex_lock
> > > > > > > 	...
> > > > > > > 	mutex_unlock
> > > > > > > 	xe_device_mem_access_put
> > > > > > > }
> > > > > > > 
> > > > > > > So once again if this is not possible for some reason let's figure out why.
> > > > > > There are two problems with this approach.
> > > > > > 
> > > > > > Problem 1: If you see implementation of xe_hwmon_power_max_write, reg
> > > > > > access is happening 3 times, so there will be 3 rpm suspend/resume
> > > > > > cycles. I was observing the same with rfc implementation. So in subsequent
> > > > > > series xe_device_mem_access_put/get is moved to top level functions
> > > > > > i.e. hwmon hooks.
> > > > > 
> > > > > This is not exactly correct because there is also a 1 second autosuspend
> > > > > delay which will prevent such rpm suspend/resume cycles:
> > > > > 
> > > > > xe_pm_runtime_init:
> > > > > 	pm_runtime_set_autosuspend_delay(dev, 1000);
> > > > > 
> > > > > 
> > > > > > 
> > > > > > Problem 2: If locking moved inside xe_hwmon_process_reg then between two
> > > > > > subsequent reg accesses it will open small window during which race can
> > > > > > happen.
> > > > > > As Anshuman suggested in other thread for read are sequential and protected
> > > > > > by sysfs layer. So lets apply locking only for RW attributes.
> > > > > 
> > > > > But what is the locking trying to protect? As far as I understand it is
> > > > > just the registers which have to be atomically modified/read. So it seems
> > > > > sufficient to just protect the register accesses with the lock.
> > > > > 
> > > > > So I am still not convinced.
> > > > 
> > > > Let's figure out the locking first depending on what needs to be protected
> > > > (just registers or other data too). And then we can see where to put the
> > > > xe_device_mem_access_get/put's (following the rule that
> > > > xe_device_mem_access_get/put's should not be called under lock).
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
new file mode 100644
index 000000000000..da0197a29fe4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon
@@ -0,0 +1,22 @@ 
+What:		/sys/devices/.../hwmon/hwmon<i>/power1_max
+Date:		September 2023
+KernelVersion:	6.5
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RW. Card reactive sustained  (PL1) power limit in microwatts.
+
+		The power controller will throttle the operating frequency
+		if the power averaged over a window (typically seconds)
+		exceeds this limit. A read value of 0 means that the PL1
+		power limit is disabled, writing 0 disables the
+		limit. Writing values > 0 and <= TDP will enable the power limit.
+
+		Only supported for particular Intel xe graphics platforms.
+
+What:		/sys/devices/.../hwmon/hwmon<i>/power1_rated_max
+Date:		September 2023
+KernelVersion:	6.5
+Contact:	intel-xe@lists.freedesktop.org
+Description:	RO. Card default power limit (default TDP setting).
+
+		Only supported for particular Intel xe graphics platforms.
+
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index b1681d1416eb..294fee78dd29 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -118,6 +118,9 @@  xe-y += xe_bb.o \
 	xe_wa.o \
 	xe_wopcm.o
 
+# graphics hardware monitoring (HWMON) support
+xe-$(CONFIG_HWMON) += xe_hwmon.o
+
 # i915 Display compat #defines and #includes
 subdir-ccflags-$(CONFIG_DRM_XE_DISPLAY) += \
 	-I$(srctree)/$(src)/display/ext \
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index e13fbbdf6929..679cdba9f383 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -410,4 +410,8 @@ 
 #define XEHPC_BCS5_BCS6_INTR_MASK		XE_REG(0x190118)
 #define XEHPC_BCS7_BCS8_INTR_MASK		XE_REG(0x19011c)
 
+#define PVC_GT0_PACKAGE_RAPL_LIMIT		XE_REG(0x281008)
+#define PVC_GT0_PACKAGE_POWER_SKU_UNIT		XE_REG(0x281068)
+#define PVC_GT0_PACKAGE_POWER_SKU		XE_REG(0x281080)
+
 #endif
diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
new file mode 100644
index 000000000000..27f1d42baf6d
--- /dev/null
+++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h
@@ -0,0 +1,33 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#ifndef _XE_MCHBAR_REGS_H_
+#define _XE_MCHBAR_REGS_H_
+
+#include "regs/xe_reg_defs.h"
+
+/*
+ * MCHBAR mirror.
+ *
+ * This mirrors the MCHBAR MMIO space whose location is determined by
+ * device 0 function 0's pci config register 0x44 or 0x48 and matches it in
+ * every way.
+ */
+
+#define MCHBAR_MIRROR_BASE_SNB			0x140000
+
+#define PCU_CR_PACKAGE_POWER_SKU		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5930)
+#define   PKG_TDP				GENMASK_ULL(14, 0)
+#define   PKG_MIN_PWR				GENMASK_ULL(30, 16)
+#define   PKG_MAX_PWR				GENMASK_ULL(46, 32)
+
+#define PCU_CR_PACKAGE_POWER_SKU_UNIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5938)
+#define   PKG_PWR_UNIT				REG_GENMASK(3, 0)
+
+#define PCU_CR_PACKAGE_RAPL_LIMIT		XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0)
+#define   PKG_PWR_LIM_1				REG_GENMASK(14, 0)
+#define   PKG_PWR_LIM_1_EN			REG_BIT(15)
+
+#endif /* _XE_MCHBAR_REGS_H_ */
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 687dc3d79a66..b9ff42a26ca3 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -35,6 +35,7 @@ 
 #include "xe_vm.h"
 #include "xe_vm_madvise.h"
 #include "xe_wait_user_fence.h"
+#include "xe_hwmon.h"
 
 #ifdef CONFIG_LOCKDEP
 struct lockdep_map xe_device_mem_access_lockdep_map = {
@@ -356,6 +357,8 @@  int xe_device_probe(struct xe_device *xe)
 
 	xe_pmu_register(&xe->pmu);
 
+	xe_hwmon_register(xe);
+
 	err = drmm_add_action_or_reset(&xe->drm, xe_device_sanitize, xe);
 	if (err)
 		return err;
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 32ab0fea04ee..519aec01fb0b 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -365,6 +365,8 @@  struct xe_device {
 	/** @pmu: performance monitoring unit */
 	struct xe_pmu pmu;
 
+	struct xe_hwmon *hwmon;
+
 	/* private: */
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c
new file mode 100644
index 000000000000..44d814e111c6
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_hwmon.c
@@ -0,0 +1,357 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#include <linux/hwmon.h>
+
+#include <drm/drm_managed.h>
+#include "regs/xe_gt_regs.h"
+#include "regs/xe_mchbar_regs.h"
+#include "xe_device.h"
+#include "xe_gt.h"
+#include "xe_hwmon.h"
+#include "xe_mmio.h"
+
+enum xe_hwmon_reg {
+	REG_PKG_RAPL_LIMIT,
+	REG_PKG_POWER_SKU,
+	REG_PKG_POWER_SKU_UNIT,
+};
+
+enum xe_hwmon_reg_operation {
+	REG_READ,
+	REG_WRITE,
+	REG_RMW,
+};
+
+/*
+ * SF_* - scale factors for particular quantities according to hwmon spec.
+ */
+#define SF_POWER	1000000		/* microwatts */
+
+struct xe_hwmon {
+	struct device *hwmon_dev;
+	struct xe_gt *gt;
+	struct mutex hwmon_lock; /* rmw operations*/
+	int scl_shift_power;
+};
+
+static u32 xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg)
+{
+	struct xe_device *xe = gt_to_xe(hwmon->gt);
+	struct xe_reg reg = XE_REG(0);
+
+	switch (hwmon_reg) {
+	case REG_PKG_RAPL_LIMIT:
+		if (xe->info.platform == XE_DG2)
+			reg = PCU_CR_PACKAGE_RAPL_LIMIT;
+		else if (xe->info.platform == XE_PVC)
+			reg = PVC_GT0_PACKAGE_RAPL_LIMIT;
+		break;
+	case REG_PKG_POWER_SKU:
+		if (xe->info.platform == XE_DG2)
+			reg = PCU_CR_PACKAGE_POWER_SKU;
+		else if (xe->info.platform == XE_PVC)
+			reg = PVC_GT0_PACKAGE_POWER_SKU;
+		break;
+	case REG_PKG_POWER_SKU_UNIT:
+		if (xe->info.platform == XE_DG2)
+			reg = PCU_CR_PACKAGE_POWER_SKU_UNIT;
+		else if (xe->info.platform == XE_PVC)
+			reg = PVC_GT0_PACKAGE_POWER_SKU_UNIT;
+		break;
+	default:
+		drm_warn(&xe->drm, "Unknown xe hwmon reg id: %d\n", hwmon_reg);
+		break;
+	}
+
+	return reg.raw;
+}
+
+static int xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg,
+				enum xe_hwmon_reg_operation operation, u32 *value,
+				u32 clr, u32 set)
+{
+	struct xe_reg reg;
+
+	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
+
+	if (!reg.raw)
+		return -EOPNOTSUPP;
+
+	switch (operation) {
+	case REG_READ:
+		*value = xe_mmio_read32(hwmon->gt, reg);
+		return 0;
+	case REG_WRITE:
+		xe_mmio_write32(hwmon->gt, reg, *value);
+		return 0;
+	case REG_RMW:
+		*value = xe_mmio_rmw32(hwmon->gt, reg, clr, set);
+		return 0;
+	default:
+		drm_warn(&gt_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n",
+			 operation);
+		return -EOPNOTSUPP;
+	}
+}
+
+int xe_hwmon_process_reg_read64(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, u64 *value)
+{
+	struct xe_reg reg;
+
+	reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg);
+
+	if (!reg.raw)
+		return -EOPNOTSUPP;
+
+	*value = xe_mmio_read64_2x32(hwmon->gt, reg);
+
+	return 0;
+}
+
+#define PL1_DISABLE 0
+
+/*
+ * HW allows arbitrary PL1 limits to be set but silently clamps these values to
+ * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the
+ * same pattern for sysfs, allow arbitrary PL1 limits to be set but display
+ * clamped values when read.
+ */
+static int xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value)
+{
+	u32 reg_val;
+	u64 reg_val64, min, max;
+
+	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val, 0, 0);
+	/* Check if PL1 limit is disabled */
+	if (!(reg_val & PKG_PWR_LIM_1_EN)) {
+		*value = PL1_DISABLE;
+		return 0;
+	}
+
+	reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val);
+	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
+
+	xe_hwmon_process_reg_read64(hwmon, REG_PKG_POWER_SKU, &reg_val64);
+	min = REG_FIELD_GET(PKG_MIN_PWR, reg_val64);
+	min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power);
+	max = REG_FIELD_GET(PKG_MAX_PWR, reg_val64);
+	max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power);
+
+	if (min && max)
+		*value = clamp_t(u64, *value, min, max);
+
+	return 0;
+}
+
+static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value)
+{
+	u32 reg_val;
+
+	/* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */
+	if (value == PL1_DISABLE) {
+		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
+				     PKG_PWR_LIM_1_EN, 0);
+		xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ, &reg_val,
+				     PKG_PWR_LIM_1_EN, 0);
+
+		if (reg_val & PKG_PWR_LIM_1_EN)
+			return -EOPNOTSUPP;
+	}
+
+	/* Computation in 64-bits to avoid overflow. Round to nearest. */
+	reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, SF_POWER);
+	reg_val = PKG_PWR_LIM_1_EN | REG_FIELD_PREP(PKG_PWR_LIM_1, reg_val);
+
+	xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW, &reg_val,
+			     PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, reg_val);
+
+	return 0;
+}
+
+static int xe_hwmon_power_rated_max_read(struct xe_hwmon *hwmon, long *value)
+{
+	u32 reg_val;
+
+	xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU, REG_READ, &reg_val, 0, 0);
+	reg_val = REG_FIELD_GET(PKG_TDP, reg_val);
+	*value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power);
+
+	return 0;
+}
+
+static const struct hwmon_channel_info *hwmon_info[] = {
+	HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX),
+	NULL
+};
+
+static umode_t
+xe_hwmon_power_is_visible(struct xe_hwmon *hwmon, u32 attr, int chan)
+{
+	switch (attr) {
+	case hwmon_power_max:
+		return xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT) ? 0664 : 0;
+	case hwmon_power_rated_max:
+		return xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU) ? 0444 : 0;
+	default:
+		return 0;
+	}
+}
+
+static int
+xe_hwmon_power_read(struct xe_hwmon *hwmon, u32 attr, int chan, long *val)
+{
+	switch (attr) {
+	case hwmon_power_max:
+		return xe_hwmon_power_max_read(hwmon, val);
+	case hwmon_power_rated_max:
+		return xe_hwmon_power_rated_max_read(hwmon, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+xe_hwmon_power_write(struct xe_hwmon *hwmon, u32 attr, int chan, long val)
+{
+	switch (attr) {
+	case hwmon_power_max:
+		return xe_hwmon_power_max_write(hwmon, val);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static umode_t
+xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type,
+		    u32 attr, int channel)
+{
+	struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata;
+	int ret;
+
+	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
+
+	switch (type) {
+	case hwmon_power:
+		ret = xe_hwmon_power_is_visible(hwmon, attr, channel);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+
+	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
+
+	return ret;
+}
+
+static int
+xe_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+	      int channel, long *val)
+{
+	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
+	int ret;
+
+	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
+
+	switch (type) {
+	case hwmon_power:
+		ret = xe_hwmon_power_read(hwmon, attr, channel, val);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
+
+	return ret;
+}
+
+static int
+xe_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr,
+	       int channel, long val)
+{
+	struct xe_hwmon *hwmon = dev_get_drvdata(dev);
+	int ret;
+
+	xe_device_mem_access_get(gt_to_xe(hwmon->gt));
+
+	switch (type) {
+	case hwmon_power:
+		ret = xe_hwmon_power_write(hwmon, attr, channel, val);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	xe_device_mem_access_put(gt_to_xe(hwmon->gt));
+
+	return ret;
+}
+
+static const struct hwmon_ops hwmon_ops = {
+	.is_visible = xe_hwmon_is_visible,
+	.read = xe_hwmon_read,
+	.write = xe_hwmon_write,
+};
+
+static const struct hwmon_chip_info hwmon_chip_info = {
+	.ops = &hwmon_ops,
+	.info = hwmon_info,
+};
+
+static void
+xe_hwmon_get_preregistration_info(struct xe_device *xe)
+{
+	struct xe_hwmon *hwmon = xe->hwmon;
+	u32 val_sku_unit = 0;
+	int ret;
+
+	ret = xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU_UNIT, REG_READ, &val_sku_unit, 0, 0);
+	/*
+	 * The contents of register PKG_POWER_SKU_UNIT do not change,
+	 * so read it once and store the shift values.
+	 */
+	if (!ret)
+		hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit);
+}
+
+void xe_hwmon_register(struct xe_device *xe)
+{
+	struct device *dev = xe->drm.dev;
+	struct xe_hwmon *hwmon;
+
+	/* hwmon is available only for dGfx */
+	if (!IS_DGFX(xe))
+		return;
+
+	hwmon = devm_kzalloc(dev, sizeof(*hwmon), GFP_KERNEL);
+	if (!hwmon)
+		return;
+
+	xe->hwmon = hwmon;
+
+	drmm_mutex_init(&xe->drm, &hwmon->hwmon_lock);
+
+	/* primary GT to access device level properties */
+	hwmon->gt = xe->tiles[0].primary_gt;
+
+	xe_hwmon_get_preregistration_info(xe);
+
+	drm_dbg(&xe->drm, "Register xe hwmon interface\n");
+
+	/*  hwmon_dev points to device hwmon<i> */
+	hwmon->hwmon_dev = devm_hwmon_device_register_with_info(dev, "xe", hwmon,
+								&hwmon_chip_info,
+								NULL);
+	if (IS_ERR(hwmon->hwmon_dev)) {
+		drm_warn(&xe->drm, "Failed to register xe hwmon (%pe)\n", hwmon->hwmon_dev);
+		xe->hwmon = NULL;
+		return;
+	}
+}
+
diff --git a/drivers/gpu/drm/xe/xe_hwmon.h b/drivers/gpu/drm/xe/xe_hwmon.h
new file mode 100644
index 000000000000..c42a1de2cd7a
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_hwmon.h
@@ -0,0 +1,19 @@ 
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#ifndef _XE_HWMON_H_
+#define _XE_HWMON_H_
+
+#include <linux/types.h>
+
+struct xe_device;
+
+#if IS_REACHABLE(CONFIG_HWMON)
+void xe_hwmon_register(struct xe_device *xe);
+#else
+static inline void xe_hwmon_register(struct xe_device *xe) { };
+#endif
+
+#endif /* _XE_HWMON_H_ */