[v3,2/3] nvme: add thermal zone devices
diff mbox series

Message ID 1558888143-5121-3-git-send-email-akinobu.mita@gmail.com
State Changes Requested
Delegated to: Eduardo Valentin
Headers show
Series
  • nvme: add thermal zone devices
Related show

Commit Message

Akinobu Mita May 26, 2019, 4:29 p.m. UTC
The NVMe controller reports up to nine temperature values in the SMART /
Health log page (the composite temperature and temperature sensor 1 through
temperature sensor 8).

This provides these temperatures via thermal zone devices.

Once the controller is identified, the thermal zone devices are created for
all implemented temperature sensors including the composite temperature.

/sys/class/thermal/thermal_zone[0-*]:
    |---type: 'nvme<instance>-temp<sensor>'
    |---temp: Temperature
    |---trip_point_0_temp: Over temperature threshold

The thermal_zone[0-*] contains a 'device' symlink to the corresponding nvme
device.

On the other hand, the following symlinks to the thermal zone devices are
created in the nvme device sysfs directory.

- temp0: Composite temperature
- temp1: Temperature sensor 1
...
- temp8: Temperature sensor 8

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Minwoo Im <minwoo.im.dev@gmail.com>
Cc: Kenneth Heitke <kenneth.heitke@intel.com>
Cc: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
---
* v3
- Change the type name of thermal zone devices from 'nvme_temp<sensor>' to
  'nvme<instance>_temp<sensor>'
- Pass a NULL to the status argument of nvme_set_feature()
- Change the name of symbolic link from 'nvme_temp<sensor>' to 'temp<sensor>'
- Don't make it fatal error if the device provides a response
- Don't register thermal zone for composite temperature if smart log reports
  zero value
- Move the thermal zones registration and unregistration into the core module.

 drivers/nvme/host/core.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |   9 ++
 include/linux/nvme.h     |   5 +
 3 files changed, 302 insertions(+)

Comments

Minwoo Im May 29, 2019, 3:15 p.m. UTC | #1
> +/**
> + * nvme_thermal_zones_register() - register nvme thermal zone devices
> + * @ctrl: controller instance
> + *
> + * This function creates up to nine thermal zone devices for all implemented
> + * temperature sensors including the composite temperature.
> + * Each thermal zone device provides a single trip point temperature that is
> + * associated with an over temperature threshold.
> + */
> +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_smart_log *log;
> +	int ret;
> +	int i;
> +
> +	log = kzalloc(sizeof(*log), GFP_KERNEL);
> +	if (!log)
> +		return 0; /* non-fatal error */

Do we need to print something like warning here? If kzalloc() fails, it
would be good to be distinguished between the nvme failure and internal
failure like this.

> +
> +	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +			   log, sizeof(*log), 0);
> +	if (ret) {
> +		dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> +		/* If the device provided a response, then it's non-fatal */
> +		if (ret > 0)
> +			ret = 0;

It seems like that nvme_init_identify() is just check the internal error
which is in negative value now as you have posted.  Why don't we just
return the value of "ret" itself without updating it to 0 ?

> +		goto free_log;
> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> +		struct thermal_zone_device *tzdev;
> +		int temp;
> +
> +		if (i)
> +			temp = le16_to_cpu(log->temp_sensor[i - 1]);
> +		else
> +			temp = get_unaligned_le16(log->temperature);
> +
> +		/*
> +		 * All implemented temperature sensors report a non-zero value
> +		 * in temperature sensor fields in the smart log page.
> +		 */
> +		if (!temp)
> +			continue;
> +		if (ctrl->tzdev[i])
> +			continue;
> +
> +		tzdev = nvme_thermal_zone_register(ctrl, i);
> +		if (!IS_ERR(tzdev))
> +			ctrl->tzdev[i] = tzdev;
> +	}
> +
> +free_log:
> +	kfree(log);
> +
> +	return ret;
> +}
> +
> +/**
> + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> + * @ctrl: controller instance
> + *
> + * This function removes the registered thermal zone devices and symlinks.
> + */
> +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> +		struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> +		char name[20];

Simple query here :)

If we are not going to allow the name of link exceed the length of its
own device name like nvmeX_tempY, then can we THERMAL_NAME_LENGTH macro
here?  If the name of link is not exactly about the device name itself,
then it's fine.  What do you think about it ?
Akinobu Mita May 29, 2019, 4:47 p.m. UTC | #2
2019年5月30日(木) 0:15 Minwoo Im <minwoo.im.dev@gmail.com>:
>
> > +/**
> > + * nvme_thermal_zones_register() - register nvme thermal zone devices
> > + * @ctrl: controller instance
> > + *
> > + * This function creates up to nine thermal zone devices for all implemented
> > + * temperature sensors including the composite temperature.
> > + * Each thermal zone device provides a single trip point temperature that is
> > + * associated with an over temperature threshold.
> > + */
> > +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > +{
> > +     struct nvme_smart_log *log;
> > +     int ret;
> > +     int i;
> > +
> > +     log = kzalloc(sizeof(*log), GFP_KERNEL);
> > +     if (!log)
> > +             return 0; /* non-fatal error */
>
> Do we need to print something like warning here? If kzalloc() fails, it
> would be good to be distinguished between the nvme failure and internal
> failure like this.

We usually remove the error message on kmalloc failure because it's
redundant as long as we don't set __GFP_NOWARN.

> > +
> > +     ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> > +                        log, sizeof(*log), 0);
> > +     if (ret) {
> > +             dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> > +             /* If the device provided a response, then it's non-fatal */
> > +             if (ret > 0)
> > +                     ret = 0;
>
> It seems like that nvme_init_identify() is just check the internal error
> which is in negative value now as you have posted.  Why don't we just
> return the value of "ret" itself without updating it to 0 ?

Both ways work for me.

I personally prefer not to return (leak) the nvme status code from
foo_register() function.

> > +             goto free_log;
> > +     }
> > +
> > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > +             struct thermal_zone_device *tzdev;
> > +             int temp;
> > +
> > +             if (i)
> > +                     temp = le16_to_cpu(log->temp_sensor[i - 1]);
> > +             else
> > +                     temp = get_unaligned_le16(log->temperature);
> > +
> > +             /*
> > +              * All implemented temperature sensors report a non-zero value
> > +              * in temperature sensor fields in the smart log page.
> > +              */
> > +             if (!temp)
> > +                     continue;
> > +             if (ctrl->tzdev[i])
> > +                     continue;
> > +
> > +             tzdev = nvme_thermal_zone_register(ctrl, i);
> > +             if (!IS_ERR(tzdev))
> > +                     ctrl->tzdev[i] = tzdev;
> > +     }
> > +
> > +free_log:
> > +     kfree(log);
> > +
> > +     return ret;
> > +}
> > +
> > +/**
> > + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> > + * @ctrl: controller instance
> > + *
> > + * This function removes the registered thermal zone devices and symlinks.
> > + */
> > +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > +{
> > +     int i;
> > +
> > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > +             struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> > +             char name[20];
>
> Simple query here :)
>
> If we are not going to allow the name of link exceed the length of its
> own device name like nvmeX_tempY, then can we THERMAL_NAME_LENGTH macro
> here?  If the name of link is not exactly about the device name itself,
> then it's fine.  What do you think about it ?

Of course we can use THERMAL_NAME_LENGTH here.  But this char array is
used only for the symbolic link name.
It's not used for thermal cooling device type, thermal zone device type,
or thermal governor name.  So I just used a random constant to avoid
confusion.
Minwoo Im May 30, 2019, 10:18 a.m. UTC | #3
On 19-05-30 01:47:08, Akinobu Mita wrote:
> 2019年5月30日(木) 0:15 Minwoo Im <minwoo.im.dev@gmail.com>:
> >
> > > +/**
> > > + * nvme_thermal_zones_register() - register nvme thermal zone devices
> > > + * @ctrl: controller instance
> > > + *
> > > + * This function creates up to nine thermal zone devices for all implemented
> > > + * temperature sensors including the composite temperature.
> > > + * Each thermal zone device provides a single trip point temperature that is
> > > + * associated with an over temperature threshold.
> > > + */
> > > +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > > +{
> > > +     struct nvme_smart_log *log;
> > > +     int ret;
> > > +     int i;
> > > +
> > > +     log = kzalloc(sizeof(*log), GFP_KERNEL);
> > > +     if (!log)
> > > +             return 0; /* non-fatal error */
> >
> > Do we need to print something like warning here? If kzalloc() fails, it
> > would be good to be distinguished between the nvme failure and internal
> > failure like this.
> 
> We usually remove the error message on kmalloc failure because it's
> redundant as long as we don't set __GFP_NOWARN.

I see what you point.

> 
> > > +
> > > +     ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> > > +                        log, sizeof(*log), 0);
> > > +     if (ret) {
> > > +             dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> > > +             /* If the device provided a response, then it's non-fatal */
> > > +             if (ret > 0)
> > > +                     ret = 0;
> >
> > It seems like that nvme_init_identify() is just check the internal error
> > which is in negative value now as you have posted.  Why don't we just
> > return the value of "ret" itself without updating it to 0 ?
> 
> Both ways work for me.
> 
> I personally prefer not to return (leak) the nvme status code from
> foo_register() function.

Okay.  In the perspective of registration, the nvme status might not be
needed to be returned.  But I think if we return the nvme status here,
it would be great for the later time when we need to figure out if the nvme
command has failed or not.

But, amyway, I'm fine with this :)

Thanks for your comment on this trivial query.

> 
> > > +             goto free_log;
> > > +     }
> > > +
> > > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > > +             struct thermal_zone_device *tzdev;
> > > +             int temp;
> > > +
> > > +             if (i)
> > > +                     temp = le16_to_cpu(log->temp_sensor[i - 1]);
> > > +             else
> > > +                     temp = get_unaligned_le16(log->temperature);
> > > +
> > > +             /*
> > > +              * All implemented temperature sensors report a non-zero value
> > > +              * in temperature sensor fields in the smart log page.
> > > +              */
> > > +             if (!temp)
> > > +                     continue;
> > > +             if (ctrl->tzdev[i])
> > > +                     continue;
> > > +
> > > +             tzdev = nvme_thermal_zone_register(ctrl, i);
> > > +             if (!IS_ERR(tzdev))
> > > +                     ctrl->tzdev[i] = tzdev;
> > > +     }
> > > +
> > > +free_log:
> > > +     kfree(log);
> > > +
> > > +     return ret;
> > > +}
> > > +
> > > +/**
> > > + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> > > + * @ctrl: controller instance
> > > + *
> > > + * This function removes the registered thermal zone devices and symlinks.
> > > + */
> > > +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > > +{
> > > +     int i;
> > > +
> > > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > > +             struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> > > +             char name[20];
> >
> > Simple query here :)
> >
> > If we are not going to allow the name of link exceed the length of its
> > own device name like nvmeX_tempY, then can we THERMAL_NAME_LENGTH macro
> > here?  If the name of link is not exactly about the device name itself,
> > then it's fine.  What do you think about it ?
> 
> Of course we can use THERMAL_NAME_LENGTH here.  But this char array is
> used only for the symbolic link name.
> It's not used for thermal cooling device type, thermal zone device type,
> or thermal governor name.  So I just used a random constant to avoid
> confusion.

Agreed.
Christoph Hellwig June 1, 2019, 9:02 a.m. UTC | #4
On Mon, May 27, 2019 at 01:29:02AM +0900, Akinobu Mita wrote:
> The NVMe controller reports up to nine temperature values in the SMART /
> Health log page (the composite temperature and temperature sensor 1 through
> temperature sensor 8).
> 
> This provides these temperatures via thermal zone devices.

Can you explain a bit more why we'd do this?  I shows up some sysfs
files, but we could easily do that with nvme-cli, too.  Is there some
greater benefit of this integration?
Akinobu Mita June 2, 2019, 1:19 p.m. UTC | #5
2019年6月1日(土) 18:03 Christoph Hellwig <hch@lst.de>:
>
> On Mon, May 27, 2019 at 01:29:02AM +0900, Akinobu Mita wrote:
> > The NVMe controller reports up to nine temperature values in the SMART /
> > Health log page (the composite temperature and temperature sensor 1 through
> > temperature sensor 8).
> >
> > This provides these temperatures via thermal zone devices.
>
> Can you explain a bit more why we'd do this?  I shows up some sysfs
> files, but we could easily do that with nvme-cli, too.  Is there some
> greater benefit of this integration?

As long as the user space thermal governor is used, there is nothing more
than that from a functional perspective.  And I suppose that this is used
with user_space governor (i.e. there is still some work to be able to bind
actual thermal cooling device).

The main purpose of this is to turn on a fan when overheated without
polling the device that could prevent the lower power state transitions.
But as you noted, we could do that with the existing AEN notifications
through uevent.

So frankly speaking, the benefit of this is providing generic thermal sysfs
interface and the tools like tmon (linux/tools/thermal/tmon) can show the
nvme temperatures.
Eduardo Valentin June 3, 2019, 2:36 a.m. UTC | #6
Hey Mita,

On Mon, May 27, 2019 at 01:29:02AM +0900, Akinobu Mita wrote:
> The NVMe controller reports up to nine temperature values in the SMART /
> Health log page (the composite temperature and temperature sensor 1 through
> temperature sensor 8).
> 
> This provides these temperatures via thermal zone devices.
> 
> Once the controller is identified, the thermal zone devices are created for
> all implemented temperature sensors including the composite temperature.
> 
> /sys/class/thermal/thermal_zone[0-*]:
>     |---type: 'nvme<instance>-temp<sensor>'
>     |---temp: Temperature
>     |---trip_point_0_temp: Over temperature threshold
> 
> The thermal_zone[0-*] contains a 'device' symlink to the corresponding nvme
> device.
> 
> On the other hand, the following symlinks to the thermal zone devices are
> created in the nvme device sysfs directory.
> 
> - temp0: Composite temperature
> - temp1: Temperature sensor 1
> ...
> - temp8: Temperature sensor 8
> 

These questions on V2 are still unanswered:
a. Can we get a more descriptive string into tz->type?
b. Can these APIs support DT probing too?

> Cc: Zhang Rui <rui.zhang@intel.com>
> Cc: Eduardo Valentin <edubezval@gmail.com>
> Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
> Cc: Keith Busch <keith.busch@intel.com>
> Cc: Jens Axboe <axboe@fb.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Sagi Grimberg <sagi@grimberg.me>
> Cc: Minwoo Im <minwoo.im.dev@gmail.com>
> Cc: Kenneth Heitke <kenneth.heitke@intel.com>
> Cc: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>
> Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
> ---
> * v3
> - Change the type name of thermal zone devices from 'nvme_temp<sensor>' to
>   'nvme<instance>_temp<sensor>'
> - Pass a NULL to the status argument of nvme_set_feature()
> - Change the name of symbolic link from 'nvme_temp<sensor>' to 'temp<sensor>'
> - Don't make it fatal error if the device provides a response
> - Don't register thermal zone for composite temperature if smart log reports
>   zero value
> - Move the thermal zones registration and unregistration into the core module.
> 
>  drivers/nvme/host/core.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/nvme/host/nvme.h |   9 ++
>  include/linux/nvme.h     |   5 +
>  3 files changed, 302 insertions(+)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index c950916..4c8271a 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2200,6 +2200,289 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val)
>  	}
>  }
>  
> +#ifdef CONFIG_THERMAL
> +
> +static int nvme_get_temp(struct nvme_ctrl *ctrl, unsigned int sensor, int *temp)
> +{
> +	struct nvme_smart_log *log;
> +	int ret;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(log->temp_sensor) + 1 !=
> +		     ARRAY_SIZE(ctrl->tzdev));
> +
> +	if (WARN_ON_ONCE(sensor > ARRAY_SIZE(log->temp_sensor)))
> +		return -EINVAL;
> +
> +	log = kzalloc(sizeof(*log), GFP_KERNEL);
> +	if (!log)
> +		return -ENOMEM;
> +
> +	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +			   log, sizeof(*log), 0);
> +	if (ret) {
> +		ret = ret > 0 ? -EINVAL : ret;
> +		goto free_log;
> +	}
> +
> +	if (sensor)
> +		*temp = le16_to_cpu(log->temp_sensor[sensor - 1]);
> +	else
> +		*temp = get_unaligned_le16(log->temperature);
> +
> +free_log:
> +	kfree(log);
> +
> +	return ret;
> +}
> +
> +static unsigned int nvme_tz_type_to_sensor(const char *type)
> +{
> +	int instance;
> +	unsigned int sensor;
> +
> +	if (sscanf(type, "nvme%d_temp%u", &instance, &sensor) != 2)
> +		return UINT_MAX;
> +
> +	return sensor;
> +}
> +
> +#define KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS((t) * 10)
> +#define MILLICELSIUS_TO_KELVIN(t) ((MILLICELSIUS_TO_DECI_KELVIN(t) + 5) / 10)
> +
> +static int nvme_tz_get_temp(struct thermal_zone_device *tzdev, int *temp)
> +{
> +	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
> +	struct nvme_ctrl *ctrl = tzdev->devdata;
> +	int ret;
> +
> +	ret = nvme_get_temp(ctrl, sensor, temp);
> +	if (!ret)
> +		*temp = KELVIN_TO_MILLICELSIUS(*temp);
> +
> +	return ret;
> +}
> +
> +static int nvme_tz_get_trip_type(struct thermal_zone_device *tzdev,
> +				 int trip, enum thermal_trip_type *type)
> +{
> +	*type = THERMAL_TRIP_ACTIVE;
> +
> +	return 0;
> +}
> +
> +static int nvme_get_over_temp_thresh(struct nvme_ctrl *ctrl,
> +				     unsigned int sensor, int *temp)
> +{
> +	unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
> +	int status;
> +	int ret;
> +
> +	if (WARN_ON_ONCE(sensor >= ARRAY_SIZE(ctrl->tzdev)))
> +		return -EINVAL;
> +
> +	ret = nvme_get_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
> +				&status);
> +	if (!ret)
> +		*temp = status & NVME_TEMP_THRESH_MASK;
> +
> +	return ret > 0 ? -EINVAL : ret;
> +}
> +
> +static int nvme_set_over_temp_thresh(struct nvme_ctrl *ctrl,
> +				     unsigned int sensor, int temp)
> +{
> +	unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
> +	int ret;
> +
> +	if (WARN_ON_ONCE(sensor >= ARRAY_SIZE(ctrl->tzdev)))
> +		return -EINVAL;
> +
> +	if (temp > NVME_TEMP_THRESH_MASK)
> +		return -EINVAL;
> +
> +	threshold |= temp & NVME_TEMP_THRESH_MASK;
> +
> +	ret = nvme_set_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
> +				NULL);
> +
> +	return ret > 0 ? -EINVAL : ret;
> +}
> +
> +static int nvme_tz_get_trip_temp(struct thermal_zone_device *tzdev,
> +				 int trip, int *temp)
> +{
> +	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
> +	struct nvme_ctrl *ctrl = tzdev->devdata;
> +	int ret;
> +
> +	ret = nvme_get_over_temp_thresh(ctrl, sensor, temp);
> +	if (!ret)
> +		*temp = KELVIN_TO_MILLICELSIUS(*temp);
> +
> +	return ret;
> +}
> +
> +static int nvme_tz_set_trip_temp(struct thermal_zone_device *tzdev,
> +				 int trip, int temp)
> +{
> +	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
> +	struct nvme_ctrl *ctrl = tzdev->devdata;
> +
> +	temp = MILLICELSIUS_TO_KELVIN(temp);
> +
> +	return nvme_set_over_temp_thresh(ctrl, sensor, temp);
> +}
> +
> +static struct thermal_zone_device_ops nvme_tz_ops = {
> +	.get_temp = nvme_tz_get_temp,
> +	.get_trip_type = nvme_tz_get_trip_type,
> +	.get_trip_temp = nvme_tz_get_trip_temp,
> +	.set_trip_temp = nvme_tz_set_trip_temp,
> +};
> +
> +static struct thermal_zone_params nvme_tz_params = {
> +	.governor_name = "user_space",

Also,

Was there any particular reason why defaulting to user_space here?

> +	.no_hwmon = true,
> +};
> +
> +static struct thermal_zone_device *
> +nvme_thermal_zone_register(struct nvme_ctrl *ctrl, unsigned int sensor)
> +{
> +	struct thermal_zone_device *tzdev;
> +	char name[THERMAL_NAME_LENGTH];
> +	int ret;
> +
> +	snprintf(name, sizeof(name), "nvme%d_temp%u", ctrl->instance, sensor);
> +
> +	tzdev = thermal_zone_device_register(name, 1, 1, ctrl, &nvme_tz_ops,
> +					     &nvme_tz_params, 0, 0);
> +	if (IS_ERR(tzdev)) {
> +		dev_err(ctrl->device,
> +			"Failed to register thermal zone device: %ld\n",
> +			PTR_ERR(tzdev));
> +		return tzdev;
> +	}
> +
> +	snprintf(name, sizeof(name), "temp%d", sensor);
> +	ret = sysfs_create_link(&ctrl->ctrl_device.kobj, &tzdev->device.kobj,
> +				name);
> +	if (ret)
> +		goto device_unregister;
> +
> +	ret = sysfs_create_link(&tzdev->device.kobj,
> +				&ctrl->ctrl_device.kobj, "device");
> +	if (ret)
> +		goto remove_link;
> +
> +	return tzdev;
> +
> +remove_link:
> +	sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> +device_unregister:
> +	thermal_zone_device_unregister(tzdev);
> +
> +	return ERR_PTR(ret);
> +}
> +
> +/**
> + * nvme_thermal_zones_register() - register nvme thermal zone devices
> + * @ctrl: controller instance
> + *
> + * This function creates up to nine thermal zone devices for all implemented
> + * temperature sensors including the composite temperature.
> + * Each thermal zone device provides a single trip point temperature that is
> + * associated with an over temperature threshold.
> + */
> +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_smart_log *log;
> +	int ret;
> +	int i;
> +
> +	log = kzalloc(sizeof(*log), GFP_KERNEL);
> +	if (!log)
> +		return 0; /* non-fatal error */

I am not sure about this API design here. I would leave the error
handling and judging if this is fatal or not to the caller. 
I mean, if I ask to register a nvme thermal zone and I get
a 0 as response, I would assume the thermal zone exists from
now on, right?

> +
> +	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +			   log, sizeof(*log), 0);
> +	if (ret) {
> +		dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> +		/* If the device provided a response, then it's non-fatal */
> +		if (ret > 0)
> +			ret = 0;
> +		goto free_log;

Once again, hiding errors is never a strategy that scales..

> +	}
> +
> +	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> +		struct thermal_zone_device *tzdev;
> +		int temp;
> +
> +		if (i)
> +			temp = le16_to_cpu(log->temp_sensor[i - 1]);
> +		else
> +			temp = get_unaligned_le16(log->temperature);
> +
> +		/*
> +		 * All implemented temperature sensors report a non-zero value
> +		 * in temperature sensor fields in the smart log page.
> +		 */
> +		if (!temp)
> +			continue;
> +		if (ctrl->tzdev[i])
> +			continue;
> +
> +		tzdev = nvme_thermal_zone_register(ctrl, i);
> +		if (!IS_ERR(tzdev))
> +			ctrl->tzdev[i] = tzdev;

Here again, I would not hide errors, the API should propagate errors
if something goes wrong.

> +	}
> +
> +free_log:
> +	kfree(log);
> +
> +	return ret;
> +}
> +
> +/**
> + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> + * @ctrl: controller instance
> + *
> + * This function removes the registered thermal zone devices and symlinks.
> + */
> +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> +		struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> +		char name[20];
> +
> +		if (!tzdev)
> +			continue;
> +
> +		sysfs_remove_link(&tzdev->device.kobj, "device");
> +
> +		snprintf(name, sizeof(name), "temp%d", i);
> +		sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> +
> +		thermal_zone_device_unregister(tzdev);
> +
> +		ctrl->tzdev[i] = NULL;
> +	}
> +}
> +
> +#else
> +
> +static inline int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> +{
> +	return 0;
> +}
> +
> +static inline void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> +{
> +}
> +
> +#endif /* CONFIG_THERMAL */
> +
>  struct nvme_core_quirk_entry {
>  	/*
>  	 * NVMe model and firmware strings are padded with spaces.  For
> @@ -2754,6 +3037,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>  	if (ret < 0)
>  		return ret;
>  
> +	ret = nvme_thermal_zones_register(ctrl);
> +	if (ret < 0)
> +		return ret;


I would definitely keep this code the way it is here in
nvme_init_identify(), but if I read this right, your
nvme_thermal_zones_register() will never return < 0, and therefore this
condition is never met.

I would suggest you simply propagate the errors and keep this piece
of code the way it is, propagating errors too.

> +
>  	ctrl->identified = true;
>  
>  	return 0;
> @@ -3756,6 +4043,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>  {
>  	nvme_mpath_stop(ctrl);
>  	nvme_stop_keep_alive(ctrl);
> +	nvme_thermal_zones_unregister(ctrl);
>  	flush_work(&ctrl->async_event_work);
>  	cancel_work_sync(&ctrl->fw_act_work);
>  }
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 802aa19..53f0b24 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -15,6 +15,7 @@
>  #include <linux/sed-opal.h>
>  #include <linux/fault-inject.h>
>  #include <linux/rcupdate.h>
> +#include <linux/thermal.h>
>  
>  extern unsigned int nvme_io_timeout;
>  #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
> @@ -248,6 +249,14 @@ struct nvme_ctrl {
>  
>  	struct page *discard_page;
>  	unsigned long discard_page_busy;
> +
> +#ifdef CONFIG_THERMAL
> +	/*
> +	 * tzdev[0]: composite temperature
> +	 * tzdev[1-8]: temperature sensor 1 through 8
> +	 */
> +	struct thermal_zone_device *tzdev[9];
> +#endif
>  };
>  
>  enum nvme_iopolicy {
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 658ac75..54f0a13 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -780,6 +780,11 @@ struct nvme_write_zeroes_cmd {
>  
>  /* Features */
>  
> +enum {
> +	NVME_TEMP_THRESH_MASK		= 0xffff,
> +	NVME_TEMP_THRESH_SELECT_SHIFT	= 16,
> +};
> +
>  struct nvme_feat_auto_pst {
>  	__le64 entries[32];
>  };
> -- 
> 2.7.4
>
Akinobu Mita June 3, 2019, 3:20 p.m. UTC | #7
2019年6月3日(月) 11:36 Eduardo Valentin <edubezval@gmail.com>:
>
> Hey Mita,
>
> On Mon, May 27, 2019 at 01:29:02AM +0900, Akinobu Mita wrote:
> > The NVMe controller reports up to nine temperature values in the SMART /
> > Health log page (the composite temperature and temperature sensor 1 through
> > temperature sensor 8).
> >
> > This provides these temperatures via thermal zone devices.
> >
> > Once the controller is identified, the thermal zone devices are created for
> > all implemented temperature sensors including the composite temperature.
> >
> > /sys/class/thermal/thermal_zone[0-*]:
> >     |---type: 'nvme<instance>-temp<sensor>'
> >     |---temp: Temperature
> >     |---trip_point_0_temp: Over temperature threshold
> >
> > The thermal_zone[0-*] contains a 'device' symlink to the corresponding nvme
> > device.
> >
> > On the other hand, the following symlinks to the thermal zone devices are
> > created in the nvme device sysfs directory.
> >
> > - temp0: Composite temperature
> > - temp1: Temperature sensor 1
> > ...
> > - temp8: Temperature sensor 8
> >
>
> These questions on V2 are still unanswered:
> a. Can we get a more descriptive string into tz->type?

As I said in the other thread, the NVMe spec only says a controller
reports the composite temperature and temperature sensor 1 through 8.
What temperature sensor N means is vendor specific.

> b. Can these APIs support DT probing too?

OK. I can try, but I don't have arm or arm64 boards with PCIe for
testing with of-thermal.  So it'll be untested.

> > +static struct thermal_zone_params nvme_tz_params = {
> > +     .governor_name = "user_space",
>
> Also,
>
> Was there any particular reason why defaulting to user_space here?

I only tested with the user_space governor.  There is no cooling device
to bind with this.

> > +     .no_hwmon = true,
> > +};
> > +
> > +static struct thermal_zone_device *
> > +nvme_thermal_zone_register(struct nvme_ctrl *ctrl, unsigned int sensor)
> > +{
> > +     struct thermal_zone_device *tzdev;
> > +     char name[THERMAL_NAME_LENGTH];
> > +     int ret;
> > +
> > +     snprintf(name, sizeof(name), "nvme%d_temp%u", ctrl->instance, sensor);
> > +
> > +     tzdev = thermal_zone_device_register(name, 1, 1, ctrl, &nvme_tz_ops,
> > +                                          &nvme_tz_params, 0, 0);
> > +     if (IS_ERR(tzdev)) {
> > +             dev_err(ctrl->device,
> > +                     "Failed to register thermal zone device: %ld\n",
> > +                     PTR_ERR(tzdev));
> > +             return tzdev;
> > +     }
> > +
> > +     snprintf(name, sizeof(name), "temp%d", sensor);
> > +     ret = sysfs_create_link(&ctrl->ctrl_device.kobj, &tzdev->device.kobj,
> > +                             name);
> > +     if (ret)
> > +             goto device_unregister;
> > +
> > +     ret = sysfs_create_link(&tzdev->device.kobj,
> > +                             &ctrl->ctrl_device.kobj, "device");
> > +     if (ret)
> > +             goto remove_link;
> > +
> > +     return tzdev;
> > +
> > +remove_link:
> > +     sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> > +device_unregister:
> > +     thermal_zone_device_unregister(tzdev);
> > +
> > +     return ERR_PTR(ret);
> > +}
> > +
> > +/**
> > + * nvme_thermal_zones_register() - register nvme thermal zone devices
> > + * @ctrl: controller instance
> > + *
> > + * This function creates up to nine thermal zone devices for all implemented
> > + * temperature sensors including the composite temperature.
> > + * Each thermal zone device provides a single trip point temperature that is
> > + * associated with an over temperature threshold.
> > + */
> > +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > +{
> > +     struct nvme_smart_log *log;
> > +     int ret;
> > +     int i;
> > +
> > +     log = kzalloc(sizeof(*log), GFP_KERNEL);
> > +     if (!log)
> > +             return 0; /* non-fatal error */
>
> I am not sure about this API design here. I would leave the error
> handling and judging if this is fatal or not to the caller.
> I mean, if I ask to register a nvme thermal zone and I get
> a 0 as response, I would assume the thermal zone exists from
> now on, right?

This routine is designed to return error code only when we're unable to
communicate with the device at all (i.e. nvme_submit_sync_cmd returns a
negative value).

We don't want to abandon device initialization just due to thermal zone
failures.  Because thermal zone device isn't mandatory to manage the
controllers, and the device like qemu doesn't provide smart log (according
to Keith).

> > +
> > +     ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> > +                        log, sizeof(*log), 0);
> > +     if (ret) {
> > +             dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> > +             /* If the device provided a response, then it's non-fatal */
> > +             if (ret > 0)
> > +                     ret = 0;
> > +             goto free_log;
>
> Once again, hiding errors is never a strategy that scales..
>
> > +     }
> > +
> > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > +             struct thermal_zone_device *tzdev;
> > +             int temp;
> > +
> > +             if (i)
> > +                     temp = le16_to_cpu(log->temp_sensor[i - 1]);
> > +             else
> > +                     temp = get_unaligned_le16(log->temperature);
> > +
> > +             /*
> > +              * All implemented temperature sensors report a non-zero value
> > +              * in temperature sensor fields in the smart log page.
> > +              */
> > +             if (!temp)
> > +                     continue;
> > +             if (ctrl->tzdev[i])
> > +                     continue;
> > +
> > +             tzdev = nvme_thermal_zone_register(ctrl, i);
> > +             if (!IS_ERR(tzdev))
> > +                     ctrl->tzdev[i] = tzdev;
>
> Here again, I would not hide errors, the API should propagate errors
> if something goes wrong.
>
> > +     }
> > +
> > +free_log:
> > +     kfree(log);
> > +
> > +     return ret;
> > +}
> > +
> > +/**
> > + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> > + * @ctrl: controller instance
> > + *
> > + * This function removes the registered thermal zone devices and symlinks.
> > + */
> > +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > +{
> > +     int i;
> > +
> > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > +             struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> > +             char name[20];
> > +
> > +             if (!tzdev)
> > +                     continue;
> > +
> > +             sysfs_remove_link(&tzdev->device.kobj, "device");
> > +
> > +             snprintf(name, sizeof(name), "temp%d", i);
> > +             sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> > +
> > +             thermal_zone_device_unregister(tzdev);
> > +
> > +             ctrl->tzdev[i] = NULL;
> > +     }
> > +}
> > +
> > +#else
> > +
> > +static inline int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > +{
> > +     return 0;
> > +}
> > +
> > +static inline void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > +{
> > +}
> > +
> > +#endif /* CONFIG_THERMAL */
> > +
> >  struct nvme_core_quirk_entry {
> >       /*
> >        * NVMe model and firmware strings are padded with spaces.  For
> > @@ -2754,6 +3037,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
> >       if (ret < 0)
> >               return ret;
> >
> > +     ret = nvme_thermal_zones_register(ctrl);
> > +     if (ret < 0)
> > +             return ret;
>
>
> I would definitely keep this code the way it is here in
> nvme_init_identify(), but if I read this right, your
> nvme_thermal_zones_register() will never return < 0, and therefore this
> condition is never met.

The nvme_get_log() can return a negative value.  Only in this case,
nvme_thermal_zones_register() returns error and abandon the device
initialization.
Christoph Hellwig June 4, 2019, 7:31 a.m. UTC | #8
On Sun, Jun 02, 2019 at 10:19:08PM +0900, Akinobu Mita wrote:
> As long as the user space thermal governor is used, there is nothing more
> than that from a functional perspective.  And I suppose that this is used
> with user_space governor (i.e. there is still some work to be able to bind
> actual thermal cooling device).
> 
> The main purpose of this is to turn on a fan when overheated without
> polling the device that could prevent the lower power state transitions.
> But as you noted, we could do that with the existing AEN notifications
> through uevent.
> 
> So frankly speaking, the benefit of this is providing generic thermal sysfs
> interface and the tools like tmon (linux/tools/thermal/tmon) can show the
> nvme temperatures.

I'm just a little worried about bloating the nvme driver with features
that look kinda nifty but don't buy us much.  I'd rather keep at least
the core and PCIe drivers as minimal.  Now the thermal device support
is pretty small and except for the smart uevents I can't find anything
actually bad, but I'm not exactly excited either.
Akinobu Mita June 5, 2019, 3:42 p.m. UTC | #9
2019年6月4日(火) 16:32 Christoph Hellwig <hch@lst.de>:
>
> On Sun, Jun 02, 2019 at 10:19:08PM +0900, Akinobu Mita wrote:
> > As long as the user space thermal governor is used, there is nothing more
> > than that from a functional perspective.  And I suppose that this is used
> > with user_space governor (i.e. there is still some work to be able to bind
> > actual thermal cooling device).
> >
> > The main purpose of this is to turn on a fan when overheated without
> > polling the device that could prevent the lower power state transitions.
> > But as you noted, we could do that with the existing AEN notifications
> > through uevent.
> >
> > So frankly speaking, the benefit of this is providing generic thermal sysfs
> > interface and the tools like tmon (linux/tools/thermal/tmon) can show the
> > nvme temperatures.
>
> I'm just a little worried about bloating the nvme driver with features
> that look kinda nifty but don't buy us much.  I'd rather keep at least
> the core and PCIe drivers as minimal.  Now the thermal device support
> is pretty small and except for the smart uevents I can't find anything
> actually bad, but I'm not exactly excited either.

I'll add thermal.c file in order to minimize the core driver changes and a
module parameter to disable the thermal zone support.

Device tree thermal zone will also be supported. It enables to use thermal
governor other than user_space governor.
Eduardo Valentin June 6, 2019, 4:05 a.m. UTC | #10
On Tue, Jun 04, 2019 at 12:20:19AM +0900, Akinobu Mita wrote:
> 2019年6月3日(月) 11:36 Eduardo Valentin <edubezval@gmail.com>:
> >
> > Hey Mita,
> >
> > On Mon, May 27, 2019 at 01:29:02AM +0900, Akinobu Mita wrote:
> > > The NVMe controller reports up to nine temperature values in the SMART /
> > > Health log page (the composite temperature and temperature sensor 1 through
> > > temperature sensor 8).
> > >
> > > This provides these temperatures via thermal zone devices.
> > >
> > > Once the controller is identified, the thermal zone devices are created for
> > > all implemented temperature sensors including the composite temperature.
> > >
> > > /sys/class/thermal/thermal_zone[0-*]:
> > >     |---type: 'nvme<instance>-temp<sensor>'
> > >     |---temp: Temperature
> > >     |---trip_point_0_temp: Over temperature threshold
> > >
> > > The thermal_zone[0-*] contains a 'device' symlink to the corresponding nvme
> > > device.
> > >
> > > On the other hand, the following symlinks to the thermal zone devices are
> > > created in the nvme device sysfs directory.
> > >
> > > - temp0: Composite temperature
> > > - temp1: Temperature sensor 1
> > > ...
> > > - temp8: Temperature sensor 8
> > >
> >
> > These questions on V2 are still unanswered:
> > a. Can we get a more descriptive string into tz->type?
> 
> As I said in the other thread, the NVMe spec only says a controller
> reports the composite temperature and temperature sensor 1 through 8.
> What temperature sensor N means is vendor specific.

I see..

> 
> > b. Can these APIs support DT probing too?
> 
> OK. I can try, but I don't have arm or arm64 boards with PCIe for
> testing with of-thermal.  So it'll be untested.
> 

OK.  Lets see what comes.

> > > +static struct thermal_zone_params nvme_tz_params = {
> > > +     .governor_name = "user_space",
> >
> > Also,
> >
> > Was there any particular reason why defaulting to user_space here?
> 
> I only tested with the user_space governor.  There is no cooling device
> to bind with this.
> 

hmmm.. I see. But was there any particular reason to pick the thermal
subsystem here if you do not have any cooling? Or is there any specific
cooling that can be written in future?

If no cooling is expected, why not a simple hwmon?

> > > +     .no_hwmon = true,
> > > +};
> > > +
> > > +static struct thermal_zone_device *
> > > +nvme_thermal_zone_register(struct nvme_ctrl *ctrl, unsigned int sensor)
> > > +{
> > > +     struct thermal_zone_device *tzdev;
> > > +     char name[THERMAL_NAME_LENGTH];
> > > +     int ret;
> > > +
> > > +     snprintf(name, sizeof(name), "nvme%d_temp%u", ctrl->instance, sensor);
> > > +
> > > +     tzdev = thermal_zone_device_register(name, 1, 1, ctrl, &nvme_tz_ops,
> > > +                                          &nvme_tz_params, 0, 0);
> > > +     if (IS_ERR(tzdev)) {
> > > +             dev_err(ctrl->device,
> > > +                     "Failed to register thermal zone device: %ld\n",
> > > +                     PTR_ERR(tzdev));
> > > +             return tzdev;
> > > +     }
> > > +
> > > +     snprintf(name, sizeof(name), "temp%d", sensor);
> > > +     ret = sysfs_create_link(&ctrl->ctrl_device.kobj, &tzdev->device.kobj,
> > > +                             name);
> > > +     if (ret)
> > > +             goto device_unregister;
> > > +
> > > +     ret = sysfs_create_link(&tzdev->device.kobj,
> > > +                             &ctrl->ctrl_device.kobj, "device");
> > > +     if (ret)
> > > +             goto remove_link;
> > > +
> > > +     return tzdev;
> > > +
> > > +remove_link:
> > > +     sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> > > +device_unregister:
> > > +     thermal_zone_device_unregister(tzdev);
> > > +
> > > +     return ERR_PTR(ret);
> > > +}
> > > +
> > > +/**
> > > + * nvme_thermal_zones_register() - register nvme thermal zone devices
> > > + * @ctrl: controller instance
> > > + *
> > > + * This function creates up to nine thermal zone devices for all implemented
> > > + * temperature sensors including the composite temperature.
> > > + * Each thermal zone device provides a single trip point temperature that is
> > > + * associated with an over temperature threshold.
> > > + */
> > > +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > > +{
> > > +     struct nvme_smart_log *log;
> > > +     int ret;
> > > +     int i;
> > > +
> > > +     log = kzalloc(sizeof(*log), GFP_KERNEL);
> > > +     if (!log)
> > > +             return 0; /* non-fatal error */
> >
> > I am not sure about this API design here. I would leave the error
> > handling and judging if this is fatal or not to the caller.
> > I mean, if I ask to register a nvme thermal zone and I get
> > a 0 as response, I would assume the thermal zone exists from
> > now on, right?
> 
> This routine is designed to return error code only when we're unable to
> communicate with the device at all (i.e. nvme_submit_sync_cmd returns a
> negative value).
> 
> We don't want to abandon device initialization just due to thermal zone
> failures.  Because thermal zone device isn't mandatory to manage the
> controllers, and the device like qemu doesn't provide smart log (according
> to Keith).

That is fair.. it is just really weird to continue your business when
the kernel cannot even allocate memory for you..

> 
> > > +
> > > +     ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> > > +                        log, sizeof(*log), 0);
> > > +     if (ret) {
> > > +             dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
> > > +             /* If the device provided a response, then it's non-fatal */
> > > +             if (ret > 0)
> > > +                     ret = 0;
> > > +             goto free_log;
> >
> > Once again, hiding errors is never a strategy that scales..
> >
> > > +     }
> > > +
> > > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > > +             struct thermal_zone_device *tzdev;
> > > +             int temp;
> > > +
> > > +             if (i)
> > > +                     temp = le16_to_cpu(log->temp_sensor[i - 1]);
> > > +             else
> > > +                     temp = get_unaligned_le16(log->temperature);
> > > +
> > > +             /*
> > > +              * All implemented temperature sensors report a non-zero value
> > > +              * in temperature sensor fields in the smart log page.
> > > +              */
> > > +             if (!temp)
> > > +                     continue;
> > > +             if (ctrl->tzdev[i])
> > > +                     continue;
> > > +
> > > +             tzdev = nvme_thermal_zone_register(ctrl, i);
> > > +             if (!IS_ERR(tzdev))
> > > +                     ctrl->tzdev[i] = tzdev;
> >
> > Here again, I would not hide errors, the API should propagate errors
> > if something goes wrong.
> >
> > > +     }
> > > +
> > > +free_log:
> > > +     kfree(log);
> > > +
> > > +     return ret;
> > > +}
> > > +
> > > +/**
> > > + * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
> > > + * @ctrl: controller instance
> > > + *
> > > + * This function removes the registered thermal zone devices and symlinks.
> > > + */
> > > +static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > > +{
> > > +     int i;
> > > +
> > > +     for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
> > > +             struct thermal_zone_device *tzdev = ctrl->tzdev[i];
> > > +             char name[20];
> > > +
> > > +             if (!tzdev)
> > > +                     continue;
> > > +
> > > +             sysfs_remove_link(&tzdev->device.kobj, "device");
> > > +
> > > +             snprintf(name, sizeof(name), "temp%d", i);
> > > +             sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> > > +
> > > +             thermal_zone_device_unregister(tzdev);
> > > +
> > > +             ctrl->tzdev[i] = NULL;
> > > +     }
> > > +}
> > > +
> > > +#else
> > > +
> > > +static inline int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > > +{
> > > +     return 0;
> > > +}
> > > +
> > > +static inline void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
> > > +{
> > > +}
> > > +
> > > +#endif /* CONFIG_THERMAL */
> > > +
> > >  struct nvme_core_quirk_entry {
> > >       /*
> > >        * NVMe model and firmware strings are padded with spaces.  For
> > > @@ -2754,6 +3037,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
> > >       if (ret < 0)
> > >               return ret;
> > >
> > > +     ret = nvme_thermal_zones_register(ctrl);
> > > +     if (ret < 0)
> > > +             return ret;
> >
> >
> > I would definitely keep this code the way it is here in
> > nvme_init_identify(), but if I read this right, your
> > nvme_thermal_zones_register() will never return < 0, and therefore this
> > condition is never met.
> 
> The nvme_get_log() can return a negative value.  Only in this case,
> nvme_thermal_zones_register() returns error and abandon the device
> initialization.
Akinobu Mita June 7, 2019, 3:21 p.m. UTC | #11
2019年6月6日(木) 13:05 Eduardo Valentin <edubezval@gmail.com>:

> > > > +static struct thermal_zone_params nvme_tz_params = {
> > > > +     .governor_name = "user_space",
> > >
> > > Also,
> > >
> > > Was there any particular reason why defaulting to user_space here?
> >
> > I only tested with the user_space governor.  There is no cooling device
> > to bind with this.
> >
>
> hmmm.. I see. But was there any particular reason to pick the thermal
> subsystem here if you do not have any cooling? Or is there any specific
> cooling that can be written in future?
>
> If no cooling is expected, why not a simple hwmon?

For example, we can use USB port powered cooling fan with USB hub that
supports per-port power switching.  It can be turned on and off by user
space tool.

> > > > +     .no_hwmon = true,
> > > > +};
> > > > +
> > > > +static struct thermal_zone_device *
> > > > +nvme_thermal_zone_register(struct nvme_ctrl *ctrl, unsigned int sensor)
> > > > +{
> > > > +     struct thermal_zone_device *tzdev;
> > > > +     char name[THERMAL_NAME_LENGTH];
> > > > +     int ret;
> > > > +
> > > > +     snprintf(name, sizeof(name), "nvme%d_temp%u", ctrl->instance, sensor);
> > > > +
> > > > +     tzdev = thermal_zone_device_register(name, 1, 1, ctrl, &nvme_tz_ops,
> > > > +                                          &nvme_tz_params, 0, 0);
> > > > +     if (IS_ERR(tzdev)) {
> > > > +             dev_err(ctrl->device,
> > > > +                     "Failed to register thermal zone device: %ld\n",
> > > > +                     PTR_ERR(tzdev));
> > > > +             return tzdev;
> > > > +     }
> > > > +
> > > > +     snprintf(name, sizeof(name), "temp%d", sensor);
> > > > +     ret = sysfs_create_link(&ctrl->ctrl_device.kobj, &tzdev->device.kobj,
> > > > +                             name);
> > > > +     if (ret)
> > > > +             goto device_unregister;
> > > > +
> > > > +     ret = sysfs_create_link(&tzdev->device.kobj,
> > > > +                             &ctrl->ctrl_device.kobj, "device");
> > > > +     if (ret)
> > > > +             goto remove_link;
> > > > +
> > > > +     return tzdev;
> > > > +
> > > > +remove_link:
> > > > +     sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
> > > > +device_unregister:
> > > > +     thermal_zone_device_unregister(tzdev);
> > > > +
> > > > +     return ERR_PTR(ret);
> > > > +}
> > > > +
> > > > +/**
> > > > + * nvme_thermal_zones_register() - register nvme thermal zone devices
> > > > + * @ctrl: controller instance
> > > > + *
> > > > + * This function creates up to nine thermal zone devices for all implemented
> > > > + * temperature sensors including the composite temperature.
> > > > + * Each thermal zone device provides a single trip point temperature that is
> > > > + * associated with an over temperature threshold.
> > > > + */
> > > > +static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
> > > > +{
> > > > +     struct nvme_smart_log *log;
> > > > +     int ret;
> > > > +     int i;
> > > > +
> > > > +     log = kzalloc(sizeof(*log), GFP_KERNEL);
> > > > +     if (!log)
> > > > +             return 0; /* non-fatal error */
> > >
> > > I am not sure about this API design here. I would leave the error
> > > handling and judging if this is fatal or not to the caller.
> > > I mean, if I ask to register a nvme thermal zone and I get
> > > a 0 as response, I would assume the thermal zone exists from
> > > now on, right?
> >
> > This routine is designed to return error code only when we're unable to
> > communicate with the device at all (i.e. nvme_submit_sync_cmd returns a
> > negative value).
> >
> > We don't want to abandon device initialization just due to thermal zone
> > failures.  Because thermal zone device isn't mandatory to manage the
> > controllers, and the device like qemu doesn't provide smart log (according
> > to Keith).
>
> That is fair.. it is just really weird to continue your business when
> the kernel cannot even allocate memory for you..

Right.  The system is not unlikely to continue working properly when under
page size allocation with GFP_KERNEL is failed, unless fault injection for
allocation is enabled.

Patch
diff mbox series

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c950916..4c8271a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2200,6 +2200,289 @@  static void nvme_set_latency_tolerance(struct device *dev, s32 val)
 	}
 }
 
+#ifdef CONFIG_THERMAL
+
+static int nvme_get_temp(struct nvme_ctrl *ctrl, unsigned int sensor, int *temp)
+{
+	struct nvme_smart_log *log;
+	int ret;
+
+	BUILD_BUG_ON(ARRAY_SIZE(log->temp_sensor) + 1 !=
+		     ARRAY_SIZE(ctrl->tzdev));
+
+	if (WARN_ON_ONCE(sensor > ARRAY_SIZE(log->temp_sensor)))
+		return -EINVAL;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
+			   log, sizeof(*log), 0);
+	if (ret) {
+		ret = ret > 0 ? -EINVAL : ret;
+		goto free_log;
+	}
+
+	if (sensor)
+		*temp = le16_to_cpu(log->temp_sensor[sensor - 1]);
+	else
+		*temp = get_unaligned_le16(log->temperature);
+
+free_log:
+	kfree(log);
+
+	return ret;
+}
+
+static unsigned int nvme_tz_type_to_sensor(const char *type)
+{
+	int instance;
+	unsigned int sensor;
+
+	if (sscanf(type, "nvme%d_temp%u", &instance, &sensor) != 2)
+		return UINT_MAX;
+
+	return sensor;
+}
+
+#define KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS((t) * 10)
+#define MILLICELSIUS_TO_KELVIN(t) ((MILLICELSIUS_TO_DECI_KELVIN(t) + 5) / 10)
+
+static int nvme_tz_get_temp(struct thermal_zone_device *tzdev, int *temp)
+{
+	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
+	struct nvme_ctrl *ctrl = tzdev->devdata;
+	int ret;
+
+	ret = nvme_get_temp(ctrl, sensor, temp);
+	if (!ret)
+		*temp = KELVIN_TO_MILLICELSIUS(*temp);
+
+	return ret;
+}
+
+static int nvme_tz_get_trip_type(struct thermal_zone_device *tzdev,
+				 int trip, enum thermal_trip_type *type)
+{
+	*type = THERMAL_TRIP_ACTIVE;
+
+	return 0;
+}
+
+static int nvme_get_over_temp_thresh(struct nvme_ctrl *ctrl,
+				     unsigned int sensor, int *temp)
+{
+	unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
+	int status;
+	int ret;
+
+	if (WARN_ON_ONCE(sensor >= ARRAY_SIZE(ctrl->tzdev)))
+		return -EINVAL;
+
+	ret = nvme_get_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
+				&status);
+	if (!ret)
+		*temp = status & NVME_TEMP_THRESH_MASK;
+
+	return ret > 0 ? -EINVAL : ret;
+}
+
+static int nvme_set_over_temp_thresh(struct nvme_ctrl *ctrl,
+				     unsigned int sensor, int temp)
+{
+	unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT;
+	int ret;
+
+	if (WARN_ON_ONCE(sensor >= ARRAY_SIZE(ctrl->tzdev)))
+		return -EINVAL;
+
+	if (temp > NVME_TEMP_THRESH_MASK)
+		return -EINVAL;
+
+	threshold |= temp & NVME_TEMP_THRESH_MASK;
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0,
+				NULL);
+
+	return ret > 0 ? -EINVAL : ret;
+}
+
+static int nvme_tz_get_trip_temp(struct thermal_zone_device *tzdev,
+				 int trip, int *temp)
+{
+	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
+	struct nvme_ctrl *ctrl = tzdev->devdata;
+	int ret;
+
+	ret = nvme_get_over_temp_thresh(ctrl, sensor, temp);
+	if (!ret)
+		*temp = KELVIN_TO_MILLICELSIUS(*temp);
+
+	return ret;
+}
+
+static int nvme_tz_set_trip_temp(struct thermal_zone_device *tzdev,
+				 int trip, int temp)
+{
+	unsigned int sensor = nvme_tz_type_to_sensor(tzdev->type);
+	struct nvme_ctrl *ctrl = tzdev->devdata;
+
+	temp = MILLICELSIUS_TO_KELVIN(temp);
+
+	return nvme_set_over_temp_thresh(ctrl, sensor, temp);
+}
+
+static struct thermal_zone_device_ops nvme_tz_ops = {
+	.get_temp = nvme_tz_get_temp,
+	.get_trip_type = nvme_tz_get_trip_type,
+	.get_trip_temp = nvme_tz_get_trip_temp,
+	.set_trip_temp = nvme_tz_set_trip_temp,
+};
+
+static struct thermal_zone_params nvme_tz_params = {
+	.governor_name = "user_space",
+	.no_hwmon = true,
+};
+
+static struct thermal_zone_device *
+nvme_thermal_zone_register(struct nvme_ctrl *ctrl, unsigned int sensor)
+{
+	struct thermal_zone_device *tzdev;
+	char name[THERMAL_NAME_LENGTH];
+	int ret;
+
+	snprintf(name, sizeof(name), "nvme%d_temp%u", ctrl->instance, sensor);
+
+	tzdev = thermal_zone_device_register(name, 1, 1, ctrl, &nvme_tz_ops,
+					     &nvme_tz_params, 0, 0);
+	if (IS_ERR(tzdev)) {
+		dev_err(ctrl->device,
+			"Failed to register thermal zone device: %ld\n",
+			PTR_ERR(tzdev));
+		return tzdev;
+	}
+
+	snprintf(name, sizeof(name), "temp%d", sensor);
+	ret = sysfs_create_link(&ctrl->ctrl_device.kobj, &tzdev->device.kobj,
+				name);
+	if (ret)
+		goto device_unregister;
+
+	ret = sysfs_create_link(&tzdev->device.kobj,
+				&ctrl->ctrl_device.kobj, "device");
+	if (ret)
+		goto remove_link;
+
+	return tzdev;
+
+remove_link:
+	sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
+device_unregister:
+	thermal_zone_device_unregister(tzdev);
+
+	return ERR_PTR(ret);
+}
+
+/**
+ * nvme_thermal_zones_register() - register nvme thermal zone devices
+ * @ctrl: controller instance
+ *
+ * This function creates up to nine thermal zone devices for all implemented
+ * temperature sensors including the composite temperature.
+ * Each thermal zone device provides a single trip point temperature that is
+ * associated with an over temperature threshold.
+ */
+static int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
+{
+	struct nvme_smart_log *log;
+	int ret;
+	int i;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return 0; /* non-fatal error */
+
+	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
+			   log, sizeof(*log), 0);
+	if (ret) {
+		dev_err(ctrl->device, "Failed to get SMART log: %d\n", ret);
+		/* If the device provided a response, then it's non-fatal */
+		if (ret > 0)
+			ret = 0;
+		goto free_log;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
+		struct thermal_zone_device *tzdev;
+		int temp;
+
+		if (i)
+			temp = le16_to_cpu(log->temp_sensor[i - 1]);
+		else
+			temp = get_unaligned_le16(log->temperature);
+
+		/*
+		 * All implemented temperature sensors report a non-zero value
+		 * in temperature sensor fields in the smart log page.
+		 */
+		if (!temp)
+			continue;
+		if (ctrl->tzdev[i])
+			continue;
+
+		tzdev = nvme_thermal_zone_register(ctrl, i);
+		if (!IS_ERR(tzdev))
+			ctrl->tzdev[i] = tzdev;
+	}
+
+free_log:
+	kfree(log);
+
+	return ret;
+}
+
+/**
+ * nvme_thermal_zones_unregister() - unregister nvme thermal zone devices
+ * @ctrl: controller instance
+ *
+ * This function removes the registered thermal zone devices and symlinks.
+ */
+static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
+		struct thermal_zone_device *tzdev = ctrl->tzdev[i];
+		char name[20];
+
+		if (!tzdev)
+			continue;
+
+		sysfs_remove_link(&tzdev->device.kobj, "device");
+
+		snprintf(name, sizeof(name), "temp%d", i);
+		sysfs_remove_link(&ctrl->ctrl_device.kobj, name);
+
+		thermal_zone_device_unregister(tzdev);
+
+		ctrl->tzdev[i] = NULL;
+	}
+}
+
+#else
+
+static inline int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
+{
+	return 0;
+}
+
+static inline void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
+{
+}
+
+#endif /* CONFIG_THERMAL */
+
 struct nvme_core_quirk_entry {
 	/*
 	 * NVMe model and firmware strings are padded with spaces.  For
@@ -2754,6 +3037,10 @@  int nvme_init_identify(struct nvme_ctrl *ctrl)
 	if (ret < 0)
 		return ret;
 
+	ret = nvme_thermal_zones_register(ctrl);
+	if (ret < 0)
+		return ret;
+
 	ctrl->identified = true;
 
 	return 0;
@@ -3756,6 +4043,7 @@  void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
+	nvme_thermal_zones_unregister(ctrl);
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fw_act_work);
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 802aa19..53f0b24 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -15,6 +15,7 @@ 
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
 #include <linux/rcupdate.h>
+#include <linux/thermal.h>
 
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
@@ -248,6 +249,14 @@  struct nvme_ctrl {
 
 	struct page *discard_page;
 	unsigned long discard_page_busy;
+
+#ifdef CONFIG_THERMAL
+	/*
+	 * tzdev[0]: composite temperature
+	 * tzdev[1-8]: temperature sensor 1 through 8
+	 */
+	struct thermal_zone_device *tzdev[9];
+#endif
 };
 
 enum nvme_iopolicy {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 658ac75..54f0a13 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -780,6 +780,11 @@  struct nvme_write_zeroes_cmd {
 
 /* Features */
 
+enum {
+	NVME_TEMP_THRESH_MASK		= 0xffff,
+	NVME_TEMP_THRESH_SELECT_SHIFT	= 16,
+};
+
 struct nvme_feat_auto_pst {
 	__le64 entries[32];
 };