diff mbox series

[v2] nvme: Add hardware monitoring support

Message ID 20191029223214.18889-1-linux@roeck-us.net (mailing list archive)
State Superseded, archived
Headers show
Series [v2] nvme: Add hardware monitoring support | expand

Commit Message

Guenter Roeck Oct. 29, 2019, 10:32 p.m. UTC
nvme devices report temperature information in the controller information
(for limits) and in the smart log. Currently, the only means to retrieve
this information is the nvme command line interface, which requires
super-user privileges.

At the same time, it would be desirable to use NVME temperature information
for thermal control.

This patch adds support to read NVME temperatures from the kernel using the
hwmon API and adds temperature zones for NVME drives. The thermal subsystem
can use this information to set thermal policies, and userspace can access
it using libsensors and/or the "sensors" command.

Example output from the "sensors" command:

nvme0-pci-0100
Adapter: PCI adapter
Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
Sensor 1:     +39.0°C
Sensor 2:     +41.0°C

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
v2: Use devm_kfree() to release memory in error path

 drivers/nvme/host/Kconfig      |  10 ++
 drivers/nvme/host/Makefile     |   1 +
 drivers/nvme/host/core.c       |   5 +
 drivers/nvme/host/nvme-hwmon.c | 163 +++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h       |   8 ++
 5 files changed, 187 insertions(+)
 create mode 100644 drivers/nvme/host/nvme-hwmon.c

Comments

Keith Busch Oct. 30, 2019, 12:53 a.m. UTC | #1
On Tue, Oct 29, 2019 at 03:32:14PM -0700, Guenter Roeck wrote:
> nvme devices report temperature information in the controller information
> (for limits) and in the smart log. Currently, the only means to retrieve
> this information is the nvme command line interface, which requires
> super-user privileges.
> 
> At the same time, it would be desirable to use NVME temperature information
> for thermal control.
> 
> This patch adds support to read NVME temperatures from the kernel using the
> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> can use this information to set thermal policies, and userspace can access
> it using libsensors and/or the "sensors" command.
> 
> Example output from the "sensors" command:
> 
> nvme0-pci-0100
> Adapter: PCI adapter
> Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> Sensor 1:     +39.0°C
> Sensor 2:     +41.0°C
> 
> Signed-off-by: Guenter Roeck <linux@roeck-us.net>

This looks fine to me, but I'll wait a few more days to see if there are
any additional comments..
Akinobu Mita Oct. 30, 2019, 11:16 a.m. UTC | #2
2019年10月30日(水) 7:32 Guenter Roeck <linux@roeck-us.net>:
>
> nvme devices report temperature information in the controller information
> (for limits) and in the smart log. Currently, the only means to retrieve
> this information is the nvme command line interface, which requires
> super-user privileges.
>
> At the same time, it would be desirable to use NVME temperature information
> for thermal control.
>
> This patch adds support to read NVME temperatures from the kernel using the
> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> can use this information to set thermal policies, and userspace can access
> it using libsensors and/or the "sensors" command.
>
> Example output from the "sensors" command:
>
> nvme0-pci-0100
> Adapter: PCI adapter
> Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> Sensor 1:     +39.0°C
> Sensor 2:     +41.0°C
>
> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> ---
> v2: Use devm_kfree() to release memory in error path
>
>  drivers/nvme/host/Kconfig      |  10 ++
>  drivers/nvme/host/Makefile     |   1 +
>  drivers/nvme/host/core.c       |   5 +
>  drivers/nvme/host/nvme-hwmon.c | 163 +++++++++++++++++++++++++++++++++
>  drivers/nvme/host/nvme.h       |   8 ++
>  5 files changed, 187 insertions(+)
>  create mode 100644 drivers/nvme/host/nvme-hwmon.c
>
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index 2b36f052bfb9..aeb49e16e386 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -23,6 +23,16 @@ config NVME_MULTIPATH
>            /dev/nvmeXnY device will show up for each NVMe namespaces,
>            even if it is accessible through multiple controllers.
>
> +config NVME_HWMON
> +       bool "NVME hardware monitoring"
> +       depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
> +       help
> +         This provides support for NVME hardware monitoring. If enabled,
> +         a hardware monitoring device will be created for each NVME drive
> +         in the system.
> +
> +         If unsure, say N.
> +
>  config NVME_FABRICS
>         tristate
>
> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> index 8a4b671c5f0c..03de4797a877 100644
> --- a/drivers/nvme/host/Makefile
> +++ b/drivers/nvme/host/Makefile
> @@ -14,6 +14,7 @@ nvme-core-$(CONFIG_TRACING)           += trace.o
>  nvme-core-$(CONFIG_NVME_MULTIPATH)     += multipath.o
>  nvme-core-$(CONFIG_NVM)                        += lightnvm.o
>  nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)   += fault_inject.o
> +nvme-core-$(CONFIG_NVME_HWMON)         += nvme-hwmon.o
>
>  nvme-y                                 += pci.o
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fa7ba09dca77..fc1d4b146717 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2796,6 +2796,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>         ctrl->oncs = le16_to_cpu(id->oncs);
>         ctrl->mtfa = le16_to_cpu(id->mtfa);
>         ctrl->oaes = le32_to_cpu(id->oaes);
> +       ctrl->wctemp = le16_to_cpu(id->wctemp);
> +       ctrl->cctemp = le16_to_cpu(id->cctemp);
> +
>         atomic_set(&ctrl->abort_limit, id->acl + 1);
>         ctrl->vwc = id->vwc;
>         if (id->mdts)
> @@ -2897,6 +2900,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>
>         ctrl->identified = true;
>
> +       nvme_hwmon_init(ctrl);
> +
>         return 0;
>
>  out_free:

The nvme_init_identify() can be called multiple time in nvme ctrl's
lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
should we need to prevent nvme_hwmon_init() from registering hwmon
device more than twice?

In the nvme thermal zone patchset[1], thernal zone is registered in
nvme_init_identify and unregistered in nvme_stop_ctrl().

[1] https://lore.kernel.org/linux-devicetree/1561990354-4084-2-git-send-email-akinobu.mita@gmail.com/

> diff --git a/drivers/nvme/host/nvme-hwmon.c b/drivers/nvme/host/nvme-hwmon.c
> new file mode 100644
> index 000000000000..af5eda326ec6
> --- /dev/null
> +++ b/drivers/nvme/host/nvme-hwmon.c
> @@ -0,0 +1,163 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVM Express hardware monitoring support
> + * Copyright (c) 2019, Guenter Roeck
> + */
> +
> +#include <linux/hwmon.h>
> +
> +#include "nvme.h"
> +
> +struct nvme_hwmon_data {
> +       struct nvme_ctrl *ctrl;
> +       struct nvme_smart_log log;
> +};
> +
> +static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
> +{
> +       return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +                           &data->log, sizeof(data->log), 0);
> +}

The 'data->log' is allocated per nvme_ctrl, so are there any locks to
prevent multiple callers of nvme_hwmon_get_smart_log() from breaking
the log buffer?

> +
> +static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
> +                          u32 attr, int channel, long *val)
> +{
> +       struct nvme_hwmon_data *data = dev_get_drvdata(dev);
> +       struct nvme_smart_log *log = &data->log;
> +       int err;
> +       int temp;
> +
> +       err = nvme_hwmon_get_smart_log(data);
> +       if (err)
> +               return err < 0 ? err : -EPROTO;
> +
> +       switch (attr) {
> +       case hwmon_temp_max:
> +               *val = (data->ctrl->wctemp - 273) * 1000;
> +               break;
> +       case hwmon_temp_crit:
> +               *val = (data->ctrl->cctemp - 273) * 1000;
> +               break;

When this function is called with 'hwmon_temp_max' or 'hwmon_temp_crit',
we don't need to call nvme_hwmon_get_smart_log() at all, do we?

> +       case hwmon_temp_input:
> +               if (!channel)
> +                       temp = le16_to_cpup((__le16 *)log->temperature);
> +               else
> +                       temp = le16_to_cpu(log->temp_sensor[channel - 1]);
> +               *val = (temp - 273) * 1000;
> +               break;
> +       case hwmon_temp_crit_alarm:
> +               *val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE);
> +               break;
> +       default:
> +               err = -EOPNOTSUPP;
> +               break;
> +       }
> +       return err;
> +}
Christoph Hellwig Oct. 30, 2019, 2:05 p.m. UTC | #3
On Wed, Oct 30, 2019 at 08:16:48PM +0900, Akinobu Mita wrote:
> The nvme_init_identify() can be called multiple time in nvme ctrl's
> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
> should we need to prevent nvme_hwmon_init() from registering hwmon
> device more than twice?
> 
> In the nvme thermal zone patchset[1], thernal zone is registered in
> nvme_init_identify and unregistered in nvme_stop_ctrl().

So Guenter said above the thermal subsystem could use the information
from hwmon as well.  Does this mean this patch would solve your needs
as well?
Christoph Hellwig Oct. 30, 2019, 2:12 p.m. UTC | #4
On Tue, Oct 29, 2019 at 03:32:14PM -0700, Guenter Roeck wrote:
> This patch adds support to read NVME temperatures from the kernel using the
> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> can use this information to set thermal policies, and userspace can access
> it using libsensors and/or the "sensors" command.

Except in all upper case or all lower case identifier the speling should
always be "NVMe".  Thi also happens a few more places like in the Kconfig
text.

> +static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
> +{
> +	return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +			    &data->log, sizeof(data->log), 0);
> +}
> +
> +static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
> +			   u32 attr, int channel, long *val)
> +{
> +	struct nvme_hwmon_data *data = dev_get_drvdata(dev);
> +	struct nvme_smart_log *log = &data->log;
> +	int err;
> +	int temp;
> +
> +	err = nvme_hwmon_get_smart_log(data);
> +	if (err)
> +		return err < 0 ? err : -EPROTO;

I think the handling of positive errnos fits better into
nvme_hwmon_get_smart_log.  Also EIO sounds like a better error for
generic NVMe level errors.

> +
> +	switch (attr) {
> +	case hwmon_temp_max:
> +		*val = (data->ctrl->wctemp - 273) * 1000;
> +		break;
> +	case hwmon_temp_crit:
> +		*val = (data->ctrl->cctemp - 273) * 1000;
> +		break;
> +	case hwmon_temp_input:
> +		if (!channel)
> +			temp = le16_to_cpup((__le16 *)log->temperature);

This needs to use get_unaligned_le16, otherwise you'll run into problems
on architectures that don't support unaligned loads.

> +static const struct hwmon_ops nvme_hwmon_ops = {
> +	.is_visible = nvme_hwmon_is_visible,
> +	.read = nvme_hwmon_read,
> +	.read_string = nvme_hwmon_read_string,
> +};
> +
> +static const struct hwmon_chip_info nvme_hwmon_chip_info = {
> +	.ops = &nvme_hwmon_ops,
> +	.info = nvme_hwmon_info,
> +};

Please use tabs to align all the = in an ops structure.

> +#if IS_ENABLED(CONFIG_NVME_HWMON)

No real need to use IS_ENABLED here, a plain ifdef should do it.
Chris Healy Oct. 30, 2019, 11:40 p.m. UTC | #5
On an ARM64 system with Toshiba SSD:

Tested-by: Chris Healy <cphealy@gmail.com>

On Tue, Oct 29, 2019 at 3:32 PM Guenter Roeck <linux@roeck-us.net> wrote:
>
> nvme devices report temperature information in the controller information
> (for limits) and in the smart log. Currently, the only means to retrieve
> this information is the nvme command line interface, which requires
> super-user privileges.
>
> At the same time, it would be desirable to use NVME temperature information
> for thermal control.
>
> This patch adds support to read NVME temperatures from the kernel using the
> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> can use this information to set thermal policies, and userspace can access
> it using libsensors and/or the "sensors" command.
>
> Example output from the "sensors" command:
>
> nvme0-pci-0100
> Adapter: PCI adapter
> Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> Sensor 1:     +39.0°C
> Sensor 2:     +41.0°C
>
> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> ---
> v2: Use devm_kfree() to release memory in error path
>
>  drivers/nvme/host/Kconfig      |  10 ++
>  drivers/nvme/host/Makefile     |   1 +
>  drivers/nvme/host/core.c       |   5 +
>  drivers/nvme/host/nvme-hwmon.c | 163 +++++++++++++++++++++++++++++++++
>  drivers/nvme/host/nvme.h       |   8 ++
>  5 files changed, 187 insertions(+)
>  create mode 100644 drivers/nvme/host/nvme-hwmon.c
>
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index 2b36f052bfb9..aeb49e16e386 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -23,6 +23,16 @@ config NVME_MULTIPATH
>            /dev/nvmeXnY device will show up for each NVMe namespaces,
>            even if it is accessible through multiple controllers.
>
> +config NVME_HWMON
> +       bool "NVME hardware monitoring"
> +       depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
> +       help
> +         This provides support for NVME hardware monitoring. If enabled,
> +         a hardware monitoring device will be created for each NVME drive
> +         in the system.
> +
> +         If unsure, say N.
> +
>  config NVME_FABRICS
>         tristate
>
> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> index 8a4b671c5f0c..03de4797a877 100644
> --- a/drivers/nvme/host/Makefile
> +++ b/drivers/nvme/host/Makefile
> @@ -14,6 +14,7 @@ nvme-core-$(CONFIG_TRACING)           += trace.o
>  nvme-core-$(CONFIG_NVME_MULTIPATH)     += multipath.o
>  nvme-core-$(CONFIG_NVM)                        += lightnvm.o
>  nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)   += fault_inject.o
> +nvme-core-$(CONFIG_NVME_HWMON)         += nvme-hwmon.o
>
>  nvme-y                                 += pci.o
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fa7ba09dca77..fc1d4b146717 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2796,6 +2796,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>         ctrl->oncs = le16_to_cpu(id->oncs);
>         ctrl->mtfa = le16_to_cpu(id->mtfa);
>         ctrl->oaes = le32_to_cpu(id->oaes);
> +       ctrl->wctemp = le16_to_cpu(id->wctemp);
> +       ctrl->cctemp = le16_to_cpu(id->cctemp);
> +
>         atomic_set(&ctrl->abort_limit, id->acl + 1);
>         ctrl->vwc = id->vwc;
>         if (id->mdts)
> @@ -2897,6 +2900,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>
>         ctrl->identified = true;
>
> +       nvme_hwmon_init(ctrl);
> +
>         return 0;
>
>  out_free:
> diff --git a/drivers/nvme/host/nvme-hwmon.c b/drivers/nvme/host/nvme-hwmon.c
> new file mode 100644
> index 000000000000..af5eda326ec6
> --- /dev/null
> +++ b/drivers/nvme/host/nvme-hwmon.c
> @@ -0,0 +1,163 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * NVM Express hardware monitoring support
> + * Copyright (c) 2019, Guenter Roeck
> + */
> +
> +#include <linux/hwmon.h>
> +
> +#include "nvme.h"
> +
> +struct nvme_hwmon_data {
> +       struct nvme_ctrl *ctrl;
> +       struct nvme_smart_log log;
> +};
> +
> +static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
> +{
> +       return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
> +                           &data->log, sizeof(data->log), 0);
> +}
> +
> +static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
> +                          u32 attr, int channel, long *val)
> +{
> +       struct nvme_hwmon_data *data = dev_get_drvdata(dev);
> +       struct nvme_smart_log *log = &data->log;
> +       int err;
> +       int temp;
> +
> +       err = nvme_hwmon_get_smart_log(data);
> +       if (err)
> +               return err < 0 ? err : -EPROTO;
> +
> +       switch (attr) {
> +       case hwmon_temp_max:
> +               *val = (data->ctrl->wctemp - 273) * 1000;
> +               break;
> +       case hwmon_temp_crit:
> +               *val = (data->ctrl->cctemp - 273) * 1000;
> +               break;
> +       case hwmon_temp_input:
> +               if (!channel)
> +                       temp = le16_to_cpup((__le16 *)log->temperature);
> +               else
> +                       temp = le16_to_cpu(log->temp_sensor[channel - 1]);
> +               *val = (temp - 273) * 1000;
> +               break;
> +       case hwmon_temp_crit_alarm:
> +               *val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE);
> +               break;
> +       default:
> +               err = -EOPNOTSUPP;
> +               break;
> +       }
> +       return err;
> +}
> +
> +static const char * const nvme_hwmon_sensor_names[] = {
> +       "Composite",
> +       "Sensor 1",
> +       "Sensor 2",
> +       "Sensor 3",
> +       "Sensor 4",
> +       "Sensor 5",
> +       "Sensor 6",
> +       "Sensor 7",
> +       "Sensor 8",
> +};
> +
> +static int nvme_hwmon_read_string(struct device *dev,
> +                                 enum hwmon_sensor_types type, u32 attr,
> +                                 int channel, const char **str)
> +{
> +       *str = nvme_hwmon_sensor_names[channel];
> +       return 0;
> +}
> +
> +static umode_t nvme_hwmon_is_visible(const void *_data,
> +                                    enum hwmon_sensor_types type,
> +                                    u32 attr, int channel)
> +{
> +       const struct nvme_hwmon_data *data = _data;
> +
> +       switch (attr) {
> +       case hwmon_temp_crit:
> +               if (!channel && data->ctrl->cctemp)
> +                       return 0444;
> +               break;
> +       case hwmon_temp_max:
> +               if (!channel && data->ctrl->wctemp)
> +                       return 0444;
> +               break;
> +       case hwmon_temp_crit_alarm:
> +               if (!channel)
> +                       return 0444;
> +               break;
> +       case hwmon_temp_input:
> +       case hwmon_temp_label:
> +               if (!channel || data->log.temp_sensor[channel - 1])
> +                       return 0444;
> +               break;
> +       default:
> +               break;
> +       }
> +       return 0;
> +}
> +
> +static const struct hwmon_channel_info *nvme_hwmon_info[] = {
> +       HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ),
> +       HWMON_CHANNEL_INFO(temp,
> +                          HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT |
> +                               HWMON_T_LABEL | HWMON_T_CRIT_ALARM,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL,
> +                          HWMON_T_INPUT | HWMON_T_LABEL),
> +       NULL
> +};
> +
> +static const struct hwmon_ops nvme_hwmon_ops = {
> +       .is_visible = nvme_hwmon_is_visible,
> +       .read = nvme_hwmon_read,
> +       .read_string = nvme_hwmon_read_string,
> +};
> +
> +static const struct hwmon_chip_info nvme_hwmon_chip_info = {
> +       .ops = &nvme_hwmon_ops,
> +       .info = nvme_hwmon_info,
> +};
> +
> +void nvme_hwmon_init(struct nvme_ctrl *ctrl)
> +{
> +       struct device *dev = ctrl->device;
> +       struct nvme_hwmon_data *data;
> +       struct device *hwmon;
> +       int err;
> +
> +       data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
> +       if (!data)
> +               return;
> +
> +       data->ctrl = ctrl;
> +
> +       err = nvme_hwmon_get_smart_log(data);
> +       if (err) {
> +               dev_warn(dev, "Failed to read smart log (error %d)\n", err);
> +               devm_kfree(dev, data);
> +               return;
> +       }
> +
> +       hwmon = devm_hwmon_device_register_with_info(dev, dev_name(dev),
> +                                                    data,
> +                                                    &nvme_hwmon_chip_info,
> +                                                    NULL);
> +       if (IS_ERR(hwmon)) {
> +               dev_warn(dev, "Failed to instantiate hwmon device\n");
> +               devm_kfree(dev, data);
> +       }
> +}
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 22e8401352c2..e6460c1216bc 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -231,6 +231,8 @@ struct nvme_ctrl {
>         u16 kas;
>         u8 npss;
>         u8 apsta;
> +       u16 wctemp;
> +       u16 cctemp;
>         u32 oaes;
>         u32 aen_result;
>         u32 ctratt;
> @@ -652,4 +654,10 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
>         return dev_to_disk(dev)->private_data;
>  }
>
> +#if IS_ENABLED(CONFIG_NVME_HWMON)
> +void nvme_hwmon_init(struct nvme_ctrl *ctrl);
> +#else
> +static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
> +#endif
> +
>  #endif /* _NVME_H */
> --
> 2.17.1
>
Guenter Roeck Oct. 31, 2019, 2:20 a.m. UTC | #6
On 10/30/19 4:16 AM, Akinobu Mita wrote:
> 2019年10月30日(水) 7:32 Guenter Roeck <linux@roeck-us.net>:
>>
>> nvme devices report temperature information in the controller information
>> (for limits) and in the smart log. Currently, the only means to retrieve
>> this information is the nvme command line interface, which requires
>> super-user privileges.
>>
>> At the same time, it would be desirable to use NVME temperature information
>> for thermal control.
>>
>> This patch adds support to read NVME temperatures from the kernel using the
>> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
>> can use this information to set thermal policies, and userspace can access
>> it using libsensors and/or the "sensors" command.
>>
>> Example output from the "sensors" command:
>>
>> nvme0-pci-0100
>> Adapter: PCI adapter
>> Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
>> Sensor 1:     +39.0°C
>> Sensor 2:     +41.0°C
>>
>> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
>> ---
>> v2: Use devm_kfree() to release memory in error path
>>
>>   drivers/nvme/host/Kconfig      |  10 ++
>>   drivers/nvme/host/Makefile     |   1 +
>>   drivers/nvme/host/core.c       |   5 +
>>   drivers/nvme/host/nvme-hwmon.c | 163 +++++++++++++++++++++++++++++++++
>>   drivers/nvme/host/nvme.h       |   8 ++
>>   5 files changed, 187 insertions(+)
>>   create mode 100644 drivers/nvme/host/nvme-hwmon.c
>>
>> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
>> index 2b36f052bfb9..aeb49e16e386 100644
>> --- a/drivers/nvme/host/Kconfig
>> +++ b/drivers/nvme/host/Kconfig
>> @@ -23,6 +23,16 @@ config NVME_MULTIPATH
>>             /dev/nvmeXnY device will show up for each NVMe namespaces,
>>             even if it is accessible through multiple controllers.
>>
>> +config NVME_HWMON
>> +       bool "NVME hardware monitoring"
>> +       depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
>> +       help
>> +         This provides support for NVME hardware monitoring. If enabled,
>> +         a hardware monitoring device will be created for each NVME drive
>> +         in the system.
>> +
>> +         If unsure, say N.
>> +
>>   config NVME_FABRICS
>>          tristate
>>
>> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
>> index 8a4b671c5f0c..03de4797a877 100644
>> --- a/drivers/nvme/host/Makefile
>> +++ b/drivers/nvme/host/Makefile
>> @@ -14,6 +14,7 @@ nvme-core-$(CONFIG_TRACING)           += trace.o
>>   nvme-core-$(CONFIG_NVME_MULTIPATH)     += multipath.o
>>   nvme-core-$(CONFIG_NVM)                        += lightnvm.o
>>   nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)   += fault_inject.o
>> +nvme-core-$(CONFIG_NVME_HWMON)         += nvme-hwmon.o
>>
>>   nvme-y                                 += pci.o
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index fa7ba09dca77..fc1d4b146717 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -2796,6 +2796,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>>          ctrl->oncs = le16_to_cpu(id->oncs);
>>          ctrl->mtfa = le16_to_cpu(id->mtfa);
>>          ctrl->oaes = le32_to_cpu(id->oaes);
>> +       ctrl->wctemp = le16_to_cpu(id->wctemp);
>> +       ctrl->cctemp = le16_to_cpu(id->cctemp);
>> +
>>          atomic_set(&ctrl->abort_limit, id->acl + 1);
>>          ctrl->vwc = id->vwc;
>>          if (id->mdts)
>> @@ -2897,6 +2900,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
>>
>>          ctrl->identified = true;
>>
>> +       nvme_hwmon_init(ctrl);
>> +
>>          return 0;
>>
>>   out_free:
> 
> The nvme_init_identify() can be called multiple time in nvme ctrl's
> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
> should we need to prevent nvme_hwmon_init() from registering hwmon
> device more than twice?
> 
> In the nvme thermal zone patchset[1], thernal zone is registered in
> nvme_init_identify and unregistered in nvme_stop_ctrl().
> 

Doesn't that mean that the initialization should happen in nvme_start_ctrl()
and not here ?

Either case, good point. Reason for calling the init function from here
is that I wanted to ensure that it is called after controller
identification. But then it is really undesirable to re-instantiate
the driver on each device reset. I'll have to think about that.

> [1] https://lore.kernel.org/linux-devicetree/1561990354-4084-2-git-send-email-akinobu.mita@gmail.com/
> 
>> diff --git a/drivers/nvme/host/nvme-hwmon.c b/drivers/nvme/host/nvme-hwmon.c
>> new file mode 100644
>> index 000000000000..af5eda326ec6
>> --- /dev/null
>> +++ b/drivers/nvme/host/nvme-hwmon.c
>> @@ -0,0 +1,163 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * NVM Express hardware monitoring support
>> + * Copyright (c) 2019, Guenter Roeck
>> + */
>> + >> +#include <linux/hwmon.h>
>> +
>> +#include "nvme.h"
>> +
>> +struct nvme_hwmon_data {
>> +       struct nvme_ctrl *ctrl;
>> +       struct nvme_smart_log log;
>> +};
>> +
>> +static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
>> +{
>> +       return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
>> +                           &data->log, sizeof(data->log), 0);
>> +}
> 
> The 'data->log' is allocated per nvme_ctrl, so are there any locks to
> prevent multiple callers of nvme_hwmon_get_smart_log() from breaking
> the log buffer?
> 
Good point. This needs either local memory like in your patch, or
I'll need a lock. I prefer a lock, though I am open to suggestions.

>> +
>> +static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
>> +                          u32 attr, int channel, long *val)
>> +{
>> +       struct nvme_hwmon_data *data = dev_get_drvdata(dev);
>> +       struct nvme_smart_log *log = &data->log;
>> +       int err;
>> +       int temp;
>> +
>> +       err = nvme_hwmon_get_smart_log(data);
>> +       if (err)
>> +               return err < 0 ? err : -EPROTO;
>> +
>> +       switch (attr) {
>> +       case hwmon_temp_max:
>> +               *val = (data->ctrl->wctemp - 273) * 1000;
>> +               break;
>> +       case hwmon_temp_crit:
>> +               *val = (data->ctrl->cctemp - 273) * 1000;
>> +               break;
> 
> When this function is called with 'hwmon_temp_max' or 'hwmon_temp_crit',
> we don't need to call nvme_hwmon_get_smart_log() at all, do we?
> 

Another good point.

Thanks,
Guenter

>> +       case hwmon_temp_input:
>> +               if (!channel)
>> +                       temp = le16_to_cpup((__le16 *)log->temperature);
>> +               else
>> +                       temp = le16_to_cpu(log->temp_sensor[channel - 1]);
>> +               *val = (temp - 273) * 1000;
>> +               break;
>> +       case hwmon_temp_crit_alarm:
>> +               *val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE);
>> +               break;
>> +       default:
>> +               err = -EOPNOTSUPP;
>> +               break;
>> +       }
>> +       return err;
>> +}
>
Guenter Roeck Oct. 31, 2019, 2:54 a.m. UTC | #7
On 10/30/19 7:05 AM, Christoph Hellwig wrote:
> On Wed, Oct 30, 2019 at 08:16:48PM +0900, Akinobu Mita wrote:
>> The nvme_init_identify() can be called multiple time in nvme ctrl's
>> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
>> should we need to prevent nvme_hwmon_init() from registering hwmon
>> device more than twice?
>>
>> In the nvme thermal zone patchset[1], thernal zone is registered in
>> nvme_init_identify and unregistered in nvme_stop_ctrl().
> 
> So Guenter said above the thermal subsystem could use the information
> from hwmon as well.  Does this mean this patch would solve your needs
> as well?
> 
Depends on the requirements. Unlike hwmon/iio, we don't have clear
guidelines describing when thermal vs. hwmon would be a better choice.
There is some interconnect between thermal and hwmon, but quite often
it is a one-way street (hwmon devices can easily register thermal
zones, for thermal zone devices it is a bit more difficult to register
associated hwmon devices).

For the most part, peripherals (memory, network devices, video
controllers, real time clocks, etc) are today handled by the hardware
monitoring subsystem. The one notable exception is the ath10k wireless
controller, but even that registers both a thermal device and a hardware
monitoring device. Sometimes peripheral devices tell the hardware
monitoring subsystem that it should also register thermal zones (I
would guess that ath10k doesn't do that because the mechanism didn't
exist back in 2014). On the other side, SoCs typically register
thermal zones and rarely register as hardware monitoring device.

Guenter
Akinobu Mita Oct. 31, 2019, 1:44 p.m. UTC | #8
2019年10月31日(木) 11:20 Guenter Roeck <linux@roeck-us.net>:
>
> On 10/30/19 4:16 AM, Akinobu Mita wrote:
> > 2019年10月30日(水) 7:32 Guenter Roeck <linux@roeck-us.net>:
> >>
> >> nvme devices report temperature information in the controller information
> >> (for limits) and in the smart log. Currently, the only means to retrieve
> >> this information is the nvme command line interface, which requires
> >> super-user privileges.
> >>
> >> At the same time, it would be desirable to use NVME temperature information
> >> for thermal control.
> >>
> >> This patch adds support to read NVME temperatures from the kernel using the
> >> hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> >> can use this information to set thermal policies, and userspace can access
> >> it using libsensors and/or the "sensors" command.
> >>
> >> Example output from the "sensors" command:
> >>
> >> nvme0-pci-0100
> >> Adapter: PCI adapter
> >> Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> >> Sensor 1:     +39.0°C
> >> Sensor 2:     +41.0°C
> >>
> >> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> >> ---
> >> v2: Use devm_kfree() to release memory in error path
> >>
> >>   drivers/nvme/host/Kconfig      |  10 ++
> >>   drivers/nvme/host/Makefile     |   1 +
> >>   drivers/nvme/host/core.c       |   5 +
> >>   drivers/nvme/host/nvme-hwmon.c | 163 +++++++++++++++++++++++++++++++++
> >>   drivers/nvme/host/nvme.h       |   8 ++
> >>   5 files changed, 187 insertions(+)
> >>   create mode 100644 drivers/nvme/host/nvme-hwmon.c
> >>
> >> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> >> index 2b36f052bfb9..aeb49e16e386 100644
> >> --- a/drivers/nvme/host/Kconfig
> >> +++ b/drivers/nvme/host/Kconfig
> >> @@ -23,6 +23,16 @@ config NVME_MULTIPATH
> >>             /dev/nvmeXnY device will show up for each NVMe namespaces,
> >>             even if it is accessible through multiple controllers.
> >>
> >> +config NVME_HWMON
> >> +       bool "NVME hardware monitoring"
> >> +       depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
> >> +       help
> >> +         This provides support for NVME hardware monitoring. If enabled,
> >> +         a hardware monitoring device will be created for each NVME drive
> >> +         in the system.
> >> +
> >> +         If unsure, say N.
> >> +
> >>   config NVME_FABRICS
> >>          tristate
> >>
> >> diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
> >> index 8a4b671c5f0c..03de4797a877 100644
> >> --- a/drivers/nvme/host/Makefile
> >> +++ b/drivers/nvme/host/Makefile
> >> @@ -14,6 +14,7 @@ nvme-core-$(CONFIG_TRACING)           += trace.o
> >>   nvme-core-$(CONFIG_NVME_MULTIPATH)     += multipath.o
> >>   nvme-core-$(CONFIG_NVM)                        += lightnvm.o
> >>   nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)   += fault_inject.o
> >> +nvme-core-$(CONFIG_NVME_HWMON)         += nvme-hwmon.o
> >>
> >>   nvme-y                                 += pci.o
> >>
> >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >> index fa7ba09dca77..fc1d4b146717 100644
> >> --- a/drivers/nvme/host/core.c
> >> +++ b/drivers/nvme/host/core.c
> >> @@ -2796,6 +2796,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
> >>          ctrl->oncs = le16_to_cpu(id->oncs);
> >>          ctrl->mtfa = le16_to_cpu(id->mtfa);
> >>          ctrl->oaes = le32_to_cpu(id->oaes);
> >> +       ctrl->wctemp = le16_to_cpu(id->wctemp);
> >> +       ctrl->cctemp = le16_to_cpu(id->cctemp);
> >> +
> >>          atomic_set(&ctrl->abort_limit, id->acl + 1);
> >>          ctrl->vwc = id->vwc;
> >>          if (id->mdts)
> >> @@ -2897,6 +2900,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
> >>
> >>          ctrl->identified = true;
> >>
> >> +       nvme_hwmon_init(ctrl);
> >> +
> >>          return 0;
> >>
> >>   out_free:
> >
> > The nvme_init_identify() can be called multiple time in nvme ctrl's
> > lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
> > should we need to prevent nvme_hwmon_init() from registering hwmon
> > device more than twice?
> >
> > In the nvme thermal zone patchset[1], thernal zone is registered in
> > nvme_init_identify and unregistered in nvme_stop_ctrl().
> >
>
> Doesn't that mean that the initialization should happen in nvme_start_ctrl()
> and not here ?

Seems possible.  But I would like to ask maintainers' opinion.
Christoph Hellwig Oct. 31, 2019, 1:45 p.m. UTC | #9
On Wed, Oct 30, 2019 at 07:20:37PM -0700, Guenter Roeck wrote:
>> The nvme_init_identify() can be called multiple time in nvme ctrl's
>> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
>> should we need to prevent nvme_hwmon_init() from registering hwmon
>> device more than twice?
>>
>> In the nvme thermal zone patchset[1], thernal zone is registered in
>> nvme_init_identify and unregistered in nvme_stop_ctrl().
>>
>
> Doesn't that mean that the initialization should happen in nvme_start_ctrl()
> and not here ?

I think calling it from nvme_init_identify is fine, it just needs to
be in the "if (!ctrl->identified)" section of that function.
Christoph Hellwig Oct. 31, 2019, 1:46 p.m. UTC | #10
On Wed, Oct 30, 2019 at 07:54:47PM -0700, Guenter Roeck wrote:
> On 10/30/19 7:05 AM, Christoph Hellwig wrote:
>> On Wed, Oct 30, 2019 at 08:16:48PM +0900, Akinobu Mita wrote:
>>> The nvme_init_identify() can be called multiple time in nvme ctrl's
>>> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
>>> should we need to prevent nvme_hwmon_init() from registering hwmon
>>> device more than twice?
>>>
>>> In the nvme thermal zone patchset[1], thernal zone is registered in
>>> nvme_init_identify and unregistered in nvme_stop_ctrl().
>>
>> So Guenter said above the thermal subsystem could use the information
>> from hwmon as well.  Does this mean this patch would solve your needs
>> as well?
>>
> Depends on the requirements. Unlike hwmon/iio, we don't have clear
> guidelines describing when thermal vs. hwmon would be a better choice.
> There is some interconnect between thermal and hwmon, but quite often
> it is a one-way street (hwmon devices can easily register thermal
> zones, for thermal zone devices it is a bit more difficult to register
> associated hwmon devices).

I'd like to hear from Akinobu-san if this also solves the thermal zone
use case.  If so I'd be much happier as we at least solve two use cases
with one patch.
Akinobu Mita Oct. 31, 2019, 1:46 p.m. UTC | #11
2019年10月30日(水) 23:05 Christoph Hellwig <hch@lst.de>:
>
> On Wed, Oct 30, 2019 at 08:16:48PM +0900, Akinobu Mita wrote:
> > The nvme_init_identify() can be called multiple time in nvme ctrl's
> > lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
> > should we need to prevent nvme_hwmon_init() from registering hwmon
> > device more than twice?
> >
> > In the nvme thermal zone patchset[1], thernal zone is registered in
> > nvme_init_identify and unregistered in nvme_stop_ctrl().
>
> So Guenter said above the thermal subsystem could use the information
> from hwmon as well.  Does this mean this patch would solve your needs
> as well?

The main reason I chose thermal framework was to utilize the temperature
threshold feature for notification on crossing a trip point temperature
without polling for smart log.

But the device I used for testing doesn't seem to report asynchronous
event immediately, so I'm not fully sure that's useful for now.

I have no problem with this nvme hwmon patch.  Maybe we can integrate
the temperature threshold feature into the nvme hwmon afterward.
Guenter Roeck Oct. 31, 2019, 5:54 p.m. UTC | #12
On Thu, Oct 31, 2019 at 02:45:49PM +0100, Christoph Hellwig wrote:
> On Wed, Oct 30, 2019 at 07:20:37PM -0700, Guenter Roeck wrote:
> >> The nvme_init_identify() can be called multiple time in nvme ctrl's
> >> lifetime (e.g 'nvme reset /dev/nvme*' or suspend/resume paths), so
> >> should we need to prevent nvme_hwmon_init() from registering hwmon
> >> device more than twice?
> >>
> >> In the nvme thermal zone patchset[1], thernal zone is registered in
> >> nvme_init_identify and unregistered in nvme_stop_ctrl().
> >>
> >
> > Doesn't that mean that the initialization should happen in nvme_start_ctrl()
> > and not here ?
> 
> I think calling it from nvme_init_identify is fine, it just needs to
> be in the "if (!ctrl->identified)" section of that function.

Excellent, I'll do that. Thanks a lot for the hint!

Guenter
Pavel Machek Nov. 6, 2019, 9:29 p.m. UTC | #13
Hi!

> > nvme devices report temperature information in the controller information
> > (for limits) and in the smart log. Currently, the only means to retrieve
> > this information is the nvme command line interface, which requires
> > super-user privileges.
> > 
> > At the same time, it would be desirable to use NVME temperature information
> > for thermal control.
> > 
> > This patch adds support to read NVME temperatures from the kernel using the
> > hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> > can use this information to set thermal policies, and userspace can access
> > it using libsensors and/or the "sensors" command.
> > 
> > Example output from the "sensors" command:
> > 
> > nvme0-pci-0100
> > Adapter: PCI adapter
> > Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> > Sensor 1:     +39.0°C
> > Sensor 2:     +41.0°C
> > 
> > Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> 
> This looks fine to me, but I'll wait a few more days to see if there are
> any additional comments..

User wants to know temperature of /dev/sda... and we already have an
userspace tools knowing about smart, etc...

pavel@amd:/data/film$ sudo hddtemp /dev/sda
/dev/sda: ST1000LM014-1EJ164: 48°C

I see we also have sensors framework but it does _not_ handle
harddrive temperatures.

Does it need some kind of unification? Should NVMe devices expose
"SMART" information in the same way other SSDs do?

Best regards,
								Pavel
Guenter Roeck Nov. 6, 2019, 10:30 p.m. UTC | #14
On Wed, Nov 06, 2019 at 10:29:21PM +0100, Pavel Machek wrote:
> Hi!
> 
> > > nvme devices report temperature information in the controller information
> > > (for limits) and in the smart log. Currently, the only means to retrieve
> > > this information is the nvme command line interface, which requires
> > > super-user privileges.
> > > 
> > > At the same time, it would be desirable to use NVME temperature information
> > > for thermal control.
> > > 
> > > This patch adds support to read NVME temperatures from the kernel using the
> > > hwmon API and adds temperature zones for NVME drives. The thermal subsystem
> > > can use this information to set thermal policies, and userspace can access
> > > it using libsensors and/or the "sensors" command.
> > > 
> > > Example output from the "sensors" command:
> > > 
> > > nvme0-pci-0100
> > > Adapter: PCI adapter
> > > Composite:    +39.0°C  (high = +85.0°C, crit = +85.0°C)
> > > Sensor 1:     +39.0°C
> > > Sensor 2:     +41.0°C
> > > 
> > > Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> > 
> > This looks fine to me, but I'll wait a few more days to see if there are
> > any additional comments..
> 
> User wants to know temperature of /dev/sda... and we already have an
> userspace tools knowing about smart, etc...
> 
> pavel@amd:/data/film$ sudo hddtemp /dev/sda
> /dev/sda: ST1000LM014-1EJ164: 48°C
> 
> I see we also have sensors framework but it does _not_ handle
> harddrive temperatures.
> 
> Does it need some kind of unification? Should NVMe devices expose
> "SMART" information in the same way other SSDs do?
> 

The unification to report hardware monitoring information to userspace
is called the sensors framework. Also, users in general prefer to not
have to run "sudo" to get such information.

Guenter
Chris Healy Nov. 6, 2019, 11:58 p.m. UTC | #15
> > > Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> >
> > This looks fine to me, but I'll wait a few more days to see if there are
> > any additional comments..
>
> User wants to know temperature of /dev/sda... and we already have an
> userspace tools knowing about smart, etc...

At least for my use cases, being able to use the thermal subsystem of
the kernel to cool things down when the SSD gets too hot is important.
Is there a way to do this with userspace tools feeding back to the
kernel's thermal subsystem?

>
> pavel@amd:/data/film$ sudo hddtemp /dev/sda
> /dev/sda: ST1000LM014-1EJ164: 48°C
>
> I see we also have sensors framework but it does _not_ handle
> harddrive temperatures.
>
> Does it need some kind of unification? Should NVMe devices expose
> "SMART" information in the same way other SSDs do?
>
> Best regards,
>                                                                 Pavel
> --
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
diff mbox series

Patch

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 2b36f052bfb9..aeb49e16e386 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -23,6 +23,16 @@  config NVME_MULTIPATH
 	   /dev/nvmeXnY device will show up for each NVMe namespaces,
 	   even if it is accessible through multiple controllers.
 
+config NVME_HWMON
+	bool "NVME hardware monitoring"
+	depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON)
+	help
+	  This provides support for NVME hardware monitoring. If enabled,
+	  a hardware monitoring device will be created for each NVME drive
+	  in the system.
+
+	  If unsure, say N.
+
 config NVME_FABRICS
 	tristate
 
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 8a4b671c5f0c..03de4797a877 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -14,6 +14,7 @@  nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
+nvme-core-$(CONFIG_NVME_HWMON)		+= nvme-hwmon.o
 
 nvme-y					+= pci.o
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa7ba09dca77..fc1d4b146717 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2796,6 +2796,9 @@  int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->oncs = le16_to_cpu(id->oncs);
 	ctrl->mtfa = le16_to_cpu(id->mtfa);
 	ctrl->oaes = le32_to_cpu(id->oaes);
+	ctrl->wctemp = le16_to_cpu(id->wctemp);
+	ctrl->cctemp = le16_to_cpu(id->cctemp);
+
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	if (id->mdts)
@@ -2897,6 +2900,8 @@  int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 	ctrl->identified = true;
 
+	nvme_hwmon_init(ctrl);
+
 	return 0;
 
 out_free:
diff --git a/drivers/nvme/host/nvme-hwmon.c b/drivers/nvme/host/nvme-hwmon.c
new file mode 100644
index 000000000000..af5eda326ec6
--- /dev/null
+++ b/drivers/nvme/host/nvme-hwmon.c
@@ -0,0 +1,163 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVM Express hardware monitoring support
+ * Copyright (c) 2019, Guenter Roeck
+ */
+
+#include <linux/hwmon.h>
+
+#include "nvme.h"
+
+struct nvme_hwmon_data {
+	struct nvme_ctrl *ctrl;
+	struct nvme_smart_log log;
+};
+
+static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
+{
+	return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
+			    &data->log, sizeof(data->log), 0);
+}
+
+static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			   u32 attr, int channel, long *val)
+{
+	struct nvme_hwmon_data *data = dev_get_drvdata(dev);
+	struct nvme_smart_log *log = &data->log;
+	int err;
+	int temp;
+
+	err = nvme_hwmon_get_smart_log(data);
+	if (err)
+		return err < 0 ? err : -EPROTO;
+
+	switch (attr) {
+	case hwmon_temp_max:
+		*val = (data->ctrl->wctemp - 273) * 1000;
+		break;
+	case hwmon_temp_crit:
+		*val = (data->ctrl->cctemp - 273) * 1000;
+		break;
+	case hwmon_temp_input:
+		if (!channel)
+			temp = le16_to_cpup((__le16 *)log->temperature);
+		else
+			temp = le16_to_cpu(log->temp_sensor[channel - 1]);
+		*val = (temp - 273) * 1000;
+		break;
+	case hwmon_temp_crit_alarm:
+		*val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+	return err;
+}
+
+static const char * const nvme_hwmon_sensor_names[] = {
+	"Composite",
+	"Sensor 1",
+	"Sensor 2",
+	"Sensor 3",
+	"Sensor 4",
+	"Sensor 5",
+	"Sensor 6",
+	"Sensor 7",
+	"Sensor 8",
+};
+
+static int nvme_hwmon_read_string(struct device *dev,
+				  enum hwmon_sensor_types type, u32 attr,
+				  int channel, const char **str)
+{
+	*str = nvme_hwmon_sensor_names[channel];
+	return 0;
+}
+
+static umode_t nvme_hwmon_is_visible(const void *_data,
+				     enum hwmon_sensor_types type,
+				     u32 attr, int channel)
+{
+	const struct nvme_hwmon_data *data = _data;
+
+	switch (attr) {
+	case hwmon_temp_crit:
+		if (!channel && data->ctrl->cctemp)
+			return 0444;
+		break;
+	case hwmon_temp_max:
+		if (!channel && data->ctrl->wctemp)
+			return 0444;
+		break;
+	case hwmon_temp_crit_alarm:
+		if (!channel)
+			return 0444;
+		break;
+	case hwmon_temp_input:
+	case hwmon_temp_label:
+		if (!channel || data->log.temp_sensor[channel - 1])
+			return 0444;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static const struct hwmon_channel_info *nvme_hwmon_info[] = {
+	HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ),
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT |
+				HWMON_T_LABEL | HWMON_T_CRIT_ALARM,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL,
+			   HWMON_T_INPUT | HWMON_T_LABEL),
+	NULL
+};
+
+static const struct hwmon_ops nvme_hwmon_ops = {
+	.is_visible = nvme_hwmon_is_visible,
+	.read = nvme_hwmon_read,
+	.read_string = nvme_hwmon_read_string,
+};
+
+static const struct hwmon_chip_info nvme_hwmon_chip_info = {
+	.ops = &nvme_hwmon_ops,
+	.info = nvme_hwmon_info,
+};
+
+void nvme_hwmon_init(struct nvme_ctrl *ctrl)
+{
+	struct device *dev = ctrl->device;
+	struct nvme_hwmon_data *data;
+	struct device *hwmon;
+	int err;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return;
+
+	data->ctrl = ctrl;
+
+	err = nvme_hwmon_get_smart_log(data);
+	if (err) {
+		dev_warn(dev, "Failed to read smart log (error %d)\n", err);
+		devm_kfree(dev, data);
+		return;
+	}
+
+	hwmon = devm_hwmon_device_register_with_info(dev, dev_name(dev),
+						     data,
+						     &nvme_hwmon_chip_info,
+						     NULL);
+	if (IS_ERR(hwmon)) {
+		dev_warn(dev, "Failed to instantiate hwmon device\n");
+		devm_kfree(dev, data);
+	}
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 22e8401352c2..e6460c1216bc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -231,6 +231,8 @@  struct nvme_ctrl {
 	u16 kas;
 	u8 npss;
 	u8 apsta;
+	u16 wctemp;
+	u16 cctemp;
 	u32 oaes;
 	u32 aen_result;
 	u32 ctratt;
@@ -652,4 +654,10 @@  static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 	return dev_to_disk(dev)->private_data;
 }
 
+#if IS_ENABLED(CONFIG_NVME_HWMON)
+void nvme_hwmon_init(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
+#endif
+
 #endif /* _NVME_H */