[v3,3/3] nvme: notify thermal framework when temperature threshold events occur
diff mbox series

Message ID 1558888143-5121-4-git-send-email-akinobu.mita@gmail.com
State Changes Requested
Delegated to: Eduardo Valentin
Headers show
Series
  • nvme: add thermal zone devices
Related show

Commit Message

Akinobu Mita May 26, 2019, 4:29 p.m. UTC
The NVMe controller supports the temperature threshold feature (Feature
Identifier 04h) that enables to configure the asynchronous event request
command to complete when the temperature is crossed its corresponding
temperature threshold.

This enables the reporting of asynchronous events from the controller when
the temperature reached or exceeded a temperature threshold.

In the case of the temperature threshold conditions, this notifies the
thermal framework.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Minwoo Im <minwoo.im.dev@gmail.com>
Cc: Kenneth Heitke <kenneth.heitke@intel.com>
Cc: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
---
* v3
- No changes since v2

 drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++
 include/linux/nvme.h     |  7 +++++++
 2 files changed, 35 insertions(+)

Comments

Christoph Hellwig June 1, 2019, 9:03 a.m. UTC | #1
So before we allowed userspace to get smart AEN notifications through
uevent, and would expect userspace to clear the AEN.  I think this
could at least potentially break existing userspace by now doing that
in kernel space.
Akinobu Mita June 2, 2019, 1:46 p.m. UTC | #2
2019年6月1日(土) 18:04 Christoph Hellwig <hch@lst.de>:
>
> So before we allowed userspace to get smart AEN notifications through
> uevent, and would expect userspace to clear the AEN.  I think this
> could at least potentially break existing userspace by now doing that
> in kernel space.

This change unconditionally sets NVME_SMART_CRIT_TEMPERATURE flag in
nvme_enable_aen(), it could be problematic for existing userspace.
So it's better to provide a knob to enable/disable the event notification
and we can utilize get_mode()/set_mode() in the thermal_zone_device_ops.

Other than that, this change doesn't remove ctrl->aen_result, so existing
userspace still receives NVME_AEN=* uevents.
Christoph Hellwig June 4, 2019, 7:32 a.m. UTC | #3
On Sun, Jun 02, 2019 at 10:46:12PM +0900, Akinobu Mita wrote:
> 2019年6月1日(土) 18:04 Christoph Hellwig <hch@lst.de>:
> >
> > So before we allowed userspace to get smart AEN notifications through
> > uevent, and would expect userspace to clear the AEN.  I think this
> > could at least potentially break existing userspace by now doing that
> > in kernel space.
> 
> This change unconditionally sets NVME_SMART_CRIT_TEMPERATURE flag in
> nvme_enable_aen(), it could be problematic for existing userspace.
> So it's better to provide a knob to enable/disable the event notification
> and we can utilize get_mode()/set_mode() in the thermal_zone_device_ops.
> 
> Other than that, this change doesn't remove ctrl->aen_result, so existing
> userspace still receives NVME_AEN=* uevents.

And that is the problem.  Because for notifications we expect userspace to
read the log page and clear the event, while we are not doing it in kernel
space.

Patch
diff mbox series

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4c8271a..26c8b59 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1186,6 +1186,9 @@  static void nvme_enable_aen(struct nvme_ctrl *ctrl)
 	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
 	int status;
 
+	if (IS_ENABLED(CONFIG_THERMAL))
+		supported_aens |= NVME_SMART_CRIT_TEMPERATURE;
+
 	if (!supported_aens)
 		return;
 
@@ -2470,6 +2473,16 @@  static void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
 	}
 }
 
+static void nvme_thermal_notify_framework(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ctrl->tzdev); i++) {
+		if (ctrl->tzdev[i])
+			thermal_notify_framework(ctrl->tzdev[i], 0);
+	}
+}
+
 #else
 
 static inline int nvme_thermal_zones_register(struct nvme_ctrl *ctrl)
@@ -2481,6 +2494,10 @@  static inline void nvme_thermal_zones_unregister(struct nvme_ctrl *ctrl)
 {
 }
 
+static void nvme_thermal_notify_framework(struct nvme_ctrl *ctrl)
+{
+}
+
 #endif /* CONFIG_THERMAL */
 
 struct nvme_core_quirk_entry {
@@ -3901,6 +3918,16 @@  void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
 
+static void nvme_handle_aen_smart(struct nvme_ctrl *ctrl, u32 result)
+{
+	u32 aer_type = result & NVME_AER_TYPE_MASK;
+	u32 aer_info = (result >> NVME_AER_INFO_SHIFT) & NVME_AER_INFO_MASK;
+
+	if (aer_type == NVME_AER_SMART &&
+	    aer_info == NVME_AER_SMART_TEMP_THRESH)
+		nvme_thermal_notify_framework(ctrl);
+}
+
 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
 {
 	char *envp[2] = { NULL, NULL };
@@ -3922,6 +3949,7 @@  static void nvme_async_event_work(struct work_struct *work)
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, async_event_work);
 
+	nvme_handle_aen_smart(ctrl, ctrl->aen_result);
 	nvme_aen_uevent(ctrl);
 	ctrl->ops->submit_async_event(ctrl);
 }
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 54f0a13..8e7d599 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -507,6 +507,7 @@  enum {
 };
 
 enum {
+	NVME_AER_TYPE_MASK		= 0x7,
 	NVME_AER_ERROR			= 0,
 	NVME_AER_SMART			= 1,
 	NVME_AER_NOTICE			= 2,
@@ -515,6 +516,12 @@  enum {
 };
 
 enum {
+	NVME_AER_INFO_SHIFT		= 8,
+	NVME_AER_INFO_MASK		= 0xff,
+	NVME_AER_SMART_TEMP_THRESH	= 0x01,
+};
+
+enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,