diff mbox series

[3/3] thermal/drivers/cpu_cooling: Introduce the cpu idle cooling driver

Message ID 20191113084042.5707-3-daniel.lezcano@linaro.org (mailing list archive)
State New, archived
Delegated to: Eduardo Valentin
Headers show
Series [1/3] thermal/drivers/Kconfig: Convert the CPU cooling device to a choice | expand

Commit Message

Daniel Lezcano Nov. 13, 2019, 8:40 a.m. UTC
The cpu idle cooling device offers a new method to cool down a CPU by
injecting idle cycles at runtime.

It has some similarities with the intel power clamp driver but it is
actually designed to be more generic and relying on the idle injection
powercap framework.

The idle injection cycle is fixed while the running cycle is variable. That
allows to have control on the device reactivity for the user experience.

An idle state powering down the CPU or the cluster will allow to drop
the static leakage, thus restoring the heat capacity of the SoC. It
can be set with a trip point between the hot and the critical points,
giving the opportunity to prevent a hard reset of the system when the
cpufreq cooling fails to cool down the CPU.

With more sophisticated boards having a per core sensor, the idle
cooling device allows to cool down a single core without throttling
the compute capacity of several cpus belonging to the same clock line,
so it could be used in collaboration with the cpufreq cooling device.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/Kconfig           |   7 +
 drivers/thermal/Makefile          |   1 +
 drivers/thermal/cpuidle_cooling.c | 233 ++++++++++++++++++++++++++++++
 include/linux/cpu_cooling.h       |  22 +++
 4 files changed, 263 insertions(+)
 create mode 100644 drivers/thermal/cpuidle_cooling.c

Comments

Viresh Kumar Nov. 26, 2019, 8:03 a.m. UTC | #1
On 13-11-19, 09:40, Daniel Lezcano wrote:
> The cpu idle cooling device offers a new method to cool down a CPU by
> injecting idle cycles at runtime.
> 
> It has some similarities with the intel power clamp driver but it is
> actually designed to be more generic and relying on the idle injection
> powercap framework.
> 
> The idle injection cycle is fixed while the running cycle is variable. That
> allows to have control on the device reactivity for the user experience.
> 
> An idle state powering down the CPU or the cluster will allow to drop
> the static leakage, thus restoring the heat capacity of the SoC. It
> can be set with a trip point between the hot and the critical points,
> giving the opportunity to prevent a hard reset of the system when the
> cpufreq cooling fails to cool down the CPU.
> 
> With more sophisticated boards having a per core sensor, the idle
> cooling device allows to cool down a single core without throttling
> the compute capacity of several cpus belonging to the same clock line,
> so it could be used in collaboration with the cpufreq cooling device.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>  drivers/thermal/Kconfig           |   7 +
>  drivers/thermal/Makefile          |   1 +
>  drivers/thermal/cpuidle_cooling.c | 233 ++++++++++++++++++++++++++++++
>  include/linux/cpu_cooling.h       |  22 +++
>  4 files changed, 263 insertions(+)
>  create mode 100644 drivers/thermal/cpuidle_cooling.c
> 
> diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
> index 2b82c4861091..00d69906c508 100644
> --- a/drivers/thermal/Kconfig
> +++ b/drivers/thermal/Kconfig
> @@ -168,6 +168,13 @@ config CPU_FREQ_THERMAL
>  	  This will be useful for platforms using the generic thermal interface
>  	  and not the ACPI interface.
>  
> +config CPU_IDLE_THERMAL
> +	bool "CPU idle cooling device"
> +	depends on IDLE_INJECT
> +	help
> +	  This implements the CPU cooling mechanism through
> +	  idle injection. This will throttle the CPU by injecting
> +	  idle cycle.
>  endif
>  
>  config CLOCK_THERMAL
> diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
> index d3b01cc96981..9c8aa2d4bd28 100644
> --- a/drivers/thermal/Makefile
> +++ b/drivers/thermal/Makefile
> @@ -20,6 +20,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
>  
>  # cpufreq cooling
>  thermal_sys-$(CONFIG_CPU_FREQ_THERMAL)	+= cpu_cooling.o

We should really rename this as cpufreq_cooling now :)

> +thermal_sys-$(CONFIG_CPU_IDLE_THERMAL)	+= cpuidle_cooling.o
>  
>  # clock cooling
>  thermal_sys-$(CONFIG_CLOCK_THERMAL)	+= clock_cooling.o
> diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c
> new file mode 100644
> index 000000000000..6e911fa87c47
> --- /dev/null
> +++ b/drivers/thermal/cpuidle_cooling.c
> @@ -0,0 +1,233 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  Copyright (C) 2019 Linaro Limited.
> + *
> + *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
> + *
> + */
> +#include <linux/cpu_cooling.h>
> +#include <linux/cpuidle.h>
> +#include <linux/err.h>
> +#include <linux/idle_inject.h>
> +#include <linux/idr.h>
> +#include <linux/slab.h>
> +#include <linux/thermal.h>
> +
> +/**
> + * struct cpuidle_cooling_device - data for the idle cooling device
> + * @ii_dev: an atomic to keep track of the last task exiting the idle cycle
> + * @idle_duration_us: an integer defining the duration of the idle injection
> + * @state: an normalized integer giving the state of the cooling device
> + */
> +struct cpuidle_cooling_device {
> +	struct idle_inject_device *ii_dev;
> +	unsigned int idle_duration_us;

This field is set with TICK_USEC and nothing else. Why not just use TICK_USEC
instead at all the places and remove this field?

> +	unsigned long state;
> +};
> +
> +static DEFINE_IDA(cpuidle_ida);
> +
> +/**
> + * cpuidle_cooling_runtime - Running time computation
> + * @idle_duration_us: the idle cooling device
> + * @state: a percentile based number
> + *
> + * The running duration is computed from the idle injection duration
> + * which is fixed. If we reach 100% of idle injection ratio, that
> + * means the running duration is zero. If we have a 50% ratio
> + * injection, that means we have equal duration for idle and for
> + * running duration.
> + *
> + * The formula is deduced as the following:
> + *
> + *  running = idle x ((100 / ratio) - 1)
> + *
> + * For precision purpose for integer math, we use the following:
> + *
> + *  running = (idle x 100) / ratio - idle
> + *
> + * For example, if we have an injected duration of 50%, then we end up
> + * with 10ms of idle injection and 10ms of running duration.
> + *
> + * Returns an unsigned int for an usec based runtime duration.
> + */
> +static unsigned int cpuidle_cooling_runtime(unsigned int idle_duration_us,
> +					    unsigned long state)
> +{
> +	if (!state)
> +		return 0;
> +
> +	return ((idle_duration_us * 100) / state) - idle_duration_us;
> +}
> +
> +/**
> + * cpuidle_cooling_get_max_state - Get the maximum state
> + * @cdev  : the thermal cooling device
> + * @state : a pointer to the state variable to be filled
> + *
> + * The function always gives 100 as the injection ratio is percentile
> + * based for consistency accros different platforms.
> + *
> + * The function can not fail, it always returns zero.
> + */
> +static int cpuidle_cooling_get_max_state(struct thermal_cooling_device *cdev,
> +					 unsigned long *state)
> +{
> +	/*
> +	 * Depending on the configuration or the hardware, the running
> +	 * cycle and the idle cycle could be different. We want unify
> +	 * that to an 0..100 interval, so the set state interface will
> +	 * be the same whatever the platform is.
> +	 *
> +	 * The state 100% will make the cluster 100% ... idle. A 0%
> +	 * injection ratio means no idle injection at all and 50%
> +	 * means for 10ms of idle injection, we have 10ms of running
> +	 * time.
> +	 */
> +	*state = 100;
> +
> +	return 0;
> +}
> +
> +/**
> + * cpuidle_cooling_get_cur_state - Get the current cooling state
> + * @cdev: the thermal cooling device
> + * @state: a pointer to the state
> + *
> + * The function just copy the state value from the private thermal
> + * cooling device structure, the mapping is 1 <-> 1.
> + *
> + * The function can not fail, it always returns zero.
> + */
> +static int cpuidle_cooling_get_cur_state(struct thermal_cooling_device *cdev,
> +					 unsigned long *state)
> +{
> +	struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
> +
> +	*state = idle_cdev->state;
> +
> +	return 0;
> +}
> +
> +/**
> + * cpuidle_cooling_set_cur_state - Set the current cooling state
> + * @cdev: the thermal cooling device
> + * @state: the target state
> + *
> + * The function checks first if we are initiating the mitigation which
> + * in turn wakes up all the idle injection tasks belonging to the idle
> + * cooling device. In any case, it updates the internal state for the
> + * cooling device.
> + *
> + * The function can not fail, it always returns zero.
> + */
> +static int cpuidle_cooling_set_cur_state(struct thermal_cooling_device *cdev,
> +					 unsigned long state)
> +{
> +	struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
> +	struct idle_inject_device *ii_dev = idle_cdev->ii_dev;
> +	unsigned long current_state = idle_cdev->state;
> +	unsigned int runtime_us;
> +
> +	idle_cdev->state = state;
> +
> +	runtime_us = cpuidle_cooling_runtime(idle_cdev->idle_duration_us,
> +					     state);
> +
> +	idle_inject_set_duration(ii_dev, runtime_us,
> +				 idle_cdev->idle_duration_us);
> +
> +	if (current_state == 0 && state > 0) {
> +		idle_inject_start(ii_dev);
> +	} else if (current_state > 0 && !state)  {
> +		idle_inject_stop(ii_dev);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * cpuidle_cooling_ops - thermal cooling device ops
> + */
> +static struct thermal_cooling_device_ops cpuidle_cooling_ops = {
> +	.get_max_state = cpuidle_cooling_get_max_state,
> +	.get_cur_state = cpuidle_cooling_get_cur_state,
> +	.set_cur_state = cpuidle_cooling_set_cur_state,
> +};
> +
> +/**
> + * cpuidle_of_cooling_register - Idle cooling device initialization function
> + * @drv: a cpuidle driver structure pointer
> + *
> + * This function is in charge of creating a cooling device per cpuidle
> + * driver and register it to thermal framework.
> + *
> + * Returns a valid pointer to a thermal cooling device, a PTR_ERR
> + * corresponding to the error detected in the underlying subsystems.
> + */
> +struct thermal_cooling_device *
> +__init cpuidle_of_cooling_register(struct device_node *np,
> +				   struct cpuidle_driver *drv)
> +{
> +	struct idle_inject_device *ii_dev;
> +	struct cpuidle_cooling_device *idle_cdev;
> +	struct thermal_cooling_device *cdev;
> +	char dev_name[THERMAL_NAME_LENGTH];
> +	int id, ret;
> +
> +	ii_dev = idle_inject_register(drv->cpumask);
> +	if (IS_ERR(ii_dev)) {
> +		ret = PTR_ERR(ii_dev);
> +		goto out;
> +	}
> +

I am not sure what's the best way of doing this, but I would have done this
after allocating idle_cdev and id.

> +	idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL);
> +	if (!idle_cdev) {
> +		ret = -ENOMEM;
> +		goto out_unregister;
> +	}
> +
> +	id = ida_simple_get(&cpuidle_ida, 0, 0, GFP_KERNEL);
> +	if (id < 0) {
> +		ret = id;
> +		goto out_kfree;
> +	}
> +
> +	idle_cdev->ii_dev = ii_dev;
> +	idle_cdev->idle_duration_us = TICK_USEC;
> +
> +	snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d", id);
> +
> +	cdev = thermal_of_cooling_device_register(np, dev_name, idle_cdev,
> +						  &cpuidle_cooling_ops);
> +	if (IS_ERR(cdev)) {
> +		ret = PTR_ERR(cdev);
> +		goto out_id;
> +	}
> +
> +	return cdev;
> +out_id:
> +	ida_simple_remove(&cpuidle_ida, id);
> +out_kfree:
> +	kfree(idle_cdev);
> +out_unregister:
> +	idle_inject_unregister(ii_dev);
> +out:
> +	return ERR_PTR(ret);
> +}
> +
> +/**
> + * cpuidle_cooling_register - Idle cooling device initialization function
> + * @drv: a cpuidle driver structure pointer
> + *
> + * This function is in charge of creating a cooling device per cpuidle
> + * driver and register it to thermal framework.
> + *
> + * Returns a valid pointer to a thermal cooling device, a PTR_ERR
> + * corresponding to the error detected in the underlying subsystems.
> + */
> +struct thermal_cooling_device *
> +__init cpuidle_cooling_register(struct cpuidle_driver *drv)
> +{
> +	return cpuidle_of_cooling_register(NULL, drv);
> +}
> diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
> index 3cdd85f987d7..7873ac2f740b 100644
> --- a/include/linux/cpu_cooling.h
> +++ b/include/linux/cpu_cooling.h
> @@ -60,4 +60,26 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
>  }
>  #endif /* CONFIG_CPU_FREQ_THERMAL */
>  
> +struct cpuidle_driver;
> +
> +#ifdef CONFIG_CPU_IDLE_THERMAL
> +extern struct thermal_cooling_device *
> +__init cpuidle_cooling_register(struct cpuidle_driver *drv);
> +extern struct thermal_cooling_device *
> +__init cpuidle_of_cooling_register(struct device_node *np,
> +				   struct cpuidle_driver *drv);
> +#else /* CONFIG_CPU_IDLE_THERMAL */
> +static inline struct thermal_cooling_device *
> +__init cpuidle_cooling_register(struct cpuidle_driver *drv)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +extern struct thermal_cooling_device *
> +__init cpuidle_of_cooling_register(struct device_node *np,
> +				   struct cpuidle_driver *drv)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +#endif /* CONFIG_CPU_IDLE_THERMAL */
> +
>  #endif /* __CPU_COOLING_H__ */
> -- 
> 2.17.1
Daniel Lezcano Dec. 2, 2019, 12:34 p.m. UTC | #2
Hi Viresh,

thanks for the review

On 26/11/2019 09:03, Viresh Kumar wrote:
> On 13-11-19, 09:40, Daniel Lezcano wrote:
>> The cpu idle cooling device offers a new method to cool down a CPU by
>> injecting idle cycles at runtime.
>>
>> It has some similarities with the intel power clamp driver but it is
>> actually designed to be more generic and relying on the idle injection
>> powercap framework.
>>
>> The idle injection cycle is fixed while the running cycle is variable. That
>> allows to have control on the device reactivity for the user experience.
>>
>> An idle state powering down the CPU or the cluster will allow to drop
>> the static leakage, thus restoring the heat capacity of the SoC. It
>> can be set with a trip point between the hot and the critical points,
>> giving the opportunity to prevent a hard reset of the system when the
>> cpufreq cooling fails to cool down the CPU.
>>
>> With more sophisticated boards having a per core sensor, the idle
>> cooling device allows to cool down a single core without throttling
>> the compute capacity of several cpus belonging to the same clock line,
>> so it could be used in collaboration with the cpufreq cooling device.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>> ---
>>  drivers/thermal/Kconfig           |   7 +
>>  drivers/thermal/Makefile          |   1 +
>>  drivers/thermal/cpuidle_cooling.c | 233 ++++++++++++++++++++++++++++++
>>  include/linux/cpu_cooling.h       |  22 +++
>>  4 files changed, 263 insertions(+)
>>  create mode 100644 drivers/thermal/cpuidle_cooling.c
>>
>> diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
>> index 2b82c4861091..00d69906c508 100644
>> --- a/drivers/thermal/Kconfig
>> +++ b/drivers/thermal/Kconfig
>> @@ -168,6 +168,13 @@ config CPU_FREQ_THERMAL
>>  	  This will be useful for platforms using the generic thermal interface
>>  	  and not the ACPI interface.
>>  
>> +config CPU_IDLE_THERMAL
>> +	bool "CPU idle cooling device"
>> +	depends on IDLE_INJECT
>> +	help
>> +	  This implements the CPU cooling mechanism through
>> +	  idle injection. This will throttle the CPU by injecting
>> +	  idle cycle.
>>  endif
>>  
>>  config CLOCK_THERMAL
>> diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
>> index d3b01cc96981..9c8aa2d4bd28 100644
>> --- a/drivers/thermal/Makefile
>> +++ b/drivers/thermal/Makefile
>> @@ -20,6 +20,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
>>  
>>  # cpufreq cooling
>>  thermal_sys-$(CONFIG_CPU_FREQ_THERMAL)	+= cpu_cooling.o
> 
> We should really rename this as cpufreq_cooling now :)

Ok, will do on top of this series.

>> +thermal_sys-$(CONFIG_CPU_IDLE_THERMAL)	+= cpuidle_cooling.o
>>  
>>  # clock cooling
>>  thermal_sys-$(CONFIG_CLOCK_THERMAL)	+= clock_cooling.o
>> diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c
>> new file mode 100644
>> index 000000000000..6e911fa87c47
>> --- /dev/null
>> +++ b/drivers/thermal/cpuidle_cooling.c
>> @@ -0,0 +1,233 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + *  Copyright (C) 2019 Linaro Limited.
>> + *
>> + *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
>> + *
>> + */
>> +#include <linux/cpu_cooling.h>
>> +#include <linux/cpuidle.h>
>> +#include <linux/err.h>
>> +#include <linux/idle_inject.h>
>> +#include <linux/idr.h>
>> +#include <linux/slab.h>
>> +#include <linux/thermal.h>
>> +
>> +/**
>> + * struct cpuidle_cooling_device - data for the idle cooling device
>> + * @ii_dev: an atomic to keep track of the last task exiting the idle cycle
>> + * @idle_duration_us: an integer defining the duration of the idle injection
>> + * @state: an normalized integer giving the state of the cooling device
>> + */
>> +struct cpuidle_cooling_device {
>> +	struct idle_inject_device *ii_dev;
>> +	unsigned int idle_duration_us;
> 
> This field is set with TICK_USEC and nothing else. Why not just use TICK_USEC
> instead at all the places and remove this field?

Actually the idle duration value is platform dependent and even it is
now TICK_USEC, we want to be able to change it (at boot or runtime).

I'll add a function to change the value.

[ ... ]
diff mbox series

Patch

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 2b82c4861091..00d69906c508 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -168,6 +168,13 @@  config CPU_FREQ_THERMAL
 	  This will be useful for platforms using the generic thermal interface
 	  and not the ACPI interface.
 
+config CPU_IDLE_THERMAL
+	bool "CPU idle cooling device"
+	depends on IDLE_INJECT
+	help
+	  This implements the CPU cooling mechanism through
+	  idle injection. This will throttle the CPU by injecting
+	  idle cycle.
 endif
 
 config CLOCK_THERMAL
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index d3b01cc96981..9c8aa2d4bd28 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -20,6 +20,7 @@  thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
 
 # cpufreq cooling
 thermal_sys-$(CONFIG_CPU_FREQ_THERMAL)	+= cpu_cooling.o
+thermal_sys-$(CONFIG_CPU_IDLE_THERMAL)	+= cpuidle_cooling.o
 
 # clock cooling
 thermal_sys-$(CONFIG_CLOCK_THERMAL)	+= clock_cooling.o
diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c
new file mode 100644
index 000000000000..6e911fa87c47
--- /dev/null
+++ b/drivers/thermal/cpuidle_cooling.c
@@ -0,0 +1,233 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Copyright (C) 2019 Linaro Limited.
+ *
+ *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ */
+#include <linux/cpu_cooling.h>
+#include <linux/cpuidle.h>
+#include <linux/err.h>
+#include <linux/idle_inject.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+/**
+ * struct cpuidle_cooling_device - data for the idle cooling device
+ * @ii_dev: an atomic to keep track of the last task exiting the idle cycle
+ * @idle_duration_us: an integer defining the duration of the idle injection
+ * @state: an normalized integer giving the state of the cooling device
+ */
+struct cpuidle_cooling_device {
+	struct idle_inject_device *ii_dev;
+	unsigned int idle_duration_us;
+	unsigned long state;
+};
+
+static DEFINE_IDA(cpuidle_ida);
+
+/**
+ * cpuidle_cooling_runtime - Running time computation
+ * @idle_duration_us: the idle cooling device
+ * @state: a percentile based number
+ *
+ * The running duration is computed from the idle injection duration
+ * which is fixed. If we reach 100% of idle injection ratio, that
+ * means the running duration is zero. If we have a 50% ratio
+ * injection, that means we have equal duration for idle and for
+ * running duration.
+ *
+ * The formula is deduced as the following:
+ *
+ *  running = idle x ((100 / ratio) - 1)
+ *
+ * For precision purpose for integer math, we use the following:
+ *
+ *  running = (idle x 100) / ratio - idle
+ *
+ * For example, if we have an injected duration of 50%, then we end up
+ * with 10ms of idle injection and 10ms of running duration.
+ *
+ * Returns an unsigned int for an usec based runtime duration.
+ */
+static unsigned int cpuidle_cooling_runtime(unsigned int idle_duration_us,
+					    unsigned long state)
+{
+	if (!state)
+		return 0;
+
+	return ((idle_duration_us * 100) / state) - idle_duration_us;
+}
+
+/**
+ * cpuidle_cooling_get_max_state - Get the maximum state
+ * @cdev  : the thermal cooling device
+ * @state : a pointer to the state variable to be filled
+ *
+ * The function always gives 100 as the injection ratio is percentile
+ * based for consistency accros different platforms.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_get_max_state(struct thermal_cooling_device *cdev,
+					 unsigned long *state)
+{
+	/*
+	 * Depending on the configuration or the hardware, the running
+	 * cycle and the idle cycle could be different. We want unify
+	 * that to an 0..100 interval, so the set state interface will
+	 * be the same whatever the platform is.
+	 *
+	 * The state 100% will make the cluster 100% ... idle. A 0%
+	 * injection ratio means no idle injection at all and 50%
+	 * means for 10ms of idle injection, we have 10ms of running
+	 * time.
+	 */
+	*state = 100;
+
+	return 0;
+}
+
+/**
+ * cpuidle_cooling_get_cur_state - Get the current cooling state
+ * @cdev: the thermal cooling device
+ * @state: a pointer to the state
+ *
+ * The function just copy the state value from the private thermal
+ * cooling device structure, the mapping is 1 <-> 1.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_get_cur_state(struct thermal_cooling_device *cdev,
+					 unsigned long *state)
+{
+	struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
+
+	*state = idle_cdev->state;
+
+	return 0;
+}
+
+/**
+ * cpuidle_cooling_set_cur_state - Set the current cooling state
+ * @cdev: the thermal cooling device
+ * @state: the target state
+ *
+ * The function checks first if we are initiating the mitigation which
+ * in turn wakes up all the idle injection tasks belonging to the idle
+ * cooling device. In any case, it updates the internal state for the
+ * cooling device.
+ *
+ * The function can not fail, it always returns zero.
+ */
+static int cpuidle_cooling_set_cur_state(struct thermal_cooling_device *cdev,
+					 unsigned long state)
+{
+	struct cpuidle_cooling_device *idle_cdev = cdev->devdata;
+	struct idle_inject_device *ii_dev = idle_cdev->ii_dev;
+	unsigned long current_state = idle_cdev->state;
+	unsigned int runtime_us;
+
+	idle_cdev->state = state;
+
+	runtime_us = cpuidle_cooling_runtime(idle_cdev->idle_duration_us,
+					     state);
+
+	idle_inject_set_duration(ii_dev, runtime_us,
+				 idle_cdev->idle_duration_us);
+
+	if (current_state == 0 && state > 0) {
+		idle_inject_start(ii_dev);
+	} else if (current_state > 0 && !state)  {
+		idle_inject_stop(ii_dev);
+	}
+
+	return 0;
+}
+
+/**
+ * cpuidle_cooling_ops - thermal cooling device ops
+ */
+static struct thermal_cooling_device_ops cpuidle_cooling_ops = {
+	.get_max_state = cpuidle_cooling_get_max_state,
+	.get_cur_state = cpuidle_cooling_get_cur_state,
+	.set_cur_state = cpuidle_cooling_set_cur_state,
+};
+
+/**
+ * cpuidle_of_cooling_register - Idle cooling device initialization function
+ * @drv: a cpuidle driver structure pointer
+ *
+ * This function is in charge of creating a cooling device per cpuidle
+ * driver and register it to thermal framework.
+ *
+ * Returns a valid pointer to a thermal cooling device, a PTR_ERR
+ * corresponding to the error detected in the underlying subsystems.
+ */
+struct thermal_cooling_device *
+__init cpuidle_of_cooling_register(struct device_node *np,
+				   struct cpuidle_driver *drv)
+{
+	struct idle_inject_device *ii_dev;
+	struct cpuidle_cooling_device *idle_cdev;
+	struct thermal_cooling_device *cdev;
+	char dev_name[THERMAL_NAME_LENGTH];
+	int id, ret;
+
+	ii_dev = idle_inject_register(drv->cpumask);
+	if (IS_ERR(ii_dev)) {
+		ret = PTR_ERR(ii_dev);
+		goto out;
+	}
+
+	idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL);
+	if (!idle_cdev) {
+		ret = -ENOMEM;
+		goto out_unregister;
+	}
+
+	id = ida_simple_get(&cpuidle_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		ret = id;
+		goto out_kfree;
+	}
+
+	idle_cdev->ii_dev = ii_dev;
+	idle_cdev->idle_duration_us = TICK_USEC;
+
+	snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d", id);
+
+	cdev = thermal_of_cooling_device_register(np, dev_name, idle_cdev,
+						  &cpuidle_cooling_ops);
+	if (IS_ERR(cdev)) {
+		ret = PTR_ERR(cdev);
+		goto out_id;
+	}
+
+	return cdev;
+out_id:
+	ida_simple_remove(&cpuidle_ida, id);
+out_kfree:
+	kfree(idle_cdev);
+out_unregister:
+	idle_inject_unregister(ii_dev);
+out:
+	return ERR_PTR(ret);
+}
+
+/**
+ * cpuidle_cooling_register - Idle cooling device initialization function
+ * @drv: a cpuidle driver structure pointer
+ *
+ * This function is in charge of creating a cooling device per cpuidle
+ * driver and register it to thermal framework.
+ *
+ * Returns a valid pointer to a thermal cooling device, a PTR_ERR
+ * corresponding to the error detected in the underlying subsystems.
+ */
+struct thermal_cooling_device *
+__init cpuidle_cooling_register(struct cpuidle_driver *drv)
+{
+	return cpuidle_of_cooling_register(NULL, drv);
+}
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index 3cdd85f987d7..7873ac2f740b 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -60,4 +60,26 @@  of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 }
 #endif /* CONFIG_CPU_FREQ_THERMAL */
 
+struct cpuidle_driver;
+
+#ifdef CONFIG_CPU_IDLE_THERMAL
+extern struct thermal_cooling_device *
+__init cpuidle_cooling_register(struct cpuidle_driver *drv);
+extern struct thermal_cooling_device *
+__init cpuidle_of_cooling_register(struct device_node *np,
+				   struct cpuidle_driver *drv);
+#else /* CONFIG_CPU_IDLE_THERMAL */
+static inline struct thermal_cooling_device *
+__init cpuidle_cooling_register(struct cpuidle_driver *drv)
+{
+	return ERR_PTR(-EINVAL);
+}
+extern struct thermal_cooling_device *
+__init cpuidle_of_cooling_register(struct device_node *np,
+				   struct cpuidle_driver *drv)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_CPU_IDLE_THERMAL */
+
 #endif /* __CPU_COOLING_H__ */