diff mbox series

[v5,14/23] PM: EM: Support late CPUs booting and capacity adjustment

Message ID 20231129110853.94344-15-lukasz.luba@arm.com (mailing list archive)
State Changes Requested, archived
Headers show
Series Introduce runtime modifiable Energy Model | expand

Commit Message

Lukasz Luba Nov. 29, 2023, 11:08 a.m. UTC
The patch adds needed infrastructure to handle the late CPUs boot, which
might change the previous CPUs capacity values. With this changes the new
CPUs which try to register EM will trigger the needed re-calculations for
other CPUs EMs. Thanks to that the em_per_state::performance values will
be aligned with the CPU capacity information after all CPUs finish the
boot and EM registrations.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 kernel/power/energy_model.c | 121 ++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

Comments

Dietmar Eggemann Dec. 12, 2023, 6:50 p.m. UTC | #1
On 29/11/2023 12:08, Lukasz Luba wrote:
> The patch adds needed infrastructure to handle the late CPUs boot, which
> might change the previous CPUs capacity values. With this changes the new
> CPUs which try to register EM will trigger the needed re-calculations for
> other CPUs EMs. Thanks to that the em_per_state::performance values will
> be aligned with the CPU capacity information after all CPUs finish the
> boot and EM registrations.

IMHO, it's worth mentioning here that this added functionality is the 1.
use case of the modifiable EM.

[...]

> + * Adjustment of CPU performance values after boot, when all CPUs capacites
> + * are correctly calculated.
> + */
> +static void em_adjust_new_capacity(struct device *dev,
> +				   struct em_perf_domain *pd,
> +				   u64 max_cap)
> +{

[...]

> +	/*
> +	 * This is one-time-update, so give up the ownership in this updater.
> +	 * The EM fwk will keep the reference and free the memory when needed.

s/fwk/framework ?

> +	 */
> +	em_free_table(runtime_table);
> +}
> +
> +static void em_check_capacity_update(void)
> +{
> +	cpumask_var_t cpu_done_mask;
> +	struct em_perf_state *table;
> +	struct em_perf_domain *pd;
> +	unsigned long cpu_capacity;
> +	int cpu;
> +
> +	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
> +		pr_warn("no free memory\n");
> +		return;
> +	}
> +
> +	/* Check if CPUs capacity has changed than update EM */

s/than/then ?

Maybe this comment is not needed since there is (1) further down?


> +	for_each_possible_cpu(cpu) {
> +		struct cpufreq_policy *policy;
> +		unsigned long em_max_perf;
> +		struct device *dev;
> +		int nr_states;
> +
> +		if (cpumask_test_cpu(cpu, cpu_done_mask))
> +			continue;
> +
> +		policy = cpufreq_cpu_get(cpu);
> +		if (!policy) {
> +			pr_debug("Accessing cpu%d policy failed\n", cpu);
> +			schedule_delayed_work(&em_update_work,
> +					      msecs_to_jiffies(1000));
> +			break;
> +		}
> +		cpufreq_cpu_put(policy);
> +
> +		pd = em_cpu_get(cpu);
> +		if (!pd || em_is_artificial(pd))
> +			continue;
> +
> +		cpumask_or(cpu_done_mask, cpu_done_mask,
> +			   em_span_cpus(pd));
> +
> +		nr_states = pd->nr_perf_states;
> +		cpu_capacity = arch_scale_cpu_capacity(cpu);
> +
> +		table = em_get_table(pd);
> +		em_max_perf = table[pd->nr_perf_states - 1].performance;
> +		em_put_table();
> +
> +		/*
> +		 * Check if the CPU capacity has been adjusted during boot
> +		 * and trigger the update for new performance values.
> +		 */

(1)

[...]
Qais Yousef Dec. 17, 2023, 6 p.m. UTC | #2
On 11/29/23 11:08, Lukasz Luba wrote:
> The patch adds needed infrastructure to handle the late CPUs boot, which
> might change the previous CPUs capacity values. With this changes the new
> CPUs which try to register EM will trigger the needed re-calculations for
> other CPUs EMs. Thanks to that the em_per_state::performance values will
> be aligned with the CPU capacity information after all CPUs finish the
> boot and EM registrations.
> 
> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
> ---
>  kernel/power/energy_model.c | 121 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 121 insertions(+)
> 
> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
> index b5016afe6a19..d3fa5a77de80 100644
> --- a/kernel/power/energy_model.c
> +++ b/kernel/power/energy_model.c
> @@ -25,6 +25,9 @@ static DEFINE_MUTEX(em_pd_mutex);
>  
>  static void em_cpufreq_update_efficiencies(struct device *dev,
>  					   struct em_perf_state *table);
> +static void em_check_capacity_update(void);
> +static void em_update_workfn(struct work_struct *work);
> +static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
>  
>  static bool _is_cpu_device(struct device *dev)
>  {
> @@ -596,6 +599,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>  
>  unlock:
>  	mutex_unlock(&em_pd_mutex);
> +
> +	if (_is_cpu_device(dev))
> +		em_check_capacity_update();
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
> @@ -631,3 +638,117 @@ void em_dev_unregister_perf_domain(struct device *dev)
>  	mutex_unlock(&em_pd_mutex);
>  }
>  EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
> +
> +/*
> + * Adjustment of CPU performance values after boot, when all CPUs capacites
> + * are correctly calculated.
> + */
> +static void em_adjust_new_capacity(struct device *dev,
> +				   struct em_perf_domain *pd,
> +				   u64 max_cap)
> +{
> +	struct em_perf_table __rcu *runtime_table;
> +	struct em_perf_state *table, *new_table;
> +	int ret, table_size;
> +
> +	runtime_table = em_allocate_table(pd);
> +	if (!runtime_table) {
> +		dev_warn(dev, "EM: allocation failed\n");
> +		return;
> +	}
> +
> +	new_table = runtime_table->state;
> +
> +	table = em_get_table(pd);
> +	/* Initialize data based on older runtime table */
> +	table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
> +	memcpy(new_table, table, table_size);
> +
> +	em_put_table();
> +
> +	em_init_performance(dev, pd, new_table, pd->nr_perf_states);
> +	ret = em_compute_costs(dev, new_table, NULL, pd->nr_perf_states,
> +			       pd->flags);
> +	if (ret) {
> +		em_free_table(runtime_table);
> +		return;
> +	}
> +
> +	ret = em_dev_update_perf_domain(dev, runtime_table);
> +	if (ret)
> +		dev_warn(dev, "EM: update failed %d\n", ret);
> +
> +	/*
> +	 * This is one-time-update, so give up the ownership in this updater.
> +	 * The EM fwk will keep the reference and free the memory when needed.
> +	 */
> +	em_free_table(runtime_table);
> +}
> +
> +static void em_check_capacity_update(void)
> +{
> +	cpumask_var_t cpu_done_mask;
> +	struct em_perf_state *table;
> +	struct em_perf_domain *pd;
> +	unsigned long cpu_capacity;
> +	int cpu;
> +
> +	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
> +		pr_warn("no free memory\n");
> +		return;
> +	}
> +
> +	/* Check if CPUs capacity has changed than update EM */
> +	for_each_possible_cpu(cpu) {

Can't we instead hook into cpufreq_online/offline() to check if we need to
do any em related update for this policy?


Cheers

--
Qais Yousef

> +		struct cpufreq_policy *policy;
> +		unsigned long em_max_perf;
> +		struct device *dev;
> +		int nr_states;
> +
> +		if (cpumask_test_cpu(cpu, cpu_done_mask))
> +			continue;
> +
> +		policy = cpufreq_cpu_get(cpu);
> +		if (!policy) {
> +			pr_debug("Accessing cpu%d policy failed\n", cpu);
> +			schedule_delayed_work(&em_update_work,
> +					      msecs_to_jiffies(1000));
> +			break;
> +		}
> +		cpufreq_cpu_put(policy);
> +
> +		pd = em_cpu_get(cpu);
> +		if (!pd || em_is_artificial(pd))
> +			continue;
> +
> +		cpumask_or(cpu_done_mask, cpu_done_mask,
> +			   em_span_cpus(pd));
> +
> +		nr_states = pd->nr_perf_states;
> +		cpu_capacity = arch_scale_cpu_capacity(cpu);
> +
> +		table = em_get_table(pd);
> +		em_max_perf = table[pd->nr_perf_states - 1].performance;
> +		em_put_table();
> +
> +		/*
> +		 * Check if the CPU capacity has been adjusted during boot
> +		 * and trigger the update for new performance values.
> +		 */
> +		if (em_max_perf == cpu_capacity)
> +			continue;
> +
> +		pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
> +			 cpu, cpu_capacity, em_max_perf);
> +
> +		dev = get_cpu_device(cpu);
> +		em_adjust_new_capacity(dev, pd, cpu_capacity);
> +	}
> +
> +	free_cpumask_var(cpu_done_mask);
> +}
> +
> +static void em_update_workfn(struct work_struct *work)
> +{
> +	em_check_capacity_update();
> +}
> -- 
> 2.25.1
>
Lukasz Luba Dec. 20, 2023, 8:23 a.m. UTC | #3
On 12/12/23 18:50, Dietmar Eggemann wrote:
> On 29/11/2023 12:08, Lukasz Luba wrote:
>> The patch adds needed infrastructure to handle the late CPUs boot, which
>> might change the previous CPUs capacity values. With this changes the new
>> CPUs which try to register EM will trigger the needed re-calculations for
>> other CPUs EMs. Thanks to that the em_per_state::performance values will
>> be aligned with the CPU capacity information after all CPUs finish the
>> boot and EM registrations.
> 
> IMHO, it's worth mentioning here that this added functionality is the 1.
> use case of the modifiable EM.

Make sense. I will add that. It's quite important information, since
it also justifies the EM update feature.

> 
> [...]
> 
>> + * Adjustment of CPU performance values after boot, when all CPUs capacites
>> + * are correctly calculated.
>> + */
>> +static void em_adjust_new_capacity(struct device *dev,
>> +				   struct em_perf_domain *pd,
>> +				   u64 max_cap)
>> +{
> 
> [...]
> 
>> +	/*
>> +	 * This is one-time-update, so give up the ownership in this updater.
>> +	 * The EM fwk will keep the reference and free the memory when needed.
> 
> s/fwk/framework ?

OK

> 
>> +	 */
>> +	em_free_table(runtime_table);
>> +}
>> +
>> +static void em_check_capacity_update(void)
>> +{
>> +	cpumask_var_t cpu_done_mask;
>> +	struct em_perf_state *table;
>> +	struct em_perf_domain *pd;
>> +	unsigned long cpu_capacity;
>> +	int cpu;
>> +
>> +	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
>> +		pr_warn("no free memory\n");
>> +		return;
>> +	}
>> +
>> +	/* Check if CPUs capacity has changed than update EM */
> 
> s/than/then ?
> 
> Maybe this comment is not needed since there is (1) further down?

Yes, I'll remove that.

> 
> 
>> +	for_each_possible_cpu(cpu) {
>> +		struct cpufreq_policy *policy;
>> +		unsigned long em_max_perf;
>> +		struct device *dev;
>> +		int nr_states;
>> +
>> +		if (cpumask_test_cpu(cpu, cpu_done_mask))
>> +			continue;
>> +
>> +		policy = cpufreq_cpu_get(cpu);
>> +		if (!policy) {
>> +			pr_debug("Accessing cpu%d policy failed\n", cpu);
>> +			schedule_delayed_work(&em_update_work,
>> +					      msecs_to_jiffies(1000));
>> +			break;
>> +		}
>> +		cpufreq_cpu_put(policy);
>> +
>> +		pd = em_cpu_get(cpu);
>> +		if (!pd || em_is_artificial(pd))
>> +			continue;
>> +
>> +		cpumask_or(cpu_done_mask, cpu_done_mask,
>> +			   em_span_cpus(pd));
>> +
>> +		nr_states = pd->nr_perf_states;
>> +		cpu_capacity = arch_scale_cpu_capacity(cpu);
>> +
>> +		table = em_get_table(pd);
>> +		em_max_perf = table[pd->nr_perf_states - 1].performance;
>> +		em_put_table();
>> +
>> +		/*
>> +		 * Check if the CPU capacity has been adjusted during boot
>> +		 * and trigger the update for new performance values.
>> +		 */
> 
> (1)
> 
> [...]
Lukasz Luba Jan. 2, 2024, 11:39 a.m. UTC | #4
On 12/17/23 18:00, Qais Yousef wrote:
> On 11/29/23 11:08, Lukasz Luba wrote:
>> The patch adds needed infrastructure to handle the late CPUs boot, which
>> might change the previous CPUs capacity values. With this changes the new
>> CPUs which try to register EM will trigger the needed re-calculations for
>> other CPUs EMs. Thanks to that the em_per_state::performance values will
>> be aligned with the CPU capacity information after all CPUs finish the
>> boot and EM registrations.
>>
>> Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
>> ---
>>   kernel/power/energy_model.c | 121 ++++++++++++++++++++++++++++++++++++
>>   1 file changed, 121 insertions(+)
>>
>> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
>> index b5016afe6a19..d3fa5a77de80 100644
>> --- a/kernel/power/energy_model.c
>> +++ b/kernel/power/energy_model.c
>> @@ -25,6 +25,9 @@ static DEFINE_MUTEX(em_pd_mutex);
>>   
>>   static void em_cpufreq_update_efficiencies(struct device *dev,
>>   					   struct em_perf_state *table);
>> +static void em_check_capacity_update(void);
>> +static void em_update_workfn(struct work_struct *work);
>> +static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
>>   
>>   static bool _is_cpu_device(struct device *dev)
>>   {
>> @@ -596,6 +599,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>>   
>>   unlock:
>>   	mutex_unlock(&em_pd_mutex);
>> +
>> +	if (_is_cpu_device(dev))
>> +		em_check_capacity_update();
>> +
>>   	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
>> @@ -631,3 +638,117 @@ void em_dev_unregister_perf_domain(struct device *dev)
>>   	mutex_unlock(&em_pd_mutex);
>>   }
>>   EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
>> +
>> +/*
>> + * Adjustment of CPU performance values after boot, when all CPUs capacites
>> + * are correctly calculated.
>> + */
>> +static void em_adjust_new_capacity(struct device *dev,
>> +				   struct em_perf_domain *pd,
>> +				   u64 max_cap)
>> +{
>> +	struct em_perf_table __rcu *runtime_table;
>> +	struct em_perf_state *table, *new_table;
>> +	int ret, table_size;
>> +
>> +	runtime_table = em_allocate_table(pd);
>> +	if (!runtime_table) {
>> +		dev_warn(dev, "EM: allocation failed\n");
>> +		return;
>> +	}
>> +
>> +	new_table = runtime_table->state;
>> +
>> +	table = em_get_table(pd);
>> +	/* Initialize data based on older runtime table */
>> +	table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
>> +	memcpy(new_table, table, table_size);
>> +
>> +	em_put_table();
>> +
>> +	em_init_performance(dev, pd, new_table, pd->nr_perf_states);
>> +	ret = em_compute_costs(dev, new_table, NULL, pd->nr_perf_states,
>> +			       pd->flags);
>> +	if (ret) {
>> +		em_free_table(runtime_table);
>> +		return;
>> +	}
>> +
>> +	ret = em_dev_update_perf_domain(dev, runtime_table);
>> +	if (ret)
>> +		dev_warn(dev, "EM: update failed %d\n", ret);
>> +
>> +	/*
>> +	 * This is one-time-update, so give up the ownership in this updater.
>> +	 * The EM fwk will keep the reference and free the memory when needed.
>> +	 */
>> +	em_free_table(runtime_table);
>> +}
>> +
>> +static void em_check_capacity_update(void)
>> +{
>> +	cpumask_var_t cpu_done_mask;
>> +	struct em_perf_state *table;
>> +	struct em_perf_domain *pd;
>> +	unsigned long cpu_capacity;
>> +	int cpu;
>> +
>> +	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
>> +		pr_warn("no free memory\n");
>> +		return;
>> +	}
>> +
>> +	/* Check if CPUs capacity has changed than update EM */
>> +	for_each_possible_cpu(cpu) {
> 
> Can't we instead hook into cpufreq_online/offline() to check if we need to
> do any em related update for this policy?
> 

I think it would be a bit over-engineering. We know the moment when
there is a need for this check - it's when new EM is registered.
Also, for the cpu hotplug, not always the capacity would change,
which would confuse in such code. Not mentioning, that it will create
an extra everhead for that hotplug notification chain, for not good
reason.
diff mbox series

Patch

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index b5016afe6a19..d3fa5a77de80 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -25,6 +25,9 @@  static DEFINE_MUTEX(em_pd_mutex);
 
 static void em_cpufreq_update_efficiencies(struct device *dev,
 					   struct em_perf_state *table);
+static void em_check_capacity_update(void);
+static void em_update_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
 
 static bool _is_cpu_device(struct device *dev)
 {
@@ -596,6 +599,10 @@  int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 
 unlock:
 	mutex_unlock(&em_pd_mutex);
+
+	if (_is_cpu_device(dev))
+		em_check_capacity_update();
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
@@ -631,3 +638,117 @@  void em_dev_unregister_perf_domain(struct device *dev)
 	mutex_unlock(&em_pd_mutex);
 }
 EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
+
+/*
+ * Adjustment of CPU performance values after boot, when all CPUs capacites
+ * are correctly calculated.
+ */
+static void em_adjust_new_capacity(struct device *dev,
+				   struct em_perf_domain *pd,
+				   u64 max_cap)
+{
+	struct em_perf_table __rcu *runtime_table;
+	struct em_perf_state *table, *new_table;
+	int ret, table_size;
+
+	runtime_table = em_allocate_table(pd);
+	if (!runtime_table) {
+		dev_warn(dev, "EM: allocation failed\n");
+		return;
+	}
+
+	new_table = runtime_table->state;
+
+	table = em_get_table(pd);
+	/* Initialize data based on older runtime table */
+	table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+	memcpy(new_table, table, table_size);
+
+	em_put_table();
+
+	em_init_performance(dev, pd, new_table, pd->nr_perf_states);
+	ret = em_compute_costs(dev, new_table, NULL, pd->nr_perf_states,
+			       pd->flags);
+	if (ret) {
+		em_free_table(runtime_table);
+		return;
+	}
+
+	ret = em_dev_update_perf_domain(dev, runtime_table);
+	if (ret)
+		dev_warn(dev, "EM: update failed %d\n", ret);
+
+	/*
+	 * This is one-time-update, so give up the ownership in this updater.
+	 * The EM fwk will keep the reference and free the memory when needed.
+	 */
+	em_free_table(runtime_table);
+}
+
+static void em_check_capacity_update(void)
+{
+	cpumask_var_t cpu_done_mask;
+	struct em_perf_state *table;
+	struct em_perf_domain *pd;
+	unsigned long cpu_capacity;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
+		pr_warn("no free memory\n");
+		return;
+	}
+
+	/* Check if CPUs capacity has changed than update EM */
+	for_each_possible_cpu(cpu) {
+		struct cpufreq_policy *policy;
+		unsigned long em_max_perf;
+		struct device *dev;
+		int nr_states;
+
+		if (cpumask_test_cpu(cpu, cpu_done_mask))
+			continue;
+
+		policy = cpufreq_cpu_get(cpu);
+		if (!policy) {
+			pr_debug("Accessing cpu%d policy failed\n", cpu);
+			schedule_delayed_work(&em_update_work,
+					      msecs_to_jiffies(1000));
+			break;
+		}
+		cpufreq_cpu_put(policy);
+
+		pd = em_cpu_get(cpu);
+		if (!pd || em_is_artificial(pd))
+			continue;
+
+		cpumask_or(cpu_done_mask, cpu_done_mask,
+			   em_span_cpus(pd));
+
+		nr_states = pd->nr_perf_states;
+		cpu_capacity = arch_scale_cpu_capacity(cpu);
+
+		table = em_get_table(pd);
+		em_max_perf = table[pd->nr_perf_states - 1].performance;
+		em_put_table();
+
+		/*
+		 * Check if the CPU capacity has been adjusted during boot
+		 * and trigger the update for new performance values.
+		 */
+		if (em_max_perf == cpu_capacity)
+			continue;
+
+		pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
+			 cpu, cpu_capacity, em_max_perf);
+
+		dev = get_cpu_device(cpu);
+		em_adjust_new_capacity(dev, pd, cpu_capacity);
+	}
+
+	free_cpumask_var(cpu_done_mask);
+}
+
+static void em_update_workfn(struct work_struct *work)
+{
+	em_check_capacity_update();
+}