diff mbox series

[v3,2/3] arm64: Provide an AMU-based version of arch_freq_get_on_cpu

Message ID 20240312083431.3239989-3-beata.michalska@arm.com (mailing list archive)
State New, archived
Headers show
Series Add support for AArch64 AMUv1-based arch_freq_get_on_cpu | expand

Commit Message

Beata Michalska March 12, 2024, 8:34 a.m. UTC
With the Frequency Invariance Engine (FIE) being already wired up with
sched tick and making use of relevant (core counter and constant
counter) AMU counters, getting the current frequency for a given CPU
on supported platforms can be achieved by utilizing the frequency scale
factor which reflects an average CPU frequency for the last tick period
length.

The solution is partially based on APERF/MPERF implementation of
arch_freq_get_on_cpu.

Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Beata Michalska <beata.michalska@arm.com>
---
 arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 11 deletions(-)

Comments

Vanshidhar Konda March 13, 2024, 2:12 a.m. UTC | #1
On Tue, Mar 12, 2024 at 08:34:30AM +0000, Beata Michalska wrote:
>With the Frequency Invariance Engine (FIE) being already wired up with
>sched tick and making use of relevant (core counter and constant
>counter) AMU counters, getting the current frequency for a given CPU
>on supported platforms can be achieved by utilizing the frequency scale
>factor which reflects an average CPU frequency for the last tick period
>length.
>
>The solution is partially based on APERF/MPERF implementation of
>arch_freq_get_on_cpu.
>
>Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
>Signed-off-by: Beata Michalska <beata.michalska@arm.com>
>---
> arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
> 1 file changed, 92 insertions(+), 11 deletions(-)
>
>diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>index 1a2c72f3e7f8..42cb19c31719 100644
>--- a/arch/arm64/kernel/topology.c
>+++ b/arch/arm64/kernel/topology.c
>@@ -17,6 +17,8 @@
> #include <linux/cpufreq.h>
> #include <linux/init.h>
> #include <linux/percpu.h>
>+#include <linux/sched/isolation.h>
>+#include <linux/seqlock_types.h>
>
> #include <asm/cpu.h>
> #include <asm/cputype.h>
>@@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
>  * initialized.
>  */
> static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
>-static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
>-static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> static cpumask_var_t amu_fie_cpus;
>
>+struct amu_cntr_sample {
>+	u64		arch_const_cycles_prev;
>+	u64		arch_core_cycles_prev;
>+	unsigned long	last_update;
>+	seqcount_t	seq;
>+};
>+
>+static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
>+	.seq = SEQCNT_ZERO(cpu_amu_samples.seq)
>+};
>+
> void update_freq_counters_refs(void)
> {
>-	this_cpu_write(arch_core_cycles_prev, read_corecnt());
>-	this_cpu_write(arch_const_cycles_prev, read_constcnt());
>+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
>+
>+	amu_sample->arch_core_cycles_prev = read_corecnt();
>+	amu_sample->arch_const_cycles_prev = read_constcnt();
> }
>
> static inline bool freq_counters_valid(int cpu)
> {
>+	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
> 	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> 		return false;
>
>@@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
> 		return false;
> 	}
>
>-	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
>-		     !per_cpu(arch_core_cycles_prev, cpu))) {
>+	if (unlikely(!amu_sample->arch_const_cycles_prev ||
>+		     !amu_sample->arch_core_cycles_prev)) {
> 		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> 		return false;
> 	}
>@@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
>
> static void amu_scale_freq_tick(void)
> {
>+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> 	u64 prev_core_cnt, prev_const_cnt;
> 	u64 core_cnt, const_cnt, scale;
>
>-	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
>-	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
>+	prev_const_cnt = amu_sample->arch_const_cycles_prev;
>+	prev_core_cnt = amu_sample->arch_core_cycles_prev;
>+
>+	write_seqcount_begin(&amu_sample->seq);
>
> 	update_freq_counters_refs();
>
>-	const_cnt = this_cpu_read(arch_const_cycles_prev);
>-	core_cnt = this_cpu_read(arch_core_cycles_prev);
>+	const_cnt = amu_sample->arch_const_cycles_prev;
>+	core_cnt = amu_sample->arch_core_cycles_prev;
>
>+	/*
>+	 * This should not happen unless the AMUs have been reset and the
>+	 * counter values have not been resroted - unlikely

/resroted/restored

>+	 */
> 	if (unlikely(core_cnt <= prev_core_cnt ||
> 		     const_cnt <= prev_const_cnt))
>-		return;
>+		goto leave;
>
> 	/*
> 	 *	    /\core    arch_max_freq_scale
>@@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
>
> 	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> 	this_cpu_write(arch_freq_scale, (unsigned long)scale);
>+
>+	amu_sample->last_update = jiffies;
>+leave:
>+	write_seqcount_end(&amu_sample->seq);
> }
>
> static struct scale_freq_data amu_sfd = {
>@@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
> 	.set_freq_scale = amu_scale_freq_tick,
> };
>
>+#define AMU_SAMPLE_EXP_MS	20
>+
>+unsigned int arch_freq_get_on_cpu(int cpu)
>+{
>+	struct amu_cntr_sample *amu_sample;
>+	unsigned long last_update;
>+	unsigned int seq;
>+	unsigned int freq;
>+	u64 scale;
>+
>+	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
>+		return 0;
>+
>+retry:
>+	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
>+	do {
>+		seq = raw_read_seqcount_begin(&amu_sample->seq);
>+		last_update = amu_sample->last_update;
>+	} while (read_seqcount_retry(&amu_sample->seq, seq));
>+
>+	/*
>+	 * For those CPUs that are in full dynticks mode,
>+	 * and those that have not seen tick for a while
>+	 * try an alternative source for the counters (and thus freq scale),
>+	 * if available for given policy
>+	 */
>+	if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
>+		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
>+		int ref_cpu = nr_cpu_ids;
>+
>+		if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
>+				       policy->cpus))
>+			ref_cpu = cpumask_nth_and(cpu, policy->cpus,
>+						  housekeeping_cpumask(HK_TYPE_TICK));
>+

Could you help me understand why getting the frequency from another
housekeeping cpu would be a better than returning 0? Wouldn't different
CPUs in the HK_TYPE_TICK domain be running at independent frequencies?
May be adding this explanation to the patch commit message would help
people who look at this in the future?

Thanks,
Vanshi

>+		cpufreq_cpu_put(policy);
>+		if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
>+			/* No alternative to pull info from */
>+			return 0;
>+		cpu = ref_cpu;
>+		goto retry;
>+	}
>+	/*
>+	 * Reversed computation to the one used to determine
>+	 * the arch_freq_scale value
>+	 * (see amu_scale_freq_tick for details)
>+	 */
>+	scale = arch_scale_freq_capacity(cpu);
>+	freq = scale * arch_scale_freq_ref(cpu);
>+	freq >>= SCHED_CAPACITY_SHIFT;
>+
>+	return freq;
>+}
>+
> static void amu_fie_setup(const struct cpumask *cpus)
> {
> 	int cpu;
>-- 
>2.25.1
>
Ionela Voinescu March 13, 2024, 12:20 p.m. UTC | #2
Hi Beata,

Thank you for the patches!

On Tuesday 12 Mar 2024 at 08:34:30 (+0000), Beata Michalska wrote:
> With the Frequency Invariance Engine (FIE) being already wired up with
> sched tick and making use of relevant (core counter and constant
> counter) AMU counters, getting the current frequency for a given CPU
> on supported platforms can be achieved by utilizing the frequency scale
> factor which reflects an average CPU frequency for the last tick period
> length.
> 
> The solution is partially based on APERF/MPERF implementation of
> arch_freq_get_on_cpu.
> 
> Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
> Signed-off-by: Beata Michalska <beata.michalska@arm.com>
> ---
>  arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
>  1 file changed, 92 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..42cb19c31719 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -17,6 +17,8 @@
>  #include <linux/cpufreq.h>
>  #include <linux/init.h>
>  #include <linux/percpu.h>
> +#include <linux/sched/isolation.h>
> +#include <linux/seqlock_types.h>
>  
>  #include <asm/cpu.h>
>  #include <asm/cputype.h>
> @@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
>   * initialized.
>   */
>  static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
> -static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
> -static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
>  static cpumask_var_t amu_fie_cpus;
>  
> +struct amu_cntr_sample {
> +	u64		arch_const_cycles_prev;
> +	u64		arch_core_cycles_prev;
> +	unsigned long	last_update;
> +	seqcount_t	seq;
> +};
> +
> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
> +	.seq = SEQCNT_ZERO(cpu_amu_samples.seq)
> +};
> +
>  void update_freq_counters_refs(void)
>  {
> -	this_cpu_write(arch_core_cycles_prev, read_corecnt());
> -	this_cpu_write(arch_const_cycles_prev, read_constcnt());
> +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> +
> +	amu_sample->arch_core_cycles_prev = read_corecnt();
> +	amu_sample->arch_const_cycles_prev = read_constcnt();
>  }
>  
>  static inline bool freq_counters_valid(int cpu)
>  {
> +	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> +
>  	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
>  		return false;
>  
> @@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
>  		return false;
>  	}
>  
> -	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
> -		     !per_cpu(arch_core_cycles_prev, cpu))) {
> +	if (unlikely(!amu_sample->arch_const_cycles_prev ||
> +		     !amu_sample->arch_core_cycles_prev)) {
>  		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
>  		return false;
>  	}
> @@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
>  
>  static void amu_scale_freq_tick(void)
>  {
> +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
>  	u64 prev_core_cnt, prev_const_cnt;
>  	u64 core_cnt, const_cnt, scale;
>  
> -	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> -	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> +	prev_const_cnt = amu_sample->arch_const_cycles_prev;
> +	prev_core_cnt = amu_sample->arch_core_cycles_prev;
> +
> +	write_seqcount_begin(&amu_sample->seq);

The critical section here does not need to be this extensive, right?

The arch_freq_get_on_cpu() function only uses the frequency scale factor
and the last_update value, so this need only be placed above
"this_cpu_write(arch_freq_scale,..", if I'm not missing anything.

>  
>  	update_freq_counters_refs();
>  
> -	const_cnt = this_cpu_read(arch_const_cycles_prev);
> -	core_cnt = this_cpu_read(arch_core_cycles_prev);
> +	const_cnt = amu_sample->arch_const_cycles_prev;
> +	core_cnt = amu_sample->arch_core_cycles_prev;
>  
> +	/*
> +	 * This should not happen unless the AMUs have been reset and the
> +	 * counter values have not been resroted - unlikely
> +	 */
>  	if (unlikely(core_cnt <= prev_core_cnt ||
>  		     const_cnt <= prev_const_cnt))
> -		return;
> +		goto leave;
>  
>  	/*
>  	 *	    /\core    arch_max_freq_scale
> @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
>  
>  	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
>  	this_cpu_write(arch_freq_scale, (unsigned long)scale);
> +
> +	amu_sample->last_update = jiffies;
> +leave:
> +	write_seqcount_end(&amu_sample->seq);
>  }
>  
>  static struct scale_freq_data amu_sfd = {
> @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
>  	.set_freq_scale = amu_scale_freq_tick,
>  };
>  
> +#define AMU_SAMPLE_EXP_MS	20
> +
> +unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> +	struct amu_cntr_sample *amu_sample;
> +	unsigned long last_update;
> +	unsigned int seq;
> +	unsigned int freq;
> +	u64 scale;
> +
> +	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> +		return 0;
> +
> +retry:
> +	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> +
> +	do {
> +		seq = raw_read_seqcount_begin(&amu_sample->seq);
> +		last_update = amu_sample->last_update;
> +	} while (read_seqcount_retry(&amu_sample->seq, seq));

Related to the point above, this retry loop should also contain
"scale = arch_scale_freq_capacity(cpu)", otherwise there's no much point
for synchronisation, as far as I can tell.

For x86, arch_freq_get_on_cpu() uses the counter deltas and it would be
bad if values from different ticks would be used. But here the only
benefit of synchronisation is to make sure that we're using the scale
factor computed at the last update time. For us, even skipping on the
synchronisation logic would still be acceptable, as we'd be ensuring that
there was a tick in the past 20ms and we'd always use the most recent
value of the frequency scale factor.

Hope it helps,
Ionela.

> +
> +	/*
> +	 * For those CPUs that are in full dynticks mode,
> +	 * and those that have not seen tick for a while
> +	 * try an alternative source for the counters (and thus freq scale),
> +	 * if available for given policy
> +	 */
> +	if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
> +		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> +		int ref_cpu = nr_cpu_ids;
> +
> +		if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
> +				       policy->cpus))
> +			ref_cpu = cpumask_nth_and(cpu, policy->cpus,
> +						  housekeeping_cpumask(HK_TYPE_TICK));
> +
> +		cpufreq_cpu_put(policy);
> +		if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
> +			/* No alternative to pull info from */
> +			return 0;
> +		cpu = ref_cpu;
> +		goto retry;
> +	}
> +	/*
> +	 * Reversed computation to the one used to determine
> +	 * the arch_freq_scale value
> +	 * (see amu_scale_freq_tick for details)
> +	 */
> +	scale = arch_scale_freq_capacity(cpu);
> +	freq = scale * arch_scale_freq_ref(cpu);
> +	freq >>= SCHED_CAPACITY_SHIFT;
> +
> +	return freq;
> +}
> +
>  static void amu_fie_setup(const struct cpumask *cpus)
>  {
>  	int cpu;
> -- 
> 2.25.1
>
Beata Michalska March 13, 2024, 9:47 p.m. UTC | #3
On Tue, Mar 12, 2024 at 07:12:37PM -0700, Vanshidhar Konda wrote:
> On Tue, Mar 12, 2024 at 08:34:30AM +0000, Beata Michalska wrote:
> > With the Frequency Invariance Engine (FIE) being already wired up with
> > sched tick and making use of relevant (core counter and constant
> > counter) AMU counters, getting the current frequency for a given CPU
> > on supported platforms can be achieved by utilizing the frequency scale
> > factor which reflects an average CPU frequency for the last tick period
> > length.
> > 
> > The solution is partially based on APERF/MPERF implementation of
> > arch_freq_get_on_cpu.
> > 
> > Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
> > Signed-off-by: Beata Michalska <beata.michalska@arm.com>
> > ---
> > arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
> > 1 file changed, 92 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> > index 1a2c72f3e7f8..42cb19c31719 100644
> > --- a/arch/arm64/kernel/topology.c
> > +++ b/arch/arm64/kernel/topology.c
> > @@ -17,6 +17,8 @@
> > #include <linux/cpufreq.h>
> > #include <linux/init.h>
> > #include <linux/percpu.h>
> > +#include <linux/sched/isolation.h>
> > +#include <linux/seqlock_types.h>
> > 
> > #include <asm/cpu.h>
> > #include <asm/cputype.h>
> > @@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
> >  * initialized.
> >  */
> > static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
> > -static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
> > -static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> > static cpumask_var_t amu_fie_cpus;
> > 
> > +struct amu_cntr_sample {
> > +	u64		arch_const_cycles_prev;
> > +	u64		arch_core_cycles_prev;
> > +	unsigned long	last_update;
> > +	seqcount_t	seq;
> > +};
> > +
> > +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
> > +	.seq = SEQCNT_ZERO(cpu_amu_samples.seq)
> > +};
> > +
> > void update_freq_counters_refs(void)
> > {
> > -	this_cpu_write(arch_core_cycles_prev, read_corecnt());
> > -	this_cpu_write(arch_const_cycles_prev, read_constcnt());
> > +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> > +
> > +	amu_sample->arch_core_cycles_prev = read_corecnt();
> > +	amu_sample->arch_const_cycles_prev = read_constcnt();
> > }
> > 
> > static inline bool freq_counters_valid(int cpu)
> > {
> > +	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> > 	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> > 		return false;
> > 
> > @@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
> > 		return false;
> > 	}
> > 
> > -	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
> > -		     !per_cpu(arch_core_cycles_prev, cpu))) {
> > +	if (unlikely(!amu_sample->arch_const_cycles_prev ||
> > +		     !amu_sample->arch_core_cycles_prev)) {
> > 		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> > 		return false;
> > 	}
> > @@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
> > 
> > static void amu_scale_freq_tick(void)
> > {
> > +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> > 	u64 prev_core_cnt, prev_const_cnt;
> > 	u64 core_cnt, const_cnt, scale;
> > 
> > -	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +	prev_const_cnt = amu_sample->arch_const_cycles_prev;
> > +	prev_core_cnt = amu_sample->arch_core_cycles_prev;
> > +
> > +	write_seqcount_begin(&amu_sample->seq);
> > 
> > 	update_freq_counters_refs();
> > 
> > -	const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -	core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +	const_cnt = amu_sample->arch_const_cycles_prev;
> > +	core_cnt = amu_sample->arch_core_cycles_prev;
> > 
> > +	/*
> > +	 * This should not happen unless the AMUs have been reset and the
> > +	 * counter values have not been resroted - unlikely
> 
> /resroted/restored
> 
> > +	 */
> > 	if (unlikely(core_cnt <= prev_core_cnt ||
> > 		     const_cnt <= prev_const_cnt))
> > -		return;
> > +		goto leave;
> > 
> > 	/*
> > 	 *	    /\core    arch_max_freq_scale
> > @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
> > 
> > 	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> > 	this_cpu_write(arch_freq_scale, (unsigned long)scale);
> > +
> > +	amu_sample->last_update = jiffies;
> > +leave:
> > +	write_seqcount_end(&amu_sample->seq);
> > }
> > 
> > static struct scale_freq_data amu_sfd = {
> > @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
> > 	.set_freq_scale = amu_scale_freq_tick,
> > };
> > 
> > +#define AMU_SAMPLE_EXP_MS	20
> > +
> > +unsigned int arch_freq_get_on_cpu(int cpu)
> > +{
> > +	struct amu_cntr_sample *amu_sample;
> > +	unsigned long last_update;
> > +	unsigned int seq;
> > +	unsigned int freq;
> > +	u64 scale;
> > +
> > +	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> > +		return 0;
> > +
> > +retry:
> > +	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> > +	do {
> > +		seq = raw_read_seqcount_begin(&amu_sample->seq);
> > +		last_update = amu_sample->last_update;
> > +	} while (read_seqcount_retry(&amu_sample->seq, seq));
> > +
> > +	/*
> > +	 * For those CPUs that are in full dynticks mode,
> > +	 * and those that have not seen tick for a while
> > +	 * try an alternative source for the counters (and thus freq scale),
> > +	 * if available for given policy
> > +	 */
> > +	if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
> > +		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > +		int ref_cpu = nr_cpu_ids;
> > +
> > +		if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
> > +				       policy->cpus))
> > +			ref_cpu = cpumask_nth_and(cpu, policy->cpus,
> > +						  housekeeping_cpumask(HK_TYPE_TICK));
> > +
> 
> Could you help me understand why getting the frequency from another
> housekeeping cpu would be a better than returning 0? Wouldn't different
> CPUs in the HK_TYPE_TICK domain be running at independent frequencies?
> May be adding this explanation to the patch commit message would help
> people who look at this in the future?

If the last AMU sample taken lost its assumed validity, we try another cpu within
the same frequency domain, choosing housekeeping cpu as one that might have seen
the tick within the last, assumed, 20ms. We stick to the cpus withn the same
policy, and thus same frequency domain which means those cpus do operate at the
same frequency. Now, in case of per-core dvfs this will bail out with '0' as the
policy->cpus will contain single CPU. Having said that, this code is bogus as it
does not handle needed wrapping in case currently considered cpu is the last one
in this policy cpus mask - will send an update soon.

I will also try to make the comment above  more readable.

---
BR
Beata
> 
> Thanks,
> Vanshi
> 
> > +		cpufreq_cpu_put(policy);
> > +		if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
> > +			/* No alternative to pull info from */
> > +			return 0;
> > +		cpu = ref_cpu;
> > +		goto retry;
> > +	}
> > +	/*
> > +	 * Reversed computation to the one used to determine
> > +	 * the arch_freq_scale value
> > +	 * (see amu_scale_freq_tick for details)
> > +	 */
> > +	scale = arch_scale_freq_capacity(cpu);
> > +	freq = scale * arch_scale_freq_ref(cpu);
> > +	freq >>= SCHED_CAPACITY_SHIFT;
> > +
> > +	return freq;
> > +}
> > +
> > static void amu_fie_setup(const struct cpumask *cpus)
> > {
> > 	int cpu;
> > -- 
> > 2.25.1
> >
Beata Michalska March 13, 2024, 11:46 p.m. UTC | #4
On Wed, Mar 13, 2024 at 12:20:16PM +0000, Ionela Voinescu wrote:
> Hi Beata,
> 
> Thank you for the patches!
>
High time for those!

> On Tuesday 12 Mar 2024 at 08:34:30 (+0000), Beata Michalska wrote:
> > With the Frequency Invariance Engine (FIE) being already wired up with
> > sched tick and making use of relevant (core counter and constant
> > counter) AMU counters, getting the current frequency for a given CPU
> > on supported platforms can be achieved by utilizing the frequency scale
> > factor which reflects an average CPU frequency for the last tick period
> > length.
> > 
> > The solution is partially based on APERF/MPERF implementation of
> > arch_freq_get_on_cpu.
> > 
> > Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
> > Signed-off-by: Beata Michalska <beata.michalska@arm.com>
> > ---
> >  arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
> >  1 file changed, 92 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> > index 1a2c72f3e7f8..42cb19c31719 100644
> > --- a/arch/arm64/kernel/topology.c
> > +++ b/arch/arm64/kernel/topology.c
> > @@ -17,6 +17,8 @@
> >  #include <linux/cpufreq.h>
> >  #include <linux/init.h>
> >  #include <linux/percpu.h>
> > +#include <linux/sched/isolation.h>
> > +#include <linux/seqlock_types.h>
> >  
> >  #include <asm/cpu.h>
> >  #include <asm/cputype.h>
> > @@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
> >   * initialized.
> >   */
> >  static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
> > -static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
> > -static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> >  static cpumask_var_t amu_fie_cpus;
> >  
> > +struct amu_cntr_sample {
> > +	u64		arch_const_cycles_prev;
> > +	u64		arch_core_cycles_prev;
> > +	unsigned long	last_update;
> > +	seqcount_t	seq;
> > +};
> > +
> > +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
> > +	.seq = SEQCNT_ZERO(cpu_amu_samples.seq)
> > +};
> > +
> >  void update_freq_counters_refs(void)
> >  {
> > -	this_cpu_write(arch_core_cycles_prev, read_corecnt());
> > -	this_cpu_write(arch_const_cycles_prev, read_constcnt());
> > +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> > +
> > +	amu_sample->arch_core_cycles_prev = read_corecnt();
> > +	amu_sample->arch_const_cycles_prev = read_constcnt();
> >  }
> >  
> >  static inline bool freq_counters_valid(int cpu)
> >  {
> > +	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> >  	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> >  		return false;
> >  
> > @@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
> >  		return false;
> >  	}
> >  
> > -	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
> > -		     !per_cpu(arch_core_cycles_prev, cpu))) {
> > +	if (unlikely(!amu_sample->arch_const_cycles_prev ||
> > +		     !amu_sample->arch_core_cycles_prev)) {
> >  		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> >  		return false;
> >  	}
> > @@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
> >  
> >  static void amu_scale_freq_tick(void)
> >  {
> > +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> >  	u64 prev_core_cnt, prev_const_cnt;
> >  	u64 core_cnt, const_cnt, scale;
> >  
> > -	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +	prev_const_cnt = amu_sample->arch_const_cycles_prev;
> > +	prev_core_cnt = amu_sample->arch_core_cycles_prev;
> > +
> > +	write_seqcount_begin(&amu_sample->seq);
> 
> The critical section here does not need to be this extensive, right?
> 
> The arch_freq_get_on_cpu() function only uses the frequency scale factor
> and the last_update value, so this need only be placed above
> "this_cpu_write(arch_freq_scale,..", if I'm not missing anything.

You're not missing anything. The write side critical section could span only
those two, but having it extended gives a chance for the readers to get in on
the update and as those are not really performance sensitive I though it might
be a good option, especially if we can save the cycles on not needing to poke
the cpufeq driver. Furthermore, if the critical section is to span only the two,
then it does not really change much and can be dropped.

> 
> >  
> >  	update_freq_counters_refs();
> >  
> > -	const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -	core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +	const_cnt = amu_sample->arch_const_cycles_prev;
> > +	core_cnt = amu_sample->arch_core_cycles_prev;
> >  
> > +	/*
> > +	 * This should not happen unless the AMUs have been reset and the
> > +	 * counter values have not been resroted - unlikely
> > +	 */
> >  	if (unlikely(core_cnt <= prev_core_cnt ||
> >  		     const_cnt <= prev_const_cnt))
> > -		return;
> > +		goto leave;
> >  
> >  	/*
> >  	 *	    /\core    arch_max_freq_scale
> > @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
> >  
> >  	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> >  	this_cpu_write(arch_freq_scale, (unsigned long)scale);
> > +
> > +	amu_sample->last_update = jiffies;
> > +leave:
> > +	write_seqcount_end(&amu_sample->seq);
> >  }
> >  
> >  static struct scale_freq_data amu_sfd = {
> > @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
> >  	.set_freq_scale = amu_scale_freq_tick,
> >  };
> >  
> > +#define AMU_SAMPLE_EXP_MS	20
> > +
> > +unsigned int arch_freq_get_on_cpu(int cpu)
> > +{
> > +	struct amu_cntr_sample *amu_sample;
> > +	unsigned long last_update;
> > +	unsigned int seq;
> > +	unsigned int freq;
> > +	u64 scale;
> > +
> > +	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> > +		return 0;
> > +
> > +retry:
> > +	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> > +	do {
> > +		seq = raw_read_seqcount_begin(&amu_sample->seq);
> > +		last_update = amu_sample->last_update;
> > +	} while (read_seqcount_retry(&amu_sample->seq, seq));
> 
> Related to the point above, this retry loop should also contain
> "scale = arch_scale_freq_capacity(cpu)", otherwise there's no much point
> for synchronisation, as far as I can tell.
I'm not entirely sure why we would need to include the scale factor within
the read critical section. The aim here is to make sure we see the update if
one is ongoing and that the update to the timestamp is observed along with
one to the scale factor, which is what the write_seqcount_end will guarantee
(although the latter is not a hard sell as the update happens under interrupts
being disabled). If later on we fetch newer scale factor that's perfectly fine,
we do not want to see the stale one. Again, I can drop the seqcount (which is
slightly abused in this case I must admit) at a cost of potentially missing some
updates.
> 
> For x86, arch_freq_get_on_cpu() uses the counter deltas and it would be
> bad if values from different ticks would be used. But here the only
> benefit of synchronisation is to make sure that we're using the scale
> factor computed at the last update time. For us, even skipping on the
> synchronisation logic would still be acceptable, as we'd be ensuring that
> there was a tick in the past 20ms and we'd always use the most recent
> value of the frequency scale factor.
How would we ensure there was a tick in last 20ms ?
> 
> Hope it helps,
It does, thank you.

--
BR
Beata
> Ionela.
> 
> > +
> > +	/*
> > +	 * For those CPUs that are in full dynticks mode,
> > +	 * and those that have not seen tick for a while
> > +	 * try an alternative source for the counters (and thus freq scale),
> > +	 * if available for given policy
> > +	 */
> > +	if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
> > +		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > +		int ref_cpu = nr_cpu_ids;
> > +
> > +		if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
> > +				       policy->cpus))
> > +			ref_cpu = cpumask_nth_and(cpu, policy->cpus,
> > +						  housekeeping_cpumask(HK_TYPE_TICK));
> > +
> > +		cpufreq_cpu_put(policy);
> > +		if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
> > +			/* No alternative to pull info from */
> > +			return 0;
> > +		cpu = ref_cpu;
> > +		goto retry;
> > +	}
> > +	/*
> > +	 * Reversed computation to the one used to determine
> > +	 * the arch_freq_scale value
> > +	 * (see amu_scale_freq_tick for details)
> > +	 */
> > +	scale = arch_scale_freq_capacity(cpu);
> > +	freq = scale * arch_scale_freq_ref(cpu);
> > +	freq >>= SCHED_CAPACITY_SHIFT;
> > +
> > +	return freq;
> > +}
> > +
> >  static void amu_fie_setup(const struct cpumask *cpus)
> >  {
> >  	int cpu;
> > -- 
> > 2.25.1
> >
Ionela Voinescu March 18, 2024, 3:01 p.m. UTC | #5
Hey,

On Thursday 14 Mar 2024 at 00:46:19 (+0100), Beata Michalska wrote:
[..]
> > >  static void amu_scale_freq_tick(void)
> > >  {
> > > +	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> > >  	u64 prev_core_cnt, prev_const_cnt;
> > >  	u64 core_cnt, const_cnt, scale;
> > >  
> > > -	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> > > -	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> > > +	prev_const_cnt = amu_sample->arch_const_cycles_prev;
> > > +	prev_core_cnt = amu_sample->arch_core_cycles_prev;
> > > +
> > > +	write_seqcount_begin(&amu_sample->seq);
> > 
> > The critical section here does not need to be this extensive, right?
> > 
> > The arch_freq_get_on_cpu() function only uses the frequency scale factor
> > and the last_update value, so this need only be placed above
> > "this_cpu_write(arch_freq_scale,..", if I'm not missing anything.
> 
> You're not missing anything. The write side critical section could span only
> those two, but having it extended gives a chance for the readers to get in on
> the update and as those are not really performance sensitive I though it might
> be a good option, especially if we can save the cycles on not needing to poke
> the cpufeq driver. Furthermore, if the critical section is to span only the two,
> then it does not really change much and can be dropped.
> 
> > 
> > >  
> > >  	update_freq_counters_refs();
> > >  
> > > -	const_cnt = this_cpu_read(arch_const_cycles_prev);
> > > -	core_cnt = this_cpu_read(arch_core_cycles_prev);
> > > +	const_cnt = amu_sample->arch_const_cycles_prev;
> > > +	core_cnt = amu_sample->arch_core_cycles_prev;
> > >  
> > > +	/*
> > > +	 * This should not happen unless the AMUs have been reset and the
> > > +	 * counter values have not been resroted - unlikely
> > > +	 */
> > >  	if (unlikely(core_cnt <= prev_core_cnt ||
> > >  		     const_cnt <= prev_const_cnt))
> > > -		return;
> > > +		goto leave;
> > >  
> > >  	/*
> > >  	 *	    /\core    arch_max_freq_scale
> > > @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
> > >  
> > >  	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> > >  	this_cpu_write(arch_freq_scale, (unsigned long)scale);
> > > +
> > > +	amu_sample->last_update = jiffies;
> > > +leave:
> > > +	write_seqcount_end(&amu_sample->seq);
> > >  }
> > >  
> > >  static struct scale_freq_data amu_sfd = {
> > > @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
> > >  	.set_freq_scale = amu_scale_freq_tick,
> > >  };
> > >  
> > > +#define AMU_SAMPLE_EXP_MS	20
> > > +
> > > +unsigned int arch_freq_get_on_cpu(int cpu)
> > > +{
> > > +	struct amu_cntr_sample *amu_sample;
> > > +	unsigned long last_update;
> > > +	unsigned int seq;
> > > +	unsigned int freq;
> > > +	u64 scale;
> > > +
> > > +	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> > > +		return 0;
> > > +
> > > +retry:
> > > +	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > > +
> > > +	do {
> > > +		seq = raw_read_seqcount_begin(&amu_sample->seq);
> > > +		last_update = amu_sample->last_update;
> > > +	} while (read_seqcount_retry(&amu_sample->seq, seq));
> > 
> > Related to the point above, this retry loop should also contain
> > "scale = arch_scale_freq_capacity(cpu)", otherwise there's no much point
> > for synchronisation, as far as I can tell.
> I'm not entirely sure why we would need to include the scale factor within
> the read critical section. The aim here is to make sure we see the update if
> one is ongoing and that the update to the timestamp is observed along with
> one to the scale factor, which is what the write_seqcount_end will guarantee
> (although the latter is not a hard sell as the update happens under interrupts
> being disabled). If later on we fetch newer scale factor that's perfectly fine,
> we do not want to see the stale one. Again, I can drop the seqcount (which is
> slightly abused in this case I must admit) at a cost of potentially missing some
> updates.

Replying here for both comments, as they are related.

I fully agree, but I would be more inclined to drop the seqcount. It
would be a game of chance if there was an update in the last few ns of
the 20ms deadline which we might hit or miss due to the presence of an
extended write critical section or the lack of one.

> > 
> > For x86, arch_freq_get_on_cpu() uses the counter deltas and it would be
> > bad if values from different ticks would be used. But here the only
> > benefit of synchronisation is to make sure that we're using the scale
> > factor computed at the last update time. For us, even skipping on the
> > synchronisation logic would still be acceptable, as we'd be ensuring that
> > there was a tick in the past 20ms and we'd always use the most recent
> > value of the frequency scale factor.
> How would we ensure there was a tick in last 20ms ?

I just meant that we'd observe the presence of a tick in the last 20ms
(if there was one) and we don't necessarily need to guarantee that we'd
use the scale factor obtained at that time. We could use the latest, as
you mentioned above as well.

Thanks,
Ionela.
Sumit Gupta March 20, 2024, 4:43 p.m. UTC | #6
On 12/03/24 14:04, Beata Michalska wrote:
> External email: Use caution opening links or attachments
> 
> 
> With the Frequency Invariance Engine (FIE) being already wired up with
> sched tick and making use of relevant (core counter and constant
> counter) AMU counters, getting the current frequency for a given CPU
> on supported platforms can be achieved by utilizing the frequency scale
> factor which reflects an average CPU frequency for the last tick period
> length.
> 
> The solution is partially based on APERF/MPERF implementation of
> arch_freq_get_on_cpu.
> 
> Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
> Signed-off-by: Beata Michalska <beata.michalska@arm.com>
> ---
>   arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
>   1 file changed, 92 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 1a2c72f3e7f8..42cb19c31719 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -17,6 +17,8 @@
>   #include <linux/cpufreq.h>
>   #include <linux/init.h>
>   #include <linux/percpu.h>
> +#include <linux/sched/isolation.h>
> +#include <linux/seqlock_types.h>
> 
>   #include <asm/cpu.h>
>   #include <asm/cputype.h>
> @@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
>    * initialized.
>    */
>   static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
> -static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
> -static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
>   static cpumask_var_t amu_fie_cpus;
> 
> +struct amu_cntr_sample {
> +       u64             arch_const_cycles_prev;
> +       u64             arch_core_cycles_prev;
> +       unsigned long   last_update;
> +       seqcount_t      seq;
> +};
> +
> +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
> +       .seq = SEQCNT_ZERO(cpu_amu_samples.seq)
> +};
> +
>   void update_freq_counters_refs(void)
>   {
> -       this_cpu_write(arch_core_cycles_prev, read_corecnt());
> -       this_cpu_write(arch_const_cycles_prev, read_constcnt());
> +       struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> +
> +       amu_sample->arch_core_cycles_prev = read_corecnt();
> +       amu_sample->arch_const_cycles_prev = read_constcnt();
>   }
> 
>   static inline bool freq_counters_valid(int cpu)
>   {
> +       struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> +
>          if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
>                  return false;
> 
> @@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
>                  return false;
>          }
> 
> -       if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
> -                    !per_cpu(arch_core_cycles_prev, cpu))) {
> +       if (unlikely(!amu_sample->arch_const_cycles_prev ||
> +                    !amu_sample->arch_core_cycles_prev)) {
>                  pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
>                  return false;
>          }
> @@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
> 
>   static void amu_scale_freq_tick(void)
>   {
> +       struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
>          u64 prev_core_cnt, prev_const_cnt;
>          u64 core_cnt, const_cnt, scale;
> 
> -       prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> -       prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> +       prev_const_cnt = amu_sample->arch_const_cycles_prev;
> +       prev_core_cnt = amu_sample->arch_core_cycles_prev;
> +
> +       write_seqcount_begin(&amu_sample->seq);
> 
>          update_freq_counters_refs();
> 
> -       const_cnt = this_cpu_read(arch_const_cycles_prev);
> -       core_cnt = this_cpu_read(arch_core_cycles_prev);
> +       const_cnt = amu_sample->arch_const_cycles_prev;
> +       core_cnt = amu_sample->arch_core_cycles_prev;
> 
> +       /*
> +        * This should not happen unless the AMUs have been reset and the
> +        * counter values have not been resroted - unlikely
> +        */
>          if (unlikely(core_cnt <= prev_core_cnt ||
>                       const_cnt <= prev_const_cnt))
> -               return;
> +               goto leave;
> 
>          /*
>           *          /\core    arch_max_freq_scale
> @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
> 
>          scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
>          this_cpu_write(arch_freq_scale, (unsigned long)scale);
> +
> +       amu_sample->last_update = jiffies;
> +leave:
> +       write_seqcount_end(&amu_sample->seq);
>   }
> 
>   static struct scale_freq_data amu_sfd = {
> @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
>          .set_freq_scale = amu_scale_freq_tick,
>   };
> 
> +#define AMU_SAMPLE_EXP_MS      20
> +
> +unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> +       struct amu_cntr_sample *amu_sample;
> +       unsigned long last_update;
> +       unsigned int seq;
> +       unsigned int freq;
> +       u64 scale;
> +
> +       if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> +               return 0;
> +
> +retry:
> +       amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> +
> +       do {
> +               seq = raw_read_seqcount_begin(&amu_sample->seq);
> +               last_update = amu_sample->last_update;
> +       } while (read_seqcount_retry(&amu_sample->seq, seq));
> +
> +       /*
> +        * For those CPUs that are in full dynticks mode,
> +        * and those that have not seen tick for a while
> +        * try an alternative source for the counters (and thus freq scale),
> +        * if available for given policy
> +        */
> +       if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
> +               struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> +               int ref_cpu = nr_cpu_ids;
> +
> +               if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
> +                                      policy->cpus))
> +                       ref_cpu = cpumask_nth_and(cpu, policy->cpus,
> +                                                 housekeeping_cpumask(HK_TYPE_TICK));
> +

This is looking for any other HK CPU within same policy for counters.
AFAIU, cpumask_nth_and() will return small_cpumask_bits/nr_cpu_ids
if the number of bits in both masks is different. Could you check
again if the current change is fine or needs something like below.
BTW, we have one CPU per policy.

   cpumask_and(&mask, policy->cpus, housekeeping_cpumask(HK_TYPE_TICK));
   retry:
	....
	cpumask_andnot(&mask, &mask, cpumask_of(cpu));
	ref_cpu = cpumask_any(&mask);

Thank you,
Sumit Gupta

> +               cpufreq_cpu_put(policy);
> +               if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
> +                       /* No alternative to pull info from */
> +                       return 0;
> +               cpu = ref_cpu;
> +               goto retry;
> +       }
> +       /*
> +        * Reversed computation to the one used to determine
> +        * the arch_freq_scale value
> +        * (see amu_scale_freq_tick for details)
> +        */
> +       scale = arch_scale_freq_capacity(cpu);
> +       freq = scale * arch_scale_freq_ref(cpu);
> +       freq >>= SCHED_CAPACITY_SHIFT;
> +
> +       return freq;
> +}
> +
>   static void amu_fie_setup(const struct cpumask *cpus)
>   {
>          int cpu;
> --
> 2.25.1
>
Beata Michalska April 3, 2024, 9:28 p.m. UTC | #7
On Wed, Mar 20, 2024 at 10:13:18PM +0530, Sumit Gupta wrote:
> 
> 
> On 12/03/24 14:04, Beata Michalska wrote:
> > External email: Use caution opening links or attachments
> > 
> > 
> > With the Frequency Invariance Engine (FIE) being already wired up with
> > sched tick and making use of relevant (core counter and constant
> > counter) AMU counters, getting the current frequency for a given CPU
> > on supported platforms can be achieved by utilizing the frequency scale
> > factor which reflects an average CPU frequency for the last tick period
> > length.
> > 
> > The solution is partially based on APERF/MPERF implementation of
> > arch_freq_get_on_cpu.
> > 
> > Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
> > Signed-off-by: Beata Michalska <beata.michalska@arm.com>
> > ---
> >   arch/arm64/kernel/topology.c | 103 +++++++++++++++++++++++++++++++----
> >   1 file changed, 92 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> > index 1a2c72f3e7f8..42cb19c31719 100644
> > --- a/arch/arm64/kernel/topology.c
> > +++ b/arch/arm64/kernel/topology.c
> > @@ -17,6 +17,8 @@
> >   #include <linux/cpufreq.h>
> >   #include <linux/init.h>
> >   #include <linux/percpu.h>
> > +#include <linux/sched/isolation.h>
> > +#include <linux/seqlock_types.h>
> > 
> >   #include <asm/cpu.h>
> >   #include <asm/cputype.h>
> > @@ -88,18 +90,31 @@ int __init parse_acpi_topology(void)
> >    * initialized.
> >    */
> >   static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
> > -static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
> > -static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> >   static cpumask_var_t amu_fie_cpus;
> > 
> > +struct amu_cntr_sample {
> > +       u64             arch_const_cycles_prev;
> > +       u64             arch_core_cycles_prev;
> > +       unsigned long   last_update;
> > +       seqcount_t      seq;
> > +};
> > +
> > +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
> > +       .seq = SEQCNT_ZERO(cpu_amu_samples.seq)
> > +};
> > +
> >   void update_freq_counters_refs(void)
> >   {
> > -       this_cpu_write(arch_core_cycles_prev, read_corecnt());
> > -       this_cpu_write(arch_const_cycles_prev, read_constcnt());
> > +       struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> > +
> > +       amu_sample->arch_core_cycles_prev = read_corecnt();
> > +       amu_sample->arch_const_cycles_prev = read_constcnt();
> >   }
> > 
> >   static inline bool freq_counters_valid(int cpu)
> >   {
> > +       struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> >          if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> >                  return false;
> > 
> > @@ -108,8 +123,8 @@ static inline bool freq_counters_valid(int cpu)
> >                  return false;
> >          }
> > 
> > -       if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
> > -                    !per_cpu(arch_core_cycles_prev, cpu))) {
> > +       if (unlikely(!amu_sample->arch_const_cycles_prev ||
> > +                    !amu_sample->arch_core_cycles_prev)) {
> >                  pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> >                  return false;
> >          }
> > @@ -152,20 +167,27 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
> > 
> >   static void amu_scale_freq_tick(void)
> >   {
> > +       struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> >          u64 prev_core_cnt, prev_const_cnt;
> >          u64 core_cnt, const_cnt, scale;
> > 
> > -       prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -       prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +       prev_const_cnt = amu_sample->arch_const_cycles_prev;
> > +       prev_core_cnt = amu_sample->arch_core_cycles_prev;
> > +
> > +       write_seqcount_begin(&amu_sample->seq);
> > 
> >          update_freq_counters_refs();
> > 
> > -       const_cnt = this_cpu_read(arch_const_cycles_prev);
> > -       core_cnt = this_cpu_read(arch_core_cycles_prev);
> > +       const_cnt = amu_sample->arch_const_cycles_prev;
> > +       core_cnt = amu_sample->arch_core_cycles_prev;
> > 
> > +       /*
> > +        * This should not happen unless the AMUs have been reset and the
> > +        * counter values have not been resroted - unlikely
> > +        */
> >          if (unlikely(core_cnt <= prev_core_cnt ||
> >                       const_cnt <= prev_const_cnt))
> > -               return;
> > +               goto leave;
> > 
> >          /*
> >           *          /\core    arch_max_freq_scale
> > @@ -182,6 +204,10 @@ static void amu_scale_freq_tick(void)
> > 
> >          scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> >          this_cpu_write(arch_freq_scale, (unsigned long)scale);
> > +
> > +       amu_sample->last_update = jiffies;
> > +leave:
> > +       write_seqcount_end(&amu_sample->seq);
> >   }
> > 
> >   static struct scale_freq_data amu_sfd = {
> > @@ -189,6 +215,61 @@ static struct scale_freq_data amu_sfd = {
> >          .set_freq_scale = amu_scale_freq_tick,
> >   };
> > 
> > +#define AMU_SAMPLE_EXP_MS      20
> > +
> > +unsigned int arch_freq_get_on_cpu(int cpu)
> > +{
> > +       struct amu_cntr_sample *amu_sample;
> > +       unsigned long last_update;
> > +       unsigned int seq;
> > +       unsigned int freq;
> > +       u64 scale;
> > +
> > +       if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
> > +               return 0;
> > +
> > +retry:
> > +       amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
> > +
> > +       do {
> > +               seq = raw_read_seqcount_begin(&amu_sample->seq);
> > +               last_update = amu_sample->last_update;
> > +       } while (read_seqcount_retry(&amu_sample->seq, seq));
> > +
> > +       /*
> > +        * For those CPUs that are in full dynticks mode,
> > +        * and those that have not seen tick for a while
> > +        * try an alternative source for the counters (and thus freq scale),
> > +        * if available for given policy
> > +        */
> > +       if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
> > +               struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > +               int ref_cpu = nr_cpu_ids;
> > +
> > +               if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
> > +                                      policy->cpus))
> > +                       ref_cpu = cpumask_nth_and(cpu, policy->cpus,
> > +                                                 housekeeping_cpumask(HK_TYPE_TICK));
> > +
> 
> This is looking for any other HK CPU within same policy for counters.
> AFAIU, cpumask_nth_and() will return small_cpumask_bits/nr_cpu_ids
> if the number of bits in both masks is different. Could you check
> again if the current change is fine or needs something like below.
> BTW, we have one CPU per policy.
> 
>   cpumask_and(&mask, policy->cpus, housekeeping_cpumask(HK_TYPE_TICK));
>   retry:
> 	....
> 	cpumask_andnot(&mask, &mask, cpumask_of(cpu));
> 	ref_cpu = cpumask_any(&mask);
>
At this point this is indeed bogus though for a different reason.
I've rewritten that part a bit, though still, this will bail out for single-cpu
policies.

---
BR
Beata


> Thank you,
> Sumit Gupta
> 
> > +               cpufreq_cpu_put(policy);
> > +               if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
> > +                       /* No alternative to pull info from */
> > +                       return 0;
> > +               cpu = ref_cpu;
> > +               goto retry;
> > +       }
> > +       /*
> > +        * Reversed computation to the one used to determine
> > +        * the arch_freq_scale value
> > +        * (see amu_scale_freq_tick for details)
> > +        */
> > +       scale = arch_scale_freq_capacity(cpu);
> > +       freq = scale * arch_scale_freq_ref(cpu);
> > +       freq >>= SCHED_CAPACITY_SHIFT;
> > +
> > +       return freq;
> > +}
> > +
> >   static void amu_fie_setup(const struct cpumask *cpus)
> >   {
> >          int cpu;
> > --
> > 2.25.1
> >
diff mbox series

Patch

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 1a2c72f3e7f8..42cb19c31719 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -17,6 +17,8 @@ 
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/sched/isolation.h>
+#include <linux/seqlock_types.h>
 
 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -88,18 +90,31 @@  int __init parse_acpi_topology(void)
  * initialized.
  */
 static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
-static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
-static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
 static cpumask_var_t amu_fie_cpus;
 
+struct amu_cntr_sample {
+	u64		arch_const_cycles_prev;
+	u64		arch_core_cycles_prev;
+	unsigned long	last_update;
+	seqcount_t	seq;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples) = {
+	.seq = SEQCNT_ZERO(cpu_amu_samples.seq)
+};
+
 void update_freq_counters_refs(void)
 {
-	this_cpu_write(arch_core_cycles_prev, read_corecnt());
-	this_cpu_write(arch_const_cycles_prev, read_constcnt());
+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
+
+	amu_sample->arch_core_cycles_prev = read_corecnt();
+	amu_sample->arch_const_cycles_prev = read_constcnt();
 }
 
 static inline bool freq_counters_valid(int cpu)
 {
+	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
+
 	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
 		return false;
 
@@ -108,8 +123,8 @@  static inline bool freq_counters_valid(int cpu)
 		return false;
 	}
 
-	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
-		     !per_cpu(arch_core_cycles_prev, cpu))) {
+	if (unlikely(!amu_sample->arch_const_cycles_prev ||
+		     !amu_sample->arch_core_cycles_prev)) {
 		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
 		return false;
 	}
@@ -152,20 +167,27 @@  void freq_inv_set_max_ratio(int cpu, u64 max_rate)
 
 static void amu_scale_freq_tick(void)
 {
+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
 	u64 prev_core_cnt, prev_const_cnt;
 	u64 core_cnt, const_cnt, scale;
 
-	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
-	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
+	prev_const_cnt = amu_sample->arch_const_cycles_prev;
+	prev_core_cnt = amu_sample->arch_core_cycles_prev;
+
+	write_seqcount_begin(&amu_sample->seq);
 
 	update_freq_counters_refs();
 
-	const_cnt = this_cpu_read(arch_const_cycles_prev);
-	core_cnt = this_cpu_read(arch_core_cycles_prev);
+	const_cnt = amu_sample->arch_const_cycles_prev;
+	core_cnt = amu_sample->arch_core_cycles_prev;
 
+	/*
+	 * This should not happen unless the AMUs have been reset and the
+	 * counter values have not been resroted - unlikely
+	 */
 	if (unlikely(core_cnt <= prev_core_cnt ||
 		     const_cnt <= prev_const_cnt))
-		return;
+		goto leave;
 
 	/*
 	 *	    /\core    arch_max_freq_scale
@@ -182,6 +204,10 @@  static void amu_scale_freq_tick(void)
 
 	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
 	this_cpu_write(arch_freq_scale, (unsigned long)scale);
+
+	amu_sample->last_update = jiffies;
+leave:
+	write_seqcount_end(&amu_sample->seq);
 }
 
 static struct scale_freq_data amu_sfd = {
@@ -189,6 +215,61 @@  static struct scale_freq_data amu_sfd = {
 	.set_freq_scale = amu_scale_freq_tick,
 };
 
+#define AMU_SAMPLE_EXP_MS	20
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+	struct amu_cntr_sample *amu_sample;
+	unsigned long last_update;
+	unsigned int seq;
+	unsigned int freq;
+	u64 scale;
+
+	if (!cpumask_test_cpu(cpu, amu_fie_cpus) || !arch_scale_freq_ref(cpu))
+		return 0;
+
+retry:
+	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
+
+	do {
+		seq = raw_read_seqcount_begin(&amu_sample->seq);
+		last_update = amu_sample->last_update;
+	} while (read_seqcount_retry(&amu_sample->seq, seq));
+
+	/*
+	 * For those CPUs that are in full dynticks mode,
+	 * and those that have not seen tick for a while
+	 * try an alternative source for the counters (and thus freq scale),
+	 * if available for given policy
+	 */
+	if (time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
+		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+		int ref_cpu = nr_cpu_ids;
+
+		if (cpumask_intersects(housekeeping_cpumask(HK_TYPE_TICK),
+				       policy->cpus))
+			ref_cpu = cpumask_nth_and(cpu, policy->cpus,
+						  housekeeping_cpumask(HK_TYPE_TICK));
+
+		cpufreq_cpu_put(policy);
+		if (ref_cpu >= nr_cpu_ids || ref_cpu == cpu)
+			/* No alternative to pull info from */
+			return 0;
+		cpu = ref_cpu;
+		goto retry;
+	}
+	/*
+	 * Reversed computation to the one used to determine
+	 * the arch_freq_scale value
+	 * (see amu_scale_freq_tick for details)
+	 */
+	scale = arch_scale_freq_capacity(cpu);
+	freq = scale * arch_scale_freq_ref(cpu);
+	freq >>= SCHED_CAPACITY_SHIFT;
+
+	return freq;
+}
+
 static void amu_fie_setup(const struct cpumask *cpus)
 {
 	int cpu;