diff mbox series

[V3,3/3] arm64: topology: Make AMUs work with modular cpufreq drivers

Message ID 8f0fe23d1c9effed71d5660c939472d43726a61b.1608010334.git.viresh.kumar@linaro.org (mailing list archive)
State New, archived
Headers show
Series [V3,1/3] arm64: topology: Avoid the have_policy check | expand

Commit Message

Viresh Kumar Dec. 15, 2020, 5:34 a.m. UTC
The AMU counters won't get used today if the cpufreq driver is built as
a module as the amu core requires everything to be ready by late init.

Fix that properly by registering for cpufreq policy notifier. Note that
the amu core don't have any cpufreq dependency after the first time
CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we
don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for
the same reason we check if the CPUs are already parsed in the beginning
of amu_fie_setup() and skip if that is true. Alternatively we can shoot
a work from there to unregister the notifier instead, but that seemed
too much instead of this simple check.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V3:
- This is a new patch.

Ionela,

I don't have a way to test the AMU stuff, will it be possible for you to
give it a try ?

 arch/arm64/kernel/topology.c | 88 +++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 42 deletions(-)

Comments

Ionela Voinescu Dec. 15, 2020, 11:56 a.m. UTC | #1
Hi,

On Tuesday 15 Dec 2020 at 11:04:16 (+0530), Viresh Kumar wrote:
> The AMU counters won't get used today if the cpufreq driver is built as
> a module as the amu core requires everything to be ready by late init.
> 
> Fix that properly by registering for cpufreq policy notifier. Note that
> the amu core don't have any cpufreq dependency after the first time
> CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we
> don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for
> the same reason we check if the CPUs are already parsed in the beginning
> of amu_fie_setup() and skip if that is true. Alternatively we can shoot
> a work from there to unregister the notifier instead, but that seemed
> too much instead of this simple check.
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
> V3:
> - This is a new patch.
> 
> Ionela,
> 
> I don't have a way to test the AMU stuff, will it be possible for you to
> give it a try ?
> 

I'll review this new patch and then I'll give the full set a try.

Thanks for the interest in improving this,
Ionela.
Ionela Voinescu Dec. 16, 2020, 12:03 a.m. UTC | #2
Hi,

On Tuesday 15 Dec 2020 at 11:04:16 (+0530), Viresh Kumar wrote:
> The AMU counters won't get used today if the cpufreq driver is built as
> a module as the amu core requires everything to be ready by late init.
> 
> Fix that properly by registering for cpufreq policy notifier. Note that
> the amu core don't have any cpufreq dependency after the first time
> CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we
> don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for
> the same reason we check if the CPUs are already parsed in the beginning
> of amu_fie_setup() and skip if that is true. Alternatively we can shoot
> a work from there to unregister the notifier instead, but that seemed
> too much instead of this simple check.
> 
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
> V3:
> - This is a new patch.
> 
> Ionela,
> 
> I don't have a way to test the AMU stuff, will it be possible for you to
> give it a try ?
> 

My best AMU test setup is a hacked Juno :). A few runs with different
"AMU settings" showed everything works nicely. I'll continue reviewing
and testing tomorrow as I want to test with CPPC and on some models as
well.

>  arch/arm64/kernel/topology.c | 88 +++++++++++++++++++-----------------
>  1 file changed, 46 insertions(+), 42 deletions(-)
> 
> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> index 57267d694495..0fc2b32ddb45 100644
> --- a/arch/arm64/kernel/topology.c
> +++ b/arch/arm64/kernel/topology.c
> @@ -199,64 +199,33 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
>  	return 0;
>  }
>  
> -static inline void
> -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus)
> -{
> -	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> -
> -	if (!policy) {
> -		pr_debug("CPU%d: No cpufreq policy found.\n", cpu);
> -		return;
> -	}
> -
> -	if (cpumask_subset(policy->related_cpus, valid_cpus))
> -		cpumask_or(amu_fie_cpus, policy->related_cpus,
> -			   amu_fie_cpus);
> -
> -	cpufreq_cpu_put(policy);
> -}
> -
>  static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
>  #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key)
>  
> -static int __init init_amu_fie(void)
> +static void amu_fie_setup(const struct cpumask *cpus)
>  {
> -	cpumask_var_t valid_cpus;
>  	bool invariant;
> -	int ret = 0;
>  	int cpu;
>  
> -	if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL))
> -		return -ENOMEM;
> -
> -	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) {
> -		ret = -ENOMEM;
> -		goto free_valid_mask;
> -	}
> +	/* We are already set since the last insmod of cpufreq driver */
> +	if (unlikely(cpumask_subset(cpus, amu_fie_cpus)))
> +		return;
>  
> -	for_each_present_cpu(cpu) {
> +	for_each_cpu(cpu, cpus) {
>  		if (!freq_counters_valid(cpu) ||
>  		    freq_inv_set_max_ratio(cpu,
>  					   cpufreq_get_hw_max_freq(cpu) * 1000,
>  					   arch_timer_get_rate()))
> -			continue;
> -
> -		cpumask_set_cpu(cpu, valid_cpus);
> -		enable_policy_freq_counters(cpu, valid_cpus);
> +			return;
>  	}
>  
> -	/* Overwrite amu_fie_cpus if all CPUs support AMU */
> -	if (cpumask_equal(valid_cpus, cpu_present_mask))
> -		cpumask_copy(amu_fie_cpus, cpu_present_mask);
> -
> -	if (cpumask_empty(amu_fie_cpus))
> -		goto free_valid_mask;
> +	cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
>  
>  	invariant = topology_scale_freq_invariant();
>  
>  	/* We aren't fully invariant yet */
>  	if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
> -		goto free_valid_mask;
> +		return;
>  
>  	static_branch_enable(&amu_fie_key);
>  

If we are cpufreq invariant, we'll reach this point and the following
pr_info, even if not all CPUs have been checked, which (at this late
hour) I think it's functionally fine.

But we get prints like:

[    2.665918] AMU: CPUs[0-3]: counters will be used for FIE.
[    2.666293] AMU: CPUs[0-5]: counters will be used for FIE.

For two policies this is fine (although confusing) but if we had more
CPUs and more policies, it would be too many lines.

I'm not sure if there's a better way of fixing this other than keeping
track of all visited CPUs and printing this line when all online CPUs
have been visited.

> @@ -271,13 +240,48 @@ static int __init init_amu_fie(void)
>  	 */
>  	if (!invariant)
>  		rebuild_sched_domains_energy();
> +}
> +
> +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
> +				 void *data)
> +{
> +	struct cpufreq_policy *policy = data;
> +
> +	if (val == CPUFREQ_CREATE_POLICY)
> +		amu_fie_setup(policy->related_cpus);
> +

Is is guaranteed that cpuinfo.max_freq is always set at this point?

I have a vague recollection of a scenario where cpuinfo.max_freq could
be populated later in case the driver needs to talk to firmware to
obtain this value.

The setup above will fail if the CPU's max frequency cannot be obtained.

Thanks,
Ionela.

> +	/*
> +	 * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU
> +	 * counters don't have any dependency on cpufreq driver once we have
> +	 * initialized AMU support and enabled invariance. The AMU counters will
> +	 * keep on working just fine in the absence of the cpufreq driver, and
> +	 * for the CPUs for which there are no counters availalbe, the last set
> +	 * value of freq_scale will remain valid as that is the frequency those
> +	 * CPUs are running at.
> +	 */
> +
> +	return 0;
> +}
> +
> +static struct notifier_block init_amu_fie_notifier = {
> +	.notifier_call = init_amu_fie_callback,
> +};
> +
> +static int __init init_amu_fie(void)
> +{
> +	int ret;
> +
> +	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL))
> +		return -ENOMEM;
>  
> -free_valid_mask:
> -	free_cpumask_var(valid_cpus);
> +	ret = cpufreq_register_notifier(&init_amu_fie_notifier,
> +					CPUFREQ_POLICY_NOTIFIER);
> +	if (ret)
> +		free_cpumask_var(amu_fie_cpus);
>  
>  	return ret;
>  }
> -late_initcall_sync(init_amu_fie);
> +core_initcall(init_amu_fie);
>  
>  bool arch_freq_counters_available(const struct cpumask *cpus)
>  {
> -- 
> 2.25.0.rc1.19.g042ed3e048af
>
Viresh Kumar Dec. 16, 2020, 4:38 a.m. UTC | #3
On 16-12-20, 00:03, Ionela Voinescu wrote:
> Hi,
> 
> On Tuesday 15 Dec 2020 at 11:04:16 (+0530), Viresh Kumar wrote:
> > The AMU counters won't get used today if the cpufreq driver is built as
> > a module as the amu core requires everything to be ready by late init.
> > 
> > Fix that properly by registering for cpufreq policy notifier. Note that
> > the amu core don't have any cpufreq dependency after the first time
> > CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we
> > don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for
> > the same reason we check if the CPUs are already parsed in the beginning
> > of amu_fie_setup() and skip if that is true. Alternatively we can shoot
> > a work from there to unregister the notifier instead, but that seemed
> > too much instead of this simple check.
> > 
> > Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> > ---
> > V3:
> > - This is a new patch.
> > 
> > Ionela,
> > 
> > I don't have a way to test the AMU stuff, will it be possible for you to
> > give it a try ?
> > 
> 
> My best AMU test setup is a hacked Juno :).

Everyone is hacking around :)

> A few runs with different
> "AMU settings" showed everything works nicely. I'll continue reviewing
> and testing tomorrow as I want to test with CPPC and on some models as
> well.
> 
> >  arch/arm64/kernel/topology.c | 88 +++++++++++++++++++-----------------
> >  1 file changed, 46 insertions(+), 42 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
> > index 57267d694495..0fc2b32ddb45 100644
> > --- a/arch/arm64/kernel/topology.c
> > +++ b/arch/arm64/kernel/topology.c
> > @@ -199,64 +199,33 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
> >  	return 0;
> >  }
> >  
> > -static inline void
> > -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus)
> > -{
> > -	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > -
> > -	if (!policy) {
> > -		pr_debug("CPU%d: No cpufreq policy found.\n", cpu);
> > -		return;
> > -	}
> > -
> > -	if (cpumask_subset(policy->related_cpus, valid_cpus))
> > -		cpumask_or(amu_fie_cpus, policy->related_cpus,
> > -			   amu_fie_cpus);
> > -
> > -	cpufreq_cpu_put(policy);
> > -}
> > -
> >  static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
> >  #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key)
> >  
> > -static int __init init_amu_fie(void)
> > +static void amu_fie_setup(const struct cpumask *cpus)
> >  {
> > -	cpumask_var_t valid_cpus;
> >  	bool invariant;
> > -	int ret = 0;
> >  	int cpu;
> >  
> > -	if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL))
> > -		return -ENOMEM;
> > -
> > -	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) {
> > -		ret = -ENOMEM;
> > -		goto free_valid_mask;
> > -	}
> > +	/* We are already set since the last insmod of cpufreq driver */
> > +	if (unlikely(cpumask_subset(cpus, amu_fie_cpus)))
> > +		return;
> >  
> > -	for_each_present_cpu(cpu) {
> > +	for_each_cpu(cpu, cpus) {
> >  		if (!freq_counters_valid(cpu) ||
> >  		    freq_inv_set_max_ratio(cpu,
> >  					   cpufreq_get_hw_max_freq(cpu) * 1000,
> >  					   arch_timer_get_rate()))
> > -			continue;
> > -
> > -		cpumask_set_cpu(cpu, valid_cpus);
> > -		enable_policy_freq_counters(cpu, valid_cpus);
> > +			return;
> >  	}
> >  
> > -	/* Overwrite amu_fie_cpus if all CPUs support AMU */
> > -	if (cpumask_equal(valid_cpus, cpu_present_mask))
> > -		cpumask_copy(amu_fie_cpus, cpu_present_mask);
> > -
> > -	if (cpumask_empty(amu_fie_cpus))
> > -		goto free_valid_mask;
> > +	cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
> >  
> >  	invariant = topology_scale_freq_invariant();
> >  
> >  	/* We aren't fully invariant yet */
> >  	if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
> > -		goto free_valid_mask;
> > +		return;
> >  
> >  	static_branch_enable(&amu_fie_key);
> >  
> 
> If we are cpufreq invariant, we'll reach this point and the following
> pr_info, even if not all CPUs have been checked, which (at this late
> hour) I think it's functionally fine.

Even then I think we should print cpus here instead of amu_fie_cpus,
just to not repeat things..

> But we get prints like:
> 
> [    2.665918] AMU: CPUs[0-3]: counters will be used for FIE.
> [    2.666293] AMU: CPUs[0-5]: counters will be used for FIE.
> 
> For two policies this is fine (although confusing) but if we had more
> CPUs and more policies, it would be too many lines.
> 
> I'm not sure if there's a better way of fixing this other than keeping
> track of all visited CPUs and printing this line when all online CPUs
> have been visited.

This is at best a debug message and maybe we should just convert it to
that and then it should be fine ?

And any logic added to print a better message isn't worth it I
believe.

> > @@ -271,13 +240,48 @@ static int __init init_amu_fie(void)
> >  	 */
> >  	if (!invariant)
> >  		rebuild_sched_domains_energy();
> > +}
> > +
> > +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
> > +				 void *data)
> > +{
> > +	struct cpufreq_policy *policy = data;
> > +
> > +	if (val == CPUFREQ_CREATE_POLICY)
> > +		amu_fie_setup(policy->related_cpus);
> > +
> 
> Is is guaranteed that cpuinfo.max_freq is always set at this point?

Yes, we call this after the policy is initialized and all basic setup
is done.

> I have a vague recollection of a scenario where cpuinfo.max_freq could
> be populated later in case the driver needs to talk to firmware to
> obtain this value.

I don't remember anything like that for sure, we can see what to do if
we ever come across such a scenario.

> The setup above will fail if the CPU's max frequency cannot be obtained.

Yeah, I agree. Shouldn't happen though.
Ionela Voinescu Dec. 16, 2020, 7:37 p.m. UTC | #4
Hi,

On Wednesday 16 Dec 2020 at 10:08:05 (+0530), Viresh Kumar wrote:
[..]
> > > +static void amu_fie_setup(const struct cpumask *cpus)
> > >  {
> > > -	cpumask_var_t valid_cpus;
> > >  	bool invariant;
> > > -	int ret = 0;
> > >  	int cpu;
> > >  
> > > -	if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL))
> > > -		return -ENOMEM;
> > > -
> > > -	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) {
> > > -		ret = -ENOMEM;
> > > -		goto free_valid_mask;
> > > -	}
> > > +	/* We are already set since the last insmod of cpufreq driver */
> > > +	if (unlikely(cpumask_subset(cpus, amu_fie_cpus)))
> > > +		return;
> > >  
> > > -	for_each_present_cpu(cpu) {
> > > +	for_each_cpu(cpu, cpus) {
> > >  		if (!freq_counters_valid(cpu) ||
> > >  		    freq_inv_set_max_ratio(cpu,
> > >  					   cpufreq_get_hw_max_freq(cpu) * 1000,
> > >  					   arch_timer_get_rate()))
> > > -			continue;
> > > -
> > > -		cpumask_set_cpu(cpu, valid_cpus);
> > > -		enable_policy_freq_counters(cpu, valid_cpus);
> > > +			return;
> > >  	}
> > >  
> > > -	/* Overwrite amu_fie_cpus if all CPUs support AMU */
> > > -	if (cpumask_equal(valid_cpus, cpu_present_mask))
> > > -		cpumask_copy(amu_fie_cpus, cpu_present_mask);
> > > -
> > > -	if (cpumask_empty(amu_fie_cpus))
> > > -		goto free_valid_mask;
> > > +	cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
> > >  
> > >  	invariant = topology_scale_freq_invariant();
> > >  
> > >  	/* We aren't fully invariant yet */
> > >  	if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
> > > -		goto free_valid_mask;
> > > +		return;
> > >  
> > >  	static_branch_enable(&amu_fie_key);
> > >  
> > 
> > If we are cpufreq invariant, we'll reach this point and the following
> > pr_info, even if not all CPUs have been checked, which (at this late
> > hour) I think it's functionally fine.
> 
> Even then I think we should print cpus here instead of amu_fie_cpus,
> just to not repeat things..
> 

Agreed!

> > But we get prints like:
> > 
> > [    2.665918] AMU: CPUs[0-3]: counters will be used for FIE.
> > [    2.666293] AMU: CPUs[0-5]: counters will be used for FIE.
> > 
> > For two policies this is fine (although confusing) but if we had more
> > CPUs and more policies, it would be too many lines.
> > 
> > I'm not sure if there's a better way of fixing this other than keeping
> > track of all visited CPUs and printing this line when all online CPUs
> > have been visited.
> 
> This is at best a debug message and maybe we should just convert it to
> that and then it should be fine ?
> 
> And any logic added to print a better message isn't worth it I
> believe.
> 

That's true. The reason I would have wanted a single line to mention the
use of counters for FIE is because it's difficult to notice frequency
invariance issues. Everything might seems to work fine but without
or with broken frequency invariance the scheduler would not behave
optimally.

But we did the same compromise for EAS so making this a debug message is
fine I think.

> > > @@ -271,13 +240,48 @@ static int __init init_amu_fie(void)
> > >  	 */
> > >  	if (!invariant)
> > >  		rebuild_sched_domains_energy();
> > > +}
> > > +
> > > +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
> > > +				 void *data)
> > > +{
> > > +	struct cpufreq_policy *policy = data;
> > > +
> > > +	if (val == CPUFREQ_CREATE_POLICY)
> > > +		amu_fie_setup(policy->related_cpus);
> > > +
> > 
> > Is is guaranteed that cpuinfo.max_freq is always set at this point?
> 
> Yes, we call this after the policy is initialized and all basic setup
> is done.
> 
> > I have a vague recollection of a scenario where cpuinfo.max_freq could
> > be populated later in case the driver needs to talk to firmware to
> > obtain this value.
> 
> I don't remember anything like that for sure, we can see what to do if
> we ever come across such a scenario.
> 

Makes sense!

> > The setup above will fail if the CPU's max frequency cannot be obtained.
> 
> Yeah, I agree. Shouldn't happen though.
> 

> > > +	/*
> > > +	 * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU
> > > +	 * counters don't have any dependency on cpufreq driver once we have
> > > +	 * initialized AMU support and enabled invariance. The AMU counters will
> > > +	 * keep on working just fine in the absence of the cpufreq driver, and
> > > +	 * for the CPUs for which there are no counters availalbe, the last set
                                                        ^^^^^^^^^
							available

> > > +	 * value of freq_scale will remain valid as that is the frequency those
> > > +	 * CPUs are running at.
> +	 */

I did not yet test this, but reading this comment made me wonder..

arch_scale_freq_invariant() (or topology_scale_freq_invariant()) is also
called from schedutil when obtaining the next frequency.

So if we had a system that only partly supports AMUs but had at some
point a cpufreq driver that provided FIE for the other CPUs, when we
unregister the driver, the cpufreq_freq_invariance static key is
disabled. Therefore, none of the conditions for system invariance is
now accomplished and arch_scale_freq_invariant() will return false.
This will be broken as utilization is still scaled, but the algorithm
for computing the next frequency in schedutil will not take this into
account.

[..]
> > > +	ret = cpufreq_register_notifier(&init_amu_fie_notifier,
> > > +					CPUFREQ_POLICY_NOTIFIER);

The above makes the use of AMUs for FIE tightly coupled with cpufreq.

Initially I made cpufreq_get_hw_max_freq(cpu) a weak function for the
possible platforms that might not use a cpufreq driver and might want to
provide this function to still benefit from the use of counters for
frequency invariance.

But I'm starting to believe that supporting all these corner-cases in
advance just introduces messiness.

So feel free to remove the 'weak' state of cpufreq_get_hw_max_freq() as
well, so we don't keep wondering why we had that in the first place.
It would not make any sense keeping that in light of these changes.

P.S. I will be on holiday starting tomorrow until beginning of January.
Were you intending this for 5.11, or can I take more time to review
future versions and continue testing?

Thanks,
Ionela.

> -- 
> viresh
Viresh Kumar Dec. 17, 2020, 10:50 a.m. UTC | #5
On 16-12-20, 19:37, Ionela Voinescu wrote:
> I did not yet test this, but reading this comment made me wonder..
> 
> arch_scale_freq_invariant() (or topology_scale_freq_invariant()) is also
> called from schedutil when obtaining the next frequency.
> 
> So if we had a system that only partly supports AMUs but had at some
> point a cpufreq driver that provided FIE for the other CPUs, when we
> unregister the driver, the cpufreq_freq_invariance static key is
> disabled. Therefore, none of the conditions for system invariance is
> now accomplished and arch_scale_freq_invariant() will return false.
> This will be broken as utilization is still scaled, but the algorithm
> for computing the next frequency in schedutil will not take this into
> account.

I think the best and the easiest solution for this is:

bool arch_freq_counters_available(const struct cpumask *cpus)
{
        return amu_freq_invariant();
}

But we probably need to rename it to something like arch_is_fie().

> 
> [..]
> > > > +	ret = cpufreq_register_notifier(&init_amu_fie_notifier,
> > > > +					CPUFREQ_POLICY_NOTIFIER);
> 
> The above makes the use of AMUs for FIE tightly coupled with cpufreq.
> 
> Initially I made cpufreq_get_hw_max_freq(cpu) a weak function for the
> possible platforms that might not use a cpufreq driver and might want to
> provide this function to still benefit from the use of counters for
> frequency invariance.
> 
> But I'm starting to believe that supporting all these corner-cases in
> advance just introduces messiness.
> 
> So feel free to remove the 'weak' state of cpufreq_get_hw_max_freq() as
> well, so we don't keep wondering why we had that in the first place.
> It would not make any sense keeping that in light of these changes.

Will do it in a separate patch then.

> P.S. I will be on holiday starting tomorrow until beginning of January.
> Were you intending this for 5.11, or can I take more time to review
> future versions and continue testing?

I wanted to  :)
Ionela Voinescu Jan. 8, 2021, 9:44 a.m. UTC | #6
On Thursday 17 Dec 2020 at 16:20:49 (+0530), Viresh Kumar wrote:
> On 16-12-20, 19:37, Ionela Voinescu wrote:
> > I did not yet test this, but reading this comment made me wonder..
> > 
> > arch_scale_freq_invariant() (or topology_scale_freq_invariant()) is also
> > called from schedutil when obtaining the next frequency.
> > 
> > So if we had a system that only partly supports AMUs but had at some
> > point a cpufreq driver that provided FIE for the other CPUs, when we
> > unregister the driver, the cpufreq_freq_invariance static key is
> > disabled. Therefore, none of the conditions for system invariance is
> > now accomplished and arch_scale_freq_invariant() will return false.
> > This will be broken as utilization is still scaled, but the algorithm
> > for computing the next frequency in schedutil will not take this into
> > account.
> 
> I think the best and the easiest solution for this is:
> 
> bool arch_freq_counters_available(const struct cpumask *cpus)
> {
>         return amu_freq_invariant();
> }
> 
> But we probably need to rename it to something like arch_is_fie().
> 

Now that I think of it again (after spending 30 minutes trying to come
up with a more clear solution) I realised this is not actually a
problem :).

The only location that checks the invariance status is schedutil, but
what a cpufreq governor does becomes irrelevant if you remove the
cpufreq driver. The only potential problem is if one then inmods a
cpufreq driver that's not invariant. But I think that might be on "if"
too many to consider. What do you think?

Thanks,
Ionela.
Ionela Voinescu Jan. 8, 2021, 10:42 a.m. UTC | #7
On Friday 08 Jan 2021 at 09:44:16 (+0000), Ionela Voinescu wrote:
> On Thursday 17 Dec 2020 at 16:20:49 (+0530), Viresh Kumar wrote:
> > On 16-12-20, 19:37, Ionela Voinescu wrote:
> > > I did not yet test this, but reading this comment made me wonder..
> > > 
> > > arch_scale_freq_invariant() (or topology_scale_freq_invariant()) is also
> > > called from schedutil when obtaining the next frequency.
> > > 
> > > So if we had a system that only partly supports AMUs but had at some
> > > point a cpufreq driver that provided FIE for the other CPUs, when we
> > > unregister the driver, the cpufreq_freq_invariance static key is
> > > disabled. Therefore, none of the conditions for system invariance is
> > > now accomplished and arch_scale_freq_invariant() will return false.
> > > This will be broken as utilization is still scaled, but the algorithm
> > > for computing the next frequency in schedutil will not take this into
> > > account.
> > 
> > I think the best and the easiest solution for this is:
> > 
> > bool arch_freq_counters_available(const struct cpumask *cpus)
> > {
> >         return amu_freq_invariant();
> > }
> > 
> > But we probably need to rename it to something like arch_is_fie().
> > 

Forgot to answer this one:

arch_freq_counters_available() is also used in arch_set_freq_scale() to
tell us not only if the arch is FI, but also to tell us if the AMUs are
used for FI for some particular CPUs. So we couldn't easily rewrite this
one, or do it in a way that would be worth it.

Ionela.

> 
> Now that I think of it again (after spending 30 minutes trying to come
> up with a more clear solution) I realised this is not actually a
> problem :).
> 
> The only location that checks the invariance status is schedutil, but
> what a cpufreq governor does becomes irrelevant if you remove the
> cpufreq driver. The only potential problem is if one then inmods a
> cpufreq driver that's not invariant. But I think that might be on "if"
> too many to consider. What do you think?
> 
> Thanks,
> Ionela.
Viresh Kumar Jan. 8, 2021, 11:03 a.m. UTC | #8
On 08-01-21, 09:44, Ionela Voinescu wrote:
> Now that I think of it again (after spending 30 minutes trying to come
> up with a more clear solution) I realised this is not actually a
> problem :).
> 
> The only location that checks the invariance status is schedutil, but
> what a cpufreq governor does becomes irrelevant if you remove the
> cpufreq driver.

Good catch :)

> The only potential problem is if one then inmods a
> cpufreq driver that's not invariant. But I think that might be on "if"
> too many to consider. What do you think?

Yeah, there is no need to worry about this then I think.

I will resend the patches soon.
diff mbox series

Patch

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 57267d694495..0fc2b32ddb45 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -199,64 +199,33 @@  static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
 	return 0;
 }
 
-static inline void
-enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus)
-{
-	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-
-	if (!policy) {
-		pr_debug("CPU%d: No cpufreq policy found.\n", cpu);
-		return;
-	}
-
-	if (cpumask_subset(policy->related_cpus, valid_cpus))
-		cpumask_or(amu_fie_cpus, policy->related_cpus,
-			   amu_fie_cpus);
-
-	cpufreq_cpu_put(policy);
-}
-
 static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
 #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key)
 
-static int __init init_amu_fie(void)
+static void amu_fie_setup(const struct cpumask *cpus)
 {
-	cpumask_var_t valid_cpus;
 	bool invariant;
-	int ret = 0;
 	int cpu;
 
-	if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) {
-		ret = -ENOMEM;
-		goto free_valid_mask;
-	}
+	/* We are already set since the last insmod of cpufreq driver */
+	if (unlikely(cpumask_subset(cpus, amu_fie_cpus)))
+		return;
 
-	for_each_present_cpu(cpu) {
+	for_each_cpu(cpu, cpus) {
 		if (!freq_counters_valid(cpu) ||
 		    freq_inv_set_max_ratio(cpu,
 					   cpufreq_get_hw_max_freq(cpu) * 1000,
 					   arch_timer_get_rate()))
-			continue;
-
-		cpumask_set_cpu(cpu, valid_cpus);
-		enable_policy_freq_counters(cpu, valid_cpus);
+			return;
 	}
 
-	/* Overwrite amu_fie_cpus if all CPUs support AMU */
-	if (cpumask_equal(valid_cpus, cpu_present_mask))
-		cpumask_copy(amu_fie_cpus, cpu_present_mask);
-
-	if (cpumask_empty(amu_fie_cpus))
-		goto free_valid_mask;
+	cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
 
 	invariant = topology_scale_freq_invariant();
 
 	/* We aren't fully invariant yet */
 	if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
-		goto free_valid_mask;
+		return;
 
 	static_branch_enable(&amu_fie_key);
 
@@ -271,13 +240,48 @@  static int __init init_amu_fie(void)
 	 */
 	if (!invariant)
 		rebuild_sched_domains_energy();
+}
+
+static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
+				 void *data)
+{
+	struct cpufreq_policy *policy = data;
+
+	if (val == CPUFREQ_CREATE_POLICY)
+		amu_fie_setup(policy->related_cpus);
+
+	/*
+	 * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU
+	 * counters don't have any dependency on cpufreq driver once we have
+	 * initialized AMU support and enabled invariance. The AMU counters will
+	 * keep on working just fine in the absence of the cpufreq driver, and
+	 * for the CPUs for which there are no counters availalbe, the last set
+	 * value of freq_scale will remain valid as that is the frequency those
+	 * CPUs are running at.
+	 */
+
+	return 0;
+}
+
+static struct notifier_block init_amu_fie_notifier = {
+	.notifier_call = init_amu_fie_callback,
+};
+
+static int __init init_amu_fie(void)
+{
+	int ret;
+
+	if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL))
+		return -ENOMEM;
 
-free_valid_mask:
-	free_cpumask_var(valid_cpus);
+	ret = cpufreq_register_notifier(&init_amu_fie_notifier,
+					CPUFREQ_POLICY_NOTIFIER);
+	if (ret)
+		free_cpumask_var(amu_fie_cpus);
 
 	return ret;
 }
-late_initcall_sync(init_amu_fie);
+core_initcall(init_amu_fie);
 
 bool arch_freq_counters_available(const struct cpumask *cpus)
 {