diff mbox series

[V2] cpufreq: Call transition notifier only once for each policy

Message ID e2b56be446eeb67f1905d2ac6f8d82dd4389d0c5.1552640968.git.viresh.kumar@linaro.org (mailing list archive)
State New, archived
Headers show
Series [V2] cpufreq: Call transition notifier only once for each policy | expand

Commit Message

Viresh Kumar March 15, 2019, 9:13 a.m. UTC
Currently we call these notifiers once for each CPU of the policy->cpus
cpumask. It would be more optimal if the notifier can be called only
once and all the relevant information be provided to it. Out of the 24
drivers that register for the transition notifiers today, only 5 of them
do per-cpu updates and the callback for the rest can be called only once
for the policy without any impact.

This would also avoid multiple function calls to the notifier callbacks
and reduce multiple iterations of notifier core's code (which does
locking as well).

This patch adds pointer to the cpufreq policy to the struct
cpufreq_freqs, so the notifier callback has all the information
available to it with a single call. The five drivers which perform
per-cpu updates are updated to use the cpufreq policy. The freqs->cpu
field is redundant now and is removed.

Acked-by: David S. Miller <davem@davemloft.net> (sparc)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V1->V2:
- Add cpufreq policy instead of cpus in struct cpufreq_freqs.
- Use policy->cpus instead of related_cpus everywhere in order not to
  change the existing behavior.
- Merged all 7 patches into a single patch.
- Updated changlog and included Ack from David.

 arch/arm/kernel/smp.c       | 24 +++++++++++++++---------
 arch/arm/kernel/smp_twd.c   |  9 ++++++---
 arch/sparc/kernel/time_64.c | 28 ++++++++++++++++------------
 arch/x86/kernel/tsc.c       | 32 +++++++++++++++++++++-----------
 arch/x86/kvm/x86.c          | 31 ++++++++++++++++++++-----------
 drivers/cpufreq/cpufreq.c   | 19 ++++++++++---------
 include/linux/cpufreq.h     | 14 +++++++-------
 7 files changed, 95 insertions(+), 62 deletions(-)

Comments

Peter Zijlstra March 15, 2019, 12:29 p.m. UTC | #1
On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 3fae23834069..cff8779fc0d2 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>  				void *data)
>  {
>  	struct cpufreq_freqs *freq = data;
> -	unsigned long *lpj;
> -
> -	lpj = &boot_cpu_data.loops_per_jiffy;
> -#ifdef CONFIG_SMP
> -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> -#endif
> +	struct cpumask *cpus = freq->policy->cpus;
> +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> +	unsigned long lpj;
> +	int cpu;
>  
>  	if (!ref_freq) {
>  		ref_freq = freq->old;
> -		loops_per_jiffy_ref = *lpj;
>  		tsc_khz_ref = tsc_khz;
> +
> +		if (boot_cpu)
> +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> +		else
> +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
>  	}
> +
>  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
>  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> -
> +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
>  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> +
>  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
>  			mark_tsc_unstable("cpufreq changes");
>  
> -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> +		if (boot_cpu) {
> +			boot_cpu_data.loops_per_jiffy = lpj;
> +		} else {
> +			for_each_cpu(cpu, cpus)
> +				cpu_data(cpu).loops_per_jiffy = lpj;
> +		}
> +
> +		for_each_cpu(cpu, cpus)
> +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());

This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
question. That's part of the whole problem here, TSC isn't sync'ed when
it's subject to CPUFREQ.

>  	}
>  
>  	return 0;
Viresh Kumar March 18, 2019, 2:35 a.m. UTC | #2
On 15-03-19, 13:29, Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 3fae23834069..cff8779fc0d2 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >  				void *data)
> >  {
> >  	struct cpufreq_freqs *freq = data;
> > -	unsigned long *lpj;
> > -
> > -	lpj = &boot_cpu_data.loops_per_jiffy;
> > -#ifdef CONFIG_SMP
> > -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > -#endif
> > +	struct cpumask *cpus = freq->policy->cpus;
> > +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > +	unsigned long lpj;
> > +	int cpu;
> >  
> >  	if (!ref_freq) {
> >  		ref_freq = freq->old;
> > -		loops_per_jiffy_ref = *lpj;
> >  		tsc_khz_ref = tsc_khz;
> > +
> > +		if (boot_cpu)
> > +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > +		else
> > +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> >  	}
> > +
> >  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> >  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > -
> > +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> >  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > +
> >  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> >  			mark_tsc_unstable("cpufreq changes");
> >  
> > -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > +		if (boot_cpu) {
> > +			boot_cpu_data.loops_per_jiffy = lpj;
> > +		} else {
> > +			for_each_cpu(cpu, cpus)
> > +				cpu_data(cpu).loops_per_jiffy = lpj;
> > +		}
> > +
> > +		for_each_cpu(cpu, cpus)
> > +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> 
> This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> question.

You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
that and it was left for the notifier to do. This patch doesn't change the
behavior at all, just that it moves the for-loop to the notifier instead of the
cpufreq core.

> That's part of the whole problem here, TSC isn't sync'ed when
> it's subject to CPUFREQ.
> 
> >  	}
> >  
> >  	return 0;
Rafael J. Wysocki March 18, 2019, 10:45 a.m. UTC | #3
On Fri, Mar 15, 2019 at 1:30 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 3fae23834069..cff8779fc0d2 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >                               void *data)
> >  {
> >       struct cpufreq_freqs *freq = data;
> > -     unsigned long *lpj;
> > -
> > -     lpj = &boot_cpu_data.loops_per_jiffy;
> > -#ifdef CONFIG_SMP
> > -     if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > -             lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > -#endif
> > +     struct cpumask *cpus = freq->policy->cpus;
> > +     bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > +     unsigned long lpj;
> > +     int cpu;
> >
> >       if (!ref_freq) {
> >               ref_freq = freq->old;
> > -             loops_per_jiffy_ref = *lpj;
> >               tsc_khz_ref = tsc_khz;
> > +
> > +             if (boot_cpu)
> > +                     loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > +             else
> > +                     loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> >       }
> > +
> >       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> >                       (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > -             *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > -
> > +             lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> >               tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > +
> >               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> >                       mark_tsc_unstable("cpufreq changes");
> >
> > -             set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > +             if (boot_cpu) {
> > +                     boot_cpu_data.loops_per_jiffy = lpj;
> > +             } else {
> > +                     for_each_cpu(cpu, cpus)
> > +                             cpu_data(cpu).loops_per_jiffy = lpj;
> > +             }
> > +
> > +             for_each_cpu(cpu, cpus)
> > +                     set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
>
> This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> question.

Well, strictly speaking the TSC value here comes from the CPU running the code.

The original code has this problem too, though (as Viresh said), so
the patch really doesn't make it worse in that respect. :-)

I'm not going to defend the original code (I ldidn't invent it
anyway), but it clearly assumes that different CPUs cannot run at
different frequencies and that kind of explains what happens in it.

> That's part of the whole problem here, TSC isn't sync'ed when
> it's subject to CPUFREQ.

So what would you recommend us to do here?

Obviously, this won't run on any new hardware.  Frankly, I'm not even
sure what the most recent HW where this hack would make a difference
is (the comment talking about Opterons suggests early 2000s), so this
clearly falls into the "legacy" bucket to me.

Does it make sense to try to preserve it, or can we simply make
cpufreq init fail on the systems where the TSC rate depends on the
frequency?
Peter Zijlstra March 18, 2019, 10:53 a.m. UTC | #4
On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> On 15-03-19, 13:29, Peter Zijlstra wrote:
> > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > index 3fae23834069..cff8779fc0d2 100644
> > > --- a/arch/x86/kernel/tsc.c
> > > +++ b/arch/x86/kernel/tsc.c
> > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > >  				void *data)
> > >  {
> > >  	struct cpufreq_freqs *freq = data;
> > > -	unsigned long *lpj;
> > > -
> > > -	lpj = &boot_cpu_data.loops_per_jiffy;
> > > -#ifdef CONFIG_SMP
> > > -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > -#endif
> > > +	struct cpumask *cpus = freq->policy->cpus;
> > > +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > +	unsigned long lpj;
> > > +	int cpu;
> > >  
> > >  	if (!ref_freq) {
> > >  		ref_freq = freq->old;
> > > -		loops_per_jiffy_ref = *lpj;
> > >  		tsc_khz_ref = tsc_khz;
> > > +
> > > +		if (boot_cpu)
> > > +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > +		else
> > > +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > >  	}
> > > +
> > >  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > >  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > -
> > > +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > >  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > +
> > >  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > >  			mark_tsc_unstable("cpufreq changes");
> > >  
> > > -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > +		if (boot_cpu) {
> > > +			boot_cpu_data.loops_per_jiffy = lpj;
> > > +		} else {
> > > +			for_each_cpu(cpu, cpus)
> > > +				cpu_data(cpu).loops_per_jiffy = lpj;
> > > +		}
> > > +
> > > +		for_each_cpu(cpu, cpus)
> > > +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > 
> > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > question.
> 
> You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> that and it was left for the notifier to do. This patch doesn't change the
> behavior at all, just that it moves the for-loop to the notifier instead of the
> cpufreq core.

Yuck..

Rafael; how does this work in practise? Earlier you said that on x86 the
policies typically have a single cpu in them anyway. Is the freq change
also notified from _that_ cpu?

I don't think I have old enough hardware around anymore to test any of
this. This was truly ancient p6 era stuff IIRC.

Because in that case, I'm all for not doing the changes to this notifier
Viresh is proposing but simply adding something like:


	WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
	WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());

And leave it at that.
Peter Zijlstra March 18, 2019, 11:01 a.m. UTC | #5
On Mon, Mar 18, 2019 at 11:45:00AM +0100, Rafael J. Wysocki wrote:
> On Fri, Mar 15, 2019 at 1:30 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:

> > > +             for_each_cpu(cpu, cpus)
> > > +                     set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> >
> > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > question.
> 
> Well, strictly speaking the TSC value here comes from the CPU running the code.
> 
> The original code has this problem too, though (as Viresh said), so
> the patch really doesn't make it worse in that respect. :-)
> 
> I'm not going to defend the original code (I ldidn't invent it
> anyway), but it clearly assumes that different CPUs cannot run at
> different frequencies and that kind of explains what happens in it.

The assumption was always that if CPUs ran at different frequencies, the
notifier would run on the affected CPU. After all, only that CPU would
know its frequency changed.

> > That's part of the whole problem here, TSC isn't sync'ed when
> > it's subject to CPUFREQ.
> 
> So what would you recommend us to do here?
> 
> Obviously, this won't run on any new hardware.  Frankly, I'm not even
> sure what the most recent HW where this hack would make a difference
> is (the comment talking about Opterons suggests early 2000s), so this
> clearly falls into the "legacy" bucket to me.
> 
> Does it make sense to try to preserve it, or can we simply make
> cpufreq init fail on the systems where the TSC rate depends on the
> frequency?

I'm all for deleting this and basically dropping support for anything
that needs this, but I suspect some people digging their legacy systems
(*cough* Pavel *cough*) might object to that.

Heck, I'm even ok with just calling panic() when TSC goes wobbly :-) I'm
fed up with all that broken crap. And yes, I know, that's systems sold
today :-(
Rafael J. Wysocki March 18, 2019, 11:09 a.m. UTC | #6
On Mon, Mar 18, 2019 at 11:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> > On 15-03-19, 13:29, Peter Zijlstra wrote:
> > > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > > index 3fae23834069..cff8779fc0d2 100644
> > > > --- a/arch/x86/kernel/tsc.c
> > > > +++ b/arch/x86/kernel/tsc.c
> > > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > > >                           void *data)
> > > >  {
> > > >   struct cpufreq_freqs *freq = data;
> > > > - unsigned long *lpj;
> > > > -
> > > > - lpj = &boot_cpu_data.loops_per_jiffy;
> > > > -#ifdef CONFIG_SMP
> > > > - if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > -         lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > > -#endif
> > > > + struct cpumask *cpus = freq->policy->cpus;
> > > > + bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > > + unsigned long lpj;
> > > > + int cpu;
> > > >
> > > >   if (!ref_freq) {
> > > >           ref_freq = freq->old;
> > > > -         loops_per_jiffy_ref = *lpj;
> > > >           tsc_khz_ref = tsc_khz;
> > > > +
> > > > +         if (boot_cpu)
> > > > +                 loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > > +         else
> > > > +                 loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > > >   }
> > > > +
> > > >   if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > > >                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > > -         *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > -
> > > > +         lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > >           tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > > +
> > > >           if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > >                   mark_tsc_unstable("cpufreq changes");
> > > >
> > > > -         set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > > +         if (boot_cpu) {
> > > > +                 boot_cpu_data.loops_per_jiffy = lpj;
> > > > +         } else {
> > > > +                 for_each_cpu(cpu, cpus)
> > > > +                         cpu_data(cpu).loops_per_jiffy = lpj;
> > > > +         }
> > > > +
> > > > +         for_each_cpu(cpu, cpus)
> > > > +                 set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > >
> > > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > > question.
> >
> > You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> > that and it was left for the notifier to do. This patch doesn't change the
> > behavior at all, just that it moves the for-loop to the notifier instead of the
> > cpufreq core.
>
> Yuck..
>
> Rafael; how does this work in practise? Earlier you said that on x86 the
> policies typically have a single cpu in them anyway.

Yes.

> Is the freq change also notified from _that_ cpu?

May not be, depending on what CPU runs the work item/thread changing
the freq.  It generally is not guaranteed to always be the same as the
target CPU.

> I don't think I have old enough hardware around anymore to test any of
> this. This was truly ancient p6 era stuff IIRC.
>
> Because in that case, I'm all for not doing the changes to this notifier
> Viresh is proposing but simply adding something like:
>
>
>         WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
>         WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());
>
> And leave it at that.

That may not work I'm afraid.
Rafael J. Wysocki March 18, 2019, 11:20 a.m. UTC | #7
On Mon, Mar 18, 2019 at 12:09 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Mon, Mar 18, 2019 at 11:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> > > On 15-03-19, 13:29, Peter Zijlstra wrote:
> > > > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > > > index 3fae23834069..cff8779fc0d2 100644
> > > > > --- a/arch/x86/kernel/tsc.c
> > > > > +++ b/arch/x86/kernel/tsc.c
> > > > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > > > >                           void *data)
> > > > >  {
> > > > >   struct cpufreq_freqs *freq = data;
> > > > > - unsigned long *lpj;
> > > > > -
> > > > > - lpj = &boot_cpu_data.loops_per_jiffy;
> > > > > -#ifdef CONFIG_SMP
> > > > > - if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > > -         lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > > > -#endif
> > > > > + struct cpumask *cpus = freq->policy->cpus;
> > > > > + bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > > > + unsigned long lpj;
> > > > > + int cpu;
> > > > >
> > > > >   if (!ref_freq) {
> > > > >           ref_freq = freq->old;
> > > > > -         loops_per_jiffy_ref = *lpj;
> > > > >           tsc_khz_ref = tsc_khz;
> > > > > +
> > > > > +         if (boot_cpu)
> > > > > +                 loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > > > +         else
> > > > > +                 loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > > > >   }
> > > > > +
> > > > >   if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > > > >                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > > > -         *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > > -
> > > > > +         lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > >           tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > > > +
> > > > >           if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > >                   mark_tsc_unstable("cpufreq changes");
> > > > >
> > > > > -         set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > > > +         if (boot_cpu) {
> > > > > +                 boot_cpu_data.loops_per_jiffy = lpj;
> > > > > +         } else {
> > > > > +                 for_each_cpu(cpu, cpus)
> > > > > +                         cpu_data(cpu).loops_per_jiffy = lpj;
> > > > > +         }
> > > > > +
> > > > > +         for_each_cpu(cpu, cpus)
> > > > > +                 set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > > >
> > > > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > > > question.
> > >
> > > You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> > > that and it was left for the notifier to do. This patch doesn't change the
> > > behavior at all, just that it moves the for-loop to the notifier instead of the
> > > cpufreq core.
> >
> > Yuck..
> >
> > Rafael; how does this work in practise? Earlier you said that on x86 the
> > policies typically have a single cpu in them anyway.
>
> Yes.
>
> > Is the freq change also notified from _that_ cpu?
>
> May not be, depending on what CPU runs the work item/thread changing
> the freq.  It generally is not guaranteed to always be the same as the
> target CPU.

Actually, scratch that.

On x86, with one CPU per cpufreq policy, that will always be the target CPU.

> > I don't think I have old enough hardware around anymore to test any of
> > this. This was truly ancient p6 era stuff IIRC.
> >
> > Because in that case, I'm all for not doing the changes to this notifier
> > Viresh is proposing but simply adding something like:
> >
> >
> >         WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
> >         WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());
> >
> > And leave it at that.
>
> That may not work I'm afraid.

So something like that could work.
Rafael J. Wysocki March 18, 2019, 11:49 a.m. UTC | #8
On Fri, Mar 15, 2019 at 10:13 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> Currently we call these notifiers once for each CPU of the policy->cpus
> cpumask. It would be more optimal if the notifier can be called only
> once and all the relevant information be provided to it. Out of the 24
> drivers that register for the transition notifiers today, only 5 of them
> do per-cpu updates and the callback for the rest can be called only once
> for the policy without any impact.
>
> This would also avoid multiple function calls to the notifier callbacks
> and reduce multiple iterations of notifier core's code (which does
> locking as well).
>
> This patch adds pointer to the cpufreq policy to the struct
> cpufreq_freqs, so the notifier callback has all the information
> available to it with a single call. The five drivers which perform
> per-cpu updates are updated to use the cpufreq policy. The freqs->cpu
> field is redundant now and is removed.
>
> Acked-by: David S. Miller <davem@davemloft.net> (sparc)
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
> V1->V2:
> - Add cpufreq policy instead of cpus in struct cpufreq_freqs.
> - Use policy->cpus instead of related_cpus everywhere in order not to
>   change the existing behavior.
> - Merged all 7 patches into a single patch.
> - Updated changlog and included Ack from David.
>
>  arch/arm/kernel/smp.c       | 24 +++++++++++++++---------
>  arch/arm/kernel/smp_twd.c   |  9 ++++++---
>  arch/sparc/kernel/time_64.c | 28 ++++++++++++++++------------
>  arch/x86/kernel/tsc.c       | 32 +++++++++++++++++++++-----------
>  arch/x86/kvm/x86.c          | 31 ++++++++++++++++++++-----------
>  drivers/cpufreq/cpufreq.c   | 19 ++++++++++---------
>  include/linux/cpufreq.h     | 14 +++++++-------
>  7 files changed, 95 insertions(+), 62 deletions(-)
>
> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index 1d6f5ea522f4..6f6b981fecda 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -758,15 +758,20 @@ static int cpufreq_callback(struct notifier_block *nb,
>                                         unsigned long val, void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       int cpu = freq->cpu;
> +       struct cpumask *cpus = freq->policy->cpus;
> +       int cpu, first = cpumask_first(cpus);
> +       unsigned int lpj;
>
>         if (freq->flags & CPUFREQ_CONST_LOOPS)
>                 return NOTIFY_OK;
>
> -       if (!per_cpu(l_p_j_ref, cpu)) {
> -               per_cpu(l_p_j_ref, cpu) =
> -                       per_cpu(cpu_data, cpu).loops_per_jiffy;
> -               per_cpu(l_p_j_ref_freq, cpu) = freq->old;
> +       if (!per_cpu(l_p_j_ref, first)) {
> +               for_each_cpu(cpu, cpus) {
> +                       per_cpu(l_p_j_ref, cpu) =
> +                               per_cpu(cpu_data, cpu).loops_per_jiffy;
> +                       per_cpu(l_p_j_ref_freq, cpu) = freq->old;
> +               }
> +
>                 if (!global_l_p_j_ref) {
>                         global_l_p_j_ref = loops_per_jiffy;
>                         global_l_p_j_ref_freq = freq->old;
> @@ -778,10 +783,11 @@ static int cpufreq_callback(struct notifier_block *nb,
>                 loops_per_jiffy = cpufreq_scale(global_l_p_j_ref,
>                                                 global_l_p_j_ref_freq,
>                                                 freq->new);
> -               per_cpu(cpu_data, cpu).loops_per_jiffy =
> -                       cpufreq_scale(per_cpu(l_p_j_ref, cpu),
> -                                       per_cpu(l_p_j_ref_freq, cpu),
> -                                       freq->new);
> +
> +               lpj = cpufreq_scale(per_cpu(l_p_j_ref, first),
> +                                   per_cpu(l_p_j_ref_freq, first), freq->new);
> +               for_each_cpu(cpu, cpus)
> +                       per_cpu(cpu_data, cpu).loops_per_jiffy = lpj;
>         }
>         return NOTIFY_OK;
>  }
> diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
> index b30eafeef096..495cc7282096 100644
> --- a/arch/arm/kernel/smp_twd.c
> +++ b/arch/arm/kernel/smp_twd.c
> @@ -162,15 +162,18 @@ static int twd_cpufreq_transition(struct notifier_block *nb,
>         unsigned long state, void *data)
>  {
>         struct cpufreq_freqs *freqs = data;
> +       int cpu;
>
>         /*
>          * The twd clock events must be reprogrammed to account for the new
>          * frequency.  The timer is local to a cpu, so cross-call to the
>          * changing cpu.
>          */
> -       if (state == CPUFREQ_POSTCHANGE)
> -               smp_call_function_single(freqs->cpu, twd_update_frequency,
> -                       NULL, 1);
> +       if (state != CPUFREQ_POSTCHANGE)
> +               return NOTIFY_OK;
> +
> +       for_each_cpu(cpu, freqs->policy->cpus)
> +               smp_call_function_single(cpu, twd_update_frequency, NULL, 1);
>
>         return NOTIFY_OK;
>  }
> diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
> index 3eb77943ce12..89fb05f90609 100644
> --- a/arch/sparc/kernel/time_64.c
> +++ b/arch/sparc/kernel/time_64.c
> @@ -653,19 +653,23 @@ static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
>                                     void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       unsigned int cpu = freq->cpu;
> -       struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
> +       unsigned int cpu;
> +       struct freq_table *ft;
>
> -       if (!ft->ref_freq) {
> -               ft->ref_freq = freq->old;
> -               ft->clock_tick_ref = cpu_data(cpu).clock_tick;
> -       }
> -       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> -           (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -               cpu_data(cpu).clock_tick =
> -                       cpufreq_scale(ft->clock_tick_ref,
> -                                     ft->ref_freq,
> -                                     freq->new);
> +       for_each_cpu(cpu, freq->policy->cpus) {
> +               ft = &per_cpu(sparc64_freq_table, cpu);
> +
> +               if (!ft->ref_freq) {
> +                       ft->ref_freq = freq->old;
> +                       ft->clock_tick_ref = cpu_data(cpu).clock_tick;
> +               }
> +
> +               if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> +                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> +                       cpu_data(cpu).clock_tick =
> +                               cpufreq_scale(ft->clock_tick_ref, ft->ref_freq,
> +                                             freq->new);
> +               }
>         }
>
>         return 0;
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 3fae23834069..cff8779fc0d2 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>                                 void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       unsigned long *lpj;
> -
> -       lpj = &boot_cpu_data.loops_per_jiffy;
> -#ifdef CONFIG_SMP
> -       if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> -               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> -#endif
> +       struct cpumask *cpus = freq->policy->cpus;
> +       bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> +       unsigned long lpj;
> +       int cpu;
>
>         if (!ref_freq) {
>                 ref_freq = freq->old;
> -               loops_per_jiffy_ref = *lpj;
>                 tsc_khz_ref = tsc_khz;
> +
> +               if (boot_cpu)
> +                       loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> +               else
> +                       loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
>         }
> +
>         if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
>                         (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -               *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> -
> +               lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
>                 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> +
>                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
>                         mark_tsc_unstable("cpufreq changes");
>
> -               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> +               if (boot_cpu) {
> +                       boot_cpu_data.loops_per_jiffy = lpj;
> +               } else {
> +                       for_each_cpu(cpu, cpus)
> +                               cpu_data(cpu).loops_per_jiffy = lpj;
> +               }
> +
> +               for_each_cpu(cpu, cpus)
> +                       set_cyc2ns_scale(tsc_khz, cpu, rdtsc());

To summarize, I think that it would be sufficient to do this just for
policy->cpu and, as Peter said, warn once if there are more CPUs in
the policy or policy->cpu is not the CPU running this code.  And mark
the TSC as unstable in both of these cases.
Viresh Kumar March 19, 2019, 5:50 a.m. UTC | #9
On 18-03-19, 12:49, Rafael J. Wysocki wrote:
> To summarize, I think that it would be sufficient to do this just for
> policy->cpu and, as Peter said, warn once if there are more CPUs in
> the policy or policy->cpu is not the CPU running this code.  And mark
> the TSC as unstable in both of these cases.

How about this ?

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3fae23834069..4d3681cfb6e0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -958,10 +958,13 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
        struct cpufreq_freqs *freq = data;
        unsigned long *lpj;
 
+       if (WARN_ON_ONCE(cpumask_weight(freq->policy->related_cpus) != 1))
+               mark_tsc_unstable("cpufreq policy has more than CPU");
+
        lpj = &boot_cpu_data.loops_per_jiffy;
 #ifdef CONFIG_SMP
        if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+               lpj = &cpu_data(freq->policy->cpu).loops_per_jiffy;
 #endif
 
        if (!ref_freq) {
@@ -977,7 +980,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
                        mark_tsc_unstable("cpufreq changes");
 
-               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
+               set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
        }
 
        return 0;
Rafael J. Wysocki March 19, 2019, 9:41 a.m. UTC | #10
On Tue, Mar 19, 2019 at 6:50 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 18-03-19, 12:49, Rafael J. Wysocki wrote:
> > To summarize, I think that it would be sufficient to do this just for
> > policy->cpu and, as Peter said, warn once if there are more CPUs in
> > the policy or policy->cpu is not the CPU running this code.  And mark
> > the TSC as unstable in both of these cases.
>
> How about this ?

We guarantee that this will always run on policy->cpu IIRC, so it LGTM
overall, but the new message is missing "one".

> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 3fae23834069..4d3681cfb6e0 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -958,10 +958,13 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>         struct cpufreq_freqs *freq = data;
>         unsigned long *lpj;
>
> +       if (WARN_ON_ONCE(cpumask_weight(freq->policy->related_cpus) != 1))
> +               mark_tsc_unstable("cpufreq policy has more than CPU");

Also I would check policy->cpus here.  After all, we don't care about
CPUs that are never online.

And the message could be something like "cpufreq changes: related CPUs
affected".

> +
>         lpj = &boot_cpu_data.loops_per_jiffy;
>  #ifdef CONFIG_SMP
>         if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> -               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> +               lpj = &cpu_data(freq->policy->cpu).loops_per_jiffy;
>  #endif
>
>         if (!ref_freq) {
> @@ -977,7 +980,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
>                         mark_tsc_unstable("cpufreq changes");
>
> -               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> +               set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
>         }
>
>         return 0;
Viresh Kumar March 19, 2019, 10:49 a.m. UTC | #11
On 19-03-19, 10:41, Rafael J. Wysocki wrote:
> On Tue, Mar 19, 2019 at 6:50 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> >
> > On 18-03-19, 12:49, Rafael J. Wysocki wrote:
> > > To summarize, I think that it would be sufficient to do this just for
> > > policy->cpu and, as Peter said, warn once if there are more CPUs in
> > > the policy or policy->cpu is not the CPU running this code.  And mark
> > > the TSC as unstable in both of these cases.
> >
> > How about this ?
> 
> We guarantee that this will always run on policy->cpu IIRC, so it LGTM

Yeah, the governor guarantees that unless dvfs_possible_from_any_cpu is set for
the policy. But there are few direct invocations to cpufreq_driver_target() and
__cpufreq_driver_target() which don't take that into account. First one is done
from cpufreq_online(), which can get called on any CPU I believe. Other one is
from cpufreq_generic_suspend(). But I think none of them gets called for x86 and
so below code should be safe.

> overall, but the new message is missing "one".

Talking about print message ?

> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 3fae23834069..4d3681cfb6e0 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -958,10 +958,13 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >         struct cpufreq_freqs *freq = data;
> >         unsigned long *lpj;
> >
> > +       if (WARN_ON_ONCE(cpumask_weight(freq->policy->related_cpus) != 1))
> > +               mark_tsc_unstable("cpufreq policy has more than CPU");
> 
> Also I would check policy->cpus here.  After all, we don't care about
> CPUs that are never online.

Because the CPU isn't in the policy->cpus mask, we can't say it was *never*
online. Just that it isn't online at that moment of time. I used related_cpus as
the code here should be robust against any corner cases and shouldn't have
different behavior based on value of policy->cpus.

If the cpufreq driver is probed after all but one CPUs of a policy are offlined,
then you won't see the warning if policy->cpus is used. But the warning will
appear if any other CPU is onlined. For me that is wrong, we should have got the
warning earlier as well as it was just wrong to not warn earlier.

> And the message could be something like "cpufreq changes: related CPUs
> affected".

Sure.

I also forgot to add a "return" statement here. We shouldn't continue in this
case, right ?

> > +
> >         lpj = &boot_cpu_data.loops_per_jiffy;
> >  #ifdef CONFIG_SMP
> >         if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > -               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > +               lpj = &cpu_data(freq->policy->cpu).loops_per_jiffy;
> >  #endif
> >
> >         if (!ref_freq) {
> > @@ -977,7 +980,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> >                         mark_tsc_unstable("cpufreq changes");
> >
> > -               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > +               set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
> >         }
> >
> >         return 0;
Rafael J. Wysocki March 19, 2019, 3:44 p.m. UTC | #12
On Tue, Mar 19, 2019 at 11:49 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> On 19-03-19, 10:41, Rafael J. Wysocki wrote:
> > On Tue, Mar 19, 2019 at 6:50 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
> > >
> > > On 18-03-19, 12:49, Rafael J. Wysocki wrote:
> > > > To summarize, I think that it would be sufficient to do this just for
> > > > policy->cpu and, as Peter said, warn once if there are more CPUs in
> > > > the policy or policy->cpu is not the CPU running this code.  And mark
> > > > the TSC as unstable in both of these cases.
> > >
> > > How about this ?
> >
> > We guarantee that this will always run on policy->cpu IIRC, so it LGTM
>
> Yeah, the governor guarantees that unless dvfs_possible_from_any_cpu is set for
> the policy. But there are few direct invocations to cpufreq_driver_target() and
> __cpufreq_driver_target() which don't take that into account. First one is done
> from cpufreq_online(), which can get called on any CPU I believe. Other one is
> from cpufreq_generic_suspend(). But I think none of them gets called for x86 and
> so below code should be safe.

I meant on x86, sorry.

> > overall, but the new message is missing "one".
>
> Talking about print message ?

Yes.

> > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > index 3fae23834069..4d3681cfb6e0 100644
> > > --- a/arch/x86/kernel/tsc.c
> > > +++ b/arch/x86/kernel/tsc.c
> > > @@ -958,10 +958,13 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > >         struct cpufreq_freqs *freq = data;
> > >         unsigned long *lpj;
> > >
> > > +       if (WARN_ON_ONCE(cpumask_weight(freq->policy->related_cpus) != 1))
> > > +               mark_tsc_unstable("cpufreq policy has more than CPU");
> >
> > Also I would check policy->cpus here.  After all, we don't care about
> > CPUs that are never online.
>
> Because the CPU isn't in the policy->cpus mask, we can't say it was *never*
> online. Just that it isn't online at that moment of time. I used related_cpus as
> the code here should be robust against any corner cases and shouldn't have
> different behavior based on value of policy->cpus.
>
> If the cpufreq driver is probed after all but one CPUs of a policy are offlined,
> then you won't see the warning if policy->cpus is used. But the warning will
> appear if any other CPU is onlined. For me that is wrong, we should have got the
> warning earlier as well as it was just wrong to not warn earlier.

Fair enough.

> > And the message could be something like "cpufreq changes: related CPUs
> > affected".
>
> Sure.
>
> I also forgot to add a "return" statement here. We shouldn't continue in this
> case, right ?

It makes a little sense to continue then, so it's better to return
immediately in that case IMO.

> > > +
> > >         lpj = &boot_cpu_data.loops_per_jiffy;
> > >  #ifdef CONFIG_SMP
> > >         if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > -               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > +               lpj = &cpu_data(freq->policy->cpu).loops_per_jiffy;
> > >  #endif
> > >
> > >         if (!ref_freq) {
> > > @@ -977,7 +980,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > >                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > >                         mark_tsc_unstable("cpufreq changes");
> > >
> > > -               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > +               set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
> > >         }
> > >
> > >         return 0;
diff mbox series

Patch

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 1d6f5ea522f4..6f6b981fecda 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -758,15 +758,20 @@  static int cpufreq_callback(struct notifier_block *nb,
 					unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	int cpu = freq->cpu;
+	struct cpumask *cpus = freq->policy->cpus;
+	int cpu, first = cpumask_first(cpus);
+	unsigned int lpj;
 
 	if (freq->flags & CPUFREQ_CONST_LOOPS)
 		return NOTIFY_OK;
 
-	if (!per_cpu(l_p_j_ref, cpu)) {
-		per_cpu(l_p_j_ref, cpu) =
-			per_cpu(cpu_data, cpu).loops_per_jiffy;
-		per_cpu(l_p_j_ref_freq, cpu) = freq->old;
+	if (!per_cpu(l_p_j_ref, first)) {
+		for_each_cpu(cpu, cpus) {
+			per_cpu(l_p_j_ref, cpu) =
+				per_cpu(cpu_data, cpu).loops_per_jiffy;
+			per_cpu(l_p_j_ref_freq, cpu) = freq->old;
+		}
+
 		if (!global_l_p_j_ref) {
 			global_l_p_j_ref = loops_per_jiffy;
 			global_l_p_j_ref_freq = freq->old;
@@ -778,10 +783,11 @@  static int cpufreq_callback(struct notifier_block *nb,
 		loops_per_jiffy = cpufreq_scale(global_l_p_j_ref,
 						global_l_p_j_ref_freq,
 						freq->new);
-		per_cpu(cpu_data, cpu).loops_per_jiffy =
-			cpufreq_scale(per_cpu(l_p_j_ref, cpu),
-					per_cpu(l_p_j_ref_freq, cpu),
-					freq->new);
+
+		lpj = cpufreq_scale(per_cpu(l_p_j_ref, first),
+				    per_cpu(l_p_j_ref_freq, first), freq->new);
+		for_each_cpu(cpu, cpus)
+			per_cpu(cpu_data, cpu).loops_per_jiffy = lpj;
 	}
 	return NOTIFY_OK;
 }
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index b30eafeef096..495cc7282096 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -162,15 +162,18 @@  static int twd_cpufreq_transition(struct notifier_block *nb,
 	unsigned long state, void *data)
 {
 	struct cpufreq_freqs *freqs = data;
+	int cpu;
 
 	/*
 	 * The twd clock events must be reprogrammed to account for the new
 	 * frequency.  The timer is local to a cpu, so cross-call to the
 	 * changing cpu.
 	 */
-	if (state == CPUFREQ_POSTCHANGE)
-		smp_call_function_single(freqs->cpu, twd_update_frequency,
-			NULL, 1);
+	if (state != CPUFREQ_POSTCHANGE)
+		return NOTIFY_OK;
+
+	for_each_cpu(cpu, freqs->policy->cpus)
+		smp_call_function_single(cpu, twd_update_frequency, NULL, 1);
 
 	return NOTIFY_OK;
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 3eb77943ce12..89fb05f90609 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -653,19 +653,23 @@  static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
 				    void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	unsigned int cpu = freq->cpu;
-	struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
+	unsigned int cpu;
+	struct freq_table *ft;
 
-	if (!ft->ref_freq) {
-		ft->ref_freq = freq->old;
-		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
-	}
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-		cpu_data(cpu).clock_tick =
-			cpufreq_scale(ft->clock_tick_ref,
-				      ft->ref_freq,
-				      freq->new);
+	for_each_cpu(cpu, freq->policy->cpus) {
+		ft = &per_cpu(sparc64_freq_table, cpu);
+
+		if (!ft->ref_freq) {
+			ft->ref_freq = freq->old;
+			ft->clock_tick_ref = cpu_data(cpu).clock_tick;
+		}
+
+		if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+		    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+			cpu_data(cpu).clock_tick =
+				cpufreq_scale(ft->clock_tick_ref, ft->ref_freq,
+					      freq->new);
+		}
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3fae23834069..cff8779fc0d2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -956,28 +956,38 @@  static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	unsigned long *lpj;
-
-	lpj = &boot_cpu_data.loops_per_jiffy;
-#ifdef CONFIG_SMP
-	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#endif
+	struct cpumask *cpus = freq->policy->cpus;
+	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
+	unsigned long lpj;
+	int cpu;
 
 	if (!ref_freq) {
 		ref_freq = freq->old;
-		loops_per_jiffy_ref = *lpj;
 		tsc_khz_ref = tsc_khz;
+
+		if (boot_cpu)
+			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
+		else
+			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
 	}
+
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
+		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
 			mark_tsc_unstable("cpufreq changes");
 
-		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
+		if (boot_cpu) {
+			boot_cpu_data.loops_per_jiffy = lpj;
+		} else {
+			for_each_cpu(cpu, cpus)
+				cpu_data(cpu).loops_per_jiffy = lpj;
+		}
+
+		for_each_cpu(cpu, cpus)
+			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 941f932373d0..653c7da11647 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6648,10 +6648,8 @@  static void kvm_hyperv_tsc_notifier(void)
 }
 #endif
 
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-				     void *data)
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 {
-	struct cpufreq_freqs *freq = data;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
@@ -6695,17 +6693,12 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	 *
 	 */
 
-	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
-		return 0;
-	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
-		return 0;
-
-	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			if (vcpu->cpu != freq->cpu)
+			if (vcpu->cpu != cpu)
 				continue;
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
@@ -6727,8 +6720,24 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
-		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 	}
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	int cpu;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+
+	for_each_cpu(cpu, freq->policy->cpus)
+		__kvmclock_cpufreq_notifier(freq, cpu);
+
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e10922709d13..fba38bf27d26 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -300,11 +300,14 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 				      struct cpufreq_freqs *freqs,
 				      unsigned int state)
 {
+	int cpu;
+
 	BUG_ON(irqs_disabled());
 
 	if (cpufreq_disabled())
 		return;
 
+	freqs->policy = policy;
 	freqs->flags = cpufreq_driver->flags;
 	pr_debug("notification %u of frequency transition to %u kHz\n",
 		 state, freqs->new);
@@ -324,10 +327,8 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 			}
 		}
 
-		for_each_cpu(freqs->cpu, policy->cpus) {
-			srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
-						 CPUFREQ_PRECHANGE, freqs);
-		}
+		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+					 CPUFREQ_PRECHANGE, freqs);
 
 		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
 		break;
@@ -337,11 +338,11 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 		pr_debug("FREQ: %u - CPUs: %*pbl\n", freqs->new,
 			 cpumask_pr_args(policy->cpus));
 
-		for_each_cpu(freqs->cpu, policy->cpus) {
-			trace_cpu_frequency(freqs->new, freqs->cpu);
-			srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
-						 CPUFREQ_POSTCHANGE, freqs);
-		}
+		for_each_cpu(cpu, policy->cpus)
+			trace_cpu_frequency(freqs->new, cpu);
+
+		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+					 CPUFREQ_POSTCHANGE, freqs);
 
 		cpufreq_stats_record_transition(policy, freqs->new);
 		policy->cur = freqs->new;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b160e98076e3..e327523ddff2 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -42,13 +42,6 @@  enum cpufreq_table_sorting {
 	CPUFREQ_TABLE_SORTED_DESCENDING
 };
 
-struct cpufreq_freqs {
-	unsigned int cpu;	/* cpu nr */
-	unsigned int old;
-	unsigned int new;
-	u8 flags;		/* flags of cpufreq_driver, see below. */
-};
-
 struct cpufreq_cpuinfo {
 	unsigned int		max_freq;
 	unsigned int		min_freq;
@@ -156,6 +149,13 @@  struct cpufreq_policy {
 	struct thermal_cooling_device *cdev;
 };
 
+struct cpufreq_freqs {
+	struct cpufreq_policy *policy;
+	unsigned int old;
+	unsigned int new;
+	u8 flags;		/* flags of cpufreq_driver, see below. */
+};
+
 /* Only for ACPI */
 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
 #define CPUFREQ_SHARED_TYPE_HW	 (1) /* HW does needed coordination */