diff mbox

x86: Calculate MHz using APERF/MPERF for cpuinfo and scaling_cur_freq

Message ID 52f711be59539723358bea1aa3c368910a68b46d.1459485198.git.len.brown@intel.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Len Brown April 1, 2016, 4:37 a.m. UTC
From: Len Brown <len.brown@intel.com>

For x86 processors with APERF/MPERF and TSC,
return meaningful and consistent MHz in
/proc/cpuinfo and
/sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq

MHz is computed like so:

MHz = base_MHz * delta_APERF / delta_MPERF

MHz is the average frequency of the busy processor
over a measurement interval.  The interval is
defined to be the time between successive reads
of the frequency on that processor, whether from
/proc/cpuinfo or from sysfs cpufreq/scaling_cur_freq.
As with previous methods of calculating MHz,
idle time is excluded.

base_MHz above is from TSC calibration global "cpu_khz".

This x86 native method to calculate MHz returns a meaningful result
no matter if P-states are controlled by hardware or firmware
and/or the Linux cpufreq sub-system is/is-not installed.

Note that frequent or concurrent reads of /proc/cpuinfo
or sysfs cpufreq/scaling_cur_freq will shorten the
measurement interval seen by each reader.  The code
mitigates that issue by caching results for 100ms.

Discerning users are encouraged to take advantage of
the turbostat(8) utility, which can gracefully handle
concurrent measurement intervals of arbitrary length.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/Makefile     |  1 +
 arch/x86/kernel/cpu/aperfmperf.c | 76 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/proc.c       |  4 ++-
 drivers/cpufreq/cpufreq.c        |  7 +++-
 include/linux/cpufreq.h          | 13 +++++++
 5 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/aperfmperf.c

Comments

Thomas Gleixner April 1, 2016, 7:56 a.m. UTC | #1
On Fri, 1 Apr 2016, Len Brown wrote:
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 100ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> +	unsigned long long aperf, aperf_delta;
> +	unsigned long long mperf, mperf_delta;
> +	unsigned long long numerator;
> +	struct aperfmperf_sample *s = &get_cpu_var(samples);

this_cpu_ptr is sufficient. That's a smp function call ...

> +
> +	/* Cache KHz for 100 ms */
> +	if (time_before(jiffies, s->jiffies + HZ/10))
> +		goto out;
> +
> +	rdmsrl(MSR_IA32_APERF, aperf);
> +	rdmsrl(MSR_IA32_MPERF, mperf);
> +
> +	aperf_delta = aperf - s->aperf;
> +	mperf_delta = mperf - s->mperf;
> +
> +	/*
> +	 * There is no architectural guarantee that MPERF
> +	 * increments faster than we can read it.
> +	 */
> +	if (mperf_delta == 0)
> +		goto out;
> +
> +	numerator = cpu_khz * aperf_delta;
> +	s->khz = div64_u64(numerator, mperf_delta);
> +	s->jiffies = jiffies;
> +	s->aperf = aperf;
> +	s->mperf = mperf;
> +
> +out:
> +	put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(int cpu)
> +{
> +	if (!cpu_khz)
> +		return 0;
> +
> +	if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
> +		return 0;
> +
> +	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);

You can avoid the function call if you check s->jiffies here.

Thanks,

	tglx
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra April 1, 2016, 8:03 a.m. UTC | #2
On Fri, Apr 01, 2016 at 12:37:00AM -0400, Len Brown wrote:
> diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
> new file mode 100644
> index 0000000..9380102
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,76 @@
> +/*
> + * x86 APERF/MPERF KHz calculation
> + * Used by /proc/cpuinfo and /sys/.../cpufreq/scaling_cur_freq
> + *
> + * Copyright (C) 2015 Intel Corp.
> + * Author: Len Brown <len.brown@intel.com>
> + *
> + * This file is licensed under GPLv2.
> + */
> +
> +#include <linux/jiffies.h>
> +#include <linux/math64.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct aperfmperf_sample {
> +	unsigned int khz;
> +	unsigned long jiffies;
> +	unsigned long long aperf;
> +	unsigned long long mperf;
> +};
> +
> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
> +
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 100ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> +	unsigned long long aperf, aperf_delta;
> +	unsigned long long mperf, mperf_delta;
> +	unsigned long long numerator;

	u64 is less typing ;-)

> +	struct aperfmperf_sample *s = &get_cpu_var(samples);
> +
> +	/* Cache KHz for 100 ms */
> +	if (time_before(jiffies, s->jiffies + HZ/10))
> +		goto out;

This puts in a lower bound, but afaict there is no upper bound. Both
users appear to be userspace controlled.

That is; if userspace doesn't request a freq reading we can go without
reading this for a very long time.

> +
> +	rdmsrl(MSR_IA32_APERF, aperf);
> +	rdmsrl(MSR_IA32_MPERF, mperf);
> +
> +	aperf_delta = aperf - s->aperf;
> +	mperf_delta = mperf - s->mperf;

That means these delta's can be arbitrarily large, in fact the MSRs can
have wrapped however many times.

> +
> +	/*
> +	 * There is no architectural guarantee that MPERF
> +	 * increments faster than we can read it.
> +	 */
> +	if (mperf_delta == 0)
> +		goto out;
> +
> +	numerator = cpu_khz * aperf_delta;

And since delta can be any 64bit value as per the msr range, this
multiplication can overflow.

> +	s->khz = div64_u64(numerator, mperf_delta);
> +	s->jiffies = jiffies;
> +	s->aperf = aperf;
> +	s->mperf = mperf;
> +
> +out:
> +	put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(int cpu)
> +{
> +	if (!cpu_khz)
> +		return 0;
> +
> +	if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
> +		return 0;

You could do the jiffy compare here; avoiding the IPI.

> +
> +	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
> +
> +	return per_cpu(samples.khz, cpu);
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra April 1, 2016, 8:16 a.m. UTC | #3
On Fri, Apr 01, 2016 at 12:37:00AM -0400, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> For x86 processors with APERF/MPERF and TSC,
> return meaningful and consistent MHz in
> /proc/cpuinfo and
> /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq
> 
> MHz is computed like so:
> 
> MHz = base_MHz * delta_APERF / delta_MPERF
> 
> MHz is the average frequency of the busy processor
> over a measurement interval.  The interval is
> defined to be the time between successive reads
> of the frequency on that processor, whether from
> /proc/cpuinfo or from sysfs cpufreq/scaling_cur_freq.
> As with previous methods of calculating MHz,
> idle time is excluded.

Is this really a semantic you want to pin down?

Since we're looking at doing something like:

lkml.kernel.org/r/20160303162829.GB6375@twins.programming.kicks-ass.net

We could also just return cpu_khz * whatever fraction we store there,
knowing it is something recent.

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephane Gasparini April 1, 2016, 8:16 a.m. UTC | #4
—
Steph




> On Apr 1, 2016, at 10:03 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Apr 01, 2016 at 12:37:00AM -0400, Len Brown wrote:
>> diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
>> new file mode 100644
>> index 0000000..9380102
>> --- /dev/null
>> +++ b/arch/x86/kernel/cpu/aperfmperf.c
>> @@ -0,0 +1,76 @@
>> +/*
>> + * x86 APERF/MPERF KHz calculation
>> + * Used by /proc/cpuinfo and /sys/.../cpufreq/scaling_cur_freq
>> + *
>> + * Copyright (C) 2015 Intel Corp.
>> + * Author: Len Brown <len.brown@intel.com>
>> + *
>> + * This file is licensed under GPLv2.
>> + */
>> +
>> +#include <linux/jiffies.h>
>> +#include <linux/math64.h>
>> +#include <linux/percpu.h>
>> +#include <linux/smp.h>
>> +
>> +struct aperfmperf_sample {
>> +	unsigned int khz;
>> +	unsigned long jiffies;
>> +	unsigned long long aperf;
>> +	unsigned long long mperf;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
>> +
>> +/*
>> + * aperfmperf_snapshot_khz()
>> + * On the current CPU, snapshot APERF, MPERF, and jiffies
>> + * unless we already did it within 100ms
>> + * calculate kHz, save snapshot
>> + */
>> +static void aperfmperf_snapshot_khz(void *dummy)
>> +{
>> +	unsigned long long aperf, aperf_delta;
>> +	unsigned long long mperf, mperf_delta;
>> +	unsigned long long numerator;
> 
> 	u64 is less typing ;-)
> 
>> +	struct aperfmperf_sample *s = &get_cpu_var(samples);
>> +
>> +	/* Cache KHz for 100 ms */
>> +	if (time_before(jiffies, s->jiffies + HZ/10))
>> +		goto out;
> 
> This puts in a lower bound, but afaict there is no upper bound. Both
> users appear to be userspace controlled.
> 
> That is; if userspace doesn't request a freq reading we can go without
> reading this for a very long time.
> 
>> +
>> +	rdmsrl(MSR_IA32_APERF, aperf);
>> +	rdmsrl(MSR_IA32_MPERF, mperf);
>> +
>> +	aperf_delta = aperf - s->aperf;
>> +	mperf_delta = mperf - s->mperf;
> 
> That means these delta's can be arbitrarily large, in fact the MSRs can
> have wrapped however many times.

64 bits is 18 446 744 073 709 551 615

so even assuming a 10 GHz frequency if my math are good this is more than
58 years before the MSR wrap around, assuming the device ran always at max
freq.



> 
>> +
>> +	/*
>> +	 * There is no architectural guarantee that MPERF
>> +	 * increments faster than we can read it.
>> +	 */
>> +	if (mperf_delta == 0)
>> +		goto out;
>> +
>> +	numerator = cpu_khz * aperf_delta;
> 
> And since delta can be any 64bit value as per the msr range, this
> multiplication can overflow.
> 
>> +	s->khz = div64_u64(numerator, mperf_delta);
>> +	s->jiffies = jiffies;
>> +	s->aperf = aperf;
>> +	s->mperf = mperf;
>> +
>> +out:
>> +	put_cpu_var(samples);
>> +}
>> +
>> +unsigned int aperfmperf_khz_on_cpu(int cpu)
>> +{
>> +	if (!cpu_khz)
>> +		return 0;
>> +
>> +	if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
>> +		return 0;
> 
> You could do the jiffy compare here; avoiding the IPI.
> 
>> +
>> +	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
>> +
>> +	return per_cpu(samples.khz, cpu);
>> +}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra April 1, 2016, 8:23 a.m. UTC | #5
Trim your emails

On Fri, Apr 01, 2016 at 10:16:42AM +0200, Stephane Gasparini wrote:

> > That means these delta's can be arbitrarily large, in fact the MSRs can
> > have wrapped however many times.
> 
> 64 bits is 18 446 744 073 709 551 615
> 
> so even assuming a 10 GHz frequency if my math are good this is more than
> 58 years before the MSR wrap around, assuming the device ran always at max
> freq.

fair enough.. but going with 10Ghz, cpu_khz would be 10e6 ~ 33 bits,
which effectively reduces the wrap/overflow time to just 31 bits, which
per that frequency is just ~1/4th of a second.


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra April 1, 2016, 8:29 a.m. UTC | #6
On Fri, Apr 01, 2016 at 10:23:23AM +0200, Peter Zijlstra wrote:
> 
> Trim your emails
> 
> On Fri, Apr 01, 2016 at 10:16:42AM +0200, Stephane Gasparini wrote:
> 
> > > That means these delta's can be arbitrarily large, in fact the MSRs can
> > > have wrapped however many times.
> > 
> > 64 bits is 18 446 744 073 709 551 615
> > 
> > so even assuming a 10 GHz frequency if my math are good this is more than
> > 58 years before the MSR wrap around, assuming the device ran always at max
> > freq.
> 
> fair enough.. but going with 10Ghz, cpu_khz would be 10e6 ~ 33 bits,

I can't do maths this morning; 23 bits

> which effectively reduces the wrap/overflow time to just 31 bits, which
> per that frequency is just ~1/4th of a second.

41 giving lots more, but a reasonable time to wrap/overflow.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephane Gasparini April 1, 2016, 9:30 a.m. UTC | #7
my comment was about your comment that MSR have wrapped however many times



> On Apr 1, 2016, at 10:03 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> That is; if userspace doesn't request a freq reading we can go without
> reading this for a very long time.
> 
>> +
>> +	rdmsrl(MSR_IA32_APERF, aperf);
>> +	rdmsrl(MSR_IA32_MPERF, mperf);
>> +
>> +	aperf_delta = aperf - s->aperf;
>> +	mperf_delta = mperf - s->mperf;
> 
> That means these delta's can be arbitrarily large, in fact the MSRs can
> have wrapped however many times.

The MSRs will not wrap that often.

—
Steph




> On Apr 1, 2016, at 10:29 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
> On Fri, Apr 01, 2016 at 10:23:23AM +0200, Peter Zijlstra wrote:
>> 
>> Trim your emails
>> 
>> On Fri, Apr 01, 2016 at 10:16:42AM +0200, Stephane Gasparini wrote:
>> 
>>>> That means these delta's can be arbitrarily large, in fact the MSRs can
>>>> have wrapped however many times.
>>> 
>>> 64 bits is 18 446 744 073 709 551 615
>>> 
>>> so even assuming a 10 GHz frequency if my math are good this is more than
>>> 58 years before the MSR wrap around, assuming the device ran always at max
>>> freq.
>> 
>> fair enough.. but going with 10Ghz, cpu_khz would be 10e6 ~ 33 bits,
> 
> I can't do maths this morning; 23 bits
> 
>> which effectively reduces the wrap/overflow time to just 31 bits, which
>> per that frequency is just ~1/4th of a second.
> 
> 41 giving lots more, but a reasonable time to wrap/overflow.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra April 1, 2016, 9:38 a.m. UTC | #8
On Fri, Apr 01, 2016 at 11:30:48AM +0200, Stephane Gasparini wrote:
> my comment was about your comment that MSR have wrapped however many times
> 

Yes, and don't top post.
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov April 1, 2016, 9:50 a.m. UTC | #9
On Fri, Apr 01, 2016 at 11:30:48AM +0200, Stephane Gasparini wrote:
> The MSRs will not wrap that often.

Unless some yahoo goes and does WRMSR APERF <big_value_close_to_wrap_around>.

I think we should handle that gracefully too, regardless of how "smart"
that move might be.
Len Brown April 2, 2016, 5:22 a.m. UTC | #10
Thanks for the comments.

Re: is this a useful semantic?

Yes, average MHz over an interval is significantly more useful than
a snapshot of the recent instantaneous frequency.
It is possible to convert the former into the later,
but it is not possible to reliably and efficiently convert the later
into the former.

Indeed, we stopped using MSR_PERF_STATUS for this very reason --
a snapshot of instantaneous frequency can be very misleading.

Further, the mechanism in this patch will still work even when Linux
has no concept of frequency control,
including firmware control and CONFIG_CPU_FREQ=n

Of course, when there is 1 reader, this mechanism works the best --
as they get to select whatever interval they like.
For multi-user, the interval would shorten -- possibly
degrading to the 100ms limit set here.  My reasoning on the
100ms limit is that anything more frequent is abuse,
and the users should be using user-space tools like turbostat in that case.

Re: 64-bit math.

Stephane is correct, APERF and  MPERF will not overflow in the uptime
of the machine.
They are both 64-bit registers, and they tick at TSC rate or slower.
(Indeed, they tick at 0 when idle)

Boris is right, this works as long as somebody doesn't scribble on these MSRs.
Linux used to do that in 2.6.23, but we learned our lesson and we leave them
free running since then.  I'm not going to worry about a yahoo
scribbling on MSRs
behind the kernel's back.  More than this will break if that happens.

Peter is right, in the expression "numerator = cpu_khz * aperf_delta",
the capacity of the 64-bit numerator is reduced as cpu_khz
and aperf_delta grow.

For example, if this patch runs on a busy system having a 4GHz CPU,
then APERF ticks at 2^32 Hz.
cpu_khz = 2^22
so max aperf_delta without overflow is 2^64/2^22 = 2^42 cycles

2^42 cycles / 2^32 cycles/sec = 2^10 sec = 1024 seconds = 17 minutes.

Though we could improve this range by 1024x by simply operating on
cpu_mhz instead of cpu_khz, yielding 12 days.

Or we could simply detect potential overflow:

2^64 < cpu_khz * delta_aperf
so
if (2^64/cpu_khz < delta_aperf) then overflow

and since delta_aperf and delta_mperf are much larger than cpu_khz
in this case, we can calculate this way:

khz = cpu_khz (delta_aperf)/(delta_mperf)
khz = cpu_khz (delta_aperf/cpu_khz)/(delta_mperf/cpu_khz)
khz = delta_aperf / (delta_mperf/cpu_khz)

no calculation here can overflow 64-bits in the uptime of the machine.

I'll send an updated patch.

thanks,
-Len
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Machek April 24, 2016, 4:38 p.m. UTC | #11
Hi!
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,76 @@
> +/*
> + * x86 APERF/MPERF KHz calculation
> + * Used by /proc/cpuinfo and /sys/.../cpufreq/scaling_cur_freq

Could we use some shorter filename here? cpu_mhz.c? mhz.c?

> +/*
> + * aperfmperf_snapshot_khz()
diff mbox

Patch

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f..821e31a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,6 +20,7 @@  obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
+obj-y			+= aperfmperf.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 0000000..9380102
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,76 @@ 
+/*
+ * x86 APERF/MPERF KHz calculation
+ * Used by /proc/cpuinfo and /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct aperfmperf_sample {
+	unsigned int khz;
+	unsigned long jiffies;
+	unsigned long long aperf;
+	unsigned long long mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 100ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+	unsigned long long aperf, aperf_delta;
+	unsigned long long mperf, mperf_delta;
+	unsigned long long numerator;
+	struct aperfmperf_sample *s = &get_cpu_var(samples);
+
+	/* Cache KHz for 100 ms */
+	if (time_before(jiffies, s->jiffies + HZ/10))
+		goto out;
+
+	rdmsrl(MSR_IA32_APERF, aperf);
+	rdmsrl(MSR_IA32_MPERF, mperf);
+
+	aperf_delta = aperf - s->aperf;
+	mperf_delta = mperf - s->mperf;
+
+	/*
+	 * There is no architectural guarantee that MPERF
+	 * increments faster than we can read it.
+	 */
+	if (mperf_delta == 0)
+		goto out;
+
+	numerator = cpu_khz * aperf_delta;
+	s->khz = div64_u64(numerator, mperf_delta);
+	s->jiffies = jiffies;
+	s->aperf = aperf;
+	s->mperf = mperf;
+
+out:
+	put_cpu_var(samples);
+}
+
+unsigned int aperfmperf_khz_on_cpu(int cpu)
+{
+	if (!cpu_khz)
+		return 0;
+
+	if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
+		return 0;
+
+	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+	return per_cpu(samples.khz, cpu);
+}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 18ca99f..44507c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -78,9 +78,11 @@  static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
 
 	if (cpu_has(c, X86_FEATURE_TSC)) {
-		unsigned int freq = cpufreq_quick_get(cpu);
+		unsigned int freq = aperfmperf_khz_on_cpu(cpu);
 
 		if (!freq)
+			freq = cpufreq_quick_get(cpu);
+		if (!freq)
 			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
 			   freq / 1000, (freq % 1000));
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b87596b..7fcd090 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -541,8 +541,13 @@  show_one(scaling_max_freq, max);
 static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
 {
 	ssize_t ret;
+	unsigned int freq;
 
-	if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
+	freq = arch_freq_get_on_cpu(policy->cpu);
+	if (freq)
+		ret = sprintf(buf, "%u\n", freq);
+	else if (cpufreq_driver && cpufreq_driver->setpolicy &&
+			cpufreq_driver->get)
 		ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu));
 	else
 		ret = sprintf(buf, "%u\n", policy->cur);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 718e872..a9b8ec6 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -566,6 +566,19 @@  static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
 /* the following funtion is for cpufreq core use only */
 struct cpufreq_frequency_table *cpufreq_frequency_get_table(unsigned int cpu);
 
+#ifdef CONFIG_X86
+extern unsigned int aperfmperf_khz_on_cpu(int cpu);
+static inline unsigned int arch_freq_get_on_cpu(int cpu)
+{
+	return aperfmperf_khz_on_cpu(cpu);
+}
+#else
+static inline unsigned int arch_freq_get_on_cpu(int cpu)
+{
+	return 0;
+}
+#endif
+
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;