diff mbox series

[XEN,v2,1/2] x86/cpufreq: move ACPI cpufreq driver into separate file

Message ID d8a13eb8c53d8cde99d7fa1d8e4fce2a597f02fd.1719832871.git.Sergiy_Kibrik@epam.com (mailing list archive)
State New
Headers show
Series x86: separate powernow/hwp/acpi cpufreq code | expand

Commit Message

Sergiy Kibrik July 1, 2024, 12:03 p.m. UTC
Separate ACPI driver from generic initialization cpufreq code.
This way acpi-cpufreq can become optional in the future and be disabled
from non-Intel builds.

Other than acpi_register_driver() helper added and clean up a list of
included headers no changes to code were introduced.

Signed-off-by: Sergiy Kibrik <Sergiy_Kibrik@epam.com>
---
 xen/arch/x86/acpi/cpufreq/Makefile  |   1 +
 xen/arch/x86/acpi/cpufreq/acpi.c    | 622 ++++++++++++++++++++++++++++
 xen/arch/x86/acpi/cpufreq/cpufreq.c | 592 +-------------------------
 xen/include/acpi/cpufreq/cpufreq.h  |   1 +
 4 files changed, 625 insertions(+), 591 deletions(-)
 create mode 100644 xen/arch/x86/acpi/cpufreq/acpi.c
diff mbox series

Patch

diff --git a/xen/arch/x86/acpi/cpufreq/Makefile b/xen/arch/x86/acpi/cpufreq/Makefile
index db83aa6b14..44d4c0b497 100644
--- a/xen/arch/x86/acpi/cpufreq/Makefile
+++ b/xen/arch/x86/acpi/cpufreq/Makefile
@@ -1,3 +1,4 @@ 
+obj-y += acpi.o
 obj-y += cpufreq.o
 obj-y += hwp.o
 obj-y += powernow.o
diff --git a/xen/arch/x86/acpi/cpufreq/acpi.c b/xen/arch/x86/acpi/cpufreq/acpi.c
new file mode 100644
index 0000000000..bf4e964377
--- /dev/null
+++ b/xen/arch/x86/acpi/cpufreq/acpi.c
@@ -0,0 +1,622 @@ 
+/*
+ *  cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
+ *
+ *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
+ *  Copyright (C) 2006        Denis Sadykov <denis.m.sadykov@intel.com>
+ *
+ *  Feb 2008 - Liu Jinsong <jinsong.liu@intel.com>
+ *      porting acpi-cpufreq.c from Linux 2.6.23 to Xen hypervisor
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/delay.h>
+#include <xen/param.h>
+#include <acpi/acpi.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+
+enum {
+    UNDEFINED_CAPABLE = 0,
+    SYSTEM_INTEL_MSR_CAPABLE,
+    SYSTEM_IO_CAPABLE,
+};
+
+#define INTEL_MSR_RANGE         (0xffffull)
+
+static bool __read_mostly acpi_pstate_strict;
+boolean_param("acpi_pstate_strict", acpi_pstate_strict);
+
+static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
+{
+    struct processor_performance *perf;
+    int i;
+
+    perf = data->acpi_data;
+
+    for (i=0; i<perf->state_count; i++) {
+        if (value == perf->states[i].status)
+            return data->freq_table[i].frequency;
+    }
+    return 0;
+}
+
+static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
+{
+    int i;
+    struct processor_performance *perf;
+
+    msr &= INTEL_MSR_RANGE;
+    perf = data->acpi_data;
+
+    for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+        if (msr == perf->states[data->freq_table[i].index].status)
+            return data->freq_table[i].frequency;
+    }
+    return data->freq_table[0].frequency;
+}
+
+static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
+{
+    switch (data->arch_cpu_flags) {
+    case SYSTEM_INTEL_MSR_CAPABLE:
+        return extract_msr(val, data);
+    case SYSTEM_IO_CAPABLE:
+        return extract_io(val, data);
+    default:
+        return 0;
+    }
+}
+
+struct msr_addr {
+    u32 reg;
+};
+
+struct io_addr {
+    u16 port;
+    u8 bit_width;
+};
+
+typedef union {
+    struct msr_addr msr;
+    struct io_addr io;
+} drv_addr_union;
+
+struct drv_cmd {
+    unsigned int type;
+    const cpumask_t *mask;
+    drv_addr_union addr;
+    u32 val;
+};
+
+static void cf_check do_drv_read(void *drvcmd)
+{
+    struct drv_cmd *cmd;
+
+    cmd = (struct drv_cmd *)drvcmd;
+
+    switch (cmd->type) {
+    case SYSTEM_INTEL_MSR_CAPABLE:
+        rdmsrl(cmd->addr.msr.reg, cmd->val);
+        break;
+    case SYSTEM_IO_CAPABLE:
+        acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
+            &cmd->val, (u32)cmd->addr.io.bit_width);
+        break;
+    default:
+        break;
+    }
+}
+
+static void cf_check do_drv_write(void *drvcmd)
+{
+    struct drv_cmd *cmd;
+    uint64_t msr_content;
+
+    cmd = (struct drv_cmd *)drvcmd;
+
+    switch (cmd->type) {
+    case SYSTEM_INTEL_MSR_CAPABLE:
+        rdmsrl(cmd->addr.msr.reg, msr_content);
+        msr_content = (msr_content & ~INTEL_MSR_RANGE)
+            | (cmd->val & INTEL_MSR_RANGE);
+        wrmsrl(cmd->addr.msr.reg, msr_content);
+        break;
+    case SYSTEM_IO_CAPABLE:
+        acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
+            cmd->val, (u32)cmd->addr.io.bit_width);
+        break;
+    default:
+        break;
+    }
+}
+
+static void drv_read(struct drv_cmd *cmd)
+{
+    cmd->val = 0;
+
+    ASSERT(cpumask_weight(cmd->mask) == 1);
+
+    /* to reduce IPI for the sake of performance */
+    if (likely(cpumask_test_cpu(smp_processor_id(), cmd->mask)))
+        do_drv_read((void *)cmd);
+    else
+        on_selected_cpus(cmd->mask, do_drv_read, cmd, 1);
+}
+
+static void drv_write(struct drv_cmd *cmd)
+{
+    if (cpumask_equal(cmd->mask, cpumask_of(smp_processor_id())))
+        do_drv_write((void *)cmd);
+    else
+        on_selected_cpus(cmd->mask, do_drv_write, cmd, 1);
+}
+
+static u32 get_cur_val(const cpumask_t *mask)
+{
+    struct cpufreq_policy *policy;
+    struct processor_performance *perf;
+    struct drv_cmd cmd;
+    unsigned int cpu = smp_processor_id();
+
+    if (unlikely(cpumask_empty(mask)))
+        return 0;
+
+    if (!cpumask_test_cpu(cpu, mask))
+        cpu = cpumask_first(mask);
+    if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+        return 0;
+
+    policy = per_cpu(cpufreq_cpu_policy, cpu);
+    if (!policy || !cpufreq_drv_data[policy->cpu])
+        return 0;
+
+    switch (cpufreq_drv_data[policy->cpu]->arch_cpu_flags) {
+    case SYSTEM_INTEL_MSR_CAPABLE:
+        cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+        cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
+        break;
+    case SYSTEM_IO_CAPABLE:
+        cmd.type = SYSTEM_IO_CAPABLE;
+        perf = cpufreq_drv_data[policy->cpu]->acpi_data;
+        cmd.addr.io.port = perf->control_register.address;
+        cmd.addr.io.bit_width = perf->control_register.bit_width;
+        break;
+    default:
+        return 0;
+    }
+
+    cmd.mask = cpumask_of(cpu);
+
+    drv_read(&cmd);
+    return cmd.val;
+}
+
+struct perf_pair {
+    union {
+        struct {
+            uint32_t lo;
+            uint32_t hi;
+        } split;
+        uint64_t whole;
+    } aperf, mperf;
+};
+static DEFINE_PER_CPU(struct perf_pair, gov_perf_pair);
+static DEFINE_PER_CPU(struct perf_pair, usr_perf_pair);
+
+static void cf_check read_measured_perf_ctrs(void *_readin)
+{
+    struct perf_pair *readin = _readin;
+
+    rdmsrl(MSR_IA32_APERF, readin->aperf.whole);
+    rdmsrl(MSR_IA32_MPERF, readin->mperf.whole);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int get_measured_perf(unsigned int cpu, unsigned int flag)
+{
+    struct cpufreq_policy *policy;
+    struct perf_pair readin, cur, *saved;
+    unsigned int perf_percent;
+
+    if (!cpu_online(cpu))
+        return 0;
+
+    policy = per_cpu(cpufreq_cpu_policy, cpu);
+    if ( !policy || !cpu_has_aperfmperf )
+        return 0;
+
+    switch (flag)
+    {
+    case GOV_GETAVG:
+    {
+        saved = &per_cpu(gov_perf_pair, cpu);
+        break;
+    }
+    case USR_GETAVG:
+    {
+        saved = &per_cpu(usr_perf_pair, cpu);
+        break;
+    }
+    default:
+        return 0;
+    }
+
+    if (cpu == smp_processor_id()) {
+        read_measured_perf_ctrs((void *)&readin);
+    } else {
+        on_selected_cpus(cpumask_of(cpu), read_measured_perf_ctrs,
+                        &readin, 1);
+    }
+
+    cur.aperf.whole = readin.aperf.whole - saved->aperf.whole;
+    cur.mperf.whole = readin.mperf.whole - saved->mperf.whole;
+    saved->aperf.whole = readin.aperf.whole;
+    saved->mperf.whole = readin.mperf.whole;
+
+    if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
+        int shift_count = 7;
+        cur.aperf.whole >>= shift_count;
+        cur.mperf.whole >>= shift_count;
+    }
+
+    if (cur.aperf.whole && cur.mperf.whole)
+        perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
+    else
+        perf_percent = 0;
+
+    return policy->cpuinfo.perf_freq * perf_percent / 100;
+}
+
+static unsigned int cf_check get_cur_freq_on_cpu(unsigned int cpu)
+{
+    struct cpufreq_policy *policy;
+    struct acpi_cpufreq_data *data;
+
+    if (!cpu_online(cpu))
+        return 0;
+
+    policy = per_cpu(cpufreq_cpu_policy, cpu);
+    if (!policy)
+        return 0;
+
+    data = cpufreq_drv_data[policy->cpu];
+    if (unlikely(data == NULL ||
+        data->acpi_data == NULL || data->freq_table == NULL))
+        return 0;
+
+    return extract_freq(get_cur_val(cpumask_of(cpu)), data);
+}
+
+void intel_feature_detect(struct cpufreq_policy *policy)
+{
+    unsigned int eax;
+
+    eax = cpuid_eax(6);
+    if (eax & 0x2) {
+        policy->turbo = CPUFREQ_TURBO_ENABLED;
+        if (cpufreq_verbose)
+            printk(XENLOG_INFO "CPU%u: Turbo Mode detected and enabled\n",
+                   smp_processor_id());
+    }
+}
+
+static void cf_check feature_detect(void *info)
+{
+    intel_feature_detect(info);
+}
+
+static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
+                                struct acpi_cpufreq_data *data)
+{
+    unsigned int cur_freq;
+    unsigned int i;
+
+    for (i=0; i<100; i++) {
+        cur_freq = extract_freq(get_cur_val(mask), data);
+        if (cur_freq == freq)
+            return 1;
+        udelay(10);
+    }
+    return 0;
+}
+
+static int cf_check acpi_cpufreq_target(
+    struct cpufreq_policy *policy,
+    unsigned int target_freq, unsigned int relation)
+{
+    struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu];
+    struct processor_performance *perf;
+    struct cpufreq_freqs freqs;
+    cpumask_t online_policy_cpus;
+    struct drv_cmd cmd;
+    unsigned int next_state = 0; /* Index into freq_table */
+    unsigned int next_perf_state = 0; /* Index into perf table */
+    unsigned int j;
+    int result = 0;
+
+    if (unlikely(data == NULL ||
+        data->acpi_data == NULL || data->freq_table == NULL)) {
+        return -ENODEV;
+    }
+
+    if (policy->turbo == CPUFREQ_TURBO_DISABLED)
+        if (target_freq > policy->cpuinfo.second_max_freq)
+            target_freq = policy->cpuinfo.second_max_freq;
+
+    perf = data->acpi_data;
+    result = cpufreq_frequency_table_target(policy,
+                                            data->freq_table,
+                                            target_freq,
+                                            relation, &next_state);
+    if (unlikely(result))
+        return -ENODEV;
+
+    cpumask_and(&online_policy_cpus, &cpu_online_map, policy->cpus);
+
+    next_perf_state = data->freq_table[next_state].index;
+    if (perf->state == next_perf_state) {
+        if (unlikely(policy->resume))
+            policy->resume = 0;
+        else
+            return 0;
+    }
+
+    switch (data->arch_cpu_flags) {
+    case SYSTEM_INTEL_MSR_CAPABLE:
+        cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+        cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
+        cmd.val = (u32) perf->states[next_perf_state].control;
+        break;
+    case SYSTEM_IO_CAPABLE:
+        cmd.type = SYSTEM_IO_CAPABLE;
+        cmd.addr.io.port = perf->control_register.address;
+        cmd.addr.io.bit_width = perf->control_register.bit_width;
+        cmd.val = (u32) perf->states[next_perf_state].control;
+        break;
+    default:
+        return -ENODEV;
+    }
+
+    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+        cmd.mask = &online_policy_cpus;
+    else
+        cmd.mask = cpumask_of(policy->cpu);
+
+    freqs.old = perf->states[perf->state].core_frequency * 1000;
+    freqs.new = data->freq_table[next_state].frequency;
+
+    drv_write(&cmd);
+
+    if (acpi_pstate_strict && !check_freqs(cmd.mask, freqs.new, data)) {
+        printk(KERN_WARNING "Fail transfer to new freq %d\n", freqs.new);
+        return -EAGAIN;
+    }
+
+    for_each_cpu(j, &online_policy_cpus)
+        cpufreq_statistic_update(j, perf->state, next_perf_state);
+
+    perf->state = next_perf_state;
+    policy->cur = freqs.new;
+
+    return result;
+}
+
+static int cf_check acpi_cpufreq_verify(struct cpufreq_policy *policy)
+{
+    struct acpi_cpufreq_data *data;
+    struct processor_performance *perf;
+
+    if (!policy || !(data = cpufreq_drv_data[policy->cpu]) ||
+        !processor_pminfo[policy->cpu])
+        return -EINVAL;
+
+    perf = &processor_pminfo[policy->cpu]->perf;
+
+    cpufreq_verify_within_limits(policy, 0,
+        perf->states[perf->platform_limit].core_frequency * 1000);
+
+    return cpufreq_frequency_table_verify(policy, data->freq_table);
+}
+
+static unsigned long
+acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
+{
+    struct processor_performance *perf = data->acpi_data;
+
+    if (cpu_khz) {
+        /* search the closest match to cpu_khz */
+        unsigned int i;
+        unsigned long freq;
+        unsigned long freqn = perf->states[0].core_frequency * 1000;
+
+        for (i=0; i<(perf->state_count-1); i++) {
+            freq = freqn;
+            freqn = perf->states[i+1].core_frequency * 1000;
+            if ((2 * cpu_khz) > (freqn + freq)) {
+                perf->state = i;
+                return freq;
+            }
+        }
+        perf->state = perf->state_count-1;
+        return freqn;
+    } else {
+        /* assume CPU is at P0... */
+        perf->state = 0;
+        return perf->states[0].core_frequency * 1000;
+    }
+}
+
+static int cf_check acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int i;
+    unsigned int valid_states = 0;
+    unsigned int cpu = policy->cpu;
+    struct acpi_cpufreq_data *data;
+    unsigned int result = 0;
+    struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
+    struct processor_performance *perf;
+
+    data = xzalloc(struct acpi_cpufreq_data);
+    if (!data)
+        return -ENOMEM;
+
+    cpufreq_drv_data[cpu] = data;
+
+    data->acpi_data = &processor_pminfo[cpu]->perf;
+
+    perf = data->acpi_data;
+    policy->shared_type = perf->shared_type;
+
+    switch (perf->control_register.space_id) {
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        if (cpufreq_verbose)
+            printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
+                   "SYSTEM IO addr space\n");
+        data->arch_cpu_flags = SYSTEM_IO_CAPABLE;
+        break;
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        if (cpufreq_verbose)
+            printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
+                   "HARDWARE addr space\n");
+        if (!cpu_has(c, X86_FEATURE_EIST)) {
+            result = -ENODEV;
+            goto err_unreg;
+        }
+        data->arch_cpu_flags = SYSTEM_INTEL_MSR_CAPABLE;
+        break;
+    default:
+        result = -ENODEV;
+        goto err_unreg;
+    }
+
+    data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
+                                    (perf->state_count+1));
+    if (!data->freq_table) {
+        result = -ENOMEM;
+        goto err_unreg;
+    }
+
+    /* detect transition latency */
+    policy->cpuinfo.transition_latency = 0;
+    for (i=0; i<perf->state_count; i++) {
+        if ((perf->states[i].transition_latency * 1000) >
+            policy->cpuinfo.transition_latency)
+            policy->cpuinfo.transition_latency =
+                perf->states[i].transition_latency * 1000;
+    }
+
+    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+    /* table init */
+    for (i=0; i<perf->state_count; i++) {
+        if (i>0 && perf->states[i].core_frequency >=
+            data->freq_table[valid_states-1].frequency / 1000)
+            continue;
+
+        data->freq_table[valid_states].index = i;
+        data->freq_table[valid_states].frequency =
+            perf->states[i].core_frequency * 1000;
+        valid_states++;
+    }
+    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+    perf->state = 0;
+
+    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+    if (result)
+        goto err_freqfree;
+
+    switch (perf->control_register.space_id) {
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        /* Current speed is unknown and not detectable by IO port */
+        policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
+        break;
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        cpufreq_driver.get = get_cur_freq_on_cpu;
+        policy->cur = get_cur_freq_on_cpu(cpu);
+        break;
+    default:
+        break;
+    }
+
+    /* Check for APERF/MPERF support in hardware
+     * also check for boost support */
+    if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6)
+        on_selected_cpus(cpumask_of(cpu), feature_detect, policy, 1);
+
+    /*
+     * the first call to ->target() should result in us actually
+     * writing something to the appropriate registers.
+     */
+    policy->resume = 1;
+
+    return result;
+
+err_freqfree:
+    xfree(data->freq_table);
+err_unreg:
+    xfree(data);
+    cpufreq_drv_data[cpu] = NULL;
+
+    return result;
+}
+
+static int cf_check acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu];
+
+    if (data) {
+        cpufreq_drv_data[policy->cpu] = NULL;
+        xfree(data->freq_table);
+        xfree(data);
+    }
+
+    return 0;
+}
+
+static const struct cpufreq_driver __initconst_cf_clobber
+acpi_cpufreq_driver = {
+    .name   = "acpi-cpufreq",
+    .verify = acpi_cpufreq_verify,
+    .target = acpi_cpufreq_target,
+    .init   = acpi_cpufreq_cpu_init,
+    .exit   = acpi_cpufreq_cpu_exit,
+    .get    = get_cur_freq_on_cpu,
+};
+
+
+int __init acpi_register_driver(void)
+{
+    return cpufreq_register_driver(&acpi_cpufreq_driver);
+}
diff --git a/xen/arch/x86/acpi/cpufreq/cpufreq.c b/xen/arch/x86/acpi/cpufreq/cpufreq.c
index a341f2f020..c1a842e959 100644
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c
@@ -26,604 +26,14 @@ 
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
-
 #include <xen/types.h>
 #include <xen/errno.h>
-#include <xen/delay.h>
-#include <xen/cpumask.h>
 #include <xen/param.h>
 #include <xen/sched.h>
-#include <xen/timer.h>
-#include <xen/xmalloc.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include <acpi/acpi.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-enum {
-    UNDEFINED_CAPABLE = 0,
-    SYSTEM_INTEL_MSR_CAPABLE,
-    SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE         (0xffffull)
-
 struct acpi_cpufreq_data *cpufreq_drv_data[NR_CPUS];
 
-static bool __read_mostly acpi_pstate_strict;
-boolean_param("acpi_pstate_strict", acpi_pstate_strict);
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
-    struct processor_performance *perf;
-    int i;
-
-    perf = data->acpi_data;
-
-    for (i=0; i<perf->state_count; i++) {
-        if (value == perf->states[i].status)
-            return data->freq_table[i].frequency;
-    }
-    return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
-    int i;
-    struct processor_performance *perf;
-
-    msr &= INTEL_MSR_RANGE;
-    perf = data->acpi_data;
-
-    for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-        if (msr == perf->states[data->freq_table[i].index].status)
-            return data->freq_table[i].frequency;
-    }
-    return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
-    switch (data->arch_cpu_flags) {
-    case SYSTEM_INTEL_MSR_CAPABLE:
-        return extract_msr(val, data);
-    case SYSTEM_IO_CAPABLE:
-        return extract_io(val, data);
-    default:
-        return 0;
-    }
-}
-
-struct msr_addr {
-    u32 reg;
-};
-
-struct io_addr {
-    u16 port;
-    u8 bit_width;
-};
-
-typedef union {
-    struct msr_addr msr;
-    struct io_addr io;
-} drv_addr_union;
-
-struct drv_cmd {
-    unsigned int type;
-    const cpumask_t *mask;
-    drv_addr_union addr;
-    u32 val;
-};
-
-static void cf_check do_drv_read(void *drvcmd)
-{
-    struct drv_cmd *cmd;
-
-    cmd = (struct drv_cmd *)drvcmd;
-
-    switch (cmd->type) {
-    case SYSTEM_INTEL_MSR_CAPABLE:
-        rdmsrl(cmd->addr.msr.reg, cmd->val);
-        break;
-    case SYSTEM_IO_CAPABLE:
-        acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
-            &cmd->val, (u32)cmd->addr.io.bit_width);
-        break;
-    default:
-        break;
-    }
-}
-
-static void cf_check do_drv_write(void *drvcmd)
-{
-    struct drv_cmd *cmd;
-    uint64_t msr_content;
-
-    cmd = (struct drv_cmd *)drvcmd;
-
-    switch (cmd->type) {
-    case SYSTEM_INTEL_MSR_CAPABLE:
-        rdmsrl(cmd->addr.msr.reg, msr_content);
-        msr_content = (msr_content & ~INTEL_MSR_RANGE)
-            | (cmd->val & INTEL_MSR_RANGE);
-        wrmsrl(cmd->addr.msr.reg, msr_content);
-        break;
-    case SYSTEM_IO_CAPABLE:
-        acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
-            cmd->val, (u32)cmd->addr.io.bit_width);
-        break;
-    default:
-        break;
-    }
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
-    cmd->val = 0;
-
-    ASSERT(cpumask_weight(cmd->mask) == 1);
-
-    /* to reduce IPI for the sake of performance */
-    if (likely(cpumask_test_cpu(smp_processor_id(), cmd->mask)))
-        do_drv_read((void *)cmd);
-    else
-        on_selected_cpus(cmd->mask, do_drv_read, cmd, 1);
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
-    if (cpumask_equal(cmd->mask, cpumask_of(smp_processor_id())))
-        do_drv_write((void *)cmd);
-    else
-        on_selected_cpus(cmd->mask, do_drv_write, cmd, 1);
-}
-
-static u32 get_cur_val(const cpumask_t *mask)
-{
-    struct cpufreq_policy *policy;
-    struct processor_performance *perf;
-    struct drv_cmd cmd;
-    unsigned int cpu = smp_processor_id();
-
-    if (unlikely(cpumask_empty(mask)))
-        return 0;
-
-    if (!cpumask_test_cpu(cpu, mask))
-        cpu = cpumask_first(mask);
-    if (cpu >= nr_cpu_ids || !cpu_online(cpu))
-        return 0;
-
-    policy = per_cpu(cpufreq_cpu_policy, cpu);
-    if (!policy || !cpufreq_drv_data[policy->cpu])
-        return 0;    
-
-    switch (cpufreq_drv_data[policy->cpu]->arch_cpu_flags) {
-    case SYSTEM_INTEL_MSR_CAPABLE:
-        cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-        cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
-        break;
-    case SYSTEM_IO_CAPABLE:
-        cmd.type = SYSTEM_IO_CAPABLE;
-        perf = cpufreq_drv_data[policy->cpu]->acpi_data;
-        cmd.addr.io.port = perf->control_register.address;
-        cmd.addr.io.bit_width = perf->control_register.bit_width;
-        break;
-    default:
-        return 0;
-    }
-
-    cmd.mask = cpumask_of(cpu);
-
-    drv_read(&cmd);
-    return cmd.val;
-}
-
-struct perf_pair {
-    union {
-        struct {
-            uint32_t lo;
-            uint32_t hi;
-        } split;
-        uint64_t whole;
-    } aperf, mperf;
-};
-static DEFINE_PER_CPU(struct perf_pair, gov_perf_pair);
-static DEFINE_PER_CPU(struct perf_pair, usr_perf_pair);
-
-static void cf_check read_measured_perf_ctrs(void *_readin)
-{
-    struct perf_pair *readin = _readin;
-
-    rdmsrl(MSR_IA32_APERF, readin->aperf.whole);
-    rdmsrl(MSR_IA32_MPERF, readin->mperf.whole);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int get_measured_perf(unsigned int cpu, unsigned int flag)
-{
-    struct cpufreq_policy *policy;    
-    struct perf_pair readin, cur, *saved;
-    unsigned int perf_percent;
-
-    if (!cpu_online(cpu))
-        return 0;
-
-    policy = per_cpu(cpufreq_cpu_policy, cpu);
-    if ( !policy || !cpu_has_aperfmperf )
-        return 0;
-
-    switch (flag)
-    {
-    case GOV_GETAVG:
-    {
-        saved = &per_cpu(gov_perf_pair, cpu);
-        break;
-    }
-    case USR_GETAVG:
-    {
-        saved = &per_cpu(usr_perf_pair, cpu);
-        break;
-    }
-    default:
-        return 0;
-    }
-
-    if (cpu == smp_processor_id()) {
-        read_measured_perf_ctrs((void *)&readin);
-    } else {
-        on_selected_cpus(cpumask_of(cpu), read_measured_perf_ctrs,
-                        &readin, 1);
-    }
-
-    cur.aperf.whole = readin.aperf.whole - saved->aperf.whole;
-    cur.mperf.whole = readin.mperf.whole - saved->mperf.whole;
-    saved->aperf.whole = readin.aperf.whole;
-    saved->mperf.whole = readin.mperf.whole;
-
-    if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
-        int shift_count = 7;
-        cur.aperf.whole >>= shift_count;
-        cur.mperf.whole >>= shift_count;
-    }
-
-    if (cur.aperf.whole && cur.mperf.whole)
-        perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
-    else
-        perf_percent = 0;
-
-    return policy->cpuinfo.perf_freq * perf_percent / 100;
-}
-
-static unsigned int cf_check get_cur_freq_on_cpu(unsigned int cpu)
-{
-    struct cpufreq_policy *policy;
-    struct acpi_cpufreq_data *data;
-
-    if (!cpu_online(cpu))
-        return 0;
-
-    policy = per_cpu(cpufreq_cpu_policy, cpu);
-    if (!policy)
-        return 0;
-
-    data = cpufreq_drv_data[policy->cpu];
-    if (unlikely(data == NULL ||
-        data->acpi_data == NULL || data->freq_table == NULL))
-        return 0;
-
-    return extract_freq(get_cur_val(cpumask_of(cpu)), data);
-}
-
-void intel_feature_detect(struct cpufreq_policy *policy)
-{
-    unsigned int eax;
-
-    eax = cpuid_eax(6);
-    if (eax & 0x2) {
-        policy->turbo = CPUFREQ_TURBO_ENABLED;
-        if (cpufreq_verbose)
-            printk(XENLOG_INFO "CPU%u: Turbo Mode detected and enabled\n",
-                   smp_processor_id());
-    }
-}
-
-static void cf_check feature_detect(void *info)
-{
-    intel_feature_detect(info);
-}
-
-static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
-                                struct acpi_cpufreq_data *data)
-{
-    unsigned int cur_freq;
-    unsigned int i;
-
-    for (i=0; i<100; i++) {
-        cur_freq = extract_freq(get_cur_val(mask), data);
-        if (cur_freq == freq)
-            return 1;
-        udelay(10);
-    }
-    return 0;
-}
-
-static int cf_check acpi_cpufreq_target(
-    struct cpufreq_policy *policy,
-    unsigned int target_freq, unsigned int relation)
-{
-    struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu];
-    struct processor_performance *perf;
-    struct cpufreq_freqs freqs;
-    cpumask_t online_policy_cpus;
-    struct drv_cmd cmd;
-    unsigned int next_state = 0; /* Index into freq_table */
-    unsigned int next_perf_state = 0; /* Index into perf table */
-    unsigned int j;
-    int result = 0;
-
-    if (unlikely(data == NULL ||
-        data->acpi_data == NULL || data->freq_table == NULL)) {
-        return -ENODEV;
-    }
-
-    if (policy->turbo == CPUFREQ_TURBO_DISABLED)
-        if (target_freq > policy->cpuinfo.second_max_freq)
-            target_freq = policy->cpuinfo.second_max_freq;
-
-    perf = data->acpi_data;
-    result = cpufreq_frequency_table_target(policy,
-                                            data->freq_table,
-                                            target_freq,
-                                            relation, &next_state);
-    if (unlikely(result))
-        return -ENODEV;
-
-    cpumask_and(&online_policy_cpus, &cpu_online_map, policy->cpus);
-
-    next_perf_state = data->freq_table[next_state].index;
-    if (perf->state == next_perf_state) {
-        if (unlikely(policy->resume))
-            policy->resume = 0;
-        else
-            return 0;
-    }
-
-    switch (data->arch_cpu_flags) {
-    case SYSTEM_INTEL_MSR_CAPABLE:
-        cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-        cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-        cmd.val = (u32) perf->states[next_perf_state].control;
-        break;
-    case SYSTEM_IO_CAPABLE:
-        cmd.type = SYSTEM_IO_CAPABLE;
-        cmd.addr.io.port = perf->control_register.address;
-        cmd.addr.io.bit_width = perf->control_register.bit_width;
-        cmd.val = (u32) perf->states[next_perf_state].control;
-        break;
-    default:
-        return -ENODEV;
-    }
-
-    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-        cmd.mask = &online_policy_cpus;
-    else
-        cmd.mask = cpumask_of(policy->cpu);
-
-    freqs.old = perf->states[perf->state].core_frequency * 1000;
-    freqs.new = data->freq_table[next_state].frequency;
-
-    drv_write(&cmd);
-
-    if (acpi_pstate_strict && !check_freqs(cmd.mask, freqs.new, data)) {
-        printk(KERN_WARNING "Fail transfer to new freq %d\n", freqs.new);
-        return -EAGAIN;
-    }
-
-    for_each_cpu(j, &online_policy_cpus)
-        cpufreq_statistic_update(j, perf->state, next_perf_state);
-
-    perf->state = next_perf_state;
-    policy->cur = freqs.new;
-
-    return result;
-}
-
-static int cf_check acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
-    struct acpi_cpufreq_data *data;
-    struct processor_performance *perf;
-
-    if (!policy || !(data = cpufreq_drv_data[policy->cpu]) ||
-        !processor_pminfo[policy->cpu])
-        return -EINVAL;
-
-    perf = &processor_pminfo[policy->cpu]->perf;
-
-    cpufreq_verify_within_limits(policy, 0, 
-        perf->states[perf->platform_limit].core_frequency * 1000);
-
-    return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
-    struct processor_performance *perf = data->acpi_data;
-
-    if (cpu_khz) {
-        /* search the closest match to cpu_khz */
-        unsigned int i;
-        unsigned long freq;
-        unsigned long freqn = perf->states[0].core_frequency * 1000;
-
-        for (i=0; i<(perf->state_count-1); i++) {
-            freq = freqn;
-            freqn = perf->states[i+1].core_frequency * 1000;
-            if ((2 * cpu_khz) > (freqn + freq)) {
-                perf->state = i;
-                return freq;
-            }
-        }
-        perf->state = perf->state_count-1;
-        return freqn;
-    } else {
-        /* assume CPU is at P0... */
-        perf->state = 0;
-        return perf->states[0].core_frequency * 1000;
-    }
-}
-
-static int cf_check acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-    unsigned int i;
-    unsigned int valid_states = 0;
-    unsigned int cpu = policy->cpu;
-    struct acpi_cpufreq_data *data;
-    unsigned int result = 0;
-    struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
-    struct processor_performance *perf;
-
-    data = xzalloc(struct acpi_cpufreq_data);
-    if (!data)
-        return -ENOMEM;
-
-    cpufreq_drv_data[cpu] = data;
-
-    data->acpi_data = &processor_pminfo[cpu]->perf;
-
-    perf = data->acpi_data;
-    policy->shared_type = perf->shared_type;
-
-    switch (perf->control_register.space_id) {
-    case ACPI_ADR_SPACE_SYSTEM_IO:
-        if (cpufreq_verbose)
-            printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
-                   "SYSTEM IO addr space\n");
-        data->arch_cpu_flags = SYSTEM_IO_CAPABLE;
-        break;
-    case ACPI_ADR_SPACE_FIXED_HARDWARE:
-        if (cpufreq_verbose)
-            printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
-                   "HARDWARE addr space\n");
-        if (!cpu_has(c, X86_FEATURE_EIST)) {
-            result = -ENODEV;
-            goto err_unreg;
-        }
-        data->arch_cpu_flags = SYSTEM_INTEL_MSR_CAPABLE;
-        break;
-    default:
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
-    data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 
-                                    (perf->state_count+1));
-    if (!data->freq_table) {
-        result = -ENOMEM;
-        goto err_unreg;
-    }
-
-    /* detect transition latency */
-    policy->cpuinfo.transition_latency = 0;
-    for (i=0; i<perf->state_count; i++) {
-        if ((perf->states[i].transition_latency * 1000) >
-            policy->cpuinfo.transition_latency)
-            policy->cpuinfo.transition_latency =
-                perf->states[i].transition_latency * 1000;
-    }
-
-    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
-
-    /* table init */
-    for (i=0; i<perf->state_count; i++) {
-        if (i>0 && perf->states[i].core_frequency >=
-            data->freq_table[valid_states-1].frequency / 1000)
-            continue;
-
-        data->freq_table[valid_states].index = i;
-        data->freq_table[valid_states].frequency =
-            perf->states[i].core_frequency * 1000;
-        valid_states++;
-    }
-    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
-    perf->state = 0;
-
-    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
-    if (result)
-        goto err_freqfree;
-
-    switch (perf->control_register.space_id) {
-    case ACPI_ADR_SPACE_SYSTEM_IO:
-        /* Current speed is unknown and not detectable by IO port */
-        policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
-        break;
-    case ACPI_ADR_SPACE_FIXED_HARDWARE:
-        cpufreq_driver.get = get_cur_freq_on_cpu;
-        policy->cur = get_cur_freq_on_cpu(cpu);
-        break;
-    default:
-        break;
-    }
-
-    /* Check for APERF/MPERF support in hardware
-     * also check for boost support */
-    if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6)
-        on_selected_cpus(cpumask_of(cpu), feature_detect, policy, 1);
-
-    /*
-     * the first call to ->target() should result in us actually
-     * writing something to the appropriate registers.
-     */
-    policy->resume = 1;
-
-    return result;
-
-err_freqfree:
-    xfree(data->freq_table);
-err_unreg:
-    xfree(data);
-    cpufreq_drv_data[cpu] = NULL;
-
-    return result;
-}
-
-static int cf_check acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-    struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu];
-
-    if (data) {
-        cpufreq_drv_data[policy->cpu] = NULL;
-        xfree(data->freq_table);
-        xfree(data);
-    }
-
-    return 0;
-}
-
-static const struct cpufreq_driver __initconst_cf_clobber
-acpi_cpufreq_driver = {
-    .name   = "acpi-cpufreq",
-    .verify = acpi_cpufreq_verify,
-    .target = acpi_cpufreq_target,
-    .init   = acpi_cpufreq_cpu_init,
-    .exit   = acpi_cpufreq_cpu_exit,
-    .get    = get_cur_freq_on_cpu,
-};
-
 static int __init cf_check cpufreq_driver_init(void)
 {
     int ret = 0;
@@ -640,7 +50,7 @@  static int __init cf_check cpufreq_driver_init(void)
                 switch ( cpufreq_xen_opts[i] )
                 {
                 case CPUFREQ_xen:
-                    ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+                    ret = acpi_register_driver();
                     break;
                 case CPUFREQ_hwp:
                     ret = hwp_register_driver();
diff --git a/xen/include/acpi/cpufreq/cpufreq.h b/xen/include/acpi/cpufreq/cpufreq.h
index 443427153b..1a8ba3756c 100644
--- a/xen/include/acpi/cpufreq/cpufreq.h
+++ b/xen/include/acpi/cpufreq/cpufreq.h
@@ -260,4 +260,5 @@  int get_hwp_para(unsigned int cpu,
 int set_hwp_para(struct cpufreq_policy *policy,
                  struct xen_set_cppc_para *set_cppc);
 
+int acpi_register_driver(void);
 #endif /* __XEN_CPUFREQ_PM_H__ */