diff mbox series

[v1,05/11] xen/x86: introduce a new amd pstate driver for cpufreq scaling

Message ID 20241203081111.463400-6-Penny.Zheng@amd.com (mailing list archive)
State New
Headers show
Series amd-pstate CPU Performance Scaling Driver | expand

Commit Message

Penny Zheng Dec. 3, 2024, 8:11 a.m. UTC
amd-pstate is the AMD CPU performance scaling driver that introduces a
new CPU frequency control mechanism on AMD Zen based CPU series.
The new mechanism is based on Collaborative Processor Performance
Control (CPPC) which is a finer grain frequency management
than legacy ACPI hardware P-States.
Current AMD CPU platforms are using the ACPI P-states driver to
manage CPU frequency and clocks with switching only in 3 P-states.
The new amd-pstate allows a more flexible, low-latency interface for Xen
to directly communicate the performance hints to hardware.

The first version "amd-pstate" could leverage common governors such as
*ondemand*, *performance*, etc, to manage the performance hints. In the
future, we will introduce an advanced active mode to enable autonomous
performence level selection.

Signed-off-by: Penny Zheng <Penny.Zheng@amd.com>
---
 xen/arch/x86/acpi/cpufreq/amd-pstate.c | 336 +++++++++++++++++++++++++
 xen/arch/x86/include/asm/msr-index.h   |   5 +
 2 files changed, 341 insertions(+)
diff mbox series

Patch

diff --git a/xen/arch/x86/acpi/cpufreq/amd-pstate.c b/xen/arch/x86/acpi/cpufreq/amd-pstate.c
index bfad96ae3d..5dfa35581a 100644
--- a/xen/arch/x86/acpi/cpufreq/amd-pstate.c
+++ b/xen/arch/x86/acpi/cpufreq/amd-pstate.c
@@ -15,6 +15,53 @@ 
 #include <xen/init.h>
 #include <xen/param.h>
 #include <acpi/cpufreq/cpufreq.h>
+#include <asm/msr.h>
+
+#define amd_pstate_err(cpu, fmt, args...) \
+    printk(XENLOG_ERR "AMD_PSTATE: CPU%u error: " fmt, cpu, ## args)
+#define amd_pstate_verbose(fmt, args...)                         \
+({                                                               \
+    if ( cpufreq_verbose )                                       \
+        printk(XENLOG_DEBUG "AMD_PSTATE: " fmt, ## args);        \
+})
+#define amd_pstate_warn(fmt, args...) \
+    printk(XENLOG_WARNING "AMD_PSTATE: CPU%u warning: " fmt, cpu, ## args)
+
+struct amd_pstate_drv_data
+{
+    struct xen_processor_cppc *cppc_data;
+    union
+    {
+        uint64_t amd_caps;
+        struct
+        {
+            unsigned int lowest_perf:8;
+            unsigned int lowest_nonlinear_perf:8;
+            unsigned int nominal_perf:8;
+            unsigned int highest_perf:8;
+            unsigned int :32;
+        } hw;
+    };
+    union
+    {
+        uint64_t amd_req;
+        struct
+        {
+            unsigned int max_perf:8;
+            unsigned int min_perf:8;
+            unsigned int des_perf:8;
+            unsigned int epp:8;
+            unsigned int :32;
+        } req;
+    };
+    int err;
+
+    uint32_t max_freq;
+    uint32_t min_freq;
+    uint32_t nominal_freq;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct amd_pstate_drv_data *, amd_pstate_drv_data);
 
 uint16_t __read_mostly dmi_max_speed_mhz;
 
@@ -52,9 +99,298 @@  int __init amd_pstate_cmdline_parse(const char *s, const char *e)
     return 0;
 }
 
+/*
+ * If CPPC lowest_freq and nominal_freq registers are exposed then we can
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an affine function passing by the 2 points:
+ *  - (Low perf, Low freq)
+ *  - (Nominal perf, Nominal freq)
+ */
+static unsigned int amd_pstate_khz_to_perf(struct amd_pstate_drv_data *data, unsigned int freq)
+{
+    struct xen_processor_cppc* cppc_data = data->cppc_data;
+    uint64_t mul, div, offset = 0;
+
+    if ( freq == (cppc_data->nominal_freq * 1000) )
+        return data->hw.nominal_perf;
+
+    if ( freq == (cppc_data->lowest_freq * 1000) )
+        return data->hw.lowest_perf;
+
+    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
+    {
+        mul = data->hw.nominal_perf - data->hw.lowest_perf;
+        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
+        /*
+         * We don't need to convert to kHz for computing offset and can
+         * directly use nominal_freq and lowest_freq as the division
+         * will remove the frequency unit.
+         */
+        div = div ?: 1;
+        offset = data->hw.nominal_perf - (mul * cppc_data->nominal_freq) / div;
+    }
+    else
+    {
+        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+        mul = data->hw.highest_perf;
+        div = dmi_max_speed_mhz;
+    }
+
+    return (unsigned int)(offset + (mul * freq ) / (div * 1000));
+}
+
+static unsigned int amd_get_min_freq(struct amd_pstate_drv_data *data)
+{
+    struct xen_processor_cppc *cppc_data = data->cppc_data;
+    uint64_t mul, div;
+
+    if ( cppc_data->lowest_freq )
+        /* Switch to khz */
+        return cppc_data->lowest_freq * 1000;
+    else
+    {
+        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+        mul = dmi_max_speed_mhz;
+        div = data->hw.highest_perf;
+
+        return (unsigned int)(mul / div) * data->hw.lowest_perf * 1000;
+    }
+}
+
+static unsigned int amd_get_nominal_freq(struct amd_pstate_drv_data *data)
+{
+    struct xen_processor_cppc *cppc_data = data->cppc_data;
+    uint64_t mul, div;
+
+    if ( cppc_data->nominal_freq )
+        /* Switch to khz */
+        return cppc_data->nominal_freq * 1000;
+    else
+    {
+        /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+        mul = dmi_max_speed_mhz;
+        div = data->hw.highest_perf;
+
+        return (unsigned int)(mul / div) * data->hw.nominal_perf * 1000;
+    }
+}
+
+static unsigned int amd_get_max_freq(struct amd_pstate_drv_data *data)
+{
+    uint64_t max_perf, max_freq, nominal_freq, nominal_perf;
+    uint64_t boost_ratio;
+
+    nominal_freq = amd_get_nominal_freq(data);
+    nominal_perf = data->hw.nominal_perf;
+    max_perf = data->hw.highest_perf;
+
+    boost_ratio = (uint64_t)(max_perf / nominal_perf);
+    max_freq = nominal_freq * boost_ratio;
+
+    return max_freq;
+}
+
+static int cf_check amd_pstate_cpufreq_verify(struct cpufreq_policy *policy)
+{
+    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, policy->cpu);
+
+    cpufreq_verify_within_limits(policy, data->min_freq, data->max_freq);
+
+    return 0;
+}
+
+static void amd_pstate_write_request_msrs(void *info)
+{
+    struct amd_pstate_drv_data *data =(struct amd_pstate_drv_data *)info;
+
+    if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->amd_req) )
+    {
+        amd_pstate_verbose("Failed to wrmsr_safe(MSR_AMD_CPPC_REQ, %lx)\n",
+                           data->amd_req);
+        data->err = -EINVAL;
+        return;
+    }
+    data->err = 0;
+}
+
+static int cf_check amd_pstate_write_request(int cpu, uint8_t min_perf,
+                                             uint8_t des_perf, uint8_t max_perf)
+{
+    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
+    uint64_t prev = data->amd_req;
+
+    data->req.min_perf = min_perf;
+    data->req.max_perf = max_perf;
+    data->req.des_perf = des_perf;
+
+    if ( prev == data->amd_req )
+        return 0;
+
+    on_selected_cpus(cpumask_of(cpu), amd_pstate_write_request_msrs, data, 1);
+
+    return data->err;
+}
+
+static int cf_check amd_pstate_cpufreq_target(struct cpufreq_policy *policy,
+                                              unsigned int target_freq,
+                                              unsigned int relation)
+{
+    unsigned int cpu = policy->cpu;
+    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
+    uint64_t max_perf, min_perf, des_perf;
+
+    if ( unlikely(!target_freq) )
+    {
+        amd_pstate_warn("Not setting target frequency to zero\n");
+        return 0;
+    }
+    max_perf = data->hw.highest_perf;
+    min_perf = data->hw.lowest_nonlinear_perf;
+    des_perf = amd_pstate_khz_to_perf(data, target_freq);
+
+    return amd_pstate_write_request(policy->cpu, min_perf, des_perf, max_perf);
+}
+
+static void cf_check amd_pstate_init_msrs(void *info)
+{
+    struct cpufreq_policy *policy = info;
+    struct amd_pstate_drv_data *data = this_cpu(amd_pstate_drv_data);
+    uint64_t val;
+    unsigned int min_freq, nominal_freq, max_freq;
+
+    /* Package level MSR */
+    if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+    {
+        amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
+        data->err = -EINVAL;
+        return;
+    }
+
+    if ( !(val & AMD_CPPC_ENABLE) )
+    {
+        val |= AMD_CPPC_ENABLE;
+        if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+        {
+            amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+            data->err = -EINVAL;
+            return;
+        }
+    }
+
+    if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->amd_caps) )
+    {
+        amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
+        goto error;
+    }
+
+    if ( data->hw.highest_perf == 0 || data->hw.lowest_perf == 0 ||
+         data->hw.nominal_perf == 0 || data->hw.lowest_nonlinear_perf == 0 )
+    {
+        amd_pstate_err(policy->cpu, "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
+                       data->hw.highest_perf, data->hw.lowest_perf,
+                       data->hw.nominal_perf, data->hw.lowest_nonlinear_perf);
+        goto error;
+    }
+
+    min_freq = amd_get_min_freq(data);
+    nominal_freq = amd_get_nominal_freq(data);
+    max_freq = amd_get_max_freq(data);
+    if ( min_freq > max_freq )
+    {
+        amd_pstate_err(policy->cpu, "min_freq(%u) or max_freq(%u) value is incorrect\n",
+                       min_freq, max_freq);
+        goto error;
+    }
+
+    policy->min = min_freq;
+    policy->max = max_freq;
+
+    policy->cpuinfo.min_freq = min_freq;
+    policy->cpuinfo.max_freq = max_freq;
+    policy->cpuinfo.perf_freq = nominal_freq;
+    policy->cur = nominal_freq;
+
+    /* Initial processor data capability frequencies */
+    data->min_freq = min_freq;
+    data->nominal_freq = nominal_freq;
+    data->max_freq = max_freq;
+
+    data->err = 0;
+    return;
+
+ error:
+    data->err = -EINVAL;
+    val &= ~AMD_CPPC_ENABLE;
+    if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+        amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+}
+
+/*
+ * The new AMD P-States driver is different than legacy ACPI hardware P-State,
+ * which has a finer grain frequency range between the highest and lowest
+ * frequency. And boost frequency is actually the frequency which is mapped on
+ * highest performance ratio. The legacy P0 frequency is actually mapped on
+ * nominal performance ratio.
+ */
+static void amd_pstate_boost_init(struct cpufreq_policy *policy, struct amd_pstate_drv_data *data)
+{
+    uint32_t highest_perf, nominal_perf;
+
+    highest_perf = data->hw.highest_perf;
+    nominal_perf = data->hw.nominal_perf;
+
+    if ( highest_perf <= nominal_perf )
+        return;
+
+    policy->turbo = CPUFREQ_TURBO_ENABLED;
+}
+
+static int cf_check amd_pstate_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int cpu = policy->cpu;
+    struct amd_pstate_drv_data *data;
+
+    data = xzalloc(struct amd_pstate_drv_data);
+    if ( !data )
+        return -ENOMEM;
+
+    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
+
+    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+    per_cpu(amd_pstate_drv_data, cpu) = data;
+
+    on_selected_cpus(cpumask_of(cpu), amd_pstate_init_msrs, policy, 1);
+
+    if ( data->err )
+    {
+        amd_pstate_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
+        per_cpu(amd_pstate_drv_data, cpu) = NULL;
+        xfree(data);
+        return -ENODEV;
+    }
+
+    amd_pstate_boost_init(policy, data);
+    return 0;
+}
+
+static int cf_check amd_pstate_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, policy->cpu);
+
+    per_cpu(amd_pstate_drv_data, policy->cpu) = NULL;
+    xfree(data);
+
+    return 0;
+}
+
 static const struct cpufreq_driver __initconstrel amd_pstate_cpufreq_driver =
 {
     .name   = XEN_AMD_PSTATE_DRIVER_NAME,
+    .verify = amd_pstate_cpufreq_verify,
+    .target = amd_pstate_cpufreq_target,
+    .init   = amd_pstate_cpufreq_cpu_init,
+    .exit   = amd_pstate_cpufreq_cpu_exit,
 };
 
 int __init amd_pstate_register_driver(void)
diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
index 9cdb5b2625..14eaddcaad 100644
--- a/xen/arch/x86/include/asm/msr-index.h
+++ b/xen/arch/x86/include/asm/msr-index.h
@@ -455,6 +455,11 @@ 
 #define MSR_AMD_PPIN_CTL                0xc00102f0U
 #define MSR_AMD_PPIN                    0xc00102f1U
 
+#define MSR_AMD_CPPC_CAP1               0xc00102b0
+#define MSR_AMD_CPPC_ENABLE             0xc00102b1
+#define MSR_AMD_CPPC_REQ                0xc00102b3
+#define AMD_CPPC_ENABLE                 BIT(0, ULL)
+
 /* VIA Cyrix defined MSRs*/
 #define MSR_VIA_FCR			0x00001107
 #define MSR_VIA_RNG			0x0000110b