@@ -15,6 +15,53 @@
#include <xen/init.h>
#include <xen/param.h>
#include <acpi/cpufreq/cpufreq.h>
+#include <asm/msr.h>
+
+#define amd_pstate_err(cpu, fmt, args...) \
+ printk(XENLOG_ERR "AMD_PSTATE: CPU%u error: " fmt, cpu, ## args)
+#define amd_pstate_verbose(fmt, args...) \
+({ \
+ if ( cpufreq_verbose ) \
+ printk(XENLOG_DEBUG "AMD_PSTATE: " fmt, ## args); \
+})
+#define amd_pstate_warn(fmt, args...) \
+ printk(XENLOG_WARNING "AMD_PSTATE: CPU%u warning: " fmt, cpu, ## args)
+
+struct amd_pstate_drv_data
+{
+ struct xen_processor_cppc *cppc_data;
+ union
+ {
+ uint64_t amd_caps;
+ struct
+ {
+ unsigned int lowest_perf:8;
+ unsigned int lowest_nonlinear_perf:8;
+ unsigned int nominal_perf:8;
+ unsigned int highest_perf:8;
+ unsigned int :32;
+ } hw;
+ };
+ union
+ {
+ uint64_t amd_req;
+ struct
+ {
+ unsigned int max_perf:8;
+ unsigned int min_perf:8;
+ unsigned int des_perf:8;
+ unsigned int epp:8;
+ unsigned int :32;
+ } req;
+ };
+ int err;
+
+ uint32_t max_freq;
+ uint32_t min_freq;
+ uint32_t nominal_freq;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct amd_pstate_drv_data *, amd_pstate_drv_data);
uint16_t __read_mostly dmi_max_speed_mhz;
@@ -52,9 +99,298 @@ int __init amd_pstate_cmdline_parse(const char *s, const char *e)
return 0;
}
+/*
+ * If CPPC lowest_freq and nominal_freq registers are exposed then we can
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an affine function passing by the 2 points:
+ * - (Low perf, Low freq)
+ * - (Nominal perf, Nominal freq)
+ */
+static unsigned int amd_pstate_khz_to_perf(struct amd_pstate_drv_data *data, unsigned int freq)
+{
+ struct xen_processor_cppc* cppc_data = data->cppc_data;
+ uint64_t mul, div, offset = 0;
+
+ if ( freq == (cppc_data->nominal_freq * 1000) )
+ return data->hw.nominal_perf;
+
+ if ( freq == (cppc_data->lowest_freq * 1000) )
+ return data->hw.lowest_perf;
+
+ if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
+ {
+ mul = data->hw.nominal_perf - data->hw.lowest_perf;
+ div = cppc_data->nominal_freq - cppc_data->lowest_freq;
+ /*
+ * We don't need to convert to kHz for computing offset and can
+ * directly use nominal_freq and lowest_freq as the division
+ * will remove the frequency unit.
+ */
+ div = div ?: 1;
+ offset = data->hw.nominal_perf - (mul * cppc_data->nominal_freq) / div;
+ }
+ else
+ {
+ /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+ mul = data->hw.highest_perf;
+ div = dmi_max_speed_mhz;
+ }
+
+ return (unsigned int)(offset + (mul * freq ) / (div * 1000));
+}
+
+static unsigned int amd_get_min_freq(struct amd_pstate_drv_data *data)
+{
+ struct xen_processor_cppc *cppc_data = data->cppc_data;
+ uint64_t mul, div;
+
+ if ( cppc_data->lowest_freq )
+ /* Switch to khz */
+ return cppc_data->lowest_freq * 1000;
+ else
+ {
+ /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+ mul = dmi_max_speed_mhz;
+ div = data->hw.highest_perf;
+
+ return (unsigned int)(mul / div) * data->hw.lowest_perf * 1000;
+ }
+}
+
+static unsigned int amd_get_nominal_freq(struct amd_pstate_drv_data *data)
+{
+ struct xen_processor_cppc *cppc_data = data->cppc_data;
+ uint64_t mul, div;
+
+ if ( cppc_data->nominal_freq )
+ /* Switch to khz */
+ return cppc_data->nominal_freq * 1000;
+ else
+ {
+ /* Read Processor Max Speed(mhz) from DMI table as anchor point */
+ mul = dmi_max_speed_mhz;
+ div = data->hw.highest_perf;
+
+ return (unsigned int)(mul / div) * data->hw.nominal_perf * 1000;
+ }
+}
+
+static unsigned int amd_get_max_freq(struct amd_pstate_drv_data *data)
+{
+ uint64_t max_perf, max_freq, nominal_freq, nominal_perf;
+ uint64_t boost_ratio;
+
+ nominal_freq = amd_get_nominal_freq(data);
+ nominal_perf = data->hw.nominal_perf;
+ max_perf = data->hw.highest_perf;
+
+ boost_ratio = (uint64_t)(max_perf / nominal_perf);
+ max_freq = nominal_freq * boost_ratio;
+
+ return max_freq;
+}
+
+static int cf_check amd_pstate_cpufreq_verify(struct cpufreq_policy *policy)
+{
+ struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, policy->cpu);
+
+ cpufreq_verify_within_limits(policy, data->min_freq, data->max_freq);
+
+ return 0;
+}
+
+static void amd_pstate_write_request_msrs(void *info)
+{
+ struct amd_pstate_drv_data *data =(struct amd_pstate_drv_data *)info;
+
+ if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->amd_req) )
+ {
+ amd_pstate_verbose("Failed to wrmsr_safe(MSR_AMD_CPPC_REQ, %lx)\n",
+ data->amd_req);
+ data->err = -EINVAL;
+ return;
+ }
+ data->err = 0;
+}
+
+static int cf_check amd_pstate_write_request(int cpu, uint8_t min_perf,
+ uint8_t des_perf, uint8_t max_perf)
+{
+ struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
+ uint64_t prev = data->amd_req;
+
+ data->req.min_perf = min_perf;
+ data->req.max_perf = max_perf;
+ data->req.des_perf = des_perf;
+
+ if ( prev == data->amd_req )
+ return 0;
+
+ on_selected_cpus(cpumask_of(cpu), amd_pstate_write_request_msrs, data, 1);
+
+ return data->err;
+}
+
+static int cf_check amd_pstate_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ unsigned int cpu = policy->cpu;
+ struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, cpu);
+ uint64_t max_perf, min_perf, des_perf;
+
+ if ( unlikely(!target_freq) )
+ {
+ amd_pstate_warn("Not setting target frequency to zero\n");
+ return 0;
+ }
+ max_perf = data->hw.highest_perf;
+ min_perf = data->hw.lowest_nonlinear_perf;
+ des_perf = amd_pstate_khz_to_perf(data, target_freq);
+
+ return amd_pstate_write_request(policy->cpu, min_perf, des_perf, max_perf);
+}
+
+static void cf_check amd_pstate_init_msrs(void *info)
+{
+ struct cpufreq_policy *policy = info;
+ struct amd_pstate_drv_data *data = this_cpu(amd_pstate_drv_data);
+ uint64_t val;
+ unsigned int min_freq, nominal_freq, max_freq;
+
+ /* Package level MSR */
+ if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+ {
+ amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
+ data->err = -EINVAL;
+ return;
+ }
+
+ if ( !(val & AMD_CPPC_ENABLE) )
+ {
+ val |= AMD_CPPC_ENABLE;
+ if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+ {
+ amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+ data->err = -EINVAL;
+ return;
+ }
+ }
+
+ if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->amd_caps) )
+ {
+ amd_pstate_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
+ goto error;
+ }
+
+ if ( data->hw.highest_perf == 0 || data->hw.lowest_perf == 0 ||
+ data->hw.nominal_perf == 0 || data->hw.lowest_nonlinear_perf == 0 )
+ {
+ amd_pstate_err(policy->cpu, "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
+ data->hw.highest_perf, data->hw.lowest_perf,
+ data->hw.nominal_perf, data->hw.lowest_nonlinear_perf);
+ goto error;
+ }
+
+ min_freq = amd_get_min_freq(data);
+ nominal_freq = amd_get_nominal_freq(data);
+ max_freq = amd_get_max_freq(data);
+ if ( min_freq > max_freq )
+ {
+ amd_pstate_err(policy->cpu, "min_freq(%u) or max_freq(%u) value is incorrect\n",
+ min_freq, max_freq);
+ goto error;
+ }
+
+ policy->min = min_freq;
+ policy->max = max_freq;
+
+ policy->cpuinfo.min_freq = min_freq;
+ policy->cpuinfo.max_freq = max_freq;
+ policy->cpuinfo.perf_freq = nominal_freq;
+ policy->cur = nominal_freq;
+
+ /* Initial processor data capability frequencies */
+ data->min_freq = min_freq;
+ data->nominal_freq = nominal_freq;
+ data->max_freq = max_freq;
+
+ data->err = 0;
+ return;
+
+ error:
+ data->err = -EINVAL;
+ val &= ~AMD_CPPC_ENABLE;
+ if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+ amd_pstate_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+}
+
+/*
+ * The new AMD P-States driver is different than legacy ACPI hardware P-State,
+ * which has a finer grain frequency range between the highest and lowest
+ * frequency. And boost frequency is actually the frequency which is mapped on
+ * highest performance ratio. The legacy P0 frequency is actually mapped on
+ * nominal performance ratio.
+ */
+static void amd_pstate_boost_init(struct cpufreq_policy *policy, struct amd_pstate_drv_data *data)
+{
+ uint32_t highest_perf, nominal_perf;
+
+ highest_perf = data->hw.highest_perf;
+ nominal_perf = data->hw.nominal_perf;
+
+ if ( highest_perf <= nominal_perf )
+ return;
+
+ policy->turbo = CPUFREQ_TURBO_ENABLED;
+}
+
+static int cf_check amd_pstate_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ unsigned int cpu = policy->cpu;
+ struct amd_pstate_drv_data *data;
+
+ data = xzalloc(struct amd_pstate_drv_data);
+ if ( !data )
+ return -ENOMEM;
+
+ data->cppc_data = &processor_pminfo[cpu]->cppc_data;
+
+ policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+ per_cpu(amd_pstate_drv_data, cpu) = data;
+
+ on_selected_cpus(cpumask_of(cpu), amd_pstate_init_msrs, policy, 1);
+
+ if ( data->err )
+ {
+ amd_pstate_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
+ per_cpu(amd_pstate_drv_data, cpu) = NULL;
+ xfree(data);
+ return -ENODEV;
+ }
+
+ amd_pstate_boost_init(policy, data);
+ return 0;
+}
+
+static int cf_check amd_pstate_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+ struct amd_pstate_drv_data *data = per_cpu(amd_pstate_drv_data, policy->cpu);
+
+ per_cpu(amd_pstate_drv_data, policy->cpu) = NULL;
+ xfree(data);
+
+ return 0;
+}
+
static const struct cpufreq_driver __initconstrel amd_pstate_cpufreq_driver =
{
.name = XEN_AMD_PSTATE_DRIVER_NAME,
+ .verify = amd_pstate_cpufreq_verify,
+ .target = amd_pstate_cpufreq_target,
+ .init = amd_pstate_cpufreq_cpu_init,
+ .exit = amd_pstate_cpufreq_cpu_exit,
};
int __init amd_pstate_register_driver(void)
@@ -455,6 +455,11 @@
#define MSR_AMD_PPIN_CTL 0xc00102f0U
#define MSR_AMD_PPIN 0xc00102f1U
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+#define AMD_CPPC_ENABLE BIT(0, ULL)
+
/* VIA Cyrix defined MSRs*/
#define MSR_VIA_FCR 0x00001107
#define MSR_VIA_RNG 0x0000110b
amd-pstate is the AMD CPU performance scaling driver that introduces a new CPU frequency control mechanism on AMD Zen based CPU series. The new mechanism is based on Collaborative Processor Performance Control (CPPC) which is a finer grain frequency management than legacy ACPI hardware P-States. Current AMD CPU platforms are using the ACPI P-states driver to manage CPU frequency and clocks with switching only in 3 P-states. The new amd-pstate allows a more flexible, low-latency interface for Xen to directly communicate the performance hints to hardware. The first version "amd-pstate" could leverage common governors such as *ondemand*, *performance*, etc, to manage the performance hints. In the future, we will introduce an advanced active mode to enable autonomous performence level selection. Signed-off-by: Penny Zheng <Penny.Zheng@amd.com> --- xen/arch/x86/acpi/cpufreq/amd-pstate.c | 336 +++++++++++++++++++++++++ xen/arch/x86/include/asm/msr-index.h | 5 + 2 files changed, 341 insertions(+)