diff mbox series

[v2,05/11] xen/x86: introduce a new amd cppc driver for cpufreq scaling

Message ID 20250206083255.1296363-6-Penny.Zheng@amd.com (mailing list archive)
State New
Headers show
Series amd-cppc CPU Performance Scaling Driver | expand

Commit Message

Penny, Zheng Feb. 6, 2025, 8:32 a.m. UTC
amd-cppc is the AMD CPU performance scaling driver that introduces a
new CPU frequency control mechanism firstly on AMD Zen based CPU series.
The new mechanism is based on Collaborative Processor Performance
Control (CPPC) which is a finer grain frequency management
than legacy ACPI hardware P-States.
Current AMD CPU platforms are using the ACPI P-states driver to
manage CPU frequency and clocks with switching only in 3 P-states.
The new amd-cppc allows a more flexible, low-latency interface for Xen
to directly communicate the performance hints to hardware.

The first version "amd-cppc" could leverage common governors such as
*ondemand*, *performance*, etc, to manage the performance hints. In the
future, we will introduce an advanced active mode to enable autonomous
performence level selection.

Signed-off-by: Penny Zheng <Penny.Zheng@amd.com>
---
v1 -> v2:
- re-construct union caps and req to have anonymous struct instead
- avoid "else" when the earlier if() ends in an unconditional control flow statement
- Add check to avoid chopping off set bits from cast
- make pointers pointer-to-const wherever possible
- remove noisy log
- exclude families before 0x17 before CPPC-feature MSR op
- remove useless variable helpers
- use xvzalloc and XVFREE
- refactor error handling as ENABLE bit can only be cleared by reset
---
 xen/arch/x86/acpi/cpufreq/amd-cppc.c | 388 +++++++++++++++++++++++++++
 1 file changed, 388 insertions(+)

Comments

Jan Beulich Feb. 11, 2025, 4:46 p.m. UTC | #1
On 06.02.2025 09:32, Penny Zheng wrote:
> --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> @@ -13,7 +13,61 @@
>  
>  #include <xen/init.h>
>  #include <xen/param.h>
> +#include <xen/percpu.h>
> +#include <xen/xvmalloc.h>
>  #include <acpi/cpufreq/cpufreq.h>
> +#include <asm/amd.h>
> +
> +#define MSR_AMD_CPPC_CAP1               0xc00102b0
> +#define MSR_AMD_CPPC_ENABLE             0xc00102b1
> +#define AMD_CPPC_ENABLE                 BIT(0, ULL)
> +#define MSR_AMD_CPPC_REQ                0xc00102b3

Why not in msr-index.h?

> +#define amd_cppc_err(cpu, fmt, args...)                     \
> +    printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
> +#define amd_cppc_verbose(fmt, args...)                      \
> +({                                                          \
> +    if ( cpufreq_verbose )                                  \
> +        printk(XENLOG_DEBUG "AMD_CPPC: " fmt, ## args);     \
> +})
> +#define amd_cppc_warn(fmt, args...)                         \
> +    printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)

Perhaps move warn up between err and verbose?

> +struct amd_cppc_drv_data
> +{
> +    struct xen_processor_cppc *cppc_data;

Pointer-to-const?

> +    union
> +    {
> +        uint64_t raw;
> +        struct
> +        {

While generally we want opening figure braces on their own lines, for
struct/union this isn't the case. See e.g. include/xen/lib/x86/cpu-policy.h.

> +            unsigned int lowest_perf:8;
> +            unsigned int lowest_nonlinear_perf:8;
> +            unsigned int nominal_perf:8;
> +            unsigned int highest_perf:8;
> +            unsigned int :32;
> +        };
> +    } caps;
> +    union
> +    {
> +        uint64_t raw;
> +        struct
> +        {
> +            unsigned int max_perf:8;
> +            unsigned int min_perf:8;
> +            unsigned int des_perf:8;
> +            unsigned int epp:8;
> +            unsigned int :32;
> +        };
> +    } req;
> +    int err;

There wants to be a blank line between the union and this one.

> +    uint32_t max_freq;
> +    uint32_t min_freq;
> +    uint32_t nominal_freq;

Are fixed-width types really needed here (see ./CODING_STYLE)?

> @@ -50,9 +104,343 @@ int __init amd_cppc_cmdline_parse(const char *s, const char *e)
>      return 0;
>  }
>  
> +/*
> + * If CPPC lowest_freq and nominal_freq registers are exposed then we can
> + * use them to convert perf to freq and vice versa. The conversion is
> + * extrapolated as an affine function passing by the 2 points:

Having studied maths, "affine function" isn't a term I know. There are affine
transformations, but don't you simply mean "linear function" here? If so, is
it said anywhere in the spec that perf values grow linearly with freq ones?

> + *  - (Low perf, Low freq)
> + *  - (Nominal perf, Nominal freq)
> + */
> +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data, unsigned int freq, uint8_t *perf)

Overlong line again. Please sort throughout the series.

> +{
> +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    uint64_t mul, div, offset = 0, res;
> +
> +    if ( freq == (cppc_data->nominal_freq * 1000) )

There's no comment anywhere what the units of the values are. Therefore the
multiplication by 1000 here leaves me wondering why consistent units aren't
used in the first place. (From the name of the function I might guess that
"freq" is in kHz, and then perhaps ->{min,max,nominal}_freq are in MHz.
Then for the foreseeable future we're hopefully safe here wrt overflow.)

> +    {
> +        *perf = data->caps.nominal_perf;
> +        return 0;
> +    }
> +
> +    if ( freq == (cppc_data->lowest_freq * 1000) )
> +    {
> +        *perf = data->caps.lowest_perf;
> +        return 0;
> +    }
> +
> +    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )

Why the inner parentheses?

> +    {
> +        mul = data->caps.nominal_perf - data->caps.lowest_perf;
> +        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
> +        /*
> +         * We don't need to convert to kHz for computing offset and can
> +         * directly use nominal_freq and lowest_freq as the division
> +         * will remove the frequency unit.
> +         */
> +        div = div ?: 1;
> +        offset = data->caps.nominal_perf - (mul * cppc_data->nominal_freq) / div;

I fear I can't convince myself that the subtraction can't ever underflow.
With

O = offset
Pn = data->caps.nominal_perf
Pl = data->caps.lowest_perf
Fn = cppc_data->nominal_freq
Fl = cppc_data->lowest_freq

the above becomes

O = Pn - ((Pn - Pl) * Fn) / (Fn - Fl)

and your assumption is O >= 0 (and for inputs: Fn >= Fl and Pn >= Pl). That
for me transforms to

(Pn - Pl) * Fn <= Pn * (Fn - Fl)

and further

-(Pl * Fn) <= -(Pn * Fl)

or

Pn * Fl <= Pl * Fn

and I don't see why this would always hold. Yet if there can be underflow,
I wonder whether the calculation is actually correct. Or, ...

> +    }
> +    else
> +    {
> +        /* Read Processor Max Speed(mhz) as anchor point */
> +        mul = data->caps.highest_perf;
> +        div = this_cpu(max_freq_mhz);
> +        if ( !div )
> +            return -EINVAL;
> +    }
> +
> +    res = offset + (mul * freq) / (div * 1000);

... considering that a negative offset here isn't really an issue, as long
as the rhs of the addition is large enough, is offset perhaps meant to be
a signed quantity (and considering it's in principle an [abstract] perf
value, it doesn't even need to be a 64-bit one, i.e. perhaps one of the
cases where plain int is appropriate to use)?

> +    if ( res > UINT8_MAX )
> +    {
> +        printk(XENLOG_ERR "Perf value exceeds maximum value 255: %lu\n", res);

If this was to ever trigger, it would then likely trigger often. Imo such
log messages want to be printk_once(), if they're needed at all.

> +        return -EINVAL;

Can't we instead clip to 0xff?

> +static int amd_get_min_freq(const struct amd_cppc_drv_data *data, unsigned int *min_freq)
> +{
> +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    uint64_t mul, div, res;
> +
> +    if ( cppc_data->lowest_freq )
> +    {
> +        /* Switch to khz */
> +        *min_freq = cppc_data->lowest_freq * 1000;
> +        return 0;
> +    }
> +
> +    /* Read Processor Max Speed(mhz) as anchor point */
> +    mul = this_cpu(max_freq_mhz);
> +    div = data->caps.highest_perf;
> +    res = (mul * data->caps.lowest_perf * 1000) / div;
> +    if ( res > UINT_MAX )
> +    {
> +        printk(XENLOG_ERR "Min freq exceeds maximum value UINT_MAX: %lu\n", res);
> +        return -EINVAL;
> +    }
> +
> +    *min_freq = (unsigned int)res;

I.e. 0 when max_freq_mhz isn't set. Does returning back 0 make sense?

> +    return 0;

Nit: Blank line please before the main return statement of a function.

> +}
> +
> +static int amd_get_nominal_freq(const struct amd_cppc_drv_data *data, unsigned int *nom_freq)
> +{
> +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
> +    uint64_t mul, div, res;
> +
> +    if ( cppc_data->nominal_freq )
> +    {
> +        /* Switch to khz */
> +        *nom_freq = cppc_data->nominal_freq * 1000;
> +        return 0;
> +    }
> +
> +    /* Read Processor Max Speed(mhz) as anchor point */
> +    mul = this_cpu(max_freq_mhz);
> +    div = data->caps.highest_perf;
> +    res = (mul * data->caps.nominal_perf * 1000) / div;
> +    if ( res > UINT_MAX )
> +    {
> +        printk(XENLOG_ERR "Nominal freq exceeds maximum value UINT_MAX: %lu\n", res);
> +        return -EINVAL;
> +    }
> +
> +    *nom_freq = (unsigned int)res;
> +    return 0;
> +}

This is an almost identical copy of the earlier function. I wonder if the
redundancy wouldn't better be reduced.

> +static void amd_cppc_write_request_msrs(void *info)
> +{
> +    struct amd_cppc_drv_data *data = info;
> +
> +    if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->req.raw) )
> +    {
> +        data->err = -EINVAL;
> +        return;
> +    }
> +    data->err = 0;

Cache bouncing wise I think it would be better if the clearing was done ...

> +}
> +
> +static int cf_check amd_cppc_write_request(int cpu, uint8_t min_perf,
> +                                           uint8_t des_perf, uint8_t max_perf)
> +{
> +    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> +    uint64_t prev = data->req.raw;
> +
> +    data->req.min_perf = min_perf;
> +    data->req.max_perf = max_perf;
> +    data->req.des_perf = des_perf;
> +
> +    if ( prev == data->req.raw )
> +        return 0;
> +
> +    on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
> +
> +    return data->err;
> +}

... in this function. Then the field would be written to (and the cacheline
change ownership) only in the error case.

As to the function's parameters - why's there a plain int?

> +static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
> +                                            unsigned int target_freq,
> +                                            unsigned int relation)
> +{
> +    unsigned int cpu = policy->cpu;
> +    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
> +    uint8_t des_perf;
> +    int res;
> +
> +    if ( unlikely(!target_freq) )
> +        return 0;
> +
> +    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
> +    if ( res )
> +        return res;
> +
> +    return amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
> +                                  des_perf, data->caps.highest_perf);

Up to here caps.lowest_perf was used. Why caps.lowest_nonlinear_perf here?

Also why do you not use the local variable "cpu" that you have made yourself?

> +static void cf_check amd_cppc_init_msrs(void *info)
> +{
> +    struct cpufreq_policy *policy = info;
> +    struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
> +    uint64_t val;
> +    unsigned int min_freq, nominal_freq, max_freq;
> +    const struct cpuinfo_x86 *c = cpu_data + policy->cpu;
> +
> +    /* Feature CPPC is firstly introduiced on Zen2 */

Nit: introduced

> +    if ( c->x86 < 0x17 )
> +    {
> +        amd_cppc_err(policy->cpu, "Unsupported cpu family: %x\n", c->x86);
> +        data->err = -EOPNOTSUPP;
> +        return;
> +    }

This could be checked ahead of issuing the IPI to invoke this function.
It probably would also be nice if this log message appeared only once;
maybe it does, and I merely don't see why.

> +    /* Package level MSR */
> +    if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> +    {
> +        amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
> +        goto err;
> +    }
> +
> +    /*
> +     * Only when Enable bit is on, the hardware will calculate the processor’s
> +     * performance capabilities and initialize the performance level fields in
> +     * the CPPC capability registers.
> +     */
> +    if ( !(val & AMD_CPPC_ENABLE) )
> +    {
> +        val |= AMD_CPPC_ENABLE;
> +        if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
> +        {
> +            amd_cppc_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
> +            goto err;
> +        }
> +    }
> +
> +    if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->caps.raw) )
> +    {
> +        amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
> +        goto err;
> +    }
> +
> +    if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
> +         data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 )
> +    {
> +        amd_cppc_err(policy->cpu,
> +                     "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
> +                     data->caps.highest_perf, data->caps.lowest_perf,
> +                     data->caps.nominal_perf, data->caps.lowest_nonlinear_perf);
> +        goto err;
> +    }
> +
> +    data->err = amd_get_min_freq(data, &min_freq);
> +    if ( data->err )
> +        return;
> +
> +    data->err = amd_get_nominal_freq(data, &nominal_freq);
> +    if ( data->err )
> +        return;
> +
> +    data->err = amd_get_max_freq(data, &max_freq);
> +    if ( data->err )
> +        return;
> +
> +    if ( min_freq > max_freq )
> +    {
> +        amd_cppc_err(policy->cpu, "min_freq(%u) or max_freq(%u) value is incorrect\n",
> +                     min_freq, max_freq);
> +        goto err;
> +    }

And nominal freq not being between the two is okay?

> +    policy->min = min_freq;
> +    policy->max = max_freq;
> +
> +    policy->cpuinfo.min_freq = min_freq;
> +    policy->cpuinfo.max_freq = max_freq;
> +    policy->cpuinfo.perf_freq = nominal_freq;
> +    policy->cur = nominal_freq;

How do you know this is correct? Or does it simply not matter at this point?

> +    /* Initial processor data capability frequencies */
> +    data->min_freq = min_freq;
> +    data->nominal_freq = nominal_freq;
> +    data->max_freq = max_freq;

Is this data duplication actually needed? Can't data, if necessary, simply
have a back pointer to the policy for the CPU?

Actually, aren't two of the three values already accessible through the
cppc_data pointer hanging off of data? Which in turn makes me wonder why
you go through the effort of calculating those values.

> + err:
> +    data->err = -EINVAL;
> +}

At this point you may have set the enable bit already, which you can't undo.
What effect will this have on the system when the error path is taken here?
Especially if we then try to fall back to another driver?

> +static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +    unsigned int cpu = policy->cpu;
> +    struct amd_cppc_drv_data *data;
> +
> +    data = xvzalloc(struct amd_cppc_drv_data);
> +    if ( !data )
> +        return -ENOMEM;
> +
> +    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
> +
> +    per_cpu(amd_cppc_drv_data, cpu) = data;
> +
> +    on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
> +
> +    if ( data->err )
> +    {
> +        amd_cppc_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
> +        per_cpu(amd_cppc_drv_data, cpu) = NULL;
> +        XVFREE(data);

May be slightly tidier to invoke amd_cppc_cpufreq_cpu_exit() here.

> +        return -ENODEV;
> +    }
> +
> +    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
> +
> +    amd_cppc_boost_init(policy, data);
> +
> +    amd_cppc_verbose("CPU %u initialized with amd-cppc passive mode\n", policy->cpu);
> +    return 0;
> +}
> +
> +static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> +{
> +    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, policy->cpu);
> +
> +    per_cpu(amd_cppc_drv_data, policy->cpu) = NULL;
> +    XVFREE(data);

This makes little sense, as "data" is about to go out of scope. Why not
simply

    XVFREE(per_cpu(amd_cppc_drv_data, policy->cpu));

(which effectively you're open-coding)?

Jan
Penny, Zheng Feb. 18, 2025, 7:40 a.m. UTC | #2
[AMD Official Use Only - AMD Internal Distribution Only]

Hi

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Wednesday, February 12, 2025 12:46 AM
> To: Penny, Zheng <penny.zheng@amd.com>
> Cc: Huang, Ray <Ray.Huang@amd.com>; Andryuk, Jason
> <Jason.Andryuk@amd.com>; Andrew Cooper <andrew.cooper3@citrix.com>;
> Roger Pau Monné <roger.pau@citrix.com>; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH v2 05/11] xen/x86: introduce a new amd cppc driver for
> cpufreq scaling
>
> On 06.02.2025 09:32, Penny Zheng wrote:
> > --- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> > +++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
> >
> > +/*
> > + * If CPPC lowest_freq and nominal_freq registers are exposed then we
> > +can
> > + * use them to convert perf to freq and vice versa. The conversion is
> > + * extrapolated as an affine function passing by the 2 points:
>
> Having studied maths, "affine function" isn't a term I know. There are affine
> transformations, but don't you simply mean "linear function" here? If so, is it said
> anywhere in the spec that perf values grow linearly with freq ones?
>

Yes, "linear mapping" is better. And the spec reference is as follows:
```
The OS should use Lowest Frequency/Performance and Nominal Frequency/Performance
as anchor points to create a linear mapping of CPPC abstract performance to CPU frequency
```
See 8.4.6.1.1.7. Lowest Frequency and Nominal Frequency
 (https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html?highlight=cppc#lowest-frequency-and-nominal-frequency )

> > + *  - (Low perf, Low freq)
> > + *  - (Nominal perf, Nominal freq)
> > + */
> > +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
> > +unsigned int freq, uint8_t *perf)
>
> Overlong line again. Please sort throughout the series.
>
> > +{
> > +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
> > +    uint64_t mul, div, offset = 0, res;
> > +
> > +    if ( freq == (cppc_data->nominal_freq * 1000) )
>
> There's no comment anywhere what the units of the values are. Therefore the
> multiplication by 1000 here leaves me wondering why consistent units aren't used in
> the first place. (From the name of the function I might guess that "freq" is in kHz,
> and then perhaps ->{min,max,nominal}_freq are in MHz.
> Then for the foreseeable future we're hopefully safe here wrt overflow.)
>

These conversion functions are designed in the first place for *ondemand* governor, which
reports performance as CPU frequencies. In generic governor->target() functions, we are always
take freq in KHz, but in CPPC ACPI spec, the frequency is read in Mhz from register...

> > +    {
> > +        *perf = data->caps.nominal_perf;
> > +        return 0;
> > +    }
> > +
> > +    if ( freq == (cppc_data->lowest_freq * 1000) )
> > +    {
> > +        *perf = data->caps.lowest_perf;
> > +        return 0;
> > +    }
> > +
> > +    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
>
> Why the inner parentheses?
>
> > +    {
> > +        mul = data->caps.nominal_perf - data->caps.lowest_perf;
> > +        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
> > +        /*
> > +         * We don't need to convert to kHz for computing offset and can
> > +         * directly use nominal_freq and lowest_freq as the division
> > +         * will remove the frequency unit.
> > +         */
> > +        div = div ?: 1;
> > +        offset = data->caps.nominal_perf - (mul *
> > + cppc_data->nominal_freq) / div;
>
> I fear I can't convince myself that the subtraction can't ever underflow.
> With
>
> O = offset
> Pn = data->caps.nominal_perf
> Pl = data->caps.lowest_perf
> Fn = cppc_data->nominal_freq
> Fl = cppc_data->lowest_freq
>
> the above becomes
>
> O = Pn - ((Pn - Pl) * Fn) / (Fn - Fl)
>
> and your assumption is O >= 0 (and for inputs: Fn >= Fl and Pn >= Pl). That for me
> transforms to
>
> (Pn - Pl) * Fn <= Pn * (Fn - Fl)
>
> and further
>
> -(Pl * Fn) <= -(Pn * Fl)
>
> or
>
> Pn * Fl <= Pl * Fn
>
> and I don't see why this would always hold. Yet if there can be underflow, I wonder
> whether the calculation is actually correct. Or, ...
>

Because we are assuming that in normal circumstances, when F==0, P is the offset value, and
It shall be an non-smaller-than-zero value, tbh, ==0 is more logical fwit
So if it is underflow, I might think the hardware itself is malfunctional.

> > +    }
> > +    else
> > +    {
> > +        /* Read Processor Max Speed(mhz) as anchor point */
> > +        mul = data->caps.highest_perf;
> > +        div = this_cpu(max_freq_mhz);
> > +        if ( !div )
> > +            return -EINVAL;
> > +    }
> > +
> > +    res = offset + (mul * freq) / (div * 1000);
>
> ... considering that a negative offset here isn't really an issue, as long as the rhs of
> the addition is large enough, is offset perhaps meant to be a signed quantity (and
> considering it's in principle an [abstract] perf value, it doesn't even need to be a 64-
> bit one, i.e. perhaps one of the cases where plain int is appropriate to use)?
>
> > +    if ( res > UINT8_MAX )
> > +    {
> > +        printk(XENLOG_ERR "Perf value exceeds maximum value 255:
> > + %lu\n", res);
>
> If this was to ever trigger, it would then likely trigger often. Imo such log messages
> want to be printk_once(), if they're needed at all.
>
> > +        return -EINVAL;
>
> Can't we instead clip to 0xff?
>

True, clip it to 0xff maybe better

>
> Jan
Jan Beulich Feb. 18, 2025, 3:04 p.m. UTC | #3
On 18.02.2025 08:40, Penny, Zheng wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Wednesday, February 12, 2025 12:46 AM
>> To: Penny, Zheng <penny.zheng@amd.com>
>>
>> On 06.02.2025 09:32, Penny Zheng wrote:
>>> +static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data,
>>> +unsigned int freq, uint8_t *perf)
>>
>> Overlong line again. Please sort throughout the series.
>>
>>> +{
>>> +    const struct xen_processor_cppc *cppc_data = data->cppc_data;
>>> +    uint64_t mul, div, offset = 0, res;
>>> +
>>> +    if ( freq == (cppc_data->nominal_freq * 1000) )
>>
>> There's no comment anywhere what the units of the values are. Therefore the
>> multiplication by 1000 here leaves me wondering why consistent units aren't used in
>> the first place. (From the name of the function I might guess that "freq" is in kHz,
>> and then perhaps ->{min,max,nominal}_freq are in MHz.
>> Then for the foreseeable future we're hopefully safe here wrt overflow.)
> 
> These conversion functions are designed in the first place for *ondemand* governor, which
> reports performance as CPU frequencies. In generic governor->target() functions, we are always
> take freq in KHz, but in CPPC ACPI spec, the frequency is read in Mhz from register...

That's all fine, but it wants reflecting in our sources somehow. Perhaps
simply by either naming the variables/fields accordingly (see how we e.g.
have a cpu_khz global variable, rather than it being named e.g. cpu_freq)
or by at least adding brief comments to their declarations.

>>> +    {
>>> +        *perf = data->caps.nominal_perf;
>>> +        return 0;
>>> +    }
>>> +
>>> +    if ( freq == (cppc_data->lowest_freq * 1000) )
>>> +    {
>>> +        *perf = data->caps.lowest_perf;
>>> +        return 0;
>>> +    }
>>> +
>>> +    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
>>
>> Why the inner parentheses?
>>
>>> +    {
>>> +        mul = data->caps.nominal_perf - data->caps.lowest_perf;
>>> +        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
>>> +        /*
>>> +         * We don't need to convert to kHz for computing offset and can
>>> +         * directly use nominal_freq and lowest_freq as the division
>>> +         * will remove the frequency unit.
>>> +         */
>>> +        div = div ?: 1;
>>> +        offset = data->caps.nominal_perf - (mul *
>>> + cppc_data->nominal_freq) / div;
>>
>> I fear I can't convince myself that the subtraction can't ever underflow.
>> With
>>
>> O = offset
>> Pn = data->caps.nominal_perf
>> Pl = data->caps.lowest_perf
>> Fn = cppc_data->nominal_freq
>> Fl = cppc_data->lowest_freq
>>
>> the above becomes
>>
>> O = Pn - ((Pn - Pl) * Fn) / (Fn - Fl)
>>
>> and your assumption is O >= 0 (and for inputs: Fn >= Fl and Pn >= Pl). That for me
>> transforms to
>>
>> (Pn - Pl) * Fn <= Pn * (Fn - Fl)
>>
>> and further
>>
>> -(Pl * Fn) <= -(Pn * Fl)
>>
>> or
>>
>> Pn * Fl <= Pl * Fn
>>
>> and I don't see why this would always hold. Yet if there can be underflow, I wonder
>> whether the calculation is actually correct. Or, ...
> 
> Because we are assuming that in normal circumstances, when F==0, P is the offset value, and
> It shall be an non-smaller-than-zero value, tbh, ==0 is more logical fwit
> So if it is underflow, I might think the hardware itself is malfunctional.

Why so? The more that I continued ...

>>> +    }
>>> +    else
>>> +    {
>>> +        /* Read Processor Max Speed(mhz) as anchor point */
>>> +        mul = data->caps.highest_perf;
>>> +        div = this_cpu(max_freq_mhz);
>>> +        if ( !div )
>>> +            return -EINVAL;
>>> +    }
>>> +
>>> +    res = offset + (mul * freq) / (div * 1000);
>>
>> ... considering that a negative offset here isn't really an issue, as long as the rhs of
>> the addition is large enough, is offset perhaps meant to be a signed quantity (and
>> considering it's in principle an [abstract] perf value, it doesn't even need to be a 64-
>> bit one, i.e. perhaps one of the cases where plain int is appropriate to use)?

... my explanation here, including the outline of an approach to deal with
this.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/acpi/cpufreq/amd-cppc.c b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
index 2dca4a00f3..f14e7a6638 100644
--- a/xen/arch/x86/acpi/cpufreq/amd-cppc.c
+++ b/xen/arch/x86/acpi/cpufreq/amd-cppc.c
@@ -13,7 +13,61 @@ 
 
 #include <xen/init.h>
 #include <xen/param.h>
+#include <xen/percpu.h>
+#include <xen/xvmalloc.h>
 #include <acpi/cpufreq/cpufreq.h>
+#include <asm/amd.h>
+
+#define MSR_AMD_CPPC_CAP1               0xc00102b0
+#define MSR_AMD_CPPC_ENABLE             0xc00102b1
+#define AMD_CPPC_ENABLE                 BIT(0, ULL)
+#define MSR_AMD_CPPC_REQ                0xc00102b3
+
+#define amd_cppc_err(cpu, fmt, args...)                     \
+    printk(XENLOG_ERR "AMD_CPPC: CPU%u error: " fmt, cpu, ## args)
+#define amd_cppc_verbose(fmt, args...)                      \
+({                                                          \
+    if ( cpufreq_verbose )                                  \
+        printk(XENLOG_DEBUG "AMD_CPPC: " fmt, ## args);     \
+})
+#define amd_cppc_warn(fmt, args...)                         \
+    printk(XENLOG_WARNING "AMD_CPPC: CPU%u warning: " fmt, cpu, ## args)
+
+struct amd_cppc_drv_data
+{
+    struct xen_processor_cppc *cppc_data;
+    union
+    {
+        uint64_t raw;
+        struct
+        {
+            unsigned int lowest_perf:8;
+            unsigned int lowest_nonlinear_perf:8;
+            unsigned int nominal_perf:8;
+            unsigned int highest_perf:8;
+            unsigned int :32;
+        };
+    } caps;
+    union
+    {
+        uint64_t raw;
+        struct
+        {
+            unsigned int max_perf:8;
+            unsigned int min_perf:8;
+            unsigned int des_perf:8;
+            unsigned int epp:8;
+            unsigned int :32;
+        };
+    } req;
+    int err;
+
+    uint32_t max_freq;
+    uint32_t min_freq;
+    uint32_t nominal_freq;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct amd_cppc_drv_data *, amd_cppc_drv_data);
 
 static bool __init amd_cppc_handle_option(const char *s, const char *end)
 {
@@ -50,9 +104,343 @@  int __init amd_cppc_cmdline_parse(const char *s, const char *e)
     return 0;
 }
 
+/*
+ * If CPPC lowest_freq and nominal_freq registers are exposed then we can
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an affine function passing by the 2 points:
+ *  - (Low perf, Low freq)
+ *  - (Nominal perf, Nominal freq)
+ */
+static int amd_cppc_khz_to_perf(const struct amd_cppc_drv_data *data, unsigned int freq, uint8_t *perf)
+{
+    const struct xen_processor_cppc *cppc_data = data->cppc_data;
+    uint64_t mul, div, offset = 0, res;
+
+    if ( freq == (cppc_data->nominal_freq * 1000) )
+    {
+        *perf = data->caps.nominal_perf;
+        return 0;
+    }
+
+    if ( freq == (cppc_data->lowest_freq * 1000) )
+    {
+        *perf = data->caps.lowest_perf;
+        return 0;
+    }
+
+    if ( (cppc_data->lowest_freq) && (cppc_data->nominal_freq) )
+    {
+        mul = data->caps.nominal_perf - data->caps.lowest_perf;
+        div = cppc_data->nominal_freq - cppc_data->lowest_freq;
+        /*
+         * We don't need to convert to kHz for computing offset and can
+         * directly use nominal_freq and lowest_freq as the division
+         * will remove the frequency unit.
+         */
+        div = div ?: 1;
+        offset = data->caps.nominal_perf - (mul * cppc_data->nominal_freq) / div;
+    }
+    else
+    {
+        /* Read Processor Max Speed(mhz) as anchor point */
+        mul = data->caps.highest_perf;
+        div = this_cpu(max_freq_mhz);
+        if ( !div )
+            return -EINVAL;
+    }
+
+    res = offset + (mul * freq) / (div * 1000);
+    if ( res > UINT8_MAX )
+    {
+        printk(XENLOG_ERR "Perf value exceeds maximum value 255: %lu\n", res);
+        return -EINVAL;
+    }
+    *perf = (uint8_t)res;
+
+    return 0;
+}
+
+static int amd_get_min_freq(const struct amd_cppc_drv_data *data, unsigned int *min_freq)
+{
+    const struct xen_processor_cppc *cppc_data = data->cppc_data;
+    uint64_t mul, div, res;
+
+    if ( cppc_data->lowest_freq )
+    {
+        /* Switch to khz */
+        *min_freq = cppc_data->lowest_freq * 1000;
+        return 0;
+    }
+
+    /* Read Processor Max Speed(mhz) as anchor point */
+    mul = this_cpu(max_freq_mhz);
+    div = data->caps.highest_perf;
+    res = (mul * data->caps.lowest_perf * 1000) / div;
+    if ( res > UINT_MAX )
+    {
+        printk(XENLOG_ERR "Min freq exceeds maximum value UINT_MAX: %lu\n", res);
+        return -EINVAL;
+    }
+
+    *min_freq = (unsigned int)res;
+    return 0;
+}
+
+static int amd_get_nominal_freq(const struct amd_cppc_drv_data *data, unsigned int *nom_freq)
+{
+    const struct xen_processor_cppc *cppc_data = data->cppc_data;
+    uint64_t mul, div, res;
+
+    if ( cppc_data->nominal_freq )
+    {
+        /* Switch to khz */
+        *nom_freq = cppc_data->nominal_freq * 1000;
+        return 0;
+    }
+
+    /* Read Processor Max Speed(mhz) as anchor point */
+    mul = this_cpu(max_freq_mhz);
+    div = data->caps.highest_perf;
+    res = (mul * data->caps.nominal_perf * 1000) / div;
+    if ( res > UINT_MAX )
+    {
+        printk(XENLOG_ERR "Nominal freq exceeds maximum value UINT_MAX: %lu\n", res);
+        return -EINVAL;
+    }
+
+    *nom_freq = (unsigned int)res;
+    return 0;
+}
+
+static int amd_get_max_freq(const struct amd_cppc_drv_data *data, unsigned int *max_freq)
+{
+    unsigned int nom_freq, boost_ratio;
+    int res;
+
+    res = amd_get_nominal_freq(data, &nom_freq);
+    if ( res )
+        return res;
+
+    boost_ratio = (unsigned int)(data->caps.highest_perf / data->caps.nominal_perf);
+    *max_freq = nom_freq * boost_ratio;
+
+    return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, policy->cpu);
+
+    cpufreq_verify_within_limits(policy, data->min_freq, data->max_freq);
+
+    return 0;
+}
+
+static void amd_cppc_write_request_msrs(void *info)
+{
+    struct amd_cppc_drv_data *data = info;
+
+    if ( wrmsr_safe(MSR_AMD_CPPC_REQ, data->req.raw) )
+    {
+        data->err = -EINVAL;
+        return;
+    }
+    data->err = 0;
+}
+
+static int cf_check amd_cppc_write_request(int cpu, uint8_t min_perf,
+                                           uint8_t des_perf, uint8_t max_perf)
+{
+    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    uint64_t prev = data->req.raw;
+
+    data->req.min_perf = min_perf;
+    data->req.max_perf = max_perf;
+    data->req.des_perf = des_perf;
+
+    if ( prev == data->req.raw )
+        return 0;
+
+    on_selected_cpus(cpumask_of(cpu), amd_cppc_write_request_msrs, data, 1);
+
+    return data->err;
+}
+
+static int cf_check amd_cppc_cpufreq_target(struct cpufreq_policy *policy,
+                                            unsigned int target_freq,
+                                            unsigned int relation)
+{
+    unsigned int cpu = policy->cpu;
+    const struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, cpu);
+    uint8_t des_perf;
+    int res;
+
+    if ( unlikely(!target_freq) )
+        return 0;
+
+    res = amd_cppc_khz_to_perf(data, target_freq, &des_perf);
+    if ( res )
+        return res;
+
+    return amd_cppc_write_request(policy->cpu, data->caps.lowest_nonlinear_perf,
+                                  des_perf, data->caps.highest_perf);
+}
+
+static void cf_check amd_cppc_init_msrs(void *info)
+{
+    struct cpufreq_policy *policy = info;
+    struct amd_cppc_drv_data *data = this_cpu(amd_cppc_drv_data);
+    uint64_t val;
+    unsigned int min_freq, nominal_freq, max_freq;
+    const struct cpuinfo_x86 *c = cpu_data + policy->cpu;
+
+    /* Feature CPPC is firstly introduiced on Zen2 */
+    if ( c->x86 < 0x17 )
+    {
+        amd_cppc_err(policy->cpu, "Unsupported cpu family: %x\n", c->x86);
+        data->err = -EOPNOTSUPP;
+        return;
+    }
+
+    /* Package level MSR */
+    if ( rdmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+    {
+        amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_ENABLE)\n");
+        goto err;
+    }
+
+    /*
+     * Only when Enable bit is on, the hardware will calculate the processor’s
+     * performance capabilities and initialize the performance level fields in
+     * the CPPC capability registers.
+     */
+    if ( !(val & AMD_CPPC_ENABLE) )
+    {
+        val |= AMD_CPPC_ENABLE;
+        if ( wrmsr_safe(MSR_AMD_CPPC_ENABLE, val) )
+        {
+            amd_cppc_err(policy->cpu, "wrmsr_safe(MSR_AMD_CPPC_ENABLE, %lx)\n", val);
+            goto err;
+        }
+    }
+
+    if ( rdmsr_safe(MSR_AMD_CPPC_CAP1, data->caps.raw) )
+    {
+        amd_cppc_err(policy->cpu, "rdmsr_safe(MSR_AMD_CPPC_CAP1)\n");
+        goto err;
+    }
+
+    if ( data->caps.highest_perf == 0 || data->caps.lowest_perf == 0 ||
+         data->caps.nominal_perf == 0 || data->caps.lowest_nonlinear_perf == 0 )
+    {
+        amd_cppc_err(policy->cpu,
+                     "Platform malfunction, read CPPC highest_perf: %u, lowest_perf: %u, nominal_perf: %u, lowest_nonlinear_perf: %u zero value\n",
+                     data->caps.highest_perf, data->caps.lowest_perf,
+                     data->caps.nominal_perf, data->caps.lowest_nonlinear_perf);
+        goto err;
+    }
+
+    data->err = amd_get_min_freq(data, &min_freq);
+    if ( data->err )
+        return;
+
+    data->err = amd_get_nominal_freq(data, &nominal_freq);
+    if ( data->err )
+        return;
+
+    data->err = amd_get_max_freq(data, &max_freq);
+    if ( data->err )
+        return;
+
+    if ( min_freq > max_freq )
+    {
+        amd_cppc_err(policy->cpu, "min_freq(%u) or max_freq(%u) value is incorrect\n",
+                     min_freq, max_freq);
+        goto err;
+    }
+
+    policy->min = min_freq;
+    policy->max = max_freq;
+
+    policy->cpuinfo.min_freq = min_freq;
+    policy->cpuinfo.max_freq = max_freq;
+    policy->cpuinfo.perf_freq = nominal_freq;
+    policy->cur = nominal_freq;
+
+    /* Initial processor data capability frequencies */
+    data->min_freq = min_freq;
+    data->nominal_freq = nominal_freq;
+    data->max_freq = max_freq;
+
+    return;
+
+ err:
+    data->err = -EINVAL;
+}
+
+/*
+ * The new AMD CPPC driver is different than legacy ACPI hardware P-State,
+ * which has a finer grain frequency range between the highest and lowest
+ * frequency. And boost frequency is actually the frequency which is mapped on
+ * highest performance ratio. The legacy P0 frequency is actually mapped on
+ * nominal performance ratio.
+ */
+static void amd_cppc_boost_init(struct cpufreq_policy *policy, const struct amd_cppc_drv_data *data)
+{
+    if ( data->caps.highest_perf <= data->caps.nominal_perf )
+        return;
+
+    policy->turbo = CPUFREQ_TURBO_ENABLED;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int cpu = policy->cpu;
+    struct amd_cppc_drv_data *data;
+
+    data = xvzalloc(struct amd_cppc_drv_data);
+    if ( !data )
+        return -ENOMEM;
+
+    data->cppc_data = &processor_pminfo[cpu]->cppc_data;
+
+    per_cpu(amd_cppc_drv_data, cpu) = data;
+
+    on_selected_cpus(cpumask_of(cpu), amd_cppc_init_msrs, policy, 1);
+
+    if ( data->err )
+    {
+        amd_cppc_err(cpu, "Could not initialize AMD CPPC MSR properly\n");
+        per_cpu(amd_cppc_drv_data, cpu) = NULL;
+        XVFREE(data);
+        return -ENODEV;
+    }
+
+    policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR;
+
+    amd_cppc_boost_init(policy, data);
+
+    amd_cppc_verbose("CPU %u initialized with amd-cppc passive mode\n", policy->cpu);
+    return 0;
+}
+
+static int cf_check amd_cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct amd_cppc_drv_data *data = per_cpu(amd_cppc_drv_data, policy->cpu);
+
+    per_cpu(amd_cppc_drv_data, policy->cpu) = NULL;
+    XVFREE(data);
+
+    return 0;
+}
+
 static const struct cpufreq_driver __initconst_cf_clobber amd_cppc_cpufreq_driver =
 {
     .name   = XEN_AMD_CPPC_DRIVER_NAME,
+    .verify = amd_cppc_cpufreq_verify,
+    .target = amd_cppc_cpufreq_target,
+    .init   = amd_cppc_cpufreq_cpu_init,
+    .exit   = amd_cppc_cpufreq_cpu_exit,
 };
 
 int __init amd_cppc_register_driver(void)