diff mbox series

[v6,2/2] cpufreq: add virtual-cpufreq driver

Message ID 20240521043102.2786284-3-davidai@google.com (mailing list archive)
State New
Headers show
Series Improve VM CPUfreq and task placement behavior | expand

Commit Message

David Dai May 21, 2024, 4:30 a.m. UTC
Introduce a virtualized cpufreq driver for guest kernels to improve
performance and power of workloads within VMs.

This driver does two main things:

1. Sends the frequency of vCPUs as a hint to the host. The host uses the
hint to schedule the vCPU threads and decide physical CPU frequency.

2. If a VM does not support a virtualized FIE(like AMUs), it queries the
host CPU frequency by reading a MMIO region of a virtual cpufreq device
to update the guest's frequency scaling factor periodically. This enables
accurate Per-Entity Load Tracking for tasks running in the guest.

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
---
 drivers/cpufreq/Kconfig           |  14 ++
 drivers/cpufreq/Makefile          |   1 +
 drivers/cpufreq/virtual-cpufreq.c | 335 ++++++++++++++++++++++++++++++
 include/linux/arch_topology.h     |   1 +
 4 files changed, 351 insertions(+)
 create mode 100644 drivers/cpufreq/virtual-cpufreq.c

Comments

David Dai June 27, 2024, 9:22 p.m. UTC | #1
Hi folks,

Gentle nudge on this patch to see if there's any remaining concerns?

Thanks,
David

On Mon, May 20, 2024 at 10:32 PM David Dai <davidai@google.com> wrote:
>
> Introduce a virtualized cpufreq driver for guest kernels to improve
> performance and power of workloads within VMs.
>
> This driver does two main things:
>
> 1. Sends the frequency of vCPUs as a hint to the host. The host uses the
> hint to schedule the vCPU threads and decide physical CPU frequency.
>
> 2. If a VM does not support a virtualized FIE(like AMUs), it queries the
> host CPU frequency by reading a MMIO region of a virtual cpufreq device
> to update the guest's frequency scaling factor periodically. This enables
> accurate Per-Entity Load Tracking for tasks running in the guest.
>
> Co-developed-by: Saravana Kannan <saravanak@google.com>
> Signed-off-by: Saravana Kannan <saravanak@google.com>
> Signed-off-by: David Dai <davidai@google.com>
> ---
>  drivers/cpufreq/Kconfig           |  14 ++
>  drivers/cpufreq/Makefile          |   1 +
>  drivers/cpufreq/virtual-cpufreq.c | 335 ++++++++++++++++++++++++++++++
>  include/linux/arch_topology.h     |   1 +
>  4 files changed, 351 insertions(+)
>  create mode 100644 drivers/cpufreq/virtual-cpufreq.c
>
> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> index 94e55c40970a..9aa86bec5793 100644
> --- a/drivers/cpufreq/Kconfig
> +++ b/drivers/cpufreq/Kconfig
> @@ -217,6 +217,20 @@ config CPUFREQ_DT
>
>           If in doubt, say N.
>
> +config CPUFREQ_VIRT
> +       tristate "Virtual cpufreq driver"
> +       depends on OF && GENERIC_ARCH_TOPOLOGY
> +       help
> +         This adds a virtualized cpufreq driver for guest kernels that
> +         read/writes to a MMIO region for a virtualized cpufreq device to
> +         communicate with the host. It sends performance requests to the host
> +         which gets used as a hint to schedule vCPU threads and select CPU
> +         frequency. If a VM does not support a virtualized FIE such as AMUs,
> +         it updates the frequency scaling factor by polling host CPU frequency
> +         to enable accurate Per-Entity Load Tracking for tasks running in the guest.
> +
> +         If in doubt, say N.
> +
>  config CPUFREQ_DT_PLATDEV
>         tristate "Generic DT based cpufreq platdev driver"
>         depends on OF
> diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> index 8d141c71b016..eb72ecdc24db 100644
> --- a/drivers/cpufreq/Makefile
> +++ b/drivers/cpufreq/Makefile
> @@ -16,6 +16,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET)   += cpufreq_governor_attr_set.o
>
>  obj-$(CONFIG_CPUFREQ_DT)               += cpufreq-dt.o
>  obj-$(CONFIG_CPUFREQ_DT_PLATDEV)       += cpufreq-dt-platdev.o
> +obj-$(CONFIG_CPUFREQ_VIRT)             += virtual-cpufreq.o
>
>  # Traces
>  CFLAGS_amd-pstate-trace.o               := -I$(src)
> diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c
> new file mode 100644
> index 000000000000..59ce2bda3913
> --- /dev/null
> +++ b/drivers/cpufreq/virtual-cpufreq.c
> @@ -0,0 +1,335 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2024 Google LLC
> + */
> +
> +#include <linux/arch_topology.h>
> +#include <linux/cpufreq.h>
> +#include <linux/init.h>
> +#include <linux/sched.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/of_address.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +
> +/*
> + * CPU0..CPUn
> + * +-------------+-------------------------------+--------+-------+
> + * | Register    | Description                   | Offset |   Len |
> + * +-------------+-------------------------------+--------+-------+
> + * | cur_perf    | read this register to get     |    0x0 |   0x4 |
> + * |             | the current perf (integer val |        |       |
> + * |             | representing perf relative to |        |       |
> + * |             | max performance)              |        |       |
> + * |             | that vCPU is running at       |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + * | set_perf    | write to this register to set |    0x4 |   0x4 |
> + * |             | perf value of the vCPU        |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
> + * |             | table. A single entry in the  |        |       |
> + * |             | perf table denotes no table   |        |       |
> + * |             | and the entry contains        |        |       |
> + * |             | the maximum perf value        |        |       |
> + * |             | that this vCPU supports.      |        |       |
> + * |             | The guest can request any     |        |       |
> + * |             | value between 1 and max perf  |        |       |
> + * |             | when perftbls are not used.   |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perftbl_sel | write to this register to     |    0xc |   0x4 |
> + * |             | select perf table entry to    |        |       |
> + * |             | read from                     |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
> + * |             | perf value of the selected    |        |       |
> + * |             | entry based on perftbl_sel    |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perf_domain | performance domain number     |   0x14 |   0x4 |
> + * |             | that this vCPU belongs to.    |        |       |
> + * |             | vCPUs sharing the same perf   |        |       |
> + * |             | domain number are part of the |        |       |
> + * |             | same performance domain.      |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + */
> +
> +#define REG_CUR_PERF_STATE_OFFSET 0x0
> +#define REG_SET_PERF_STATE_OFFSET 0x4
> +#define REG_PERFTBL_LEN_OFFSET 0x8
> +#define REG_PERFTBL_SEL_OFFSET 0xc
> +#define REG_PERFTBL_RD_OFFSET 0x10
> +#define REG_PERF_DOMAIN_OFFSET 0x14
> +#define PER_CPU_OFFSET 0x1000
> +
> +#define PERFTBL_MAX_ENTRIES 64U
> +
> +static void __iomem *base;
> +static DEFINE_PER_CPU(u32, perftbl_num_entries);
> +
> +static void virt_scale_freq_tick(void)
> +{
> +       int cpu = smp_processor_id();
> +       u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu);
> +       u64 cur_freq;
> +       unsigned long scale;
> +
> +       cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET
> +                       + REG_CUR_PERF_STATE_OFFSET);
> +
> +       cur_freq <<= SCHED_CAPACITY_SHIFT;
> +       scale = (unsigned long)div_u64(cur_freq, max_freq);
> +       scale = min(scale, SCHED_CAPACITY_SCALE);
> +
> +       this_cpu_write(arch_freq_scale, scale);
> +}
> +
> +static struct scale_freq_data virt_sfd = {
> +       .source = SCALE_FREQ_SOURCE_VIRT,
> +       .set_freq_scale = virt_scale_freq_tick,
> +};
> +
> +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy,
> +                                         unsigned int target_freq)
> +{
> +       writel_relaxed(target_freq,
> +                      base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET);
> +       return 0;
> +}
> +
> +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy,
> +                                            unsigned int target_freq)
> +{
> +       virt_cpufreq_set_perf(policy, target_freq);
> +       return target_freq;
> +}
> +
> +static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx)
> +{
> +       writel_relaxed(idx, base + cpu * PER_CPU_OFFSET +
> +                      REG_PERFTBL_SEL_OFFSET);
> +       return readl_relaxed(base + cpu * PER_CPU_OFFSET +
> +                            REG_PERFTBL_RD_OFFSET);
> +}
> +
> +static int virt_cpufreq_target(struct cpufreq_policy *policy,
> +                              unsigned int target_freq,
> +                              unsigned int relation)
> +{
> +       struct cpufreq_freqs freqs;
> +       int ret = 0;
> +
> +       freqs.old = policy->cur;
> +       freqs.new = target_freq;
> +
> +       cpufreq_freq_transition_begin(policy, &freqs);
> +       ret = virt_cpufreq_set_perf(policy, target_freq);
> +       cpufreq_freq_transition_end(policy, &freqs, ret != 0);
> +
> +       return ret;
> +}
> +
> +static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy)
> +{
> +       u32 cur_perf_domain, perf_domain;
> +       struct device *cpu_dev;
> +       int cpu;
> +
> +       cur_perf_domain = readl_relaxed(base + policy->cpu *
> +                                       PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> +
> +       for_each_possible_cpu(cpu) {
> +               cpu_dev = get_cpu_device(cpu);
> +               if (!cpu_dev)
> +                       continue;
> +
> +               perf_domain = readl_relaxed(base + cpu *
> +                                           PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> +
> +               if (perf_domain == cur_perf_domain)
> +                       cpumask_set_cpu(cpu, policy->cpus);
> +       }
> +
> +       return 0;
> +}
> +
> +static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy)
> +{
> +       struct cpufreq_frequency_table *table;
> +       u32 num_perftbl_entries, idx;
> +
> +       num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu);
> +
> +       if (num_perftbl_entries == 1) {
> +               policy->cpuinfo.min_freq = 1;
> +               policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0);
> +
> +               policy->min = policy->cpuinfo.min_freq;
> +               policy->max = policy->cpuinfo.max_freq;
> +
> +               policy->cur = policy->max;
> +               return 0;
> +       }
> +
> +       table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL);
> +       if (!table)
> +               return -ENOMEM;
> +
> +       for (idx = 0; idx < num_perftbl_entries; idx++)
> +               table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx);
> +
> +       table[idx].frequency = CPUFREQ_TABLE_END;
> +       policy->freq_table = table;
> +
> +       return 0;
> +}
> +
> +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy)
> +{
> +       struct device *cpu_dev;
> +       int ret;
> +
> +       cpu_dev = get_cpu_device(policy->cpu);
> +       if (!cpu_dev)
> +               return -ENODEV;
> +
> +       ret = virt_cpufreq_get_freq_info(policy);
> +       if (ret) {
> +               dev_warn(cpu_dev, "failed to get cpufreq info\n");
> +               return ret;
> +       }
> +
> +       ret = virt_cpufreq_get_sharing_cpus(policy);
> +       if (ret) {
> +               dev_warn(cpu_dev, "failed to get sharing cpumask\n");
> +               return ret;
> +       }
> +
> +       /*
> +        * To simplify and improve latency of handling frequency requests on
> +        * the host side, this ensures that the vCPU thread triggering the MMIO
> +        * abort is the same thread whose performance constraints (Ex. uclamp
> +        * settings) need to be updated. This simplifies the VMM (Virtual
> +        * Machine Manager) having to find the correct vCPU thread and/or
> +        * facing permission issues when configuring other threads.
> +        */
> +       policy->dvfs_possible_from_any_cpu = false;
> +       policy->fast_switch_possible = true;
> +
> +       /*
> +        * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since
> +        * the actual physical CPU frequency may not match requested frequency
> +        * from the vCPU thread due to frequency update latencies or other
> +        * inputs to the physical CPU frequency selection. This additional FIE
> +        * source allows for more accurate freq_scale updates and only takes
> +        * effect if another FIE source such as AMUs have not been registered.
> +        */
> +       topology_set_scale_freq_source(&virt_sfd, policy->cpus);
> +
> +       return 0;
> +}
> +
> +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> +{
> +       topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus);
> +       kfree(policy->freq_table);
> +       return 0;
> +}
> +
> +static int virt_cpufreq_online(struct cpufreq_policy *policy)
> +{
> +       /* Nothing to restore. */
> +       return 0;
> +}
> +
> +static int virt_cpufreq_offline(struct cpufreq_policy *policy)
> +{
> +       /* Dummy offline() to avoid exit() being called and freeing resources. */
> +       return 0;
> +}
> +
> +static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
> +{
> +       if (policy->freq_table)
> +               return cpufreq_frequency_table_verify(policy, policy->freq_table);
> +
> +       cpufreq_verify_within_cpu_limits(policy);
> +       return 0;
> +}
> +
> +static struct cpufreq_driver cpufreq_virt_driver = {
> +       .name           = "virt-cpufreq",
> +       .init           = virt_cpufreq_cpu_init,
> +       .exit           = virt_cpufreq_cpu_exit,
> +       .online         = virt_cpufreq_online,
> +       .offline        = virt_cpufreq_offline,
> +       .verify         = virt_cpufreq_verify_policy,
> +       .target         = virt_cpufreq_target,
> +       .fast_switch    = virt_cpufreq_fast_switch,
> +       .attr           = cpufreq_generic_attr,
> +};
> +
> +static int virt_cpufreq_driver_probe(struct platform_device *pdev)
> +{
> +       u32 num_perftbl_entries;
> +       int ret, cpu;
> +
> +       base = devm_platform_ioremap_resource(pdev, 0);
> +       if (IS_ERR(base))
> +               return PTR_ERR(base);
> +
> +       for_each_possible_cpu(cpu) {
> +               num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET +
> +                                                   REG_PERFTBL_LEN_OFFSET);
> +
> +               if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES)
> +                       return -ENODEV;
> +
> +               per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries;
> +       }
> +
> +       ret = cpufreq_register_driver(&cpufreq_virt_driver);
> +       if (ret) {
> +               dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret);
> +               return ret;
> +       }
> +
> +       dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n");
> +       return 0;
> +}
> +
> +static int virt_cpufreq_driver_remove(struct platform_device *pdev)
> +{
> +       cpufreq_unregister_driver(&cpufreq_virt_driver);
> +       return 0;
> +}
> +
> +static const struct of_device_id virt_cpufreq_match[] = {
> +       { .compatible = "qemu,virtual-cpufreq", .data = NULL},
> +       {}
> +};
> +MODULE_DEVICE_TABLE(of, virt_cpufreq_match);
> +
> +static struct platform_driver virt_cpufreq_driver = {
> +       .probe = virt_cpufreq_driver_probe,
> +       .remove = virt_cpufreq_driver_remove,
> +       .driver = {
> +               .name = "virt-cpufreq",
> +               .of_match_table = virt_cpufreq_match,
> +       },
> +};
> +
> +static int __init virt_cpufreq_init(void)
> +{
> +       return platform_driver_register(&virt_cpufreq_driver);
> +}
> +postcore_initcall(virt_cpufreq_init);
> +
> +static void __exit virt_cpufreq_exit(void)
> +{
> +       platform_driver_unregister(&virt_cpufreq_driver);
> +}
> +module_exit(virt_cpufreq_exit);
> +
> +MODULE_DESCRIPTION("Virtual cpufreq driver");
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index b721f360d759..d5d848849408 100644
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -49,6 +49,7 @@ enum scale_freq_source {
>         SCALE_FREQ_SOURCE_CPUFREQ = 0,
>         SCALE_FREQ_SOURCE_ARCH,
>         SCALE_FREQ_SOURCE_CPPC,
> +       SCALE_FREQ_SOURCE_VIRT,
>  };
>
>  struct scale_freq_data {
> --
> 2.45.0.215.g3402c0e53f-goog
>
Rafael J. Wysocki June 28, 2024, 12:01 p.m. UTC | #2
Hi,

On Thu, Jun 27, 2024 at 11:22 PM David Dai <davidai@google.com> wrote:
>
> Hi folks,
>
> Gentle nudge on this patch to see if there's any remaining concerns?

Yes, there are.

The dependency of OF is pretty much a no-go from my perspective.

Thanks!


> On Mon, May 20, 2024 at 10:32 PM David Dai <davidai@google.com> wrote:
> >
> > Introduce a virtualized cpufreq driver for guest kernels to improve
> > performance and power of workloads within VMs.
> >
> > This driver does two main things:
> >
> > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the
> > hint to schedule the vCPU threads and decide physical CPU frequency.
> >
> > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the
> > host CPU frequency by reading a MMIO region of a virtual cpufreq device
> > to update the guest's frequency scaling factor periodically. This enables
> > accurate Per-Entity Load Tracking for tasks running in the guest.
> >
> > Co-developed-by: Saravana Kannan <saravanak@google.com>
> > Signed-off-by: Saravana Kannan <saravanak@google.com>
> > Signed-off-by: David Dai <davidai@google.com>
> > ---
> >  drivers/cpufreq/Kconfig           |  14 ++
> >  drivers/cpufreq/Makefile          |   1 +
> >  drivers/cpufreq/virtual-cpufreq.c | 335 ++++++++++++++++++++++++++++++
> >  include/linux/arch_topology.h     |   1 +
> >  4 files changed, 351 insertions(+)
> >  create mode 100644 drivers/cpufreq/virtual-cpufreq.c
> >
> > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> > index 94e55c40970a..9aa86bec5793 100644
> > --- a/drivers/cpufreq/Kconfig
> > +++ b/drivers/cpufreq/Kconfig
> > @@ -217,6 +217,20 @@ config CPUFREQ_DT
> >
> >           If in doubt, say N.
> >
> > +config CPUFREQ_VIRT
> > +       tristate "Virtual cpufreq driver"
> > +       depends on OF && GENERIC_ARCH_TOPOLOGY
> > +       help
> > +         This adds a virtualized cpufreq driver for guest kernels that
> > +         read/writes to a MMIO region for a virtualized cpufreq device to
> > +         communicate with the host. It sends performance requests to the host
> > +         which gets used as a hint to schedule vCPU threads and select CPU
> > +         frequency. If a VM does not support a virtualized FIE such as AMUs,
> > +         it updates the frequency scaling factor by polling host CPU frequency
> > +         to enable accurate Per-Entity Load Tracking for tasks running in the guest.
> > +
> > +         If in doubt, say N.
> > +
> >  config CPUFREQ_DT_PLATDEV
> >         tristate "Generic DT based cpufreq platdev driver"
> >         depends on OF
> > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> > index 8d141c71b016..eb72ecdc24db 100644
> > --- a/drivers/cpufreq/Makefile
> > +++ b/drivers/cpufreq/Makefile
> > @@ -16,6 +16,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET)   += cpufreq_governor_attr_set.o
> >
> >  obj-$(CONFIG_CPUFREQ_DT)               += cpufreq-dt.o
> >  obj-$(CONFIG_CPUFREQ_DT_PLATDEV)       += cpufreq-dt-platdev.o
> > +obj-$(CONFIG_CPUFREQ_VIRT)             += virtual-cpufreq.o
> >
> >  # Traces
> >  CFLAGS_amd-pstate-trace.o               := -I$(src)
> > diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c
> > new file mode 100644
> > index 000000000000..59ce2bda3913
> > --- /dev/null
> > +++ b/drivers/cpufreq/virtual-cpufreq.c
> > @@ -0,0 +1,335 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (C) 2024 Google LLC
> > + */
> > +
> > +#include <linux/arch_topology.h>
> > +#include <linux/cpufreq.h>
> > +#include <linux/init.h>
> > +#include <linux/sched.h>
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/of_address.h>
> > +#include <linux/of_platform.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/slab.h>
> > +
> > +/*
> > + * CPU0..CPUn
> > + * +-------------+-------------------------------+--------+-------+
> > + * | Register    | Description                   | Offset |   Len |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | cur_perf    | read this register to get     |    0x0 |   0x4 |
> > + * |             | the current perf (integer val |        |       |
> > + * |             | representing perf relative to |        |       |
> > + * |             | max performance)              |        |       |
> > + * |             | that vCPU is running at       |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | set_perf    | write to this register to set |    0x4 |   0x4 |
> > + * |             | perf value of the vCPU        |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
> > + * |             | table. A single entry in the  |        |       |
> > + * |             | perf table denotes no table   |        |       |
> > + * |             | and the entry contains        |        |       |
> > + * |             | the maximum perf value        |        |       |
> > + * |             | that this vCPU supports.      |        |       |
> > + * |             | The guest can request any     |        |       |
> > + * |             | value between 1 and max perf  |        |       |
> > + * |             | when perftbls are not used.   |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perftbl_sel | write to this register to     |    0xc |   0x4 |
> > + * |             | select perf table entry to    |        |       |
> > + * |             | read from                     |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
> > + * |             | perf value of the selected    |        |       |
> > + * |             | entry based on perftbl_sel    |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perf_domain | performance domain number     |   0x14 |   0x4 |
> > + * |             | that this vCPU belongs to.    |        |       |
> > + * |             | vCPUs sharing the same perf   |        |       |
> > + * |             | domain number are part of the |        |       |
> > + * |             | same performance domain.      |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + */
> > +
> > +#define REG_CUR_PERF_STATE_OFFSET 0x0
> > +#define REG_SET_PERF_STATE_OFFSET 0x4
> > +#define REG_PERFTBL_LEN_OFFSET 0x8
> > +#define REG_PERFTBL_SEL_OFFSET 0xc
> > +#define REG_PERFTBL_RD_OFFSET 0x10
> > +#define REG_PERF_DOMAIN_OFFSET 0x14
> > +#define PER_CPU_OFFSET 0x1000
> > +
> > +#define PERFTBL_MAX_ENTRIES 64U
> > +
> > +static void __iomem *base;
> > +static DEFINE_PER_CPU(u32, perftbl_num_entries);
> > +
> > +static void virt_scale_freq_tick(void)
> > +{
> > +       int cpu = smp_processor_id();
> > +       u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu);
> > +       u64 cur_freq;
> > +       unsigned long scale;
> > +
> > +       cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET
> > +                       + REG_CUR_PERF_STATE_OFFSET);
> > +
> > +       cur_freq <<= SCHED_CAPACITY_SHIFT;
> > +       scale = (unsigned long)div_u64(cur_freq, max_freq);
> > +       scale = min(scale, SCHED_CAPACITY_SCALE);
> > +
> > +       this_cpu_write(arch_freq_scale, scale);
> > +}
> > +
> > +static struct scale_freq_data virt_sfd = {
> > +       .source = SCALE_FREQ_SOURCE_VIRT,
> > +       .set_freq_scale = virt_scale_freq_tick,
> > +};
> > +
> > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy,
> > +                                         unsigned int target_freq)
> > +{
> > +       writel_relaxed(target_freq,
> > +                      base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET);
> > +       return 0;
> > +}
> > +
> > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy,
> > +                                            unsigned int target_freq)
> > +{
> > +       virt_cpufreq_set_perf(policy, target_freq);
> > +       return target_freq;
> > +}
> > +
> > +static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx)
> > +{
> > +       writel_relaxed(idx, base + cpu * PER_CPU_OFFSET +
> > +                      REG_PERFTBL_SEL_OFFSET);
> > +       return readl_relaxed(base + cpu * PER_CPU_OFFSET +
> > +                            REG_PERFTBL_RD_OFFSET);
> > +}
> > +
> > +static int virt_cpufreq_target(struct cpufreq_policy *policy,
> > +                              unsigned int target_freq,
> > +                              unsigned int relation)
> > +{
> > +       struct cpufreq_freqs freqs;
> > +       int ret = 0;
> > +
> > +       freqs.old = policy->cur;
> > +       freqs.new = target_freq;
> > +
> > +       cpufreq_freq_transition_begin(policy, &freqs);
> > +       ret = virt_cpufreq_set_perf(policy, target_freq);
> > +       cpufreq_freq_transition_end(policy, &freqs, ret != 0);
> > +
> > +       return ret;
> > +}
> > +
> > +static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy)
> > +{
> > +       u32 cur_perf_domain, perf_domain;
> > +       struct device *cpu_dev;
> > +       int cpu;
> > +
> > +       cur_perf_domain = readl_relaxed(base + policy->cpu *
> > +                                       PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> > +
> > +       for_each_possible_cpu(cpu) {
> > +               cpu_dev = get_cpu_device(cpu);
> > +               if (!cpu_dev)
> > +                       continue;
> > +
> > +               perf_domain = readl_relaxed(base + cpu *
> > +                                           PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> > +
> > +               if (perf_domain == cur_perf_domain)
> > +                       cpumask_set_cpu(cpu, policy->cpus);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy)
> > +{
> > +       struct cpufreq_frequency_table *table;
> > +       u32 num_perftbl_entries, idx;
> > +
> > +       num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu);
> > +
> > +       if (num_perftbl_entries == 1) {
> > +               policy->cpuinfo.min_freq = 1;
> > +               policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0);
> > +
> > +               policy->min = policy->cpuinfo.min_freq;
> > +               policy->max = policy->cpuinfo.max_freq;
> > +
> > +               policy->cur = policy->max;
> > +               return 0;
> > +       }
> > +
> > +       table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL);
> > +       if (!table)
> > +               return -ENOMEM;
> > +
> > +       for (idx = 0; idx < num_perftbl_entries; idx++)
> > +               table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx);
> > +
> > +       table[idx].frequency = CPUFREQ_TABLE_END;
> > +       policy->freq_table = table;
> > +
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy)
> > +{
> > +       struct device *cpu_dev;
> > +       int ret;
> > +
> > +       cpu_dev = get_cpu_device(policy->cpu);
> > +       if (!cpu_dev)
> > +               return -ENODEV;
> > +
> > +       ret = virt_cpufreq_get_freq_info(policy);
> > +       if (ret) {
> > +               dev_warn(cpu_dev, "failed to get cpufreq info\n");
> > +               return ret;
> > +       }
> > +
> > +       ret = virt_cpufreq_get_sharing_cpus(policy);
> > +       if (ret) {
> > +               dev_warn(cpu_dev, "failed to get sharing cpumask\n");
> > +               return ret;
> > +       }
> > +
> > +       /*
> > +        * To simplify and improve latency of handling frequency requests on
> > +        * the host side, this ensures that the vCPU thread triggering the MMIO
> > +        * abort is the same thread whose performance constraints (Ex. uclamp
> > +        * settings) need to be updated. This simplifies the VMM (Virtual
> > +        * Machine Manager) having to find the correct vCPU thread and/or
> > +        * facing permission issues when configuring other threads.
> > +        */
> > +       policy->dvfs_possible_from_any_cpu = false;
> > +       policy->fast_switch_possible = true;
> > +
> > +       /*
> > +        * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since
> > +        * the actual physical CPU frequency may not match requested frequency
> > +        * from the vCPU thread due to frequency update latencies or other
> > +        * inputs to the physical CPU frequency selection. This additional FIE
> > +        * source allows for more accurate freq_scale updates and only takes
> > +        * effect if another FIE source such as AMUs have not been registered.
> > +        */
> > +       topology_set_scale_freq_source(&virt_sfd, policy->cpus);
> > +
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> > +{
> > +       topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus);
> > +       kfree(policy->freq_table);
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_online(struct cpufreq_policy *policy)
> > +{
> > +       /* Nothing to restore. */
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_offline(struct cpufreq_policy *policy)
> > +{
> > +       /* Dummy offline() to avoid exit() being called and freeing resources. */
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
> > +{
> > +       if (policy->freq_table)
> > +               return cpufreq_frequency_table_verify(policy, policy->freq_table);
> > +
> > +       cpufreq_verify_within_cpu_limits(policy);
> > +       return 0;
> > +}
> > +
> > +static struct cpufreq_driver cpufreq_virt_driver = {
> > +       .name           = "virt-cpufreq",
> > +       .init           = virt_cpufreq_cpu_init,
> > +       .exit           = virt_cpufreq_cpu_exit,
> > +       .online         = virt_cpufreq_online,
> > +       .offline        = virt_cpufreq_offline,
> > +       .verify         = virt_cpufreq_verify_policy,
> > +       .target         = virt_cpufreq_target,
> > +       .fast_switch    = virt_cpufreq_fast_switch,
> > +       .attr           = cpufreq_generic_attr,
> > +};
> > +
> > +static int virt_cpufreq_driver_probe(struct platform_device *pdev)
> > +{
> > +       u32 num_perftbl_entries;
> > +       int ret, cpu;
> > +
> > +       base = devm_platform_ioremap_resource(pdev, 0);
> > +       if (IS_ERR(base))
> > +               return PTR_ERR(base);
> > +
> > +       for_each_possible_cpu(cpu) {
> > +               num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET +
> > +                                                   REG_PERFTBL_LEN_OFFSET);
> > +
> > +               if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES)
> > +                       return -ENODEV;
> > +
> > +               per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries;
> > +       }
> > +
> > +       ret = cpufreq_register_driver(&cpufreq_virt_driver);
> > +       if (ret) {
> > +               dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret);
> > +               return ret;
> > +       }
> > +
> > +       dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n");
> > +       return 0;
> > +}
> > +
> > +static int virt_cpufreq_driver_remove(struct platform_device *pdev)
> > +{
> > +       cpufreq_unregister_driver(&cpufreq_virt_driver);
> > +       return 0;
> > +}
> > +
> > +static const struct of_device_id virt_cpufreq_match[] = {
> > +       { .compatible = "qemu,virtual-cpufreq", .data = NULL},
> > +       {}
> > +};
> > +MODULE_DEVICE_TABLE(of, virt_cpufreq_match);
> > +
> > +static struct platform_driver virt_cpufreq_driver = {
> > +       .probe = virt_cpufreq_driver_probe,
> > +       .remove = virt_cpufreq_driver_remove,
> > +       .driver = {
> > +               .name = "virt-cpufreq",
> > +               .of_match_table = virt_cpufreq_match,
> > +       },
> > +};
> > +
> > +static int __init virt_cpufreq_init(void)
> > +{
> > +       return platform_driver_register(&virt_cpufreq_driver);
> > +}
> > +postcore_initcall(virt_cpufreq_init);
> > +
> > +static void __exit virt_cpufreq_exit(void)
> > +{
> > +       platform_driver_unregister(&virt_cpufreq_driver);
> > +}
> > +module_exit(virt_cpufreq_exit);
> > +
> > +MODULE_DESCRIPTION("Virtual cpufreq driver");
> > +MODULE_LICENSE("GPL");
> > diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> > index b721f360d759..d5d848849408 100644
> > --- a/include/linux/arch_topology.h
> > +++ b/include/linux/arch_topology.h
> > @@ -49,6 +49,7 @@ enum scale_freq_source {
> >         SCALE_FREQ_SOURCE_CPUFREQ = 0,
> >         SCALE_FREQ_SOURCE_ARCH,
> >         SCALE_FREQ_SOURCE_CPPC,
> > +       SCALE_FREQ_SOURCE_VIRT,
> >  };
> >
> >  struct scale_freq_data {
> > --
> > 2.45.0.215.g3402c0e53f-goog
> >
>
Sudeep Holla June 28, 2024, 12:42 p.m. UTC | #3
On Fri, Jun 28, 2024 at 02:01:16PM +0200, Rafael J. Wysocki wrote:
> Hi,
> 
> On Thu, Jun 27, 2024 at 11:22 PM David Dai <davidai@google.com> wrote:
> >
> > Hi folks,
> >
> > Gentle nudge on this patch to see if there's any remaining concerns?
> 
> Yes, there are.
> 
> The dependency of OF is pretty much a no-go from my perspective.
> 

I agree and I don't think it is needed as well, can be removed easily IMO.

--
Regards,
Sudeep
Sudeep Holla June 28, 2024, 12:51 p.m. UTC | #4
On Mon, May 20, 2024 at 09:30:52PM -0700, David Dai wrote:
> Introduce a virtualized cpufreq driver for guest kernels to improve
> performance and power of workloads within VMs.
>
> This driver does two main things:
>
> 1. Sends the frequency of vCPUs as a hint to the host. The host uses the
> hint to schedule the vCPU threads and decide physical CPU frequency.
>
> 2. If a VM does not support a virtualized FIE(like AMUs), it queries the
> host CPU frequency by reading a MMIO region of a virtual cpufreq device
> to update the guest's frequency scaling factor periodically. This enables
> accurate Per-Entity Load Tracking for tasks running in the guest.
>
> +
> +/*
> + * CPU0..CPUn
> + * +-------------+-------------------------------+--------+-------+
> + * | Register    | Description                   | Offset |   Len |
> + * +-------------+-------------------------------+--------+-------+
> + * | cur_perf    | read this register to get     |    0x0 |   0x4 |
> + * |             | the current perf (integer val |        |       |
> + * |             | representing perf relative to |        |       |
> + * |             | max performance)              |        |       |
> + * |             | that vCPU is running at       |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + * | set_perf    | write to this register to set |    0x4 |   0x4 |
> + * |             | perf value of the vCPU        |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
> + * |             | table. A single entry in the  |        |       |
> + * |             | perf table denotes no table   |        |       |
> + * |             | and the entry contains        |        |       |
> + * |             | the maximum perf value        |        |       |
> + * |             | that this vCPU supports.      |        |       |
> + * |             | The guest can request any     |        |       |
> + * |             | value between 1 and max perf  |        |       |
> + * |             | when perftbls are not used.   |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perftbl_sel | write to this register to     |    0xc |   0x4 |
> + * |             | select perf table entry to    |        |       |
> + * |             | read from                     |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
> + * |             | perf value of the selected    |        |       |
> + * |             | entry based on perftbl_sel    |        |       |
> + * +---------------------------------------------+--------+-------+
> + * | perf_domain | performance domain number     |   0x14 |   0x4 |
> + * |             | that this vCPU belongs to.    |        |       |
> + * |             | vCPUs sharing the same perf   |        |       |
> + * |             | domain number are part of the |        |       |
> + * |             | same performance domain.      |        |       |
> + * +-------------+-------------------------------+--------+-------+
> + */

I think it is good idea to version this table, so that it gives flexibility
to update the entries. It is a must if we are getting away with DT. I didn't
give complete information in my previous response where I agreed with Rafael.

I am not sure how much feasible it is, but can it be queried via KVM IOCTLs
to VMM. Just a thought, I am exploring how to make this work even on ACPI
systems. It is simpler if we neednot rely on DT or ACPI.

--
Regards,
Sudeep
Saravana Kannan July 10, 2024, 12:07 a.m. UTC | #5
On Fri, Jun 28, 2024 at 5:01 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> Hi,
>
> On Thu, Jun 27, 2024 at 11:22 PM David Dai <davidai@google.com> wrote:
> >
> > Hi folks,
> >
> > Gentle nudge on this patch to see if there's any remaining concerns?
>
> Yes, there are.
>
> The dependency of OF is pretty much a no-go from my perspective.

If you are talking about the "depends on OF" I think that can and
should be dropped.

There's nothing really OF specific about this driver except the probe
function being for a platform device/driver. We can easily write an
ACPI driver variant of the probe function and have the same driver
work for both OF and ACPI versions of the device. We don't have any
ACPI experience or a test set up to test the ACPI variant. That's why
we haven't added it. We are more than happy to accept tested patches
on top of this that'll enable this for ACPI variants too.

If we drop the "depends on OF", will you accept this patch? If not,
can you give some guidance on what you are looking for?

Thanks,
Saravana




>
> Thanks!
>
>
> > On Mon, May 20, 2024 at 10:32 PM David Dai <davidai@google.com> wrote:
> > >
> > > Introduce a virtualized cpufreq driver for guest kernels to improve
> > > performance and power of workloads within VMs.
> > >
> > > This driver does two main things:
> > >
> > > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the
> > > hint to schedule the vCPU threads and decide physical CPU frequency.
> > >
> > > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the
> > > host CPU frequency by reading a MMIO region of a virtual cpufreq device
> > > to update the guest's frequency scaling factor periodically. This enables
> > > accurate Per-Entity Load Tracking for tasks running in the guest.
> > >
> > > Co-developed-by: Saravana Kannan <saravanak@google.com>
> > > Signed-off-by: Saravana Kannan <saravanak@google.com>
> > > Signed-off-by: David Dai <davidai@google.com>
> > > ---
> > >  drivers/cpufreq/Kconfig           |  14 ++
> > >  drivers/cpufreq/Makefile          |   1 +
> > >  drivers/cpufreq/virtual-cpufreq.c | 335 ++++++++++++++++++++++++++++++
> > >  include/linux/arch_topology.h     |   1 +
> > >  4 files changed, 351 insertions(+)
> > >  create mode 100644 drivers/cpufreq/virtual-cpufreq.c
> > >
> > > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> > > index 94e55c40970a..9aa86bec5793 100644
> > > --- a/drivers/cpufreq/Kconfig
> > > +++ b/drivers/cpufreq/Kconfig
> > > @@ -217,6 +217,20 @@ config CPUFREQ_DT
> > >
> > >           If in doubt, say N.
> > >
> > > +config CPUFREQ_VIRT
> > > +       tristate "Virtual cpufreq driver"
> > > +       depends on OF && GENERIC_ARCH_TOPOLOGY
> > > +       help
> > > +         This adds a virtualized cpufreq driver for guest kernels that
> > > +         read/writes to a MMIO region for a virtualized cpufreq device to
> > > +         communicate with the host. It sends performance requests to the host
> > > +         which gets used as a hint to schedule vCPU threads and select CPU
> > > +         frequency. If a VM does not support a virtualized FIE such as AMUs,
> > > +         it updates the frequency scaling factor by polling host CPU frequency
> > > +         to enable accurate Per-Entity Load Tracking for tasks running in the guest.
> > > +
> > > +         If in doubt, say N.
> > > +
> > >  config CPUFREQ_DT_PLATDEV
> > >         tristate "Generic DT based cpufreq platdev driver"
> > >         depends on OF
> > > diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
> > > index 8d141c71b016..eb72ecdc24db 100644
> > > --- a/drivers/cpufreq/Makefile
> > > +++ b/drivers/cpufreq/Makefile
> > > @@ -16,6 +16,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET)   += cpufreq_governor_attr_set.o
> > >
> > >  obj-$(CONFIG_CPUFREQ_DT)               += cpufreq-dt.o
> > >  obj-$(CONFIG_CPUFREQ_DT_PLATDEV)       += cpufreq-dt-platdev.o
> > > +obj-$(CONFIG_CPUFREQ_VIRT)             += virtual-cpufreq.o
> > >
> > >  # Traces
> > >  CFLAGS_amd-pstate-trace.o               := -I$(src)
> > > diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c
> > > new file mode 100644
> > > index 000000000000..59ce2bda3913
> > > --- /dev/null
> > > +++ b/drivers/cpufreq/virtual-cpufreq.c
> > > @@ -0,0 +1,335 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * Copyright (C) 2024 Google LLC
> > > + */
> > > +
> > > +#include <linux/arch_topology.h>
> > > +#include <linux/cpufreq.h>
> > > +#include <linux/init.h>
> > > +#include <linux/sched.h>
> > > +#include <linux/kernel.h>
> > > +#include <linux/module.h>
> > > +#include <linux/of_address.h>
> > > +#include <linux/of_platform.h>
> > > +#include <linux/platform_device.h>
> > > +#include <linux/slab.h>
> > > +
> > > +/*
> > > + * CPU0..CPUn
> > > + * +-------------+-------------------------------+--------+-------+
> > > + * | Register    | Description                   | Offset |   Len |
> > > + * +-------------+-------------------------------+--------+-------+
> > > + * | cur_perf    | read this register to get     |    0x0 |   0x4 |
> > > + * |             | the current perf (integer val |        |       |
> > > + * |             | representing perf relative to |        |       |
> > > + * |             | max performance)              |        |       |
> > > + * |             | that vCPU is running at       |        |       |
> > > + * +-------------+-------------------------------+--------+-------+
> > > + * | set_perf    | write to this register to set |    0x4 |   0x4 |
> > > + * |             | perf value of the vCPU        |        |       |
> > > + * +-------------+-------------------------------+--------+-------+
> > > + * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
> > > + * |             | table. A single entry in the  |        |       |
> > > + * |             | perf table denotes no table   |        |       |
> > > + * |             | and the entry contains        |        |       |
> > > + * |             | the maximum perf value        |        |       |
> > > + * |             | that this vCPU supports.      |        |       |
> > > + * |             | The guest can request any     |        |       |
> > > + * |             | value between 1 and max perf  |        |       |
> > > + * |             | when perftbls are not used.   |        |       |
> > > + * +---------------------------------------------+--------+-------+
> > > + * | perftbl_sel | write to this register to     |    0xc |   0x4 |
> > > + * |             | select perf table entry to    |        |       |
> > > + * |             | read from                     |        |       |
> > > + * +---------------------------------------------+--------+-------+
> > > + * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
> > > + * |             | perf value of the selected    |        |       |
> > > + * |             | entry based on perftbl_sel    |        |       |
> > > + * +---------------------------------------------+--------+-------+
> > > + * | perf_domain | performance domain number     |   0x14 |   0x4 |
> > > + * |             | that this vCPU belongs to.    |        |       |
> > > + * |             | vCPUs sharing the same perf   |        |       |
> > > + * |             | domain number are part of the |        |       |
> > > + * |             | same performance domain.      |        |       |
> > > + * +-------------+-------------------------------+--------+-------+
> > > + */
> > > +
> > > +#define REG_CUR_PERF_STATE_OFFSET 0x0
> > > +#define REG_SET_PERF_STATE_OFFSET 0x4
> > > +#define REG_PERFTBL_LEN_OFFSET 0x8
> > > +#define REG_PERFTBL_SEL_OFFSET 0xc
> > > +#define REG_PERFTBL_RD_OFFSET 0x10
> > > +#define REG_PERF_DOMAIN_OFFSET 0x14
> > > +#define PER_CPU_OFFSET 0x1000
> > > +
> > > +#define PERFTBL_MAX_ENTRIES 64U
> > > +
> > > +static void __iomem *base;
> > > +static DEFINE_PER_CPU(u32, perftbl_num_entries);
> > > +
> > > +static void virt_scale_freq_tick(void)
> > > +{
> > > +       int cpu = smp_processor_id();
> > > +       u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu);
> > > +       u64 cur_freq;
> > > +       unsigned long scale;
> > > +
> > > +       cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET
> > > +                       + REG_CUR_PERF_STATE_OFFSET);
> > > +
> > > +       cur_freq <<= SCHED_CAPACITY_SHIFT;
> > > +       scale = (unsigned long)div_u64(cur_freq, max_freq);
> > > +       scale = min(scale, SCHED_CAPACITY_SCALE);
> > > +
> > > +       this_cpu_write(arch_freq_scale, scale);
> > > +}
> > > +
> > > +static struct scale_freq_data virt_sfd = {
> > > +       .source = SCALE_FREQ_SOURCE_VIRT,
> > > +       .set_freq_scale = virt_scale_freq_tick,
> > > +};
> > > +
> > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy,
> > > +                                         unsigned int target_freq)
> > > +{
> > > +       writel_relaxed(target_freq,
> > > +                      base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET);
> > > +       return 0;
> > > +}
> > > +
> > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy,
> > > +                                            unsigned int target_freq)
> > > +{
> > > +       virt_cpufreq_set_perf(policy, target_freq);
> > > +       return target_freq;
> > > +}
> > > +
> > > +static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx)
> > > +{
> > > +       writel_relaxed(idx, base + cpu * PER_CPU_OFFSET +
> > > +                      REG_PERFTBL_SEL_OFFSET);
> > > +       return readl_relaxed(base + cpu * PER_CPU_OFFSET +
> > > +                            REG_PERFTBL_RD_OFFSET);
> > > +}
> > > +
> > > +static int virt_cpufreq_target(struct cpufreq_policy *policy,
> > > +                              unsigned int target_freq,
> > > +                              unsigned int relation)
> > > +{
> > > +       struct cpufreq_freqs freqs;
> > > +       int ret = 0;
> > > +
> > > +       freqs.old = policy->cur;
> > > +       freqs.new = target_freq;
> > > +
> > > +       cpufreq_freq_transition_begin(policy, &freqs);
> > > +       ret = virt_cpufreq_set_perf(policy, target_freq);
> > > +       cpufreq_freq_transition_end(policy, &freqs, ret != 0);
> > > +
> > > +       return ret;
> > > +}
> > > +
> > > +static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy)
> > > +{
> > > +       u32 cur_perf_domain, perf_domain;
> > > +       struct device *cpu_dev;
> > > +       int cpu;
> > > +
> > > +       cur_perf_domain = readl_relaxed(base + policy->cpu *
> > > +                                       PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> > > +
> > > +       for_each_possible_cpu(cpu) {
> > > +               cpu_dev = get_cpu_device(cpu);
> > > +               if (!cpu_dev)
> > > +                       continue;
> > > +
> > > +               perf_domain = readl_relaxed(base + cpu *
> > > +                                           PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
> > > +
> > > +               if (perf_domain == cur_perf_domain)
> > > +                       cpumask_set_cpu(cpu, policy->cpus);
> > > +       }
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy)
> > > +{
> > > +       struct cpufreq_frequency_table *table;
> > > +       u32 num_perftbl_entries, idx;
> > > +
> > > +       num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu);
> > > +
> > > +       if (num_perftbl_entries == 1) {
> > > +               policy->cpuinfo.min_freq = 1;
> > > +               policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0);
> > > +
> > > +               policy->min = policy->cpuinfo.min_freq;
> > > +               policy->max = policy->cpuinfo.max_freq;
> > > +
> > > +               policy->cur = policy->max;
> > > +               return 0;
> > > +       }
> > > +
> > > +       table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL);
> > > +       if (!table)
> > > +               return -ENOMEM;
> > > +
> > > +       for (idx = 0; idx < num_perftbl_entries; idx++)
> > > +               table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx);
> > > +
> > > +       table[idx].frequency = CPUFREQ_TABLE_END;
> > > +       policy->freq_table = table;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy)
> > > +{
> > > +       struct device *cpu_dev;
> > > +       int ret;
> > > +
> > > +       cpu_dev = get_cpu_device(policy->cpu);
> > > +       if (!cpu_dev)
> > > +               return -ENODEV;
> > > +
> > > +       ret = virt_cpufreq_get_freq_info(policy);
> > > +       if (ret) {
> > > +               dev_warn(cpu_dev, "failed to get cpufreq info\n");
> > > +               return ret;
> > > +       }
> > > +
> > > +       ret = virt_cpufreq_get_sharing_cpus(policy);
> > > +       if (ret) {
> > > +               dev_warn(cpu_dev, "failed to get sharing cpumask\n");
> > > +               return ret;
> > > +       }
> > > +
> > > +       /*
> > > +        * To simplify and improve latency of handling frequency requests on
> > > +        * the host side, this ensures that the vCPU thread triggering the MMIO
> > > +        * abort is the same thread whose performance constraints (Ex. uclamp
> > > +        * settings) need to be updated. This simplifies the VMM (Virtual
> > > +        * Machine Manager) having to find the correct vCPU thread and/or
> > > +        * facing permission issues when configuring other threads.
> > > +        */
> > > +       policy->dvfs_possible_from_any_cpu = false;
> > > +       policy->fast_switch_possible = true;
> > > +
> > > +       /*
> > > +        * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since
> > > +        * the actual physical CPU frequency may not match requested frequency
> > > +        * from the vCPU thread due to frequency update latencies or other
> > > +        * inputs to the physical CPU frequency selection. This additional FIE
> > > +        * source allows for more accurate freq_scale updates and only takes
> > > +        * effect if another FIE source such as AMUs have not been registered.
> > > +        */
> > > +       topology_set_scale_freq_source(&virt_sfd, policy->cpus);
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy)
> > > +{
> > > +       topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus);
> > > +       kfree(policy->freq_table);
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_online(struct cpufreq_policy *policy)
> > > +{
> > > +       /* Nothing to restore. */
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_offline(struct cpufreq_policy *policy)
> > > +{
> > > +       /* Dummy offline() to avoid exit() being called and freeing resources. */
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
> > > +{
> > > +       if (policy->freq_table)
> > > +               return cpufreq_frequency_table_verify(policy, policy->freq_table);
> > > +
> > > +       cpufreq_verify_within_cpu_limits(policy);
> > > +       return 0;
> > > +}
> > > +
> > > +static struct cpufreq_driver cpufreq_virt_driver = {
> > > +       .name           = "virt-cpufreq",
> > > +       .init           = virt_cpufreq_cpu_init,
> > > +       .exit           = virt_cpufreq_cpu_exit,
> > > +       .online         = virt_cpufreq_online,
> > > +       .offline        = virt_cpufreq_offline,
> > > +       .verify         = virt_cpufreq_verify_policy,
> > > +       .target         = virt_cpufreq_target,
> > > +       .fast_switch    = virt_cpufreq_fast_switch,
> > > +       .attr           = cpufreq_generic_attr,
> > > +};
> > > +
> > > +static int virt_cpufreq_driver_probe(struct platform_device *pdev)
> > > +{
> > > +       u32 num_perftbl_entries;
> > > +       int ret, cpu;
> > > +
> > > +       base = devm_platform_ioremap_resource(pdev, 0);
> > > +       if (IS_ERR(base))
> > > +               return PTR_ERR(base);
> > > +
> > > +       for_each_possible_cpu(cpu) {
> > > +               num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET +
> > > +                                                   REG_PERFTBL_LEN_OFFSET);
> > > +
> > > +               if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES)
> > > +                       return -ENODEV;
> > > +
> > > +               per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries;
> > > +       }
> > > +
> > > +       ret = cpufreq_register_driver(&cpufreq_virt_driver);
> > > +       if (ret) {
> > > +               dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret);
> > > +               return ret;
> > > +       }
> > > +
> > > +       dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n");
> > > +       return 0;
> > > +}
> > > +
> > > +static int virt_cpufreq_driver_remove(struct platform_device *pdev)
> > > +{
> > > +       cpufreq_unregister_driver(&cpufreq_virt_driver);
> > > +       return 0;
> > > +}
> > > +
> > > +static const struct of_device_id virt_cpufreq_match[] = {
> > > +       { .compatible = "qemu,virtual-cpufreq", .data = NULL},
> > > +       {}
> > > +};
> > > +MODULE_DEVICE_TABLE(of, virt_cpufreq_match);
> > > +
> > > +static struct platform_driver virt_cpufreq_driver = {
> > > +       .probe = virt_cpufreq_driver_probe,
> > > +       .remove = virt_cpufreq_driver_remove,
> > > +       .driver = {
> > > +               .name = "virt-cpufreq",
> > > +               .of_match_table = virt_cpufreq_match,
> > > +       },
> > > +};
> > > +
> > > +static int __init virt_cpufreq_init(void)
> > > +{
> > > +       return platform_driver_register(&virt_cpufreq_driver);
> > > +}
> > > +postcore_initcall(virt_cpufreq_init);
> > > +
> > > +static void __exit virt_cpufreq_exit(void)
> > > +{
> > > +       platform_driver_unregister(&virt_cpufreq_driver);
> > > +}
> > > +module_exit(virt_cpufreq_exit);
> > > +
> > > +MODULE_DESCRIPTION("Virtual cpufreq driver");
> > > +MODULE_LICENSE("GPL");
> > > diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> > > index b721f360d759..d5d848849408 100644
> > > --- a/include/linux/arch_topology.h
> > > +++ b/include/linux/arch_topology.h
> > > @@ -49,6 +49,7 @@ enum scale_freq_source {
> > >         SCALE_FREQ_SOURCE_CPUFREQ = 0,
> > >         SCALE_FREQ_SOURCE_ARCH,
> > >         SCALE_FREQ_SOURCE_CPPC,
> > > +       SCALE_FREQ_SOURCE_VIRT,
> > >  };
> > >
> > >  struct scale_freq_data {
> > > --
> > > 2.45.0.215.g3402c0e53f-goog
> > >
> >
Marc Zyngier July 10, 2024, 9:05 a.m. UTC | #6
On Fri, 28 Jun 2024 13:51:06 +0100,
Sudeep Holla <sudeep.holla@arm.com> wrote:
> 
> On Mon, May 20, 2024 at 09:30:52PM -0700, David Dai wrote:
> > Introduce a virtualized cpufreq driver for guest kernels to improve
> > performance and power of workloads within VMs.
> >
> > This driver does two main things:
> >
> > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the
> > hint to schedule the vCPU threads and decide physical CPU frequency.
> >
> > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the
> > host CPU frequency by reading a MMIO region of a virtual cpufreq device
> > to update the guest's frequency scaling factor periodically. This enables
> > accurate Per-Entity Load Tracking for tasks running in the guest.
> >
> > +
> > +/*
> > + * CPU0..CPUn
> > + * +-------------+-------------------------------+--------+-------+
> > + * | Register    | Description                   | Offset |   Len |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | cur_perf    | read this register to get     |    0x0 |   0x4 |
> > + * |             | the current perf (integer val |        |       |
> > + * |             | representing perf relative to |        |       |
> > + * |             | max performance)              |        |       |
> > + * |             | that vCPU is running at       |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | set_perf    | write to this register to set |    0x4 |   0x4 |
> > + * |             | perf value of the vCPU        |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
> > + * |             | table. A single entry in the  |        |       |
> > + * |             | perf table denotes no table   |        |       |
> > + * |             | and the entry contains        |        |       |
> > + * |             | the maximum perf value        |        |       |
> > + * |             | that this vCPU supports.      |        |       |
> > + * |             | The guest can request any     |        |       |
> > + * |             | value between 1 and max perf  |        |       |
> > + * |             | when perftbls are not used.   |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perftbl_sel | write to this register to     |    0xc |   0x4 |
> > + * |             | select perf table entry to    |        |       |
> > + * |             | read from                     |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
> > + * |             | perf value of the selected    |        |       |
> > + * |             | entry based on perftbl_sel    |        |       |
> > + * +---------------------------------------------+--------+-------+
> > + * | perf_domain | performance domain number     |   0x14 |   0x4 |
> > + * |             | that this vCPU belongs to.    |        |       |
> > + * |             | vCPUs sharing the same perf   |        |       |
> > + * |             | domain number are part of the |        |       |
> > + * |             | same performance domain.      |        |       |
> > + * +-------------+-------------------------------+--------+-------+
> > + */
> 
> I think it is good idea to version this table, so that it gives flexibility
> to update the entries. It is a must if we are getting away with DT. I didn't
> give complete information in my previous response where I agreed with Rafael.
> 
> I am not sure how much feasible it is, but can it be queried via KVM IOCTLs
> to VMM. Just a thought, I am exploring how to make this work even on ACPI
> systems. It is simpler if we neednot rely on DT or ACPI.

KVM should not have to know any of this. This is purely between a
contract (and a pretty weak one) between userspace and the guest.

	M.
diff mbox series

Patch

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 94e55c40970a..9aa86bec5793 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -217,6 +217,20 @@  config CPUFREQ_DT
 
 	  If in doubt, say N.
 
+config CPUFREQ_VIRT
+	tristate "Virtual cpufreq driver"
+	depends on OF && GENERIC_ARCH_TOPOLOGY
+	help
+	  This adds a virtualized cpufreq driver for guest kernels that
+	  read/writes to a MMIO region for a virtualized cpufreq device to
+	  communicate with the host. It sends performance requests to the host
+	  which gets used as a hint to schedule vCPU threads and select CPU
+	  frequency. If a VM does not support a virtualized FIE such as AMUs,
+	  it updates the frequency scaling factor by polling host CPU frequency
+	  to enable accurate Per-Entity Load Tracking for tasks running in the guest.
+
+	  If in doubt, say N.
+
 config CPUFREQ_DT_PLATDEV
 	tristate "Generic DT based cpufreq platdev driver"
 	depends on OF
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 8d141c71b016..eb72ecdc24db 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -16,6 +16,7 @@  obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET)	+= cpufreq_governor_attr_set.o
 
 obj-$(CONFIG_CPUFREQ_DT)		+= cpufreq-dt.o
 obj-$(CONFIG_CPUFREQ_DT_PLATDEV)	+= cpufreq-dt-platdev.o
+obj-$(CONFIG_CPUFREQ_VIRT)		+= virtual-cpufreq.o
 
 # Traces
 CFLAGS_amd-pstate-trace.o               := -I$(src)
diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c
new file mode 100644
index 000000000000..59ce2bda3913
--- /dev/null
+++ b/drivers/cpufreq/virtual-cpufreq.c
@@ -0,0 +1,335 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Google LLC
+ */
+
+#include <linux/arch_topology.h>
+#include <linux/cpufreq.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/*
+ * CPU0..CPUn
+ * +-------------+-------------------------------+--------+-------+
+ * | Register    | Description                   | Offset |   Len |
+ * +-------------+-------------------------------+--------+-------+
+ * | cur_perf    | read this register to get     |    0x0 |   0x4 |
+ * |             | the current perf (integer val |        |       |
+ * |             | representing perf relative to |        |       |
+ * |             | max performance)              |        |       |
+ * |             | that vCPU is running at       |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ * | set_perf    | write to this register to set |    0x4 |   0x4 |
+ * |             | perf value of the vCPU        |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ * | perftbl_len | number of entries in perf     |    0x8 |   0x4 |
+ * |             | table. A single entry in the  |        |       |
+ * |             | perf table denotes no table   |        |       |
+ * |             | and the entry contains        |        |       |
+ * |             | the maximum perf value        |        |       |
+ * |             | that this vCPU supports.      |        |       |
+ * |             | The guest can request any     |        |       |
+ * |             | value between 1 and max perf  |        |       |
+ * |             | when perftbls are not used.   |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perftbl_sel | write to this register to     |    0xc |   0x4 |
+ * |             | select perf table entry to    |        |       |
+ * |             | read from                     |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perftbl_rd  | read this register to get     |   0x10 |   0x4 |
+ * |             | perf value of the selected    |        |       |
+ * |             | entry based on perftbl_sel    |        |       |
+ * +---------------------------------------------+--------+-------+
+ * | perf_domain | performance domain number     |   0x14 |   0x4 |
+ * |             | that this vCPU belongs to.    |        |       |
+ * |             | vCPUs sharing the same perf   |        |       |
+ * |             | domain number are part of the |        |       |
+ * |             | same performance domain.      |        |       |
+ * +-------------+-------------------------------+--------+-------+
+ */
+
+#define REG_CUR_PERF_STATE_OFFSET 0x0
+#define REG_SET_PERF_STATE_OFFSET 0x4
+#define REG_PERFTBL_LEN_OFFSET 0x8
+#define REG_PERFTBL_SEL_OFFSET 0xc
+#define REG_PERFTBL_RD_OFFSET 0x10
+#define REG_PERF_DOMAIN_OFFSET 0x14
+#define PER_CPU_OFFSET 0x1000
+
+#define PERFTBL_MAX_ENTRIES 64U
+
+static void __iomem *base;
+static DEFINE_PER_CPU(u32, perftbl_num_entries);
+
+static void virt_scale_freq_tick(void)
+{
+	int cpu = smp_processor_id();
+	u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu);
+	u64 cur_freq;
+	unsigned long scale;
+
+	cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET
+			+ REG_CUR_PERF_STATE_OFFSET);
+
+	cur_freq <<= SCHED_CAPACITY_SHIFT;
+	scale = (unsigned long)div_u64(cur_freq, max_freq);
+	scale = min(scale, SCHED_CAPACITY_SCALE);
+
+	this_cpu_write(arch_freq_scale, scale);
+}
+
+static struct scale_freq_data virt_sfd = {
+	.source = SCALE_FREQ_SOURCE_VIRT,
+	.set_freq_scale = virt_scale_freq_tick,
+};
+
+static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy,
+					  unsigned int target_freq)
+{
+	writel_relaxed(target_freq,
+		       base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET);
+	return 0;
+}
+
+static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy,
+					     unsigned int target_freq)
+{
+	virt_cpufreq_set_perf(policy, target_freq);
+	return target_freq;
+}
+
+static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx)
+{
+	writel_relaxed(idx, base + cpu * PER_CPU_OFFSET +
+		       REG_PERFTBL_SEL_OFFSET);
+	return readl_relaxed(base + cpu * PER_CPU_OFFSET +
+			     REG_PERFTBL_RD_OFFSET);
+}
+
+static int virt_cpufreq_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq,
+			       unsigned int relation)
+{
+	struct cpufreq_freqs freqs;
+	int ret = 0;
+
+	freqs.old = policy->cur;
+	freqs.new = target_freq;
+
+	cpufreq_freq_transition_begin(policy, &freqs);
+	ret = virt_cpufreq_set_perf(policy, target_freq);
+	cpufreq_freq_transition_end(policy, &freqs, ret != 0);
+
+	return ret;
+}
+
+static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy)
+{
+	u32 cur_perf_domain, perf_domain;
+	struct device *cpu_dev;
+	int cpu;
+
+	cur_perf_domain = readl_relaxed(base + policy->cpu *
+					PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
+
+	for_each_possible_cpu(cpu) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev)
+			continue;
+
+		perf_domain = readl_relaxed(base + cpu *
+					    PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET);
+
+		if (perf_domain == cur_perf_domain)
+			cpumask_set_cpu(cpu, policy->cpus);
+	}
+
+	return 0;
+}
+
+static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *table;
+	u32 num_perftbl_entries, idx;
+
+	num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu);
+
+	if (num_perftbl_entries == 1) {
+		policy->cpuinfo.min_freq = 1;
+		policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0);
+
+		policy->min = policy->cpuinfo.min_freq;
+		policy->max = policy->cpuinfo.max_freq;
+
+		policy->cur = policy->max;
+		return 0;
+	}
+
+	table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	for (idx = 0; idx < num_perftbl_entries; idx++)
+		table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx);
+
+	table[idx].frequency = CPUFREQ_TABLE_END;
+	policy->freq_table = table;
+
+	return 0;
+}
+
+static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	struct device *cpu_dev;
+	int ret;
+
+	cpu_dev = get_cpu_device(policy->cpu);
+	if (!cpu_dev)
+		return -ENODEV;
+
+	ret = virt_cpufreq_get_freq_info(policy);
+	if (ret) {
+		dev_warn(cpu_dev, "failed to get cpufreq info\n");
+		return ret;
+	}
+
+	ret = virt_cpufreq_get_sharing_cpus(policy);
+	if (ret) {
+		dev_warn(cpu_dev, "failed to get sharing cpumask\n");
+		return ret;
+	}
+
+	/*
+	 * To simplify and improve latency of handling frequency requests on
+	 * the host side, this ensures that the vCPU thread triggering the MMIO
+	 * abort is the same thread whose performance constraints (Ex. uclamp
+	 * settings) need to be updated. This simplifies the VMM (Virtual
+	 * Machine Manager) having to find the correct vCPU thread and/or
+	 * facing permission issues when configuring other threads.
+	 */
+	policy->dvfs_possible_from_any_cpu = false;
+	policy->fast_switch_possible = true;
+
+	/*
+	 * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since
+	 * the actual physical CPU frequency may not match requested frequency
+	 * from the vCPU thread due to frequency update latencies or other
+	 * inputs to the physical CPU frequency selection. This additional FIE
+	 * source allows for more accurate freq_scale updates and only takes
+	 * effect if another FIE source such as AMUs have not been registered.
+	 */
+	topology_set_scale_freq_source(&virt_sfd, policy->cpus);
+
+	return 0;
+}
+
+static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus);
+	kfree(policy->freq_table);
+	return 0;
+}
+
+static int virt_cpufreq_online(struct cpufreq_policy *policy)
+{
+	/* Nothing to restore. */
+	return 0;
+}
+
+static int virt_cpufreq_offline(struct cpufreq_policy *policy)
+{
+	/* Dummy offline() to avoid exit() being called and freeing resources. */
+	return 0;
+}
+
+static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy)
+{
+	if (policy->freq_table)
+		return cpufreq_frequency_table_verify(policy, policy->freq_table);
+
+	cpufreq_verify_within_cpu_limits(policy);
+	return 0;
+}
+
+static struct cpufreq_driver cpufreq_virt_driver = {
+	.name		= "virt-cpufreq",
+	.init		= virt_cpufreq_cpu_init,
+	.exit		= virt_cpufreq_cpu_exit,
+	.online         = virt_cpufreq_online,
+	.offline        = virt_cpufreq_offline,
+	.verify		= virt_cpufreq_verify_policy,
+	.target		= virt_cpufreq_target,
+	.fast_switch	= virt_cpufreq_fast_switch,
+	.attr		= cpufreq_generic_attr,
+};
+
+static int virt_cpufreq_driver_probe(struct platform_device *pdev)
+{
+	u32 num_perftbl_entries;
+	int ret, cpu;
+
+	base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	for_each_possible_cpu(cpu) {
+		num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET +
+						    REG_PERFTBL_LEN_OFFSET);
+
+		if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES)
+			return -ENODEV;
+
+		per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries;
+	}
+
+	ret = cpufreq_register_driver(&cpufreq_virt_driver);
+	if (ret) {
+		dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret);
+		return ret;
+	}
+
+	dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n");
+	return 0;
+}
+
+static int virt_cpufreq_driver_remove(struct platform_device *pdev)
+{
+	cpufreq_unregister_driver(&cpufreq_virt_driver);
+	return 0;
+}
+
+static const struct of_device_id virt_cpufreq_match[] = {
+	{ .compatible = "qemu,virtual-cpufreq", .data = NULL},
+	{}
+};
+MODULE_DEVICE_TABLE(of, virt_cpufreq_match);
+
+static struct platform_driver virt_cpufreq_driver = {
+	.probe = virt_cpufreq_driver_probe,
+	.remove = virt_cpufreq_driver_remove,
+	.driver = {
+		.name = "virt-cpufreq",
+		.of_match_table = virt_cpufreq_match,
+	},
+};
+
+static int __init virt_cpufreq_init(void)
+{
+	return platform_driver_register(&virt_cpufreq_driver);
+}
+postcore_initcall(virt_cpufreq_init);
+
+static void __exit virt_cpufreq_exit(void)
+{
+	platform_driver_unregister(&virt_cpufreq_driver);
+}
+module_exit(virt_cpufreq_exit);
+
+MODULE_DESCRIPTION("Virtual cpufreq driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index b721f360d759..d5d848849408 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -49,6 +49,7 @@  enum scale_freq_source {
 	SCALE_FREQ_SOURCE_CPUFREQ = 0,
 	SCALE_FREQ_SOURCE_ARCH,
 	SCALE_FREQ_SOURCE_CPPC,
+	SCALE_FREQ_SOURCE_VIRT,
 };
 
 struct scale_freq_data {