Message ID | 20230731174613.4133167-3-davidai@google.com (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
Series | Improve VM CPUfreq and task placement behavior | expand |
On 7/31/23 10:46, David Dai wrote: > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig > index f429b9b37b76..3977ca796747 100644 > --- a/drivers/cpufreq/Kconfig > +++ b/drivers/cpufreq/Kconfig > @@ -217,6 +217,21 @@ config CPUFREQ_DT > > If in doubt, say N. > > +config CPUFREQ_VIRT > + tristate "Virtual cpufreq driver" > + depends on OF > + select PM_OPP > + help The 4 lines above should be indented with one tab (not 8 spaces). > + This adds a virtualized cpufreq driver for guest kernels that > + read/writes to a MMIO region for a virtualized cpufreq device to reads/writes to an MMIO region > + communicate with the host. It sends frequency updates to the host > + which gets used as a hint to schedule vCPU threads and select CPU > + frequency. If a VM does not support a virtualized FIE such as AMUs, > + it updates the frequency scaling factor by polling host CPU frequency > + to enable accurate Per-Entity Load Tracking for tasks running in the guest. > + > + If in doubt, say N.
Hi Randy, Thanks for reviewing, On Mon, Jul 31, 2023 at 3:02 PM Randy Dunlap <rdunlap@infradead.org> wrote: > > > > On 7/31/23 10:46, David Dai wrote: > > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig > > index f429b9b37b76..3977ca796747 100644 > > --- a/drivers/cpufreq/Kconfig > > +++ b/drivers/cpufreq/Kconfig > > @@ -217,6 +217,21 @@ config CPUFREQ_DT > > > > If in doubt, say N. > > > > +config CPUFREQ_VIRT > > + tristate "Virtual cpufreq driver" > > + depends on OF > > + select PM_OPP > > + help > > The 4 lines above should be indented with one tab (not 8 spaces). Ok. > > > + This adds a virtualized cpufreq driver for guest kernels that > > + read/writes to a MMIO region for a virtualized cpufreq device to > > reads/writes to an MMIO region Will fix these, thanks! David > > > + communicate with the host. It sends frequency updates to the host > > + which gets used as a hint to schedule vCPU threads and select CPU > > + frequency. If a VM does not support a virtualized FIE such as AMUs, > > + it updates the frequency scaling factor by polling host CPU frequency > > + to enable accurate Per-Entity Load Tracking for tasks running in the guest. > > + > > + If in doubt, say N. > > -- > ~Randy
On 31-07-23, 10:46, David Dai wrote: > diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c > new file mode 100644 > index 000000000000..66b0fd9b821c > --- /dev/null > +++ b/drivers/cpufreq/virtual-cpufreq.c > @@ -0,0 +1,237 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2023 Google LLC > + */ > + > +#include <linux/arch_topology.h> > +#include <linux/cpufreq.h> > +#include <linux/init.h> > +#include <linux/sched.h> > +#include <linux/kernel.h> > +#include <linux/module.h> > +#include <linux/of_address.h> > +#include <linux/of_platform.h> > +#include <linux/pm_opp.h> > +#include <linux/slab.h> > + > +#define REG_CUR_FREQ_OFFSET 0x0 > +#define REG_SET_FREQ_OFFSET 0x4 > +#define PER_CPU_OFFSET 0x8 > + > +struct virt_cpufreq_ops { > + void (*set_freq)(struct cpufreq_policy *policy, u32 freq); > + u32 (*get_freq)(struct cpufreq_policy *policy); > +}; Since you only have one implementation currently, this isn't really required. Keep the data as NULL in `virt_cpufreq_match` and use writel/readl directly. This can be updated if we need more implementations later. > +struct virt_cpufreq_drv_data { > + void __iomem *base; > + const struct virt_cpufreq_ops *ops; > +}; > + > +static void virt_cpufreq_set_freq(struct cpufreq_policy *policy, u32 freq) > +{ > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + > + writel_relaxed(freq, data->base + policy->cpu * PER_CPU_OFFSET > + + REG_SET_FREQ_OFFSET); > +} > + > +static u32 virt_cpufreq_get_freq(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + > + return readl_relaxed(data->base + policy->cpu * PER_CPU_OFFSET > + + REG_CUR_FREQ_OFFSET); This doesn't look properly aligned. Please run checkpatch (--strict (optional)). > +} > + > +static const struct virt_cpufreq_ops virt_freq_ops = { > + .set_freq = virt_cpufreq_set_freq, > + .get_freq = virt_cpufreq_get_freq, > +}; > + > +static void virt_scale_freq_tick(void) > +{ > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > + u64 cur_freq; > + u64 scale; > + > + cpufreq_cpu_put(policy); > + > + cur_freq = (u64)data->ops->get_freq(policy); > + cur_freq <<= SCHED_CAPACITY_SHIFT; > + scale = div_u64(cur_freq, max_freq); > + > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > +} > + > +static struct scale_freq_data virt_sfd = { > + .source = SCALE_FREQ_SOURCE_VIRT, > + .set_freq_scale = virt_scale_freq_tick, > +}; > + > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + /* > + * Use cached frequency to avoid rounding to freq table entries > + * and undo 25% frequency boost applied by schedutil. > + */ > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > + > + data->ops->set_freq(policy, freq); > + return 0; > +} > + > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > + unsigned int target_freq) > +{ > + virt_cpufreq_set_perf(policy); > + return target_freq; > +} > + > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > + unsigned int index) > +{ > + return virt_cpufreq_set_perf(policy); > +} > + > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > + struct cpufreq_frequency_table *table; > + struct device *cpu_dev; > + int ret; > + > + cpu_dev = get_cpu_device(policy->cpu); > + if (!cpu_dev) > + return -ENODEV; > + > + ret = dev_pm_opp_of_add_table(cpu_dev); > + if (ret) > + return ret; > + > + ret = dev_pm_opp_get_opp_count(cpu_dev); > + if (ret <= 0) { > + dev_err(cpu_dev, "OPP table can't be empty\n"); > + return -ENODEV; > + } > + > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > + if (ret) { > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > + return ret; > + } > + > + policy->freq_table = table; > + policy->dvfs_possible_from_any_cpu = false; Why can't we call virt_cpufreq_target_index() from any CPU ? > + policy->fast_switch_possible = true; > + policy->driver_data = drv_data; > + > + /* > + * Only takes effect if another FIE source such as AMUs > + * have not been registered. > + */ > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > + > + return 0; > + > +} > + > +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy) > +{ > + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus); > + kfree(policy->freq_table); > + policy->freq_table = NULL; > + return 0; > +} > + > +static int virt_cpufreq_online(struct cpufreq_policy *policy) > +{ > + /* Nothing to restore. */ > + return 0; > +} > + > +static int virt_cpufreq_offline(struct cpufreq_policy *policy) > +{ > + /* Dummy offline() to avoid exit() being called and freeing resources. */ > + return 0; > +} > + > +static struct cpufreq_driver cpufreq_virt_driver = { > + .name = "virt-cpufreq", > + .init = virt_cpufreq_cpu_init, > + .exit = virt_cpufreq_cpu_exit, > + .online = virt_cpufreq_online, > + .offline = virt_cpufreq_offline, > + .verify = cpufreq_generic_frequency_table_verify, > + .target_index = virt_cpufreq_target_index, > + .fast_switch = virt_cpufreq_fast_switch, > + .attr = cpufreq_generic_attr, > +}; > + > +static int virt_cpufreq_driver_probe(struct platform_device *pdev) > +{ > + int ret; > + struct virt_cpufreq_drv_data *drv_data; > + > + drv_data = devm_kzalloc(&pdev->dev, sizeof(*drv_data), GFP_KERNEL); > + if (!drv_data) > + return -ENOMEM; > + > + drv_data->ops = of_device_get_match_data(&pdev->dev); > + if (!drv_data->ops) > + return -EINVAL; > + > + drv_data->base = devm_platform_ioremap_resource(pdev, 0); > + if (IS_ERR(drv_data->base)) > + return PTR_ERR(drv_data->base); > + > + cpufreq_virt_driver.driver_data = drv_data; > + > + ret = cpufreq_register_driver(&cpufreq_virt_driver); > + if (ret) { > + dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret); > + return ret; > + } > + > + dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n"); > + return 0; > +} > + > +static int virt_cpufreq_driver_remove(struct platform_device *pdev) > +{ > + cpufreq_unregister_driver(&cpufreq_virt_driver); > + return 0; > +} > + > +static const struct of_device_id virt_cpufreq_match[] = { > + { .compatible = "virtual,cpufreq", .data = &virt_freq_ops}, > + {} > +}; > +MODULE_DEVICE_TABLE(of, virt_cpufreq_match); > + > +static struct platform_driver virt_cpufreq_driver = { > + .probe = virt_cpufreq_driver_probe, > + .remove = virt_cpufreq_driver_remove, > + .driver = { > + .name = "virt-cpufreq", > + .of_match_table = virt_cpufreq_match, > + }, > +}; > + > +static int __init virt_cpufreq_init(void) > +{ > + return platform_driver_register(&virt_cpufreq_driver); > +} > +postcore_initcall(virt_cpufreq_init); Why do you want to use this and not module_init() ? Then you can simply use `module_platform_driver()`. > + > +static void __exit virt_cpufreq_exit(void) > +{ > + platform_driver_unregister(&virt_cpufreq_driver); > +} > +module_exit(virt_cpufreq_exit); > + > +MODULE_DESCRIPTION("Virtual cpufreq driver"); > +MODULE_LICENSE("GPL");
Hi David, On Monday 31 Jul 2023 at 10:46:09 (-0700), David Dai wrote: > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + /* > + * Use cached frequency to avoid rounding to freq table entries > + * and undo 25% frequency boost applied by schedutil. > + */ The VMM would be a better place for this scaling I think, the driver can't/shouldn't make assumptions about the governor it is running with given that this is a guest userspace decision essentially. IIRC the fast_switch() path is only used by schedutil, so one could probably make a case to scale things there, but it'd be inconsistent with the "slow" switch case, and would create a fragile dependency, so it's probably not worth pursuing. > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > + > + data->ops->set_freq(policy, freq); > + return 0; > +} Thanks, Quentin
On Tuesday 01 Aug 2023 at 09:45:22 (+0000), Quentin Perret wrote: > Hi David, > > On Monday 31 Jul 2023 at 10:46:09 (-0700), David Dai wrote: > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + /* > > + * Use cached frequency to avoid rounding to freq table entries > > + * and undo 25% frequency boost applied by schedutil. > > + */ > > The VMM would be a better place for this scaling I think, the driver > can't/shouldn't make assumptions about the governor it is running with > given that this is a guest userspace decision essentially. > > IIRC the fast_switch() path is only used by schedutil, so one could > probably make a case to scale things there, but it'd be inconsistent > with the "slow" switch case, and would create a fragile dependency, so > it's probably not worth pursuing. Alternatively we could make the schedutil margin configurable via the cmdline or something along those lines, so we can set it to 0 in the guest and avoid the issue entirely. Some partners have been asking for this IIRC , so I suspect there would be interest from other parties. Thanks, Quentin
On Tue, Aug 1, 2023 at 2:36 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 31-07-23, 10:46, David Dai wrote: > > diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c > > new file mode 100644 > > index 000000000000..66b0fd9b821c > > --- /dev/null > > +++ b/drivers/cpufreq/virtual-cpufreq.c > > @@ -0,0 +1,237 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * Copyright (C) 2023 Google LLC > > + */ > > + > > +#include <linux/arch_topology.h> > > +#include <linux/cpufreq.h> > > +#include <linux/init.h> > > +#include <linux/sched.h> > > +#include <linux/kernel.h> > > +#include <linux/module.h> > > +#include <linux/of_address.h> > > +#include <linux/of_platform.h> > > +#include <linux/pm_opp.h> > > +#include <linux/slab.h> > > + > > +#define REG_CUR_FREQ_OFFSET 0x0 > > +#define REG_SET_FREQ_OFFSET 0x4 > > +#define PER_CPU_OFFSET 0x8 > > + > > +struct virt_cpufreq_ops { > > + void (*set_freq)(struct cpufreq_policy *policy, u32 freq); > > + u32 (*get_freq)(struct cpufreq_policy *policy); > > +}; > > Since you only have one implementation currently, this isn't really > required. Keep the data as NULL in `virt_cpufreq_match` and use > writel/readl directly. > > This can be updated if we need more implementations later. > > > +struct virt_cpufreq_drv_data { > > + void __iomem *base; > > + const struct virt_cpufreq_ops *ops; > > +}; > > + > > +static void virt_cpufreq_set_freq(struct cpufreq_policy *policy, u32 freq) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + > > + writel_relaxed(freq, data->base + policy->cpu * PER_CPU_OFFSET > > + + REG_SET_FREQ_OFFSET); > > +} > > + > > +static u32 virt_cpufreq_get_freq(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + > > + return readl_relaxed(data->base + policy->cpu * PER_CPU_OFFSET > > + + REG_CUR_FREQ_OFFSET); > > This doesn't look properly aligned. Please run checkpatch (--strict > (optional)). > > > +} > > + > > +static const struct virt_cpufreq_ops virt_freq_ops = { > > + .set_freq = virt_cpufreq_set_freq, > > + .get_freq = virt_cpufreq_get_freq, > > +}; > > + > > +static void virt_scale_freq_tick(void) > > +{ > > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > > + u64 cur_freq; > > + u64 scale; > > + > > + cpufreq_cpu_put(policy); > > + > > + cur_freq = (u64)data->ops->get_freq(policy); > > + cur_freq <<= SCHED_CAPACITY_SHIFT; > > + scale = div_u64(cur_freq, max_freq); > > + > > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > > +} > > + > > +static struct scale_freq_data virt_sfd = { > > + .source = SCALE_FREQ_SOURCE_VIRT, > > + .set_freq_scale = virt_scale_freq_tick, > > +}; > > + > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + /* > > + * Use cached frequency to avoid rounding to freq table entries > > + * and undo 25% frequency boost applied by schedutil. > > + */ > > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > > + > > + data->ops->set_freq(policy, freq); > > + return 0; > > +} > > + > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > > + unsigned int target_freq) > > +{ > > + virt_cpufreq_set_perf(policy); > > + return target_freq; > > +} > > + > > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > > + unsigned int index) > > +{ > > + return virt_cpufreq_set_perf(policy); > > +} > > + > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > > + struct cpufreq_frequency_table *table; > > + struct device *cpu_dev; > > + int ret; > > + > > + cpu_dev = get_cpu_device(policy->cpu); > > + if (!cpu_dev) > > + return -ENODEV; > > + > > + ret = dev_pm_opp_of_add_table(cpu_dev); > > + if (ret) > > + return ret; > > + > > + ret = dev_pm_opp_get_opp_count(cpu_dev); > > + if (ret <= 0) { > > + dev_err(cpu_dev, "OPP table can't be empty\n"); > > + return -ENODEV; > > + } > > + > > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > > + if (ret) { > > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > > + return ret; > > + } > > + > > + policy->freq_table = table; > > + policy->dvfs_possible_from_any_cpu = false; > > Why can't we call virt_cpufreq_target_index() from any CPU ? This is mainly an optimization to reduce the latency of the "frequency change" which has a huge impact on the performance (as can be seen from the numbers in the cover letter). Setting this flag means that the vCPU thread triggering the MMIO handling (on the host side) is the thread on which the host needs to apply any uclamp settings. So this avoids the VMM having to look up the right vCPU thread that corresponds to this CPU, and any permissions issues wrt setting another threads uclamp, etc. This becomes even more important if/when BPF support is added for handling simple MMIO read/writes. Will Deacon has been working on the eBPF part[1] and IIUC, not setting this flag adds a lot of extra overhead on the BPF side. So, yeah, this flag is very helpful wrt reducing latency/simplifying host side implementation and that's why we want it here. [1] - https://kvm-forum.qemu.org/2023/talk/AZKC77/ Thanks, Saravana > > > + policy->fast_switch_possible = true; > > + policy->driver_data = drv_data; > > + > > + /* > > + * Only takes effect if another FIE source such as AMUs > > + * have not been registered. > > + */ > > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > > + > > + return 0; > > + > > +} > > + > > +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy) > > +{ > > + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus); > > + kfree(policy->freq_table); > > + policy->freq_table = NULL; > > + return 0; > > +} > > + > > +static int virt_cpufreq_online(struct cpufreq_policy *policy) > > +{ > > + /* Nothing to restore. */ > > + return 0; > > +} > > + > > +static int virt_cpufreq_offline(struct cpufreq_policy *policy) > > +{ > > + /* Dummy offline() to avoid exit() being called and freeing resources. */ > > + return 0; > > +} > > + > > +static struct cpufreq_driver cpufreq_virt_driver = { > > + .name = "virt-cpufreq", > > + .init = virt_cpufreq_cpu_init, > > + .exit = virt_cpufreq_cpu_exit, > > + .online = virt_cpufreq_online, > > + .offline = virt_cpufreq_offline, > > + .verify = cpufreq_generic_frequency_table_verify, > > + .target_index = virt_cpufreq_target_index, > > + .fast_switch = virt_cpufreq_fast_switch, > > + .attr = cpufreq_generic_attr, > > +}; > > + > > +static int virt_cpufreq_driver_probe(struct platform_device *pdev) > > +{ > > + int ret; > > + struct virt_cpufreq_drv_data *drv_data; > > + > > + drv_data = devm_kzalloc(&pdev->dev, sizeof(*drv_data), GFP_KERNEL); > > + if (!drv_data) > > + return -ENOMEM; > > + > > + drv_data->ops = of_device_get_match_data(&pdev->dev); > > + if (!drv_data->ops) > > + return -EINVAL; > > + > > + drv_data->base = devm_platform_ioremap_resource(pdev, 0); > > + if (IS_ERR(drv_data->base)) > > + return PTR_ERR(drv_data->base); > > + > > + cpufreq_virt_driver.driver_data = drv_data; > > + > > + ret = cpufreq_register_driver(&cpufreq_virt_driver); > > + if (ret) { > > + dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret); > > + return ret; > > + } > > + > > + dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n"); > > + return 0; > > +} > > + > > +static int virt_cpufreq_driver_remove(struct platform_device *pdev) > > +{ > > + cpufreq_unregister_driver(&cpufreq_virt_driver); > > + return 0; > > +} > > + > > +static const struct of_device_id virt_cpufreq_match[] = { > > + { .compatible = "virtual,cpufreq", .data = &virt_freq_ops}, > > + {} > > +}; > > +MODULE_DEVICE_TABLE(of, virt_cpufreq_match); > > + > > +static struct platform_driver virt_cpufreq_driver = { > > + .probe = virt_cpufreq_driver_probe, > > + .remove = virt_cpufreq_driver_remove, > > + .driver = { > > + .name = "virt-cpufreq", > > + .of_match_table = virt_cpufreq_match, > > + }, > > +}; > > + > > +static int __init virt_cpufreq_init(void) > > +{ > > + return platform_driver_register(&virt_cpufreq_driver); > > +} > > +postcore_initcall(virt_cpufreq_init); > > Why do you want to use this and not module_init() ? Then you can > simply use `module_platform_driver()`. > > > + > > +static void __exit virt_cpufreq_exit(void) > > +{ > > + platform_driver_unregister(&virt_cpufreq_driver); > > +} > > +module_exit(virt_cpufreq_exit); > > + > > +MODULE_DESCRIPTION("Virtual cpufreq driver"); > > +MODULE_LICENSE("GPL"); > > -- > viresh
On Mon, Jul 31, 2023 at 10:46:09AM -0700, David Dai wrote: > Introduce a virtualized cpufreq driver for guest kernels to improve > performance and power of workloads within VMs. > > This driver does two main things: > > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the > hint to schedule the vCPU threads and decide physical CPU frequency. > > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the > host CPU frequency by reading a MMIO region of a virtual cpufreq device > to update the guest's frequency scaling factor periodically. This enables > accurate Per-Entity Load Tracking for tasks running in the guest. > > Co-developed-by: Saravana Kannan <saravanak@google.com> > Signed-off-by: Saravana Kannan <saravanak@google.com> > Signed-off-by: David Dai <davidai@google.com> [...] > +static void virt_scale_freq_tick(void) > +{ > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > + u64 cur_freq; > + u64 scale; > + > + cpufreq_cpu_put(policy); > + > + cur_freq = (u64)data->ops->get_freq(policy); > + cur_freq <<= SCHED_CAPACITY_SHIFT; > + scale = div_u64(cur_freq, max_freq); > + > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > +} > + We expect the host to provide the frequency in kHz, can you please add a comment about it. It is not very obvious when you look at the REG_CUR_FREQ_OFFSET register name. > +static struct scale_freq_data virt_sfd = { > + .source = SCALE_FREQ_SOURCE_VIRT, > + .set_freq_scale = virt_scale_freq_tick, > +}; > + > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *data = policy->driver_data; > + /* > + * Use cached frequency to avoid rounding to freq table entries > + * and undo 25% frequency boost applied by schedutil. > + */ > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > + > + data->ops->set_freq(policy, freq); > + return 0; > +} Why do we undo the frequency boost? A governor may apply other boosts like RT (uclamp), iowait. It is not clear why we need to worry about governor policies here. > + > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > + unsigned int target_freq) > +{ > + virt_cpufreq_set_perf(policy); > + return target_freq; > +} > + > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > + unsigned int index) > +{ > + return virt_cpufreq_set_perf(policy); > +} > + > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > +{ > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > + struct cpufreq_frequency_table *table; > + struct device *cpu_dev; > + int ret; > + > + cpu_dev = get_cpu_device(policy->cpu); > + if (!cpu_dev) > + return -ENODEV; > + > + ret = dev_pm_opp_of_add_table(cpu_dev); > + if (ret) > + return ret; > + > + ret = dev_pm_opp_get_opp_count(cpu_dev); > + if (ret <= 0) { > + dev_err(cpu_dev, "OPP table can't be empty\n"); > + return -ENODEV; > + } > + > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > + if (ret) { > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > + return ret; > + } > + > + policy->freq_table = table; > + policy->dvfs_possible_from_any_cpu = false; > + policy->fast_switch_possible = true; > + policy->driver_data = drv_data; > + > + /* > + * Only takes effect if another FIE source such as AMUs > + * have not been registered. > + */ > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > + > + return 0; > + > +} > + Do we need to register as FIE source even with the below commit? By registering as a source, we are not supplying any accurate metric. We still fallback on the same source that cpufreq implements it. 874f63531064 ("cpufreq: report whether cpufreq supports Frequency Invariance (FI)") Thanks, Pavan
On 02-08-23, 15:16, Saravana Kannan wrote: > This is mainly an optimization to reduce the latency of the "frequency > change" which has a huge impact on the performance (as can be seen > from the numbers in the cover letter). > > Setting this flag means that the vCPU thread triggering the MMIO > handling (on the host side) is the thread on which the host needs to > apply any uclamp settings. So this avoids the VMM having to look up > the right vCPU thread that corresponds to this CPU, and any > permissions issues wrt setting another threads uclamp, etc. This > becomes even more important if/when BPF support is added for handling > simple MMIO read/writes. Will Deacon has been working on the eBPF > part[1] and IIUC, not setting this flag adds a lot of extra overhead > on the BPF side. > > So, yeah, this flag is very helpful wrt reducing latency/simplifying > host side implementation and that's why we want it here. > > [1] - https://kvm-forum.qemu.org/2023/talk/AZKC77/ Would be good to have a (big) comment in the code explaining that as it isn't obvious. Thanks.
Hi Viresh, Thanks for reviewing! On Tue, Aug 1, 2023 at 2:36 AM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 31-07-23, 10:46, David Dai wrote: > > diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c > > new file mode 100644 > > index 000000000000..66b0fd9b821c > > --- /dev/null > > +++ b/drivers/cpufreq/virtual-cpufreq.c > > @@ -0,0 +1,237 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * Copyright (C) 2023 Google LLC > > + */ > > + > > +#include <linux/arch_topology.h> > > +#include <linux/cpufreq.h> > > +#include <linux/init.h> > > +#include <linux/sched.h> > > +#include <linux/kernel.h> > > +#include <linux/module.h> > > +#include <linux/of_address.h> > > +#include <linux/of_platform.h> > > +#include <linux/pm_opp.h> > > +#include <linux/slab.h> > > + > > +#define REG_CUR_FREQ_OFFSET 0x0 > > +#define REG_SET_FREQ_OFFSET 0x4 > > +#define PER_CPU_OFFSET 0x8 > > + > > +struct virt_cpufreq_ops { > > + void (*set_freq)(struct cpufreq_policy *policy, u32 freq); > > + u32 (*get_freq)(struct cpufreq_policy *policy); > > +}; > > Since you only have one implementation currently, this isn't really > required. Keep the data as NULL in `virt_cpufreq_match` and use > writel/readl directly. Okay, I’ll remove the ops for now and bring it back in the future if required. > > This can be updated if we need more implementations later. > > > +struct virt_cpufreq_drv_data { > > + void __iomem *base; > > + const struct virt_cpufreq_ops *ops; > > +}; > > + > > +static void virt_cpufreq_set_freq(struct cpufreq_policy *policy, u32 freq) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + > > + writel_relaxed(freq, data->base + policy->cpu * PER_CPU_OFFSET > > + + REG_SET_FREQ_OFFSET); > > +} > > + > > +static u32 virt_cpufreq_get_freq(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + > > + return readl_relaxed(data->base + policy->cpu * PER_CPU_OFFSET > > + + REG_CUR_FREQ_OFFSET); > > This doesn't look properly aligned. Please run checkpatch (--strict > (optional)). Ok. > > > +} > > + > > +static const struct virt_cpufreq_ops virt_freq_ops = { > > + .set_freq = virt_cpufreq_set_freq, > > + .get_freq = virt_cpufreq_get_freq, > > +}; > > + > > +static void virt_scale_freq_tick(void) > > +{ > > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > > + u64 cur_freq; > > + u64 scale; > > + > > + cpufreq_cpu_put(policy); > > + > > + cur_freq = (u64)data->ops->get_freq(policy); > > + cur_freq <<= SCHED_CAPACITY_SHIFT; > > + scale = div_u64(cur_freq, max_freq); > > + > > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > > +} > > + > > +static struct scale_freq_data virt_sfd = { > > + .source = SCALE_FREQ_SOURCE_VIRT, > > + .set_freq_scale = virt_scale_freq_tick, > > +}; > > + > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + /* > > + * Use cached frequency to avoid rounding to freq table entries > > + * and undo 25% frequency boost applied by schedutil. > > + */ > > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > > + > > + data->ops->set_freq(policy, freq); > > + return 0; > > +} > > + > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > > + unsigned int target_freq) > > +{ > > + virt_cpufreq_set_perf(policy); > > + return target_freq; > > +} > > + > > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > > + unsigned int index) > > +{ > > + return virt_cpufreq_set_perf(policy); > > +} > > + > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > > + struct cpufreq_frequency_table *table; > > + struct device *cpu_dev; > > + int ret; > > + > > + cpu_dev = get_cpu_device(policy->cpu); > > + if (!cpu_dev) > > + return -ENODEV; > > + > > + ret = dev_pm_opp_of_add_table(cpu_dev); > > + if (ret) > > + return ret; > > + > > + ret = dev_pm_opp_get_opp_count(cpu_dev); > > + if (ret <= 0) { > > + dev_err(cpu_dev, "OPP table can't be empty\n"); > > + return -ENODEV; > > + } > > + > > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > > + if (ret) { > > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > > + return ret; > > + } > > + > > + policy->freq_table = table; > > + policy->dvfs_possible_from_any_cpu = false; > > Why can't we call virt_cpufreq_target_index() from any CPU ? > > > + policy->fast_switch_possible = true; > > + policy->driver_data = drv_data; > > + > > + /* > > + * Only takes effect if another FIE source such as AMUs > > + * have not been registered. > > + */ > > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > > + > > + return 0; > > + > > +} > > + > > +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy) > > +{ > > + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus); > > + kfree(policy->freq_table); > > + policy->freq_table = NULL; > > + return 0; > > +} > > + > > +static int virt_cpufreq_online(struct cpufreq_policy *policy) > > +{ > > + /* Nothing to restore. */ > > + return 0; > > +} > > + > > +static int virt_cpufreq_offline(struct cpufreq_policy *policy) > > +{ > > + /* Dummy offline() to avoid exit() being called and freeing resources. */ > > + return 0; > > +} > > + > > +static struct cpufreq_driver cpufreq_virt_driver = { > > + .name = "virt-cpufreq", > > + .init = virt_cpufreq_cpu_init, > > + .exit = virt_cpufreq_cpu_exit, > > + .online = virt_cpufreq_online, > > + .offline = virt_cpufreq_offline, > > + .verify = cpufreq_generic_frequency_table_verify, > > + .target_index = virt_cpufreq_target_index, > > + .fast_switch = virt_cpufreq_fast_switch, > > + .attr = cpufreq_generic_attr, > > +}; > > + > > +static int virt_cpufreq_driver_probe(struct platform_device *pdev) > > +{ > > + int ret; > > + struct virt_cpufreq_drv_data *drv_data; > > + > > + drv_data = devm_kzalloc(&pdev->dev, sizeof(*drv_data), GFP_KERNEL); > > + if (!drv_data) > > + return -ENOMEM; > > + > > + drv_data->ops = of_device_get_match_data(&pdev->dev); > > + if (!drv_data->ops) > > + return -EINVAL; > > + > > + drv_data->base = devm_platform_ioremap_resource(pdev, 0); > > + if (IS_ERR(drv_data->base)) > > + return PTR_ERR(drv_data->base); > > + > > + cpufreq_virt_driver.driver_data = drv_data; > > + > > + ret = cpufreq_register_driver(&cpufreq_virt_driver); > > + if (ret) { > > + dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret); > > + return ret; > > + } > > + > > + dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n"); > > + return 0; > > +} > > + > > +static int virt_cpufreq_driver_remove(struct platform_device *pdev) > > +{ > > + cpufreq_unregister_driver(&cpufreq_virt_driver); > > + return 0; > > +} > > + > > +static const struct of_device_id virt_cpufreq_match[] = { > > + { .compatible = "virtual,cpufreq", .data = &virt_freq_ops}, > > + {} > > +}; > > +MODULE_DEVICE_TABLE(of, virt_cpufreq_match); > > + > > +static struct platform_driver virt_cpufreq_driver = { > > + .probe = virt_cpufreq_driver_probe, > > + .remove = virt_cpufreq_driver_remove, > > + .driver = { > > + .name = "virt-cpufreq", > > + .of_match_table = virt_cpufreq_match, > > + }, > > +}; > > + > > +static int __init virt_cpufreq_init(void) > > +{ > > + return platform_driver_register(&virt_cpufreq_driver); > > +} > > +postcore_initcall(virt_cpufreq_init); > > Why do you want to use this and not module_init() ? Then you can > simply use `module_platform_driver()`. We found that using postcore_init over module_init results in a small(2-3%) but measurable benefit during boot time for VMs, so this is an optimization that I’d prefer to keep. Thanks, David > > > + > > +static void __exit virt_cpufreq_exit(void) > > +{ > > + platform_driver_unregister(&virt_cpufreq_driver); > > +} > > +module_exit(virt_cpufreq_exit); > > + > > +MODULE_DESCRIPTION("Virtual cpufreq driver"); > > +MODULE_LICENSE("GPL"); > > -- > viresh
On 03-08-23, 09:50, David Dai wrote: > > Why do you want to use this and not module_init() ? Then you can > > simply use `module_platform_driver()`. > > We found that using postcore_init over module_init results in a > small(2-3%) but measurable benefit during boot time for VMs, so this > is an optimization that I’d prefer to keep. Okay. That's what platforms normally do (kick in cpufreq support earlier), so we can boot at a higher frequency. Just wasn't sure if it matters for this driver too.
On Tue, Aug 1, 2023 at 2:45 AM Quentin Perret <qperret@google.com> wrote: > > Hi David, > > On Monday 31 Jul 2023 at 10:46:09 (-0700), David Dai wrote: > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + /* > > + * Use cached frequency to avoid rounding to freq table entries > > + * and undo 25% frequency boost applied by schedutil. > > + */ > > The VMM would be a better place for this scaling I think, the driver > can't/shouldn't make assumptions about the governor it is running with > given that this is a guest userspace decision essentially. > > IIRC the fast_switch() path is only used by schedutil, so one could > probably make a case to scale things there, but it'd be inconsistent > with the "slow" switch case, and would create a fragile dependency, so > it's probably not worth pursuing. Thanks for the input Quentin! David and I spend several hours over several days discussing this. We were trying to think through and decide if we were really removing the 25% margin applied by the guest side schedutil or the host side schedutil. We ran through different thought experiments on what would happen if the guest used ondemand/conservative/performance/powersave governors and what if in the future we had a configurable schedutil margin. We changed our opinions multiple times until we finally remembered this goal from my original presentation[1]: "On an idle host, running the use case in the host vs VM, should result in close to identical DVFS behavior of the physical CPUs and CPU selection for the threads." For that statement to be true when the guest uses ondemand/conservative governor, we have to remove the 25% margin applied by the host side schedutil governor. Otherwise, running the workload on the VM will result in frequencies 25% higher than running the same load on the host with ondemand/conservative governor. So, we finally concluded that we are really undoing the host side schedutil margin. And in that case, it makes sense to undo this in the VMM side. So, we'll go with your suggestion in this email instead of making the schedutil margin to be 0 for the guest. [1] - https://lpc.events/event/16/contributions/1195/attachments/970/1893/LPC%202022%20-%20VM%20DVFS.pdf Thanks, Saravana > > > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > > + > > + data->ops->set_freq(policy, freq); > > + return 0; > > +} > > Thanks, > Quentin
On Wed, Aug 2, 2023 at 10:52 PM Viresh Kumar <viresh.kumar@linaro.org> wrote: > > On 02-08-23, 15:16, Saravana Kannan wrote: > > This is mainly an optimization to reduce the latency of the "frequency > > change" which has a huge impact on the performance (as can be seen > > from the numbers in the cover letter). > > > > Setting this flag means that the vCPU thread triggering the MMIO > > handling (on the host side) is the thread on which the host needs to > > apply any uclamp settings. So this avoids the VMM having to look up > > the right vCPU thread that corresponds to this CPU, and any > > permissions issues wrt setting another threads uclamp, etc. This > > becomes even more important if/when BPF support is added for handling > > simple MMIO read/writes. Will Deacon has been working on the eBPF > > part[1] and IIUC, not setting this flag adds a lot of extra overhead > > on the BPF side. > > > > So, yeah, this flag is very helpful wrt reducing latency/simplifying > > host side implementation and that's why we want it here. > > > > [1] - https://kvm-forum.qemu.org/2023/talk/AZKC77/ > > Would be good to have a (big) comment in the code explaining that as > it isn't obvious. Thanks. Will do. Thanks, Saravana
Hi Pavan, Thanks for reviewing! On Wed, Aug 2, 2023 at 9:18 PM Pavan Kondeti <quic_pkondeti@quicinc.com> wrote: > > On Mon, Jul 31, 2023 at 10:46:09AM -0700, David Dai wrote: > > Introduce a virtualized cpufreq driver for guest kernels to improve > > performance and power of workloads within VMs. > > > > This driver does two main things: > > > > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the > > hint to schedule the vCPU threads and decide physical CPU frequency. > > > > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the > > host CPU frequency by reading a MMIO region of a virtual cpufreq device > > to update the guest's frequency scaling factor periodically. This enables > > accurate Per-Entity Load Tracking for tasks running in the guest. > > > > Co-developed-by: Saravana Kannan <saravanak@google.com> > > Signed-off-by: Saravana Kannan <saravanak@google.com> > > Signed-off-by: David Dai <davidai@google.com> > > [...] > > > +static void virt_scale_freq_tick(void) > > +{ > > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > > + u64 cur_freq; > > + u64 scale; > > + > > + cpufreq_cpu_put(policy); > > + > > + cur_freq = (u64)data->ops->get_freq(policy); > > + cur_freq <<= SCHED_CAPACITY_SHIFT; > > + scale = div_u64(cur_freq, max_freq); > > + > > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > > +} > > + > > We expect the host to provide the frequency in kHz, can you please add a > comment about it. It is not very obvious when you look at the > REG_CUR_FREQ_OFFSET register name. I’ll include a KHZ in the offset names. > > > +static struct scale_freq_data virt_sfd = { > > + .source = SCALE_FREQ_SOURCE_VIRT, > > + .set_freq_scale = virt_scale_freq_tick, > > +}; > > + > > +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > + /* > > + * Use cached frequency to avoid rounding to freq table entries > > + * and undo 25% frequency boost applied by schedutil. > > + */ > > + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); > > + > > + data->ops->set_freq(policy, freq); > > + return 0; > > +} > > Why do we undo the frequency boost? A governor may apply other boosts > like RT (uclamp), iowait. It is not clear why we need to worry about > governor policies here. See Saravana’s response to Quentin for more details, but in short, we’ll remove this particular snippet in the driver. > > > + > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > > + unsigned int target_freq) > > +{ > > + virt_cpufreq_set_perf(policy); > > + return target_freq; > > +} > > + > > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > > + unsigned int index) > > +{ > > + return virt_cpufreq_set_perf(policy); > > +} > > + > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > > +{ > > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > > + struct cpufreq_frequency_table *table; > > + struct device *cpu_dev; > > + int ret; > > + > > + cpu_dev = get_cpu_device(policy->cpu); > > + if (!cpu_dev) > > + return -ENODEV; > > + > > + ret = dev_pm_opp_of_add_table(cpu_dev); > > + if (ret) > > + return ret; > > + > > + ret = dev_pm_opp_get_opp_count(cpu_dev); > > + if (ret <= 0) { > > + dev_err(cpu_dev, "OPP table can't be empty\n"); > > + return -ENODEV; > > + } > > + > > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > > + if (ret) { > > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > > + return ret; > > + } > > + > > + policy->freq_table = table; > > + policy->dvfs_possible_from_any_cpu = false; > > + policy->fast_switch_possible = true; > > + policy->driver_data = drv_data; > > + > > + /* > > + * Only takes effect if another FIE source such as AMUs > > + * have not been registered. > > + */ > > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > > + > > + return 0; > > + > > +} > > + > > Do we need to register as FIE source even with the below commit? By > registering as a source, we are not supplying any accurate metric. We > still fallback on the same source that cpufreq implements it. The arch_set_freq_scale() done at cpufreq driver’s frequency updates at cpufreq_freq_transition_end() and cpufreq_driver_fast_switch() only represent the guest’s frequency request. However, this does not accurately represent the physical CPU’s frequency that the vCPU is running on. E.g. There may be other processes sharing the same physical CPU that results in a much higher CPU frequency than what’s requested by the vCPU. Thanks, David > > 874f63531064 ("cpufreq: report whether cpufreq supports Frequency > Invariance (FI)") > > Thanks, > Pavan > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On Fri, Aug 04, 2023 at 04:46:11PM -0700, David Dai wrote: > Hi Pavan, > > Thanks for reviewing! > > On Wed, Aug 2, 2023 at 9:18 PM Pavan Kondeti <quic_pkondeti@quicinc.com> wrote: > > > > On Mon, Jul 31, 2023 at 10:46:09AM -0700, David Dai wrote: > > > Introduce a virtualized cpufreq driver for guest kernels to improve > > > performance and power of workloads within VMs. > > > > > > This driver does two main things: > > > > > > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the > > > hint to schedule the vCPU threads and decide physical CPU frequency. > > > > > > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the > > > host CPU frequency by reading a MMIO region of a virtual cpufreq device > > > to update the guest's frequency scaling factor periodically. This enables > > > accurate Per-Entity Load Tracking for tasks running in the guest. > > > > > > Co-developed-by: Saravana Kannan <saravanak@google.com> > > > Signed-off-by: Saravana Kannan <saravanak@google.com> > > > Signed-off-by: David Dai <davidai@google.com> > > > > [...] > > > > > +static void virt_scale_freq_tick(void) > > > +{ > > > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > > > + u64 cur_freq; > > > + u64 scale; > > > + > > > + cpufreq_cpu_put(policy); > > > + > > > + cur_freq = (u64)data->ops->get_freq(policy); > > > + cur_freq <<= SCHED_CAPACITY_SHIFT; > > > + scale = div_u64(cur_freq, max_freq); > > > + > > > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > > > +} > > > + > > > > We expect the host to provide the frequency in kHz, can you please add a > > comment about it. It is not very obvious when you look at the > > REG_CUR_FREQ_OFFSET register name. > > I’ll include a KHZ in the offset names. > Sure, that would help. Also, can you limit the scale to SCHED_CAPACITY_SCALE? It may be possible that host may be running at a higher frequency than max_freq advertised on this guest. > > > > > + > > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > > > + unsigned int target_freq) > > > +{ > > > + virt_cpufreq_set_perf(policy); > > > + return target_freq; > > > +} > > > + > > > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > > > + unsigned int index) > > > +{ > > > + return virt_cpufreq_set_perf(policy); > > > +} > > > + > > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > > > +{ > > > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > > > + struct cpufreq_frequency_table *table; > > > + struct device *cpu_dev; > > > + int ret; > > > + > > > + cpu_dev = get_cpu_device(policy->cpu); > > > + if (!cpu_dev) > > > + return -ENODEV; > > > + > > > + ret = dev_pm_opp_of_add_table(cpu_dev); > > > + if (ret) > > > + return ret; > > > + > > > + ret = dev_pm_opp_get_opp_count(cpu_dev); > > > + if (ret <= 0) { > > > + dev_err(cpu_dev, "OPP table can't be empty\n"); > > > + return -ENODEV; > > > + } > > > + > > > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > > > + if (ret) { > > > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > > > + return ret; > > > + } > > > + > > > + policy->freq_table = table; > > > + policy->dvfs_possible_from_any_cpu = false; > > > + policy->fast_switch_possible = true; > > > + policy->driver_data = drv_data; > > > + > > > + /* > > > + * Only takes effect if another FIE source such as AMUs > > > + * have not been registered. > > > + */ > > > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > > > + > > > + return 0; > > > + > > > +} > > > + > > > > Do we need to register as FIE source even with the below commit? By > > registering as a source, we are not supplying any accurate metric. We > > still fallback on the same source that cpufreq implements it. > > The arch_set_freq_scale() done at cpufreq driver’s frequency updates > at cpufreq_freq_transition_end() and cpufreq_driver_fast_switch() only > represent the guest’s frequency request. However, this does not > accurately represent the physical CPU’s frequency that the vCPU is > running on. E.g. There may be other processes sharing the same > physical CPU that results in a much higher CPU frequency than what’s > requested by the vCPU. > understood that policy->cur may not reflect the actual frequency. Is this something needs to be advertised to cpufreq core so that it query the underlying cpufreq driver and use it for frequency scale updates. This also gives userspace to read the actual frequency when read from sysfs. In fact, cpufreq_driver_fast_switch() comment says that cpufreq_driver::fast_switch() should return the *actual* frequency and the same is used to update frequency scale updates. I understand that it depends on other things like if host defer the frequency switch, the value read from REG_CUR_FREQ_OFFSET may reflect the old value.. May be a comment in code would help. Thanks, Pavan
Hi David, kernel test robot noticed the following build errors: [auto build test ERROR on rafael-pm/linux-next] [also build test ERROR on robh/for-next linus/master v6.5-rc5 next-20230809] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/David-Dai/dt-bindings-cpufreq-add-bindings-for-virtual-cpufreq/20230801-014946 base: https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next patch link: https://lore.kernel.org/r/20230731174613.4133167-3-davidai%40google.com patch subject: [PATCH v3 2/2] cpufreq: add virtual-cpufreq driver config: arm-randconfig-r061-20230812 (https://download.01.org/0day-ci/archive/20230812/202308121020.23DejpAv-lkp@intel.com/config) compiler: arm-linux-gnueabi-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230812/202308121020.23DejpAv-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202308121020.23DejpAv-lkp@intel.com/ All errors (new ones prefixed by >>): arm-linux-gnueabi-ld: drivers/cpufreq/virtual-cpufreq.o: in function `virt_cpufreq_cpu_exit': >> virtual-cpufreq.c:(.text+0xf8): undefined reference to `topology_clear_scale_freq_source' arm-linux-gnueabi-ld: drivers/cpufreq/virtual-cpufreq.o: in function `virt_cpufreq_cpu_init': >> virtual-cpufreq.c:(.text+0x1c8): undefined reference to `topology_set_scale_freq_source' arm-linux-gnueabi-ld: drivers/cpufreq/virtual-cpufreq.o: in function `virt_scale_freq_tick': >> virtual-cpufreq.c:(.text+0x330): undefined reference to `arch_freq_scale'
On Sun, Aug 6, 2023 at 8:22 PM Pavan Kondeti <quic_pkondeti@quicinc.com> wrote: > > On Fri, Aug 04, 2023 at 04:46:11PM -0700, David Dai wrote: > > Hi Pavan, > > > > Thanks for reviewing! > > > > On Wed, Aug 2, 2023 at 9:18 PM Pavan Kondeti <quic_pkondeti@quicinc.com> wrote: > > > > > > On Mon, Jul 31, 2023 at 10:46:09AM -0700, David Dai wrote: > > > > Introduce a virtualized cpufreq driver for guest kernels to improve > > > > performance and power of workloads within VMs. > > > > > > > > This driver does two main things: > > > > > > > > 1. Sends the frequency of vCPUs as a hint to the host. The host uses the > > > > hint to schedule the vCPU threads and decide physical CPU frequency. > > > > > > > > 2. If a VM does not support a virtualized FIE(like AMUs), it queries the > > > > host CPU frequency by reading a MMIO region of a virtual cpufreq device > > > > to update the guest's frequency scaling factor periodically. This enables > > > > accurate Per-Entity Load Tracking for tasks running in the guest. > > > > > > > > Co-developed-by: Saravana Kannan <saravanak@google.com> > > > > Signed-off-by: Saravana Kannan <saravanak@google.com> > > > > Signed-off-by: David Dai <davidai@google.com> > > > > > > [...] > > > > > > > +static void virt_scale_freq_tick(void) > > > > +{ > > > > + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); > > > > + struct virt_cpufreq_drv_data *data = policy->driver_data; > > > > + u32 max_freq = (u32)policy->cpuinfo.max_freq; > > > > + u64 cur_freq; > > > > + u64 scale; > > > > + > > > > + cpufreq_cpu_put(policy); > > > > + > > > > + cur_freq = (u64)data->ops->get_freq(policy); > > > > + cur_freq <<= SCHED_CAPACITY_SHIFT; > > > > + scale = div_u64(cur_freq, max_freq); > > > > + > > > > + this_cpu_write(arch_freq_scale, (unsigned long)scale); > > > > +} > > > > + > > > > > > We expect the host to provide the frequency in kHz, can you please add a > > > comment about it. It is not very obvious when you look at the > > > REG_CUR_FREQ_OFFSET register name. > > > > I’ll include a KHZ in the offset names. > > > Hi Pavan, Apologies for the slow responses, I was out on vacation for the week prior to last week. > Sure, that would help. Also, can you limit the scale to > SCHED_CAPACITY_SCALE? It may be possible that host may be running at a > higher frequency than max_freq advertised on this guest. Good catch, will include a check to limit freq_scale to SCHED_CAPACITY_SCALE. > > > > > > > > + > > > > +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, > > > > + unsigned int target_freq) > > > > +{ > > > > + virt_cpufreq_set_perf(policy); > > > > + return target_freq; > > > > +} > > > > + > > > > +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, > > > > + unsigned int index) > > > > +{ > > > > + return virt_cpufreq_set_perf(policy); > > > > +} > > > > + > > > > +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) > > > > +{ > > > > + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); > > > > + struct cpufreq_frequency_table *table; > > > > + struct device *cpu_dev; > > > > + int ret; > > > > + > > > > + cpu_dev = get_cpu_device(policy->cpu); > > > > + if (!cpu_dev) > > > > + return -ENODEV; > > > > + > > > > + ret = dev_pm_opp_of_add_table(cpu_dev); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + ret = dev_pm_opp_get_opp_count(cpu_dev); > > > > + if (ret <= 0) { > > > > + dev_err(cpu_dev, "OPP table can't be empty\n"); > > > > + return -ENODEV; > > > > + } > > > > + > > > > + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); > > > > + if (ret) { > > > > + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); > > > > + return ret; > > > > + } > > > > + > > > > + policy->freq_table = table; > > > > + policy->dvfs_possible_from_any_cpu = false; > > > > + policy->fast_switch_possible = true; > > > > + policy->driver_data = drv_data; > > > > + > > > > + /* > > > > + * Only takes effect if another FIE source such as AMUs > > > > + * have not been registered. > > > > + */ > > > > + topology_set_scale_freq_source(&virt_sfd, policy->cpus); > > > > + > > > > + return 0; > > > > + > > > > +} > > > > + > > > > > > Do we need to register as FIE source even with the below commit? By > > > registering as a source, we are not supplying any accurate metric. We > > > still fallback on the same source that cpufreq implements it. > > > > The arch_set_freq_scale() done at cpufreq driver’s frequency updates > > at cpufreq_freq_transition_end() and cpufreq_driver_fast_switch() only > > represent the guest’s frequency request. However, this does not > > accurately represent the physical CPU’s frequency that the vCPU is > > running on. E.g. There may be other processes sharing the same > > physical CPU that results in a much higher CPU frequency than what’s > > requested by the vCPU. > > > > understood that policy->cur may not reflect the actual frequency. Is this > something needs to be advertised to cpufreq core so that it query the > underlying cpufreq driver and use it for frequency scale updates. This > also gives userspace to read the actual frequency when read from sysfs. > Adding a ->get() callback in the driver to advertise to the cpufreq core does not actually update the freq_scale if fast_switch is enabled. Since fast_switch is required for performance reasons, I don’t see value in adding ->get(). > In fact, cpufreq_driver_fast_switch() comment says that > cpufreq_driver::fast_switch() should return the *actual* frequency and > the same is used to update frequency scale updates. I understand that it > depends on other things like if host defer the frequency switch, the > value read from REG_CUR_FREQ_OFFSET may reflect the old value.. > > May be a comment in code would help. > Sounds good, I'll include a comment. Thanks, David > Thanks, > Pavan > > -- > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index f429b9b37b76..3977ca796747 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -217,6 +217,21 @@ config CPUFREQ_DT If in doubt, say N. +config CPUFREQ_VIRT + tristate "Virtual cpufreq driver" + depends on OF + select PM_OPP + help + This adds a virtualized cpufreq driver for guest kernels that + read/writes to a MMIO region for a virtualized cpufreq device to + communicate with the host. It sends frequency updates to the host + which gets used as a hint to schedule vCPU threads and select CPU + frequency. If a VM does not support a virtualized FIE such as AMUs, + it updates the frequency scaling factor by polling host CPU frequency + to enable accurate Per-Entity Load Tracking for tasks running in the guest. + + If in doubt, say N. + config CPUFREQ_DT_PLATDEV tristate "Generic DT based cpufreq platdev driver" depends on OF diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index ef8510774913..bbc9f9bdfa4b 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_CPU_FREQ_GOV_ATTR_SET) += cpufreq_governor_attr_set.o obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o +obj-$(CONFIG_CPUFREQ_VIRT) += virtual-cpufreq.o # Traces CFLAGS_amd-pstate-trace.o := -I$(src) diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c new file mode 100644 index 000000000000..66b0fd9b821c --- /dev/null +++ b/drivers/cpufreq/virtual-cpufreq.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Google LLC + */ + +#include <linux/arch_topology.h> +#include <linux/cpufreq.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/pm_opp.h> +#include <linux/slab.h> + +#define REG_CUR_FREQ_OFFSET 0x0 +#define REG_SET_FREQ_OFFSET 0x4 +#define PER_CPU_OFFSET 0x8 + +struct virt_cpufreq_ops { + void (*set_freq)(struct cpufreq_policy *policy, u32 freq); + u32 (*get_freq)(struct cpufreq_policy *policy); +}; + +struct virt_cpufreq_drv_data { + void __iomem *base; + const struct virt_cpufreq_ops *ops; +}; + +static void virt_cpufreq_set_freq(struct cpufreq_policy *policy, u32 freq) +{ + struct virt_cpufreq_drv_data *data = policy->driver_data; + + writel_relaxed(freq, data->base + policy->cpu * PER_CPU_OFFSET + + REG_SET_FREQ_OFFSET); +} + +static u32 virt_cpufreq_get_freq(struct cpufreq_policy *policy) +{ + struct virt_cpufreq_drv_data *data = policy->driver_data; + + return readl_relaxed(data->base + policy->cpu * PER_CPU_OFFSET + + REG_CUR_FREQ_OFFSET); +} + +static const struct virt_cpufreq_ops virt_freq_ops = { + .set_freq = virt_cpufreq_set_freq, + .get_freq = virt_cpufreq_get_freq, +}; + +static void virt_scale_freq_tick(void) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(smp_processor_id()); + struct virt_cpufreq_drv_data *data = policy->driver_data; + u32 max_freq = (u32)policy->cpuinfo.max_freq; + u64 cur_freq; + u64 scale; + + cpufreq_cpu_put(policy); + + cur_freq = (u64)data->ops->get_freq(policy); + cur_freq <<= SCHED_CAPACITY_SHIFT; + scale = div_u64(cur_freq, max_freq); + + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data virt_sfd = { + .source = SCALE_FREQ_SOURCE_VIRT, + .set_freq_scale = virt_scale_freq_tick, +}; + +static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy) +{ + struct virt_cpufreq_drv_data *data = policy->driver_data; + /* + * Use cached frequency to avoid rounding to freq table entries + * and undo 25% frequency boost applied by schedutil. + */ + u32 freq = mult_frac(policy->cached_target_freq, 80, 100); + + data->ops->set_freq(policy, freq); + return 0; +} + +static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + virt_cpufreq_set_perf(policy); + return target_freq; +} + +static int virt_cpufreq_target_index(struct cpufreq_policy *policy, + unsigned int index) +{ + return virt_cpufreq_set_perf(policy); +} + +static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + struct virt_cpufreq_drv_data *drv_data = cpufreq_get_driver_data(); + struct cpufreq_frequency_table *table; + struct device *cpu_dev; + int ret; + + cpu_dev = get_cpu_device(policy->cpu); + if (!cpu_dev) + return -ENODEV; + + ret = dev_pm_opp_of_add_table(cpu_dev); + if (ret) + return ret; + + ret = dev_pm_opp_get_opp_count(cpu_dev); + if (ret <= 0) { + dev_err(cpu_dev, "OPP table can't be empty\n"); + return -ENODEV; + } + + ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &table); + if (ret) { + dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); + return ret; + } + + policy->freq_table = table; + policy->dvfs_possible_from_any_cpu = false; + policy->fast_switch_possible = true; + policy->driver_data = drv_data; + + /* + * Only takes effect if another FIE source such as AMUs + * have not been registered. + */ + topology_set_scale_freq_source(&virt_sfd, policy->cpus); + + return 0; + +} + +static int virt_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus); + kfree(policy->freq_table); + policy->freq_table = NULL; + return 0; +} + +static int virt_cpufreq_online(struct cpufreq_policy *policy) +{ + /* Nothing to restore. */ + return 0; +} + +static int virt_cpufreq_offline(struct cpufreq_policy *policy) +{ + /* Dummy offline() to avoid exit() being called and freeing resources. */ + return 0; +} + +static struct cpufreq_driver cpufreq_virt_driver = { + .name = "virt-cpufreq", + .init = virt_cpufreq_cpu_init, + .exit = virt_cpufreq_cpu_exit, + .online = virt_cpufreq_online, + .offline = virt_cpufreq_offline, + .verify = cpufreq_generic_frequency_table_verify, + .target_index = virt_cpufreq_target_index, + .fast_switch = virt_cpufreq_fast_switch, + .attr = cpufreq_generic_attr, +}; + +static int virt_cpufreq_driver_probe(struct platform_device *pdev) +{ + int ret; + struct virt_cpufreq_drv_data *drv_data; + + drv_data = devm_kzalloc(&pdev->dev, sizeof(*drv_data), GFP_KERNEL); + if (!drv_data) + return -ENOMEM; + + drv_data->ops = of_device_get_match_data(&pdev->dev); + if (!drv_data->ops) + return -EINVAL; + + drv_data->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(drv_data->base)) + return PTR_ERR(drv_data->base); + + cpufreq_virt_driver.driver_data = drv_data; + + ret = cpufreq_register_driver(&cpufreq_virt_driver); + if (ret) { + dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret); + return ret; + } + + dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n"); + return 0; +} + +static int virt_cpufreq_driver_remove(struct platform_device *pdev) +{ + cpufreq_unregister_driver(&cpufreq_virt_driver); + return 0; +} + +static const struct of_device_id virt_cpufreq_match[] = { + { .compatible = "virtual,cpufreq", .data = &virt_freq_ops}, + {} +}; +MODULE_DEVICE_TABLE(of, virt_cpufreq_match); + +static struct platform_driver virt_cpufreq_driver = { + .probe = virt_cpufreq_driver_probe, + .remove = virt_cpufreq_driver_remove, + .driver = { + .name = "virt-cpufreq", + .of_match_table = virt_cpufreq_match, + }, +}; + +static int __init virt_cpufreq_init(void) +{ + return platform_driver_register(&virt_cpufreq_driver); +} +postcore_initcall(virt_cpufreq_init); + +static void __exit virt_cpufreq_exit(void) +{ + platform_driver_unregister(&virt_cpufreq_driver); +} +module_exit(virt_cpufreq_exit); + +MODULE_DESCRIPTION("Virtual cpufreq driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc5..888282dce2ba 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -42,6 +42,7 @@ enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, SCALE_FREQ_SOURCE_CPPC, + SCALE_FREQ_SOURCE_VIRT, }; struct scale_freq_data {