Message ID | 20210203135321.12253-2-ggherdovich@suse.cz (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | AMD EPYC: fix schedutil perf regression (freq-invariance) | expand |
On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: > [cut] > Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") > Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") > Reported-by: Michael Larabel <Michael@phoronix.com> > Tested-by: Michael Larabel <Michael@phoronix.com> > Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> > --- > drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- > drivers/cpufreq/cpufreq.c | 3 ++ > include/linux/cpufreq.h | 5 +++ > kernel/sched/cpufreq_schedutil.c | 8 +++-- I don't really think that it is necessary to modify schedutil to address this issue. > 4 files changed, 73 insertions(+), 4 deletions(-) > > diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c > index 1e4fbb002a31..a5facc6cad16 100644 > --- a/drivers/cpufreq/acpi-cpufreq.c > +++ b/drivers/cpufreq/acpi-cpufreq.c > @@ -26,6 +26,7 @@ > #include <linux/uaccess.h> > > #include <acpi/processor.h> > +#include <acpi/cppc_acpi.h> > > #include <asm/msr.h> > #include <asm/processor.h> > @@ -628,11 +629,57 @@ static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) > } > #endif > > +#ifdef CONFIG_ACPI_CPPC_LIB > +static bool amd_max_boost(unsigned int max_freq, > + unsigned int *max_boost) Is there anything specific to AMD CPUs in this function? > +{ > + struct cppc_perf_caps perf_caps; > + u64 highest_perf, nominal_perf, perf_ratio; > + int ret; > + > + ret = cppc_get_perf_caps(0, &perf_caps); > + if (ret) { > + pr_debug("Could not retrieve perf counters (%d)\n", ret); > + return false; > + } > + > + highest_perf = perf_caps.highest_perf; > + nominal_perf = perf_caps.nominal_perf; > + > + if (!highest_perf || !nominal_perf) { > + pr_debug("Could not retrieve highest or nominal performance\n"); > + return false; > + } > + > + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); Why do you use SCHED_CAPACITY_SCALE here? And why does this multiply instead of shifting? > + if (perf_ratio <= SCHED_CAPACITY_SCALE) { > + pr_debug("Either perf_ratio is 0, or nominal >= highest performance\n"); > + return false; > + } > + > + *max_boost = max_freq * perf_ratio >> SCHED_CAPACITY_SHIFT; Is this assuming that max_freq corresponds to nominal_perf? > + if (!*max_boost) { > + pr_debug("max_boost seems to be zero\n"); > + return false; So this function may just return the max_boost value with 0 meaning a failure. > + } > + > + return true; > +} > +#else > +static bool amd_max_boost(unsigned int max_freq, > + unsigned int *max_boost) > +{ > + return false; > +} > +#endif > + > static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) > { > unsigned int i; > unsigned int valid_states = 0; > unsigned int cpu = policy->cpu; > + unsigned int freq, max_freq = 0; > + unsigned int max_boost; > struct acpi_cpufreq_data *data; > unsigned int result = 0; > struct cpuinfo_x86 *c = &cpu_data(policy->cpu); > @@ -779,15 +826,25 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) > freq_table[valid_states-1].frequency / 1000) > continue; > > + freq = perf->states[i].core_frequency * 1000; > freq_table[valid_states].driver_data = i; > - freq_table[valid_states].frequency = > - perf->states[i].core_frequency * 1000; > + freq_table[valid_states].frequency = freq; > + > + if (freq > max_freq) > + max_freq = freq; > + > valid_states++; > } > freq_table[valid_states].frequency = CPUFREQ_TABLE_END; > policy->freq_table = freq_table; > perf->state = 0; > > + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && > + amd_max_boost(max_freq, &max_boost)) { AFAICS, the issue is not limited to AMD CPUs . > + policy->cpuinfo.max_boost = max_boost; > + static_branch_enable(&cpufreq_amd_max_boost); > + } > + > switch (perf->control_register.space_id) { > case ACPI_ADR_SPACE_SYSTEM_IO: > /* > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c > index d0a3525ce27f..b96677f6b57e 100644 > --- a/drivers/cpufreq/cpufreq.c > +++ b/drivers/cpufreq/cpufreq.c > @@ -2721,6 +2721,9 @@ int cpufreq_boost_enabled(void) > } > EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); > > +DEFINE_STATIC_KEY_FALSE(cpufreq_amd_max_boost); > +EXPORT_SYMBOL_GPL(cpufreq_amd_max_boost); > + > /********************************************************************* > * REGISTER / UNREGISTER CPUFREQ DRIVER * > *********************************************************************/ > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h > index 9c8b7437b6cd..341cac76d254 100644 > --- a/include/linux/cpufreq.h > +++ b/include/linux/cpufreq.h > @@ -40,9 +40,14 @@ enum cpufreq_table_sorting { > CPUFREQ_TABLE_SORTED_DESCENDING > }; > > +DECLARE_STATIC_KEY_FALSE(cpufreq_amd_max_boost); > + > +#define cpufreq_driver_has_max_boost() static_branch_unlikely(&cpufreq_amd_max_boost) > + > struct cpufreq_cpuinfo { > unsigned int max_freq; > unsigned int min_freq; > + unsigned int max_boost; > > /* in 10^(-9) s = nanoseconds */ > unsigned int transition_latency; > diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c > index 6931f0cdeb80..541f3db3f576 100644 > --- a/kernel/sched/cpufreq_schedutil.c > +++ b/kernel/sched/cpufreq_schedutil.c > @@ -159,8 +159,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, > unsigned long util, unsigned long max) > { > struct cpufreq_policy *policy = sg_policy->policy; > - unsigned int freq = arch_scale_freq_invariant() ? > - policy->cpuinfo.max_freq : policy->cur; > + unsigned int freq, max_freq; > + > + max_freq = cpufreq_driver_has_max_boost() ? > + policy->cpuinfo.max_boost : policy->cpuinfo.max_freq; > + > + freq = arch_scale_freq_invariant() ? max_freq : policy->cur; > > freq = map_util_freq(util, freq, max); > > --
On Wednesday, February 3, 2021 3:11:37 PM CET Rafael J. Wysocki wrote: > On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: > > > > [cut] > > > Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") > > Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") > > Reported-by: Michael Larabel <Michael@phoronix.com> > > Tested-by: Michael Larabel <Michael@phoronix.com> > > Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> > > --- > > drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- > > drivers/cpufreq/cpufreq.c | 3 ++ > > include/linux/cpufreq.h | 5 +++ > > kernel/sched/cpufreq_schedutil.c | 8 +++-- > > I don't really think that it is necessary to modify schedutil to > address this issue. So below is a prototype of an alternative fix for the issue at hand. I can't really test it here, because there's no _CPC in the ACPI tables of my test machines, so testing it would be appreciated. However, AFAICS these machines are affected by the performance issue related to the scale-invariance when they are running acpi-cpufreq, so what we are doing here is not entirely sufficient. It looks like the scale-invariance code should ask the cpufreq driver about the maximum frequency and note that cpufreq drivers may be changed on the fly. What the patch below does is to add an extra entry to the frequency table for each CPU to represent the maximum "boost" frequency, so as to cause that frequency to be used as cpuinfo.max_freq. The reason why I think it is better to extend the frequency tables instead of simply increasing the frequency for the "P0" entry is because the latter may cause "turbo" frequency to be asked for less often. --- drivers/cpufreq/acpi-cpufreq.c | 107 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 12 deletions(-) Index: linux-pm/drivers/cpufreq/acpi-cpufreq.c =================================================================== --- linux-pm.orig/drivers/cpufreq/acpi-cpufreq.c +++ linux-pm/drivers/cpufreq/acpi-cpufreq.c @@ -26,6 +26,7 @@ #include <linux/uaccess.h> #include <acpi/processor.h> +#include <acpi/cppc_acpi.h> #include <asm/msr.h> #include <asm/processor.h> @@ -53,6 +54,7 @@ struct acpi_cpufreq_data { unsigned int resume; unsigned int cpu_feature; unsigned int acpi_perf_cpu; + unsigned int first_perf_state; cpumask_var_t freqdomain_cpus; void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val); u32 (*cpu_freq_read)(struct acpi_pct_register *reg); @@ -221,10 +223,10 @@ static unsigned extract_msr(struct cpufr perf = to_perf_data(data); - cpufreq_for_each_entry(pos, policy->freq_table) + cpufreq_for_each_entry(pos, policy->freq_table + data->first_perf_state) if (msr == perf->states[pos->driver_data].status) return pos->frequency; - return policy->freq_table[0].frequency; + return policy->freq_table[data->first_perf_state].frequency; } static unsigned extract_freq(struct cpufreq_policy *policy, u32 val) @@ -363,6 +365,7 @@ static unsigned int get_cur_freq_on_cpu( struct cpufreq_policy *policy; unsigned int freq; unsigned int cached_freq; + unsigned int state; pr_debug("%s (%d)\n", __func__, cpu); @@ -374,7 +377,11 @@ static unsigned int get_cur_freq_on_cpu( if (unlikely(!data || !policy->freq_table)) return 0; - cached_freq = policy->freq_table[to_perf_data(data)->state].frequency; + state = to_perf_data(data)->state; + if (state < data->first_perf_state) + state = data->first_perf_state; + + cached_freq = policy->freq_table[state].frequency; freq = extract_freq(policy, get_cur_val(cpumask_of(cpu), data)); if (freq != cached_freq) { /* @@ -628,16 +635,54 @@ static int acpi_cpufreq_blacklist(struct } #endif +#ifdef CONFIG_ACPI_CPPC_LIB +static u64 get_max_boost_ratio(unsigned int cpu) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + int ret; + + if (acpi_pstate_strict) + return 0; + + ret = cppc_get_perf_caps(cpu, &perf_caps); + if (ret) { + pr_debug("CPU%d: Unable to get performance capabilities (%d)\n", + cpu, ret); + return 0; + } + + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("CPU%d: highest or nominal performance missing\n", cpu); + return 0; + } + + if (highest_perf < nominal_perf) { + pr_debug("CPU%d: nominal performance above highest\n", cpu); + return 0; + } + + return div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); +} +#else +static inline u64 get_max_boost_ratio(unsigned int cpu) { return 0; } +#endif + static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; + struct cpufreq_frequency_table *freq_table; + struct acpi_processor_performance *perf; struct acpi_cpufreq_data *data; + unsigned int cpu = policy->cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int valid_states = 0; unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; - struct cpufreq_frequency_table *freq_table; + unsigned int state_count; + u64 max_boost_ratio; + unsigned int i; #ifdef CONFIG_SMP static int blacklisted; #endif @@ -750,8 +795,20 @@ static int acpi_cpufreq_cpu_init(struct goto err_unreg; } - freq_table = kcalloc(perf->state_count + 1, sizeof(*freq_table), - GFP_KERNEL); + state_count = perf->state_count + 1; + + max_boost_ratio = get_max_boost_ratio(cpu); + if (max_boost_ratio) { + /* + * Make a room for one more entry to represent the highest + * available "boost" frequency. + */ + state_count++; + valid_states++; + data->first_perf_state = valid_states; + } + + freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL); if (!freq_table) { result = -ENOMEM; goto err_unreg; @@ -785,6 +842,30 @@ static int acpi_cpufreq_cpu_init(struct valid_states++; } freq_table[valid_states].frequency = CPUFREQ_TABLE_END; + + if (max_boost_ratio) { + unsigned int state = data->first_perf_state; + unsigned int freq = freq_table[state].frequency; + + /* + * Because the loop above sorts the freq_table entries in the + * descending order, freq is the maximum frequency in the table. + * Assume that it corresponds to the CPPC nominal frequency and + * use it to populate the frequency field of the extra "boost" + * frequency entry. + */ + freq_table[0].frequency = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; + /* + * The purpose of the extra "boost" frequency entry is to make + * the rest of cpufreq aware of the real maximum frequency, but + * the way to request it is the same as for the first_perf_state + * entry that is expected to cover the entire range of "boost" + * frequencies of the CPU, so copy the driver_data value from + * that entry. + */ + freq_table[0].driver_data = freq_table[state].driver_data; + } + policy->freq_table = freq_table; perf->state = 0; @@ -858,8 +939,10 @@ static void acpi_cpufreq_cpu_ready(struc { struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, policy->cpu); + struct acpi_cpufreq_data *data = policy->driver_data; + unsigned int freq = policy->freq_table[data->first_perf_state].frequency; - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) + if (perf->states[0].core_frequency * 1000 != freq) pr_warn(FW_WARN "P-state 0 is not max freq\n"); }
On 2/3/21 12:25 PM, Rafael J. Wysocki wrote: > On Wednesday, February 3, 2021 3:11:37 PM CET Rafael J. Wysocki wrote: >> On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: >> [cut] >> >>> Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") >>> Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") >>> Reported-by: Michael Larabel <Michael@phoronix.com> >>> Tested-by: Michael Larabel <Michael@phoronix.com> >>> Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> >>> --- >>> drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- >>> drivers/cpufreq/cpufreq.c | 3 ++ >>> include/linux/cpufreq.h | 5 +++ >>> kernel/sched/cpufreq_schedutil.c | 8 +++-- >> I don't really think that it is necessary to modify schedutil to >> address this issue. > So below is a prototype of an alternative fix for the issue at hand. > > I can't really test it here, because there's no _CPC in the ACPI tables of my > test machines, so testing it would be appreciated. However, AFAICS these > machines are affected by the performance issue related to the scale-invariance > when they are running acpi-cpufreq, so what we are doing here is not entirely > sufficient. I have benchmarks running on several Ryzen and EPYC systems with this patch. The full batch of tests won't be done until tomorrow, but in looking at the data so far from an AMD EPYC 7F72 2P server over the past few hours, this patch does provide fairly comparable numbers to Giovanni's patch. There were a few outliers so far but waiting to see with the complete set of results. At the very least it's clear enough already this new patch is at least an improvement over the current 5.11 upstream state with schedutil on AMD. Michael > > It looks like the scale-invariance code should ask the cpufreq driver about > the maximum frequency and note that cpufreq drivers may be changed on the > fly. > > What the patch below does is to add an extra entry to the frequency table for > each CPU to represent the maximum "boost" frequency, so as to cause that > frequency to be used as cpuinfo.max_freq. > > The reason why I think it is better to extend the frequency tables instead > of simply increasing the frequency for the "P0" entry is because the latter > may cause "turbo" frequency to be asked for less often. > > --- > drivers/cpufreq/acpi-cpufreq.c | 107 ++++++++++++++++++++++++++++++++++++----- > 1 file changed, 95 insertions(+), 12 deletions(-) > > Index: linux-pm/drivers/cpufreq/acpi-cpufreq.c > =================================================================== > --- linux-pm.orig/drivers/cpufreq/acpi-cpufreq.c > +++ linux-pm/drivers/cpufreq/acpi-cpufreq.c > @@ -26,6 +26,7 @@ > #include <linux/uaccess.h> > > #include <acpi/processor.h> > +#include <acpi/cppc_acpi.h> > > #include <asm/msr.h> > #include <asm/processor.h> > @@ -53,6 +54,7 @@ struct acpi_cpufreq_data { > unsigned int resume; > unsigned int cpu_feature; > unsigned int acpi_perf_cpu; > + unsigned int first_perf_state; > cpumask_var_t freqdomain_cpus; > void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val); > u32 (*cpu_freq_read)(struct acpi_pct_register *reg); > @@ -221,10 +223,10 @@ static unsigned extract_msr(struct cpufr > > perf = to_perf_data(data); > > - cpufreq_for_each_entry(pos, policy->freq_table) > + cpufreq_for_each_entry(pos, policy->freq_table + data->first_perf_state) > if (msr == perf->states[pos->driver_data].status) > return pos->frequency; > - return policy->freq_table[0].frequency; > + return policy->freq_table[data->first_perf_state].frequency; > } > > static unsigned extract_freq(struct cpufreq_policy *policy, u32 val) > @@ -363,6 +365,7 @@ static unsigned int get_cur_freq_on_cpu( > struct cpufreq_policy *policy; > unsigned int freq; > unsigned int cached_freq; > + unsigned int state; > > pr_debug("%s (%d)\n", __func__, cpu); > > @@ -374,7 +377,11 @@ static unsigned int get_cur_freq_on_cpu( > if (unlikely(!data || !policy->freq_table)) > return 0; > > - cached_freq = policy->freq_table[to_perf_data(data)->state].frequency; > + state = to_perf_data(data)->state; > + if (state < data->first_perf_state) > + state = data->first_perf_state; > + > + cached_freq = policy->freq_table[state].frequency; > freq = extract_freq(policy, get_cur_val(cpumask_of(cpu), data)); > if (freq != cached_freq) { > /* > @@ -628,16 +635,54 @@ static int acpi_cpufreq_blacklist(struct > } > #endif > > +#ifdef CONFIG_ACPI_CPPC_LIB > +static u64 get_max_boost_ratio(unsigned int cpu) > +{ > + struct cppc_perf_caps perf_caps; > + u64 highest_perf, nominal_perf; > + int ret; > + > + if (acpi_pstate_strict) > + return 0; > + > + ret = cppc_get_perf_caps(cpu, &perf_caps); > + if (ret) { > + pr_debug("CPU%d: Unable to get performance capabilities (%d)\n", > + cpu, ret); > + return 0; > + } > + > + highest_perf = perf_caps.highest_perf; > + nominal_perf = perf_caps.nominal_perf; > + > + if (!highest_perf || !nominal_perf) { > + pr_debug("CPU%d: highest or nominal performance missing\n", cpu); > + return 0; > + } > + > + if (highest_perf < nominal_perf) { > + pr_debug("CPU%d: nominal performance above highest\n", cpu); > + return 0; > + } > + > + return div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); > +} > +#else > +static inline u64 get_max_boost_ratio(unsigned int cpu) { return 0; } > +#endif > + > static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) > { > - unsigned int i; > - unsigned int valid_states = 0; > - unsigned int cpu = policy->cpu; > + struct cpufreq_frequency_table *freq_table; > + struct acpi_processor_performance *perf; > struct acpi_cpufreq_data *data; > + unsigned int cpu = policy->cpu; > + struct cpuinfo_x86 *c = &cpu_data(cpu); > + unsigned int valid_states = 0; > unsigned int result = 0; > - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); > - struct acpi_processor_performance *perf; > - struct cpufreq_frequency_table *freq_table; > + unsigned int state_count; > + u64 max_boost_ratio; > + unsigned int i; > #ifdef CONFIG_SMP > static int blacklisted; > #endif > @@ -750,8 +795,20 @@ static int acpi_cpufreq_cpu_init(struct > goto err_unreg; > } > > - freq_table = kcalloc(perf->state_count + 1, sizeof(*freq_table), > - GFP_KERNEL); > + state_count = perf->state_count + 1; > + > + max_boost_ratio = get_max_boost_ratio(cpu); > + if (max_boost_ratio) { > + /* > + * Make a room for one more entry to represent the highest > + * available "boost" frequency. > + */ > + state_count++; > + valid_states++; > + data->first_perf_state = valid_states; > + } > + > + freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL); > if (!freq_table) { > result = -ENOMEM; > goto err_unreg; > @@ -785,6 +842,30 @@ static int acpi_cpufreq_cpu_init(struct > valid_states++; > } > freq_table[valid_states].frequency = CPUFREQ_TABLE_END; > + > + if (max_boost_ratio) { > + unsigned int state = data->first_perf_state; > + unsigned int freq = freq_table[state].frequency; > + > + /* > + * Because the loop above sorts the freq_table entries in the > + * descending order, freq is the maximum frequency in the table. > + * Assume that it corresponds to the CPPC nominal frequency and > + * use it to populate the frequency field of the extra "boost" > + * frequency entry. > + */ > + freq_table[0].frequency = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT; > + /* > + * The purpose of the extra "boost" frequency entry is to make > + * the rest of cpufreq aware of the real maximum frequency, but > + * the way to request it is the same as for the first_perf_state > + * entry that is expected to cover the entire range of "boost" > + * frequencies of the CPU, so copy the driver_data value from > + * that entry. > + */ > + freq_table[0].driver_data = freq_table[state].driver_data; > + } > + > policy->freq_table = freq_table; > perf->state = 0; > > @@ -858,8 +939,10 @@ static void acpi_cpufreq_cpu_ready(struc > { > struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data, > policy->cpu); > + struct acpi_cpufreq_data *data = policy->driver_data; > + unsigned int freq = policy->freq_table[data->first_perf_state].frequency; > > - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) > + if (perf->states[0].core_frequency * 1000 != freq) > pr_warn(FW_WARN "P-state 0 is not max freq\n"); > } > > > >
On Thu, Feb 4, 2021 at 12:36 AM Michael Larabel <Michael@phoronix.com> wrote: > > On 2/3/21 12:25 PM, Rafael J. Wysocki wrote: > > On Wednesday, February 3, 2021 3:11:37 PM CET Rafael J. Wysocki wrote: > >> On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: > >> [cut] > >> > >>> Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") > >>> Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") > >>> Reported-by: Michael Larabel <Michael@phoronix.com> > >>> Tested-by: Michael Larabel <Michael@phoronix.com> > >>> Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> > >>> --- > >>> drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- > >>> drivers/cpufreq/cpufreq.c | 3 ++ > >>> include/linux/cpufreq.h | 5 +++ > >>> kernel/sched/cpufreq_schedutil.c | 8 +++-- > >> I don't really think that it is necessary to modify schedutil to > >> address this issue. > > So below is a prototype of an alternative fix for the issue at hand. > > > > I can't really test it here, because there's no _CPC in the ACPI tables of my > > test machines, so testing it would be appreciated. However, AFAICS these > > machines are affected by the performance issue related to the scale-invariance > > when they are running acpi-cpufreq, so what we are doing here is not entirely > > sufficient. > > > I have benchmarks running on several Ryzen and EPYC systems with this > patch. The full batch of tests won't be done until tomorrow, but in > looking at the data so far from an AMD EPYC 7F72 2P server over the past > few hours, this patch does provide fairly comparable numbers to > Giovanni's patch. There were a few outliers so far but waiting to see > with the complete set of results. At the very least it's clear enough > already this new patch is at least an improvement over the current 5.11 > upstream state with schedutil on AMD. Thanks for the feedback, much appreciated! Let me submit the patch properly, then.
On Wed, 2021-02-03 at 19:25 +0100, Rafael J. Wysocki wrote: > [cut] > > So below is a prototype of an alternative fix for the issue at hand. > > I can't really test it here, because there's no _CPC in the ACPI tables of my > test machines, so testing it would be appreciated. However, AFAICS these > machines are affected by the performance issue related to the scale-invariance > when they are running acpi-cpufreq, so what we are doing here is not entirely > sufficient. > > It looks like the scale-invariance code should ask the cpufreq driver about > the maximum frequency and note that cpufreq drivers may be changed on the > fly. > > What the patch below does is to add an extra entry to the frequency table for > each CPU to represent the maximum "boost" frequency, so as to cause that > frequency to be used as cpuinfo.max_freq. > > The reason why I think it is better to extend the frequency tables instead > of simply increasing the frequency for the "P0" entry is because the latter > may cause "turbo" frequency to be asked for less often. > > --- > drivers/cpufreq/acpi-cpufreq.c | 107 ++++++++++++++++++++++++++++++++++++----- > 1 file changed, 95 insertions(+), 12 deletions(-) Hello Rafael, thanks for looking at this. Your patch is indeed cleaner than the one I proposed. Preliminary testing is favorable; more tests are running. Results from your patch are in the fourth column below; the performance from v5.10 looks restored. I'll follow up once the tests I queued are completed. Giovanni TEST : Intel Open Image Denoise, www.openimagedenoise.org INVOCATION : ./denoise -hdr memorial.pfm -out out.pfm -bench 200 -threads $NTHREADS CPU : MODEL : 2x AMD EPYC 7742 FREQUENCY TABLE : P2: 1.50 GHz P1: 2.00 GHz P0: 2.25 GHz MAX BOOST : 3.40 GHz Results: threads, msecs (ratio). Lower is better. v5.10 v5.11-rc4 v5.11-rc4-ggherdov v5.11-rc6-rafael ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1 1069.85 (1.00) 1071.84 (1.00) 1070.42 (1.00) 1069.12 (1.00) 2 542.24 (1.00) 544.40 (1.00) 544.48 (1.00) 540.81 (1.00) 4 278.00 (1.00) 278.44 (1.00) 277.72 (1.00) 277.79 (1.00) 8 149.81 (1.00) 149.61 (1.00) 149.87 (1.00) 149.51 (1.00) 16 79.01 (1.00) 79.31 (1.00) 78.94 (1.00) 79.02 (1.00) 24 58.01 (1.00) 58.51 (1.01) 58.15 (1.00) 57.84 (1.00) 32 46.58 (1.00) 48.30 (1.04) 46.66 (1.00) 46.70 (1.00) 48 37.29 (1.00) 51.29 (1.38) 37.27 (1.00) 38.10 (1.02) 64 34.01 (1.00) 49.59 (1.46) 33.71 (0.99) 34.51 (1.01) 80 31.09 (1.00) 44.27 (1.42) 31.33 (1.01) 31.11 (1.00) 96 28.56 (1.00) 40.82 (1.43) 28.47 (1.00) 28.65 (1.00) 112 28.09 (1.00) 40.06 (1.43) 28.63 (1.02) 28.38 (1.01) 120 28.73 (1.00) 39.78 (1.38) 28.14 (0.98) 28.16 (0.98) 128 28.93 (1.00) 39.60 (1.37) 29.38 (1.02) 28.55 (0.99)
On Thu, Feb 4, 2021 at 2:49 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: > > On Wed, 2021-02-03 at 19:25 +0100, Rafael J. Wysocki wrote: > > [cut] > > > > So below is a prototype of an alternative fix for the issue at hand. > > > > I can't really test it here, because there's no _CPC in the ACPI tables of my > > test machines, so testing it would be appreciated. However, AFAICS these > > machines are affected by the performance issue related to the scale-invariance > > when they are running acpi-cpufreq, so what we are doing here is not entirely > > sufficient. > > > > It looks like the scale-invariance code should ask the cpufreq driver about > > the maximum frequency and note that cpufreq drivers may be changed on the > > fly. > > > > What the patch below does is to add an extra entry to the frequency table for > > each CPU to represent the maximum "boost" frequency, so as to cause that > > frequency to be used as cpuinfo.max_freq. > > > > The reason why I think it is better to extend the frequency tables instead > > of simply increasing the frequency for the "P0" entry is because the latter > > may cause "turbo" frequency to be asked for less often. > > > > --- > > drivers/cpufreq/acpi-cpufreq.c | 107 ++++++++++++++++++++++++++++++++++++----- > > 1 file changed, 95 insertions(+), 12 deletions(-) > > Hello Rafael, > > thanks for looking at this. Your patch is indeed cleaner than the one I proposed. > > Preliminary testing is favorable; more tests are running. > > Results from your patch are in the fourth column below; the performance from > v5.10 looks restored. > > I'll follow up once the tests I queued are completed. Thank you! > TEST : Intel Open Image Denoise, www.openimagedenoise.org > INVOCATION : ./denoise -hdr memorial.pfm -out out.pfm -bench 200 -threads $NTHREADS > CPU : MODEL : 2x AMD EPYC 7742 > FREQUENCY TABLE : P2: 1.50 GHz > P1: 2.00 GHz > P0: 2.25 GHz > MAX BOOST : 3.40 GHz > > Results: threads, msecs (ratio). Lower is better. > > v5.10 v5.11-rc4 v5.11-rc4-ggherdov v5.11-rc6-rafael > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > 1 1069.85 (1.00) 1071.84 (1.00) 1070.42 (1.00) 1069.12 (1.00) > 2 542.24 (1.00) 544.40 (1.00) 544.48 (1.00) 540.81 (1.00) > 4 278.00 (1.00) 278.44 (1.00) 277.72 (1.00) 277.79 (1.00) > 8 149.81 (1.00) 149.61 (1.00) 149.87 (1.00) 149.51 (1.00) > 16 79.01 (1.00) 79.31 (1.00) 78.94 (1.00) 79.02 (1.00) > 24 58.01 (1.00) 58.51 (1.01) 58.15 (1.00) 57.84 (1.00) > 32 46.58 (1.00) 48.30 (1.04) 46.66 (1.00) 46.70 (1.00) > 48 37.29 (1.00) 51.29 (1.38) 37.27 (1.00) 38.10 (1.02) > 64 34.01 (1.00) 49.59 (1.46) 33.71 (0.99) 34.51 (1.01) > 80 31.09 (1.00) 44.27 (1.42) 31.33 (1.01) 31.11 (1.00) > 96 28.56 (1.00) 40.82 (1.43) 28.47 (1.00) 28.65 (1.00) > 112 28.09 (1.00) 40.06 (1.43) 28.63 (1.02) 28.38 (1.01) > 120 28.73 (1.00) 39.78 (1.38) 28.14 (0.98) 28.16 (0.98) > 128 28.93 (1.00) 39.60 (1.37) 29.38 (1.02) 28.55 (0.99)
On 2/4/21 7:40 AM, Rafael J. Wysocki wrote: > On Thu, Feb 4, 2021 at 12:36 AM Michael Larabel <Michael@phoronix.com> wrote: >> On 2/3/21 12:25 PM, Rafael J. Wysocki wrote: >>> On Wednesday, February 3, 2021 3:11:37 PM CET Rafael J. Wysocki wrote: >>>> On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: >>>> [cut] >>>> >>>>> Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") >>>>> Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") >>>>> Reported-by: Michael Larabel <Michael@phoronix.com> >>>>> Tested-by: Michael Larabel <Michael@phoronix.com> >>>>> Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> >>>>> --- >>>>> drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- >>>>> drivers/cpufreq/cpufreq.c | 3 ++ >>>>> include/linux/cpufreq.h | 5 +++ >>>>> kernel/sched/cpufreq_schedutil.c | 8 +++-- >>>> I don't really think that it is necessary to modify schedutil to >>>> address this issue. >>> So below is a prototype of an alternative fix for the issue at hand. >>> >>> I can't really test it here, because there's no _CPC in the ACPI tables of my >>> test machines, so testing it would be appreciated. However, AFAICS these >>> machines are affected by the performance issue related to the scale-invariance >>> when they are running acpi-cpufreq, so what we are doing here is not entirely >>> sufficient. >> >> I have benchmarks running on several Ryzen and EPYC systems with this >> patch. The full batch of tests won't be done until tomorrow, but in >> looking at the data so far from an AMD EPYC 7F72 2P server over the past >> few hours, this patch does provide fairly comparable numbers to >> Giovanni's patch. There were a few outliers so far but waiting to see >> with the complete set of results. At the very least it's clear enough >> already this new patch is at least an improvement over the current 5.11 >> upstream state with schedutil on AMD. > Thanks for the feedback, much appreciated! > > Let me submit the patch properly, then. Everything continues looking good in running this patch on a variety of AMD Zen2/Zen3 systems. As Giovanni has been focusing on EPYC testing, I have been running several Ryzen laptops/desktop for more exposure and there it's looking very good. On a Ryzen 9 5900X[1] when looking at this latest patch against current 5.11 Git and 5.10, the performance is recovered and in some cases better off now than 5.10 with Schedutil. No anomalies there and with other Zen 2/3 desktops and Zen 2 notebook the performance relative to 5.10 is comparable or in some cases better while all indications point to the 5.11 regression being addressed. Some of the slower systems still finishing up but no unexpected results yet and likely just redundant testing at this point. Tests on EPYC hardware has also been looking good. With some 44 tests on an EPYC 7F72 2P setup[2] when taking the geometric mean of all the data finding it rightly in line with the prior patch from Giovanni. EPYC 7702 and EPYC 7F52 1P performance similarly showing no regression any longer with the patched kernel. This patch also seemed to help CPUFreq ondemand performance improve as well in some cases. Will advise if hitting anything unexpected with the remainder of the testing but all is looking solid at this point and a definite improvement over the current 5.11 Git state. Tested-by: Michael Larabel <Michael@phoronix.com> [1] https://openbenchmarking.org/result/2102048-PTS-RYZEN95920 (5.10 stable vs. 5.11 Git vs. patched.) [2] https://openbenchmarking.org/result/2102048-HA-AMDEPYC7F37 (Giovanni's earlier patch against this new patch, compared to unpatched Linux 5.11 Git and then Linux 5.11 with CPUfreq performance governor.) Michael
On Fri, Feb 5, 2021 at 12:04 AM Michael Larabel <Michael@phoronix.com> wrote: > > On 2/4/21 7:40 AM, Rafael J. Wysocki wrote: > > On Thu, Feb 4, 2021 at 12:36 AM Michael Larabel <Michael@phoronix.com> wrote: > >> On 2/3/21 12:25 PM, Rafael J. Wysocki wrote: > >>> On Wednesday, February 3, 2021 3:11:37 PM CET Rafael J. Wysocki wrote: > >>>> On Wed, Feb 3, 2021 at 2:53 PM Giovanni Gherdovich <ggherdovich@suse.cz> wrote: > >>>> [cut] > >>>> > >>>>> Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") > >>>>> Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC") > >>>>> Reported-by: Michael Larabel <Michael@phoronix.com> > >>>>> Tested-by: Michael Larabel <Michael@phoronix.com> > >>>>> Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz> > >>>>> --- > >>>>> drivers/cpufreq/acpi-cpufreq.c | 61 ++++++++++++++++++++++++++++++-- > >>>>> drivers/cpufreq/cpufreq.c | 3 ++ > >>>>> include/linux/cpufreq.h | 5 +++ > >>>>> kernel/sched/cpufreq_schedutil.c | 8 +++-- > >>>> I don't really think that it is necessary to modify schedutil to > >>>> address this issue. > >>> So below is a prototype of an alternative fix for the issue at hand. > >>> > >>> I can't really test it here, because there's no _CPC in the ACPI tables of my > >>> test machines, so testing it would be appreciated. However, AFAICS these > >>> machines are affected by the performance issue related to the scale-invariance > >>> when they are running acpi-cpufreq, so what we are doing here is not entirely > >>> sufficient. > >> > >> I have benchmarks running on several Ryzen and EPYC systems with this > >> patch. The full batch of tests won't be done until tomorrow, but in > >> looking at the data so far from an AMD EPYC 7F72 2P server over the past > >> few hours, this patch does provide fairly comparable numbers to > >> Giovanni's patch. There were a few outliers so far but waiting to see > >> with the complete set of results. At the very least it's clear enough > >> already this new patch is at least an improvement over the current 5.11 > >> upstream state with schedutil on AMD. > > Thanks for the feedback, much appreciated! > > > > Let me submit the patch properly, then. > > > Everything continues looking good in running this patch on a variety of > AMD Zen2/Zen3 systems. > > As Giovanni has been focusing on EPYC testing, I have been running > several Ryzen laptops/desktop for more exposure and there it's looking > very good. On a Ryzen 9 5900X[1] when looking at this latest patch > against current 5.11 Git and 5.10, the performance is recovered and in > some cases better off now than 5.10 with Schedutil. No anomalies there > and with other Zen 2/3 desktops and Zen 2 notebook the performance > relative to 5.10 is comparable or in some cases better while all > indications point to the 5.11 regression being addressed. Some of the > slower systems still finishing up but no unexpected results yet and > likely just redundant testing at this point. > > Tests on EPYC hardware has also been looking good. With some 44 tests on > an EPYC 7F72 2P setup[2] when taking the geometric mean of all the data > finding it rightly in line with the prior patch from Giovanni. EPYC 7702 > and EPYC 7F52 1P performance similarly showing no regression any longer > with the patched kernel. This patch also seemed to help CPUFreq ondemand > performance improve as well in some cases. > > Will advise if hitting anything unexpected with the remainder of the > testing but all is looking solid at this point and a definite > improvement over the current 5.11 Git state. > > Tested-by: Michael Larabel <Michael@phoronix.com> Thank you for all of the verification work, much appreciated! > [1] https://openbenchmarking.org/result/2102048-PTS-RYZEN95920 (5.10 > stable vs. 5.11 Git vs. patched.) > [2] https://openbenchmarking.org/result/2102048-HA-AMDEPYC7F37 > (Giovanni's earlier patch against this new patch, compared to unpatched > Linux 5.11 Git and then Linux 5.11 with CPUfreq performance governor.)
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 1e4fbb002a31..a5facc6cad16 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -26,6 +26,7 @@ #include <linux/uaccess.h> #include <acpi/processor.h> +#include <acpi/cppc_acpi.h> #include <asm/msr.h> #include <asm/processor.h> @@ -628,11 +629,57 @@ static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) } #endif +#ifdef CONFIG_ACPI_CPPC_LIB +static bool amd_max_boost(unsigned int max_freq, + unsigned int *max_boost) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf, perf_ratio; + int ret; + + ret = cppc_get_perf_caps(0, &perf_caps); + if (ret) { + pr_debug("Could not retrieve perf counters (%d)\n", ret); + return false; + } + + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return false; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + if (perf_ratio <= SCHED_CAPACITY_SCALE) { + pr_debug("Either perf_ratio is 0, or nominal >= highest performance\n"); + return false; + } + + *max_boost = max_freq * perf_ratio >> SCHED_CAPACITY_SHIFT; + if (!*max_boost) { + pr_debug("max_boost seems to be zero\n"); + return false; + } + + return true; +} +#else +static bool amd_max_boost(unsigned int max_freq, + unsigned int *max_boost) +{ + return false; +} +#endif + static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int i; unsigned int valid_states = 0; unsigned int cpu = policy->cpu; + unsigned int freq, max_freq = 0; + unsigned int max_boost; struct acpi_cpufreq_data *data; unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); @@ -779,15 +826,25 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) freq_table[valid_states-1].frequency / 1000) continue; + freq = perf->states[i].core_frequency * 1000; freq_table[valid_states].driver_data = i; - freq_table[valid_states].frequency = - perf->states[i].core_frequency * 1000; + freq_table[valid_states].frequency = freq; + + if (freq > max_freq) + max_freq = freq; + valid_states++; } freq_table[valid_states].frequency = CPUFREQ_TABLE_END; policy->freq_table = freq_table; perf->state = 0; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + amd_max_boost(max_freq, &max_boost)) { + policy->cpuinfo.max_boost = max_boost; + static_branch_enable(&cpufreq_amd_max_boost); + } + switch (perf->control_register.space_id) { case ACPI_ADR_SPACE_SYSTEM_IO: /* diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d0a3525ce27f..b96677f6b57e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2721,6 +2721,9 @@ int cpufreq_boost_enabled(void) } EXPORT_SYMBOL_GPL(cpufreq_boost_enabled); +DEFINE_STATIC_KEY_FALSE(cpufreq_amd_max_boost); +EXPORT_SYMBOL_GPL(cpufreq_amd_max_boost); + /********************************************************************* * REGISTER / UNREGISTER CPUFREQ DRIVER * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9c8b7437b6cd..341cac76d254 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -40,9 +40,14 @@ enum cpufreq_table_sorting { CPUFREQ_TABLE_SORTED_DESCENDING }; +DECLARE_STATIC_KEY_FALSE(cpufreq_amd_max_boost); + +#define cpufreq_driver_has_max_boost() static_branch_unlikely(&cpufreq_amd_max_boost) + struct cpufreq_cpuinfo { unsigned int max_freq; unsigned int min_freq; + unsigned int max_boost; /* in 10^(-9) s = nanoseconds */ unsigned int transition_latency; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6931f0cdeb80..541f3db3f576 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -159,8 +159,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq, max_freq; + + max_freq = cpufreq_driver_has_max_boost() ? + policy->cpuinfo.max_boost : policy->cpuinfo.max_freq; + + freq = arch_scale_freq_invariant() ? max_freq : policy->cur; freq = map_util_freq(util, freq, max);