Message ID | 53962072.6010702@semaphore.gr (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
On 06/09/2014 02:00 PM, Stratos Karafotis wrote: > Add stats file in debugfs under driver's parent directory > (pstate_snb) which counts the time in nsecs per requested > P state and the number of times the specific state > was requested. > > The file presents the statistics per logical CPU in the > following format. The time is displayed in msecs: > NAK This adds significantly to the memory footprint to gather information that is available by post processing the perf tracepoint information. The increase isn't horrible on single socket desktop processor machines but gets big with server class machines. One vendor I have talked to considers a machine with 1024 cpus to be a SMALL machine. > CPU0 > P-state Time Count > 16 4882777 23632 > 17 21210 174 > 18 549781 3300 > 19 51171 461 > 20 35487 394 > 21 18173 219 > 22 13752 258 > 23 6048 172 > 24 7754 177 > 25 4587 151 > 26 5465 162 > 27 1432 47 > 28 863 54 > 29 1448 50 > 30 1030 47 > 31 1472 62 > 32 2221 68 > 33 1869 60 > 34 2140 70 > 39 85446 3803 > > ... > > The file can be used for debugging but also for monitoring > various system workloads. > > Also, make the debugfs_parent local as we never remove > the driver's debugfs files. > > Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr> > --- > drivers/cpufreq/intel_pstate.c | 80 +++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 79 insertions(+), 1 deletion(-) > > diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c > index 31e2ae5..3a49269 100644 > --- a/drivers/cpufreq/intel_pstate.c > +++ b/drivers/cpufreq/intel_pstate.c > @@ -86,6 +86,12 @@ struct _pid { > int32_t last_err; > }; > > +struct pstate_stat { > + int pstate; > + u64 time; > + u64 count; > +}; > + > struct cpudata { > int cpu; > > @@ -99,6 +105,7 @@ struct cpudata { > u64 prev_aperf; > u64 prev_mperf; > struct sample sample; > + struct pstate_stat *stats; > }; > > static struct cpudata **all_cpu_data; > @@ -256,9 +263,59 @@ static struct pid_param pid_files[] = { > {NULL, NULL} > }; > > -static struct dentry *debugfs_parent; > +static inline unsigned int stats_state_index(struct cpudata *cpu, int pstate) > +{ > + if (pstate <= cpu->pstate.max_pstate) > + return pstate - cpu->pstate.min_pstate; > + else > + return cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; > +} > + > +static int stats_debug_show(struct seq_file *m, void *unused) > +{ > + struct cpudata *cpu; > + int i, j, cnt; > + > + get_online_cpus(); > + for_each_online_cpu(i) { > + if (all_cpu_data[i]) > + cpu = all_cpu_data[i]; > + else > + continue; > + > + seq_printf(m, "CPU%u\n", i); > + seq_puts(m, "P-state Time Count\n"); > + > + cnt = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 2; > + for (j = 0; j < cnt; j++) > + seq_printf(m, "%7u %11llu %9llu\n", > + cpu->stats[j].pstate, > + cpu->stats[j].time / USEC_PER_MSEC, > + cpu->stats[j].count); > + > + seq_puts(m, "\n"); > + } > + put_online_cpus(); > + > + return 0; > +} > + > +static int stats_debug_open(struct inode *inode, struct file *file) > +{ > + return single_open(file, stats_debug_show, inode->i_private); > +} > + > +static const struct file_operations fops_stats_pstate = { > + .open = stats_debug_open, > + .read = seq_read, > + .llseek = seq_lseek, > + .release = single_release, > + .owner = THIS_MODULE, > +}; > + > static void intel_pstate_debug_expose_params(void) > { > + struct dentry *debugfs_parent; > int i = 0; > > debugfs_parent = debugfs_create_dir("pstate_snb", NULL); > @@ -270,6 +327,8 @@ static void intel_pstate_debug_expose_params(void) > &fops_pid_param); > i++; > } > + debugfs_create_file("stats", S_IRUSR | S_IRGRP, debugfs_parent, NULL, > + &fops_stats_pstate); > } > > /************************** debugfs end ************************/ > @@ -610,6 +669,7 @@ static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu) > int32_t core_busy, max_pstate, current_pstate, sample_ratio; > u32 duration_us; > u32 sample_time; > + unsigned int i; > > core_busy = cpu->sample.core_pct_busy; > max_pstate = int_tofp(cpu->pstate.max_pstate); > @@ -626,6 +686,10 @@ static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu) > } > > cpu->sample.busy_scaled = core_busy; > + > + i = stats_state_index(cpu, cpu->pstate.current_pstate); > + cpu->stats[i].time += duration_us; > + cpu->stats[i].count++; > } > > static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) > @@ -692,6 +756,7 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); > static int intel_pstate_init_cpu(unsigned int cpunum) > { > struct cpudata *cpu; > + unsigned int i, cnt; > > all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); > if (!all_cpu_data[cpunum]) > @@ -701,6 +766,17 @@ static int intel_pstate_init_cpu(unsigned int cpunum) > > intel_pstate_get_cpu_pstates(cpu); > > + /* cnt equals to number of p-states + 1 (for turbo p-state) */ > + cnt = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 2; > + cpu->stats = kzalloc(sizeof(*cpu->stats) * cnt, GFP_KERNEL); > + if (!cpu->stats) { > + kfree(all_cpu_data[cpunum]); > + return -ENOMEM; > + } > + for (i = 0; i < cnt - 1; i++) > + cpu->stats[i].pstate = cpu->pstate.min_pstate + i; > + cpu->stats[cnt - 1].pstate = cpu->pstate.turbo_pstate; > + > cpu->cpu = cpunum; > > init_timer_deferrable(&cpu->timer); > @@ -779,6 +855,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) > > del_timer_sync(&all_cpu_data[cpu_num]->timer); > intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); > + kfree(all_cpu_data[cpu_num]->stats); > kfree(all_cpu_data[cpu_num]); > all_cpu_data[cpu_num] = NULL; > } > @@ -980,6 +1057,7 @@ out: > for_each_online_cpu(cpu) { > if (all_cpu_data[cpu]) { > del_timer_sync(&all_cpu_data[cpu]->timer); > + kfree(all_cpu_data[cpu]->stats); > kfree(all_cpu_data[cpu]); > } > } > -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/06/2014 06:47 ??, Dirk Brandewie wrote: > On 06/09/2014 02:00 PM, Stratos Karafotis wrote: >> Add stats file in debugfs under driver's parent directory >> (pstate_snb) which counts the time in nsecs per requested >> P state and the number of times the specific state >> was requested. >> >> The file presents the statistics per logical CPU in the >> following format. The time is displayed in msecs: >> > > NAK > > This adds significantly to the memory footprint to gather information > that is available by post processing the perf tracepoint information. > The increase isn't horrible on single socket desktop processor machines > but gets big with server class machines. One vendor I have talked to considers > a machine with 1024 cpus to be a SMALL machine. > If I am not wrong the sizeof pstate_stat is 20B. On my CPU with 20 P states, we need 400B per logical CPU (3200B total in my desktop) plus 64B for stats pointers. In your example this would need about 400KB - 500KB? Is it too much for 1024 a CPUs system? I think it's a useful piece of info that we can have it directly without post processing tracepoint. Is it acceptable to conditionally compile it with a new CONFIG option? Thanks, Stratos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/10/2014 09:21 AM, Stratos Karafotis wrote: > On 10/06/2014 06:47 ??, Dirk Brandewie wrote: >> On 06/09/2014 02:00 PM, Stratos Karafotis wrote: >>> Add stats file in debugfs under driver's parent directory >>> (pstate_snb) which counts the time in nsecs per requested >>> P state and the number of times the specific state >>> was requested. >>> >>> The file presents the statistics per logical CPU in the >>> following format. The time is displayed in msecs: >>> >> >> NAK >> >> This adds significantly to the memory footprint to gather information >> that is available by post processing the perf tracepoint information. >> The increase isn't horrible on single socket desktop processor machines >> but gets big with server class machines. One vendor I have talked to considers >> a machine with 1024 cpus to be a SMALL machine. >> > > If I am not wrong the sizeof pstate_stat is 20B. On my CPU with 20 P states, we > need 400B per logical CPU (3200B total in my desktop) plus 64B for stats pointers. > > In your example this would need about 400KB - 500KB? > Is it too much for 1024 a CPUs system? For something that will likely not be used IMO yes. > > I think it's a useful piece of info that we can have it directly without > post processing tracepoint. > Is it acceptable to conditionally compile it with a new CONFIG option? I can see where the information could be useful but the set of people that would find it useful is very small. Having information about residency since boot is interesting but just barely. This file will encourage people to build tools/scripts that rely on this file and they will complain bitterly if/when it changes or goes away so you would be creating a defacto ABI in debugfs. This functionality will *not* be supportable in up coming processors where HWP is being used. See section 14.4 of the current SDM vol. 3 http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-system-programming-manual-325384.pdf > > > Thanks, > Stratos > -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/06/2014 08:05 ??, Dirk Brandewie wrote: > On 06/10/2014 09:21 AM, Stratos Karafotis wrote: >> On 10/06/2014 06:47 ??, Dirk Brandewie wrote: >>> On 06/09/2014 02:00 PM, Stratos Karafotis wrote: >>>> Add stats file in debugfs under driver's parent directory >>>> (pstate_snb) which counts the time in nsecs per requested >>>> P state and the number of times the specific state >>>> was requested. >>>> >>>> The file presents the statistics per logical CPU in the >>>> following format. The time is displayed in msecs: >>>> >>> >>> NAK >>> >>> This adds significantly to the memory footprint to gather information >>> that is available by post processing the perf tracepoint information. >>> The increase isn't horrible on single socket desktop processor machines >>> but gets big with server class machines. One vendor I have talked to considers >>> a machine with 1024 cpus to be a SMALL machine. >>> >> >> If I am not wrong the sizeof pstate_stat is 20B. On my CPU with 20 P states, we >> need 400B per logical CPU (3200B total in my desktop) plus 64B for stats pointers. >> >> In your example this would need about 400KB - 500KB? >> Is it too much for 1024 a CPUs system? > > For something that will likely not be used IMO yes. > >> >> I think it's a useful piece of info that we can have it directly without >> post processing tracepoint. >> Is it acceptable to conditionally compile it with a new CONFIG option? > > > I can see where the information could be useful but the set of people > that would find it useful is very small. Having information about residency > since boot is interesting but just barely. This file will encourage people > to build tools/scripts that rely on this file and they will complain bitterly > if/when it changes or goes away so you would be creating a defacto ABI in > debugfs. > > > This functionality will *not* be supportable in up coming processors where HWP > is being used. See section 14.4 of the current SDM vol. 3 > http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-system-programming-manual-325384.pdf > I will drop this patch in v2. Thanks a lot for your comments and your time! Stratos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 31e2ae5..3a49269 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -86,6 +86,12 @@ struct _pid { int32_t last_err; }; +struct pstate_stat { + int pstate; + u64 time; + u64 count; +}; + struct cpudata { int cpu; @@ -99,6 +105,7 @@ struct cpudata { u64 prev_aperf; u64 prev_mperf; struct sample sample; + struct pstate_stat *stats; }; static struct cpudata **all_cpu_data; @@ -256,9 +263,59 @@ static struct pid_param pid_files[] = { {NULL, NULL} }; -static struct dentry *debugfs_parent; +static inline unsigned int stats_state_index(struct cpudata *cpu, int pstate) +{ + if (pstate <= cpu->pstate.max_pstate) + return pstate - cpu->pstate.min_pstate; + else + return cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1; +} + +static int stats_debug_show(struct seq_file *m, void *unused) +{ + struct cpudata *cpu; + int i, j, cnt; + + get_online_cpus(); + for_each_online_cpu(i) { + if (all_cpu_data[i]) + cpu = all_cpu_data[i]; + else + continue; + + seq_printf(m, "CPU%u\n", i); + seq_puts(m, "P-state Time Count\n"); + + cnt = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 2; + for (j = 0; j < cnt; j++) + seq_printf(m, "%7u %11llu %9llu\n", + cpu->stats[j].pstate, + cpu->stats[j].time / USEC_PER_MSEC, + cpu->stats[j].count); + + seq_puts(m, "\n"); + } + put_online_cpus(); + + return 0; +} + +static int stats_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, stats_debug_show, inode->i_private); +} + +static const struct file_operations fops_stats_pstate = { + .open = stats_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; + static void intel_pstate_debug_expose_params(void) { + struct dentry *debugfs_parent; int i = 0; debugfs_parent = debugfs_create_dir("pstate_snb", NULL); @@ -270,6 +327,8 @@ static void intel_pstate_debug_expose_params(void) &fops_pid_param); i++; } + debugfs_create_file("stats", S_IRUSR | S_IRGRP, debugfs_parent, NULL, + &fops_stats_pstate); } /************************** debugfs end ************************/ @@ -610,6 +669,7 @@ static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu) int32_t core_busy, max_pstate, current_pstate, sample_ratio; u32 duration_us; u32 sample_time; + unsigned int i; core_busy = cpu->sample.core_pct_busy; max_pstate = int_tofp(cpu->pstate.max_pstate); @@ -626,6 +686,10 @@ static inline void intel_pstate_calc_scaled_busy(struct cpudata *cpu) } cpu->sample.busy_scaled = core_busy; + + i = stats_state_index(cpu, cpu->pstate.current_pstate); + cpu->stats[i].time += duration_us; + cpu->stats[i].count++; } static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) @@ -692,6 +756,7 @@ MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; + unsigned int i, cnt; all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); if (!all_cpu_data[cpunum]) @@ -701,6 +766,17 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_get_cpu_pstates(cpu); + /* cnt equals to number of p-states + 1 (for turbo p-state) */ + cnt = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 2; + cpu->stats = kzalloc(sizeof(*cpu->stats) * cnt, GFP_KERNEL); + if (!cpu->stats) { + kfree(all_cpu_data[cpunum]); + return -ENOMEM; + } + for (i = 0; i < cnt - 1; i++) + cpu->stats[i].pstate = cpu->pstate.min_pstate + i; + cpu->stats[cnt - 1].pstate = cpu->pstate.turbo_pstate; + cpu->cpu = cpunum; init_timer_deferrable(&cpu->timer); @@ -779,6 +855,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) del_timer_sync(&all_cpu_data[cpu_num]->timer); intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); + kfree(all_cpu_data[cpu_num]->stats); kfree(all_cpu_data[cpu_num]); all_cpu_data[cpu_num] = NULL; } @@ -980,6 +1057,7 @@ out: for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { del_timer_sync(&all_cpu_data[cpu]->timer); + kfree(all_cpu_data[cpu]->stats); kfree(all_cpu_data[cpu]); } }
Add stats file in debugfs under driver's parent directory (pstate_snb) which counts the time in nsecs per requested P state and the number of times the specific state was requested. The file presents the statistics per logical CPU in the following format. The time is displayed in msecs: CPU0 P-state Time Count 16 4882777 23632 17 21210 174 18 549781 3300 19 51171 461 20 35487 394 21 18173 219 22 13752 258 23 6048 172 24 7754 177 25 4587 151 26 5465 162 27 1432 47 28 863 54 29 1448 50 30 1030 47 31 1472 62 32 2221 68 33 1869 60 34 2140 70 39 85446 3803 ... The file can be used for debugging but also for monitoring various system workloads. Also, make the debugfs_parent local as we never remove the driver's debugfs files. Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr> --- drivers/cpufreq/intel_pstate.c | 80 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-)