diff mbox series

[RESEND,2/2] /proc/stat: Simplify iowait and idle calculations when cpu is offline

Message ID 20200909144122.77210-3-tom.hromatka@oracle.com (mailing list archive)
State New, archived
Headers show
Series iowait and idle fixes in /proc/stat | expand

Commit Message

Tom Hromatka Sept. 9, 2020, 2:41 p.m. UTC
A customer reported that when a cpu goes offline, the iowait and idle
times reported in /proc/stat will sometimes spike.  This is being
caused by a different data source being used for these values when a
cpu is offline.

Prior to this patch:

put the system under heavy load so that there is little idle time

	       user nice system    idle iowait
	cpu  109515   17  32111  220686    607

take cpu1 offline

	       user nice system    idle iowait
	cpu  113742   17  32721  220724    612

bring cpu1 back online

	       user nice system    idle iowait
	cpu  118332   17  33430  220687    607

To prevent this, let's use the same data source whether a cpu is
online or not.

With this patch:

put the system under heavy load so that there is little idle time

	       user nice system    idle iowait
	cpu   14096   16   4646  157687    426

take cpu1 offline

	       user nice system    idle iowait
	cpu   21614   16   7179  157687    426

bring cpu1 back online

	       user nice system    idle iowait
	cpu   27362   16   9555  157688    426

Signed-off-by: Tom Hromatka <tom.hromatka@oracle.com>
---
 fs/proc/stat.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

Comments

Alexey Dobriyan Sept. 10, 2020, 12:14 p.m. UTC | #1
On Wed, Sep 09, 2020 at 08:41:22AM -0600, Tom Hromatka wrote:
>  static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
>  {
> -	u64 idle, idle_usecs = -1ULL;
> +	u64 idle, idle_usecs;
>  
> -	if (cpu_online(cpu))
> -		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> -
> -	if (idle_usecs == -1ULL)
> -		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> -		idle = kcs->cpustat[CPUTIME_IDLE];
> -	else
> -		idle = idle_usecs * NSEC_PER_USEC;
> +	idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> +	idle = idle_usecs * NSEC_PER_USEC;
>  
>  	return idle;
>  }
>  
>  static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
>  {
> -	u64 iowait, iowait_usecs = -1ULL;
> -
> -	if (cpu_online(cpu))
> -		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
> +	u64 iowait, iowait_usecs;
>  
> -	if (iowait_usecs == -1ULL)
> -		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
> -		iowait = kcs->cpustat[CPUTIME_IOWAIT];
> -	else
> -		iowait = iowait_usecs * NSEC_PER_USEC;
> +	iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
> +	iowait = iowait_usecs * NSEC_PER_USEC;

You can gc variables in both cases:

	return get_cpu_iowait_time_us() * NSEC_PER_USEC;
Thomas Gleixner Sept. 13, 2020, 9:35 p.m. UTC | #2
On Wed, Sep 09 2020 at 08:41, Tom Hromatka wrote:

> A customer reported that when a cpu goes offline, the iowait and idle
> times reported in /proc/stat will sometimes spike.  This is being
> caused by a different data source being used for these values when a
> cpu is offline.
>
> Prior to this patch:
>
> put the system under heavy load so that there is little idle time
>
> 	       user nice system    idle iowait
> 	cpu  109515   17  32111  220686    607
>
> take cpu1 offline
>
> 	       user nice system    idle iowait
> 	cpu  113742   17  32721  220724    612
>
> bring cpu1 back online
>
> 	       user nice system    idle iowait
> 	cpu  118332   17  33430  220687    607
>
> To prevent this, let's use the same data source whether a cpu is
> online or not.

Let's use? Your patch makes it use the same data source.

And again, neither the customer story nor the numbers are helpful to
understand the underlying problem. Also this lacks a reference to the
previous change which preserves the times accross a CPU offline/online
sequence.

> diff --git a/fs/proc/stat.c b/fs/proc/stat.c
> index 46b3293015fe..35b92539e711 100644
> --- a/fs/proc/stat.c
> +++ b/fs/proc/stat.c
> @@ -47,32 +47,20 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
>  
>  static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
>  {
> -	u64 idle, idle_usecs = -1ULL;
> +	u64 idle, idle_usecs;
>  
> -	if (cpu_online(cpu))
> -		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> -
> -	if (idle_usecs == -1ULL)
> -		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
> -		idle = kcs->cpustat[CPUTIME_IDLE];
> -	else
> -		idle = idle_usecs * NSEC_PER_USEC;
> +	idle_usecs = get_cpu_idle_time_us(cpu, NULL);
> +	idle = idle_usecs * NSEC_PER_USEC;
>  
>  	return idle;

        return get_cpu_idle_time_us(cpu, NULL) * NSEC_PER_USEC;

perhaps?

Thanks,

        tglx
diff mbox series

Patch

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 46b3293015fe..35b92539e711 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -47,32 +47,20 @@  static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 
 static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
 {
-	u64 idle, idle_usecs = -1ULL;
+	u64 idle, idle_usecs;
 
-	if (cpu_online(cpu))
-		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
-
-	if (idle_usecs == -1ULL)
-		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
-		idle = kcs->cpustat[CPUTIME_IDLE];
-	else
-		idle = idle_usecs * NSEC_PER_USEC;
+	idle_usecs = get_cpu_idle_time_us(cpu, NULL);
+	idle = idle_usecs * NSEC_PER_USEC;
 
 	return idle;
 }
 
 static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
 {
-	u64 iowait, iowait_usecs = -1ULL;
-
-	if (cpu_online(cpu))
-		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+	u64 iowait, iowait_usecs;
 
-	if (iowait_usecs == -1ULL)
-		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
-		iowait = kcs->cpustat[CPUTIME_IOWAIT];
-	else
-		iowait = iowait_usecs * NSEC_PER_USEC;
+	iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
+	iowait = iowait_usecs * NSEC_PER_USEC;
 
 	return iowait;
 }