[RFC,1/9] schedule: Introduce per-pcpu time accounting
diff mbox series

Message ID 1568197942-15374-2-git-send-email-andrii.anisov@gmail.com
State New
Headers show
Series
  • Changes to time accounting
Related show

Commit Message

Andrii Anisov Sept. 11, 2019, 10:32 a.m. UTC
From: Andrii Anisov <andrii_anisov@epam.com>

Introduce per-pcpu time accounting what includes the following states:

TACC_HYP - the pcpu executes hypervisor code like softirq processing
           (including scheduling), tasklets and context switches
TACC_GUEST - the pcpu executes guests code
TACC_IDLE - the low-power state of the pcpu
TACC_IRQ - the pcpu performs interrupts processing, without separation to
           guest or hypervisor interrupts
TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
             from the guest. E.g. hypercall processing or io emulation.

Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
to state other than TACC_IRQ could happen until we return from nested
interrupts. IRQ time is accounted in a distinct way comparing to other states.
It is acumulated between other states transition moments, and is substracted
from the old state on states transion calculation.

Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
---
 xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/sched.h | 27 +++++++++++++++++
 2 files changed, 108 insertions(+)

Comments

Volodymyr Babchuk Sept. 11, 2019, 6:01 p.m. UTC | #1
Andrii,

Andrii Anisov writes:

> From: Andrii Anisov <andrii_anisov@epam.com>
>
> Introduce per-pcpu time accounting what includes the following states:
>
> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>            (including scheduling), tasklets and context switches
> TACC_GUEST - the pcpu executes guests code
> TACC_IDLE - the low-power state of the pcpu
Is it really low-power?

> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>            guest or hypervisor interrupts
I think, word "distinguishing" would be better than "separation"

> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>              from the guest. E.g. hypercall processing or io emulation.
>
> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
> to state other than TACC_IRQ could happen until we return from nested
> interrupts. IRQ time is accounted in a distinct way comparing to other states.
> It is acumulated between other states transition moments, and is substracted
> from the old state on states transion calculation.
>
> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
> ---
>  xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>  xen/include/xen/sched.h | 27 +++++++++++++++++
>  2 files changed, 108 insertions(+)
>
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> index 7b71581..6dd6603 100644
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -1539,6 +1539,87 @@ static void schedule(void)
>      context_switch(prev, next);
>  }
>  
> +DEFINE_PER_CPU(struct tacc, tacc);
> +
> +static void tacc_state_change(enum TACC_STATES new_state)
> +{
> +    s_time_t now, delta;
> +    struct tacc* tacc = &this_cpu(tacc);
> +    unsigned long flags;
> +
> +    local_irq_save(flags);
> +
> +    now = NOW();
> +    delta = now - tacc->state_entry_time;
> +
> +    /* We do not expect states reenterability (at least through this function)*/
> +    ASSERT(new_state != tacc->state);
> +
> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
> +    tacc->irq_time = 0;
> +    tacc->state = new_state;
> +    tacc->state_entry_time = now;
> +
> +    local_irq_restore(flags);
> +}
> +
> +void tacc_hyp(int place)
I believe, you want some enum for this "place" parameter type
> +{
> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
Please, don't push commented-out code. BTW, I think, it is possible to
add some TACC_DEBUG facilities to enable/disable this traces during
compile-time.

Also, looks like you don't use "place" parameter at all.

Lastly, I believe that this function (and other similar functions below)
can be defined as "static inline" in a header file.

> +    tacc_state_change(TACC_HYP);
> +}
> +
> +void tacc_guest(int place)
> +{
> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GUEST);
> +}
> +
> +void tacc_idle(int place)
> +{
> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_IDLE);
> +}
> +
> +void tacc_gsync(int place)
> +{
> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GSYNC);
> +}
> +
> +void tacc_irq_enter(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt >= 0);
You can make irq_cnt unsigned and drop this assert.

> +
> +    if ( tacc->irq_cnt == 0 )
> +    {
> +        tacc->irq_enter_time = NOW();
> +    }
Coding style:

---
Braces should be omitted for blocks with a single statement. e.g.,

if ( condition )
    single_statement();
---

> +
> +    tacc->irq_cnt++;
> +}
> +
> +void tacc_irq_exit(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt > 0);
> +    if ( tacc->irq_cnt == 1 )
> +    {
> +        tacc->irq_time = NOW() - tacc->irq_enter_time;
> +        tacc->irq_enter_time = 0;
> +    }
> +
> +    tacc->irq_cnt--;
What if, you IRQ will arrive right after this? I believe, you will lose
some of the accumulated time.

> +}
> +
>  void context_saved(struct vcpu *prev)
>  {
>      /* Clear running flag /after/ writing context to memory. */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index e3601c1..04a8724 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>  
>  void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>  
> +enum TACC_STATES {
If I remember correct, enum names should in lower case

> +    TACC_HYP = 0,
> +    TACC_GUEST = 1,
> +    TACC_IDLE = 2,
> +    TACC_IRQ = 3,
> +    TACC_GSYNC = 4,
> +    TACC_STATES_MAX
> +};
> +
> +struct tacc
> +{
> +    s_time_t state_time[TACC_STATES_MAX];
> +    s_time_t state_entry_time;
> +    int state;
enum, maybe?

> +
> +    s_time_t guest_time;
> +
> +    s_time_t irq_enter_time;
> +    s_time_t irq_time;
> +    int irq_cnt;
> +};
> +
> +DECLARE_PER_CPU(struct tacc, tacc);
> +
> +void tacc_hyp(int place);
> +void tacc_idle(int place);
What about functions from sched.c? Should they be declared there?

> +
>  #endif /* __SCHED_H__ */
>  
>  /*
Andrii Anisov Sept. 12, 2019, 10:26 a.m. UTC | #2
Hello Volodymyr,

On 11.09.19 21:01, Volodymyr Babchuk wrote:
>> Introduce per-pcpu time accounting what includes the following states:
>>
>> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>>             (including scheduling), tasklets and context switches
>> TACC_GUEST - the pcpu executes guests code
>> TACC_IDLE - the low-power state of the pcpu
> Is it really low-power?

It is rather matter of scheduling design. It differs from OS to OS, even from arch to arch. See [1].
Me personally tend to treat only low-power state as a true idle.
As a bad (IMO) example I can give the current XEN mainline. Pretty heavy tasks could be performed by the idle vcpu, and they are accounted as idle. This may mislead, for example, cpufreq governor.

>> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>>             guest or hypervisor interrupts
> I think, word "distinguishing" would be better than "separation"

Why so?

> 
>> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>>               from the guest. E.g. hypercall processing or io emulation.
>>
>> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
>> to state other than TACC_IRQ could happen until we return from nested
>> interrupts. IRQ time is accounted in a distinct way comparing to other states.
>> It is acumulated between other states transition moments, and is substracted
>> from the old state on states transion calculation.
>>
>> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
>> ---
>>   xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   xen/include/xen/sched.h | 27 +++++++++++++++++
>>   2 files changed, 108 insertions(+)
>>
>> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
>> index 7b71581..6dd6603 100644
>> --- a/xen/common/schedule.c
>> +++ b/xen/common/schedule.c
>> @@ -1539,6 +1539,87 @@ static void schedule(void)
>>       context_switch(prev, next);
>>   }
>>   
>> +DEFINE_PER_CPU(struct tacc, tacc);
>> +
>> +static void tacc_state_change(enum TACC_STATES new_state)
>> +{
>> +    s_time_t now, delta;
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +    unsigned long flags;
>> +
>> +    local_irq_save(flags);
>> +
>> +    now = NOW();
>> +    delta = now - tacc->state_entry_time;
>> +
>> +    /* We do not expect states reenterability (at least through this function)*/
>> +    ASSERT(new_state != tacc->state);
>> +
>> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
>> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
>> +    tacc->irq_time = 0;
>> +    tacc->state = new_state;
>> +    tacc->state_entry_time = now;
>> +
>> +    local_irq_restore(flags);
>> +}
>> +
>> +void tacc_hyp(int place)
> I believe, you want some enum for this "place" parameter type
>> +{
>> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
> Please, don't push commented-out code. BTW, I think, it is possible to
> add some TACC_DEBUG facilities to enable/disable this traces during
> compile-time.
> 
> Also, looks like you don't use "place" parameter at all.

Since that is the RFC, I've comforted myself with leaving my debug code in place. I hope it should not be confusing.

> 
> Lastly, I believe that this function (and other similar functions below)
> can be defined as "static inline" in a header file

Not this time. They are mostly called from asm (at least now).

> 
>> +    tacc_state_change(TACC_HYP);
>> +}
>> +
>> +void tacc_guest(int place)
>> +{
>> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GUEST);
>> +}
>> +
>> +void tacc_idle(int place)
>> +{
>> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_IDLE);
>> +}
>> +
>> +void tacc_gsync(int place)
>> +{
>> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GSYNC);
>> +}
>> +
>> +void tacc_irq_enter(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt >= 0);
> You can make irq_cnt unsigned and drop this assert.

No. Otherwise one might miss proper call sequence when utilize this for the different arch, and have no notice from the debug assertion.

> 
>> +
>> +    if ( tacc->irq_cnt == 0 )
>> +    {
>> +        tacc->irq_enter_time = NOW();
>> +    }
> Coding style:
> 
> ---
> Braces should be omitted for blocks with a single statement. e.g.,
> 
> if ( condition )
>      single_statement();
> ---
> 

OK.

>> +
>> +    tacc->irq_cnt++;
>> +}
>> +
>> +void tacc_irq_exit(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt > 0);
>> +    if ( tacc->irq_cnt == 1 )
>> +    {
>> +        tacc->irq_time = NOW() - tacc->irq_enter_time;
>> +        tacc->irq_enter_time = 0;
>> +    }
>> +
>> +    tacc->irq_cnt--;
> What if, you IRQ will arrive right after this? I believe, you will lose
> some of the accumulated time.

See ASSERT(!local_irq_is_enabled()) above.

> 
>> +}
>> +
>>   void context_saved(struct vcpu *prev)
>>   {
>>       /* Clear running flag /after/ writing context to memory. */
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index e3601c1..04a8724 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>>   
>>   void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>>   
>> +enum TACC_STATES {
> If I remember correct, enum names should in lower case

Ugh...

> 
>> +    TACC_HYP = 0,
>> +    TACC_GUEST = 1,
>> +    TACC_IDLE = 2,
>> +    TACC_IRQ = 3,
>> +    TACC_GSYNC = 4,
>> +    TACC_STATES_MAX
>> +};
>> +
>> +struct tacc
>> +{
>> +    s_time_t state_time[TACC_STATES_MAX];
>> +    s_time_t state_entry_time;
>> +    int state;
> enum, maybe?

Maybe.

> 
>> +
>> +    s_time_t guest_time;
>> +
>> +    s_time_t irq_enter_time;
>> +    s_time_t irq_time;
>> +    int irq_cnt;
>> +};
>> +
>> +DECLARE_PER_CPU(struct tacc, tacc);
>> +
>> +void tacc_hyp(int place);
>> +void tacc_idle(int place);
> What about functions from sched.c? Should they be declared there?

Maybe.

> 
>> +
>>   #endif /* __SCHED_H__ */
>>   
>>   /*
> 
> 

[1] https://elixir.bootlin.com/linux/latest/source/kernel/sched/cputime.c#L429

Patch
diff mbox series

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 7b71581..6dd6603 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1539,6 +1539,87 @@  static void schedule(void)
     context_switch(prev, next);
 }
 
+DEFINE_PER_CPU(struct tacc, tacc);
+
+static void tacc_state_change(enum TACC_STATES new_state)
+{
+    s_time_t now, delta;
+    struct tacc* tacc = &this_cpu(tacc);
+    unsigned long flags;
+
+    local_irq_save(flags);
+
+    now = NOW();
+    delta = now - tacc->state_entry_time;
+
+    /* We do not expect states reenterability (at least through this function)*/
+    ASSERT(new_state != tacc->state);
+
+    tacc->state_time[tacc->state] += delta - tacc->irq_time;
+    tacc->state_time[TACC_IRQ] += tacc->irq_time;
+    tacc->irq_time = 0;
+    tacc->state = new_state;
+    tacc->state_entry_time = now;
+
+    local_irq_restore(flags);
+}
+
+void tacc_hyp(int place)
+{
+//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_HYP);
+}
+
+void tacc_guest(int place)
+{
+//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_GUEST);
+}
+
+void tacc_idle(int place)
+{
+//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_IDLE);
+}
+
+void tacc_gsync(int place)
+{
+//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_GSYNC);
+}
+
+void tacc_irq_enter(int place)
+{
+    struct tacc* tacc = &this_cpu(tacc);
+
+//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
+    ASSERT(!local_irq_is_enabled());
+    ASSERT(tacc->irq_cnt >= 0);
+
+    if ( tacc->irq_cnt == 0 )
+    {
+        tacc->irq_enter_time = NOW();
+    }
+
+    tacc->irq_cnt++;
+}
+
+void tacc_irq_exit(int place)
+{
+    struct tacc* tacc = &this_cpu(tacc);
+
+//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
+    ASSERT(!local_irq_is_enabled());
+    ASSERT(tacc->irq_cnt > 0);
+    if ( tacc->irq_cnt == 1 )
+    {
+        tacc->irq_time = NOW() - tacc->irq_enter_time;
+        tacc->irq_enter_time = 0;
+    }
+
+    tacc->irq_cnt--;
+}
+
 void context_saved(struct vcpu *prev)
 {
     /* Clear running flag /after/ writing context to memory. */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index e3601c1..04a8724 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -1002,6 +1002,33 @@  extern void dump_runq(unsigned char key);
 
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
 
+enum TACC_STATES {
+    TACC_HYP = 0,
+    TACC_GUEST = 1,
+    TACC_IDLE = 2,
+    TACC_IRQ = 3,
+    TACC_GSYNC = 4,
+    TACC_STATES_MAX
+};
+
+struct tacc
+{
+    s_time_t state_time[TACC_STATES_MAX];
+    s_time_t state_entry_time;
+    int state;
+
+    s_time_t guest_time;
+
+    s_time_t irq_enter_time;
+    s_time_t irq_time;
+    int irq_cnt;
+};
+
+DECLARE_PER_CPU(struct tacc, tacc);
+
+void tacc_hyp(int place);
+void tacc_idle(int place);
+
 #endif /* __SCHED_H__ */
 
 /*