diff mbox series

[RFC,1/9] schedule: Introduce per-pcpu time accounting

Message ID 1568197942-15374-2-git-send-email-andrii.anisov@gmail.com (mailing list archive)
State New, archived
Headers show
Series Changes to time accounting | expand

Commit Message

Andrii Anisov Sept. 11, 2019, 10:32 a.m. UTC
From: Andrii Anisov <andrii_anisov@epam.com>

Introduce per-pcpu time accounting what includes the following states:

TACC_HYP - the pcpu executes hypervisor code like softirq processing
           (including scheduling), tasklets and context switches
TACC_GUEST - the pcpu executes guests code
TACC_IDLE - the low-power state of the pcpu
TACC_IRQ - the pcpu performs interrupts processing, without separation to
           guest or hypervisor interrupts
TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
             from the guest. E.g. hypercall processing or io emulation.

Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
to state other than TACC_IRQ could happen until we return from nested
interrupts. IRQ time is accounted in a distinct way comparing to other states.
It is acumulated between other states transition moments, and is substracted
from the old state on states transion calculation.

Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
---
 xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/sched.h | 27 +++++++++++++++++
 2 files changed, 108 insertions(+)

Comments

Volodymyr Babchuk Sept. 11, 2019, 6:01 p.m. UTC | #1
Andrii,

Andrii Anisov writes:

> From: Andrii Anisov <andrii_anisov@epam.com>
>
> Introduce per-pcpu time accounting what includes the following states:
>
> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>            (including scheduling), tasklets and context switches
> TACC_GUEST - the pcpu executes guests code
> TACC_IDLE - the low-power state of the pcpu
Is it really low-power?

> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>            guest or hypervisor interrupts
I think, word "distinguishing" would be better than "separation"

> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>              from the guest. E.g. hypercall processing or io emulation.
>
> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
> to state other than TACC_IRQ could happen until we return from nested
> interrupts. IRQ time is accounted in a distinct way comparing to other states.
> It is acumulated between other states transition moments, and is substracted
> from the old state on states transion calculation.
>
> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
> ---
>  xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>  xen/include/xen/sched.h | 27 +++++++++++++++++
>  2 files changed, 108 insertions(+)
>
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> index 7b71581..6dd6603 100644
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -1539,6 +1539,87 @@ static void schedule(void)
>      context_switch(prev, next);
>  }
>  
> +DEFINE_PER_CPU(struct tacc, tacc);
> +
> +static void tacc_state_change(enum TACC_STATES new_state)
> +{
> +    s_time_t now, delta;
> +    struct tacc* tacc = &this_cpu(tacc);
> +    unsigned long flags;
> +
> +    local_irq_save(flags);
> +
> +    now = NOW();
> +    delta = now - tacc->state_entry_time;
> +
> +    /* We do not expect states reenterability (at least through this function)*/
> +    ASSERT(new_state != tacc->state);
> +
> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
> +    tacc->irq_time = 0;
> +    tacc->state = new_state;
> +    tacc->state_entry_time = now;
> +
> +    local_irq_restore(flags);
> +}
> +
> +void tacc_hyp(int place)
I believe, you want some enum for this "place" parameter type
> +{
> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
Please, don't push commented-out code. BTW, I think, it is possible to
add some TACC_DEBUG facilities to enable/disable this traces during
compile-time.

Also, looks like you don't use "place" parameter at all.

Lastly, I believe that this function (and other similar functions below)
can be defined as "static inline" in a header file.

> +    tacc_state_change(TACC_HYP);
> +}
> +
> +void tacc_guest(int place)
> +{
> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GUEST);
> +}
> +
> +void tacc_idle(int place)
> +{
> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_IDLE);
> +}
> +
> +void tacc_gsync(int place)
> +{
> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GSYNC);
> +}
> +
> +void tacc_irq_enter(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt >= 0);
You can make irq_cnt unsigned and drop this assert.

> +
> +    if ( tacc->irq_cnt == 0 )
> +    {
> +        tacc->irq_enter_time = NOW();
> +    }
Coding style:

---
Braces should be omitted for blocks with a single statement. e.g.,

if ( condition )
    single_statement();
---

> +
> +    tacc->irq_cnt++;
> +}
> +
> +void tacc_irq_exit(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt > 0);
> +    if ( tacc->irq_cnt == 1 )
> +    {
> +        tacc->irq_time = NOW() - tacc->irq_enter_time;
> +        tacc->irq_enter_time = 0;
> +    }
> +
> +    tacc->irq_cnt--;
What if, you IRQ will arrive right after this? I believe, you will lose
some of the accumulated time.

> +}
> +
>  void context_saved(struct vcpu *prev)
>  {
>      /* Clear running flag /after/ writing context to memory. */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index e3601c1..04a8724 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>  
>  void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>  
> +enum TACC_STATES {
If I remember correct, enum names should in lower case

> +    TACC_HYP = 0,
> +    TACC_GUEST = 1,
> +    TACC_IDLE = 2,
> +    TACC_IRQ = 3,
> +    TACC_GSYNC = 4,
> +    TACC_STATES_MAX
> +};
> +
> +struct tacc
> +{
> +    s_time_t state_time[TACC_STATES_MAX];
> +    s_time_t state_entry_time;
> +    int state;
enum, maybe?

> +
> +    s_time_t guest_time;
> +
> +    s_time_t irq_enter_time;
> +    s_time_t irq_time;
> +    int irq_cnt;
> +};
> +
> +DECLARE_PER_CPU(struct tacc, tacc);
> +
> +void tacc_hyp(int place);
> +void tacc_idle(int place);
What about functions from sched.c? Should they be declared there?

> +
>  #endif /* __SCHED_H__ */
>  
>  /*
Andrii Anisov Sept. 12, 2019, 10:26 a.m. UTC | #2
Hello Volodymyr,

On 11.09.19 21:01, Volodymyr Babchuk wrote:
>> Introduce per-pcpu time accounting what includes the following states:
>>
>> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>>             (including scheduling), tasklets and context switches
>> TACC_GUEST - the pcpu executes guests code
>> TACC_IDLE - the low-power state of the pcpu
> Is it really low-power?

It is rather matter of scheduling design. It differs from OS to OS, even from arch to arch. See [1].
Me personally tend to treat only low-power state as a true idle.
As a bad (IMO) example I can give the current XEN mainline. Pretty heavy tasks could be performed by the idle vcpu, and they are accounted as idle. This may mislead, for example, cpufreq governor.

>> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>>             guest or hypervisor interrupts
> I think, word "distinguishing" would be better than "separation"

Why so?

> 
>> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>>               from the guest. E.g. hypercall processing or io emulation.
>>
>> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
>> to state other than TACC_IRQ could happen until we return from nested
>> interrupts. IRQ time is accounted in a distinct way comparing to other states.
>> It is acumulated between other states transition moments, and is substracted
>> from the old state on states transion calculation.
>>
>> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
>> ---
>>   xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   xen/include/xen/sched.h | 27 +++++++++++++++++
>>   2 files changed, 108 insertions(+)
>>
>> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
>> index 7b71581..6dd6603 100644
>> --- a/xen/common/schedule.c
>> +++ b/xen/common/schedule.c
>> @@ -1539,6 +1539,87 @@ static void schedule(void)
>>       context_switch(prev, next);
>>   }
>>   
>> +DEFINE_PER_CPU(struct tacc, tacc);
>> +
>> +static void tacc_state_change(enum TACC_STATES new_state)
>> +{
>> +    s_time_t now, delta;
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +    unsigned long flags;
>> +
>> +    local_irq_save(flags);
>> +
>> +    now = NOW();
>> +    delta = now - tacc->state_entry_time;
>> +
>> +    /* We do not expect states reenterability (at least through this function)*/
>> +    ASSERT(new_state != tacc->state);
>> +
>> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
>> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
>> +    tacc->irq_time = 0;
>> +    tacc->state = new_state;
>> +    tacc->state_entry_time = now;
>> +
>> +    local_irq_restore(flags);
>> +}
>> +
>> +void tacc_hyp(int place)
> I believe, you want some enum for this "place" parameter type
>> +{
>> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
> Please, don't push commented-out code. BTW, I think, it is possible to
> add some TACC_DEBUG facilities to enable/disable this traces during
> compile-time.
> 
> Also, looks like you don't use "place" parameter at all.

Since that is the RFC, I've comforted myself with leaving my debug code in place. I hope it should not be confusing.

> 
> Lastly, I believe that this function (and other similar functions below)
> can be defined as "static inline" in a header file

Not this time. They are mostly called from asm (at least now).

> 
>> +    tacc_state_change(TACC_HYP);
>> +}
>> +
>> +void tacc_guest(int place)
>> +{
>> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GUEST);
>> +}
>> +
>> +void tacc_idle(int place)
>> +{
>> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_IDLE);
>> +}
>> +
>> +void tacc_gsync(int place)
>> +{
>> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GSYNC);
>> +}
>> +
>> +void tacc_irq_enter(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt >= 0);
> You can make irq_cnt unsigned and drop this assert.

No. Otherwise one might miss proper call sequence when utilize this for the different arch, and have no notice from the debug assertion.

> 
>> +
>> +    if ( tacc->irq_cnt == 0 )
>> +    {
>> +        tacc->irq_enter_time = NOW();
>> +    }
> Coding style:
> 
> ---
> Braces should be omitted for blocks with a single statement. e.g.,
> 
> if ( condition )
>      single_statement();
> ---
> 

OK.

>> +
>> +    tacc->irq_cnt++;
>> +}
>> +
>> +void tacc_irq_exit(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt > 0);
>> +    if ( tacc->irq_cnt == 1 )
>> +    {
>> +        tacc->irq_time = NOW() - tacc->irq_enter_time;
>> +        tacc->irq_enter_time = 0;
>> +    }
>> +
>> +    tacc->irq_cnt--;
> What if, you IRQ will arrive right after this? I believe, you will lose
> some of the accumulated time.

See ASSERT(!local_irq_is_enabled()) above.

> 
>> +}
>> +
>>   void context_saved(struct vcpu *prev)
>>   {
>>       /* Clear running flag /after/ writing context to memory. */
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index e3601c1..04a8724 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>>   
>>   void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>>   
>> +enum TACC_STATES {
> If I remember correct, enum names should in lower case

Ugh...

> 
>> +    TACC_HYP = 0,
>> +    TACC_GUEST = 1,
>> +    TACC_IDLE = 2,
>> +    TACC_IRQ = 3,
>> +    TACC_GSYNC = 4,
>> +    TACC_STATES_MAX
>> +};
>> +
>> +struct tacc
>> +{
>> +    s_time_t state_time[TACC_STATES_MAX];
>> +    s_time_t state_entry_time;
>> +    int state;
> enum, maybe?

Maybe.

> 
>> +
>> +    s_time_t guest_time;
>> +
>> +    s_time_t irq_enter_time;
>> +    s_time_t irq_time;
>> +    int irq_cnt;
>> +};
>> +
>> +DECLARE_PER_CPU(struct tacc, tacc);
>> +
>> +void tacc_hyp(int place);
>> +void tacc_idle(int place);
> What about functions from sched.c? Should they be declared there?

Maybe.

> 
>> +
>>   #endif /* __SCHED_H__ */
>>   
>>   /*
> 
> 

[1] https://elixir.bootlin.com/linux/latest/source/kernel/sched/cputime.c#L429
Julien Grall Oct. 28, 2019, 2:28 p.m. UTC | #3
Hi Andrii,

Sorry for the late answer. It would be good to get a review from the scheduler 
maintainers (Dario, George) to make sure they are happy with the suggested 
states here.

Please see my comments below.

On 11/09/2019 11:32, Andrii Anisov wrote:
> From: Andrii Anisov <andrii_anisov@epam.com>
> 
> Introduce per-pcpu time accounting what includes the following states:

I think we need a very detailed description of each states. Otherwise it will be 
hard to know how to categorize it.

> 
> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>             (including scheduling), tasklets and context switches

IHMO, "like" is too weak here. What do you exactly plan to introduce?

For instance, on Arm, you consider that leave_hypervisor_tail() is part of 
TACC_HYP. This function will include some handling for synchronous trap.

> TACC_GUEST - the pcpu executes guests code

Looking at the arm64 code, you are executing some hypervisor code here. I agree 
this is impossible to not run any hypervisor code with TACC_GUEST, but I think 
this should be clarified in the documentation.

> TACC_IDLE - the low-power state of the pcpu

Did you intend to mean "idle vCPU" is in use?

> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>             guest or hypervisor interrupts
> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>               from the guest. E.g. hypercall processing or io emulation.
> 
> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
> to state other than TACC_IRQ could happen until we return from nested
> interrupts. IRQ time is accounted in a distinct way comparing to other states.

s/comparing/compare/

> It is acumulated between other states transition moments, and is substracted

s/acumulated/accumulated/ s/substracted/subtracted/

> from the old state on states transion calculation.

s/transion/transition/

> 
> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
> ---
>   xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>   xen/include/xen/sched.h | 27 +++++++++++++++++
>   2 files changed, 108 insertions(+)
> 
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> index 7b71581..6dd6603 100644
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -1539,6 +1539,87 @@ static void schedule(void)
>       context_switch(prev, next);
>   }
>   
> +DEFINE_PER_CPU(struct tacc, tacc);
> +
> +static void tacc_state_change(enum TACC_STATES new_state)

This should never be called with the TACC_IRQ, right?

> +{
> +    s_time_t now, delta;
> +    struct tacc* tacc = &this_cpu(tacc);
> +    unsigned long flags;
> +
> +    local_irq_save(flags);
> +
> +    now = NOW();
> +    delta = now - tacc->state_entry_time;
> +
> +    /* We do not expect states reenterability (at least through this function)*/
> +    ASSERT(new_state != tacc->state);
> +
> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
> +    tacc->irq_time = 0;
> +    tacc->state = new_state;
> +    tacc->state_entry_time = now;
> +
> +    local_irq_restore(flags);
> +}
> +
> +void tacc_hyp(int place)

Place is never used except for your commented printk. So what's the goal for it?

Also, is it really necessary to provide helper for each state? Couldn't we just 
introduce one functions doing all the state?

> +{
> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_HYP);
> +}
> +
> +void tacc_guest(int place)
> +{
> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GUEST);
> +}
> +
> +void tacc_idle(int place)
> +{
> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_IDLE);
> +}
> +
> +void tacc_gsync(int place)
> +{
> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
> +    tacc_state_change(TACC_GSYNC);
> +}
> +
> +void tacc_irq_enter(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt >= 0);
> +
> +    if ( tacc->irq_cnt == 0 )
> +    {
> +        tacc->irq_enter_time = NOW();
> +    }
> +
> +    tacc->irq_cnt++;
> +}
> +
> +void tacc_irq_exit(int place)
> +{
> +    struct tacc* tacc = &this_cpu(tacc);
> +
> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
> +    ASSERT(!local_irq_is_enabled());
> +    ASSERT(tacc->irq_cnt > 0);
> +    if ( tacc->irq_cnt == 1 )
> +    {
> +        tacc->irq_time = NOW() - tacc->irq_enter_time;

If I understand correctly, you will use irq_time to update TACC_IRQ in 
tacc_state_change(). It may be possible to receive another interrupt before the 
state is changed (e.g. HYP -> GUEST). This means only the time for the last IRQ 
received would be accounted.

> +        tacc->irq_enter_time = 0;
> +    }
> +
> +    tacc->irq_cnt--;
> +}
> +
>   void context_saved(struct vcpu *prev)
>   {
>       /* Clear running flag /after/ writing context to memory. */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index e3601c1..04a8724 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>   
>   void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>   
> +enum TACC_STATES {

We don't tend to use all uppercases for enum name.

> +    TACC_HYP = 0,

enum begins at 0 and increment by one every time. So there is no need to 
hardcode a number.

Also, looking at the code, I think you rely on the first state to be TACC_HYP. 
Am I correct?

> +    TACC_GUEST = 1,
> +    TACC_IDLE = 2,
> +    TACC_IRQ = 3,
> +    TACC_GSYNC = 4,
> +    TACC_STATES_MAX
> +};

It would be good to document all the states in the header as well.

> +
> +struct tacc

Please document the structure.

> +{
> +    s_time_t state_time[TACC_STATES_MAX];
> +    s_time_t state_entry_time;
> +    int state;

This should be the enum you used above here.

> +
> +    s_time_t guest_time;

This is not used.

> +
> +    s_time_t irq_enter_time;
> +    s_time_t irq_time;
> +    int irq_cnt;
Why do you need this to be signed?

> +};
> +
> +DECLARE_PER_CPU(struct tacc, tacc);
> +
> +void tacc_hyp(int place);
> +void tacc_idle(int place);
> +
>   #endif /* __SCHED_H__ */
>   
>   /*
> 

Cheers,
Andrii Anisov Nov. 6, 2019, 11:24 a.m. UTC | #4
Hello Julien,

On 28.10.19 16:28, Julien Grall wrote:
> It would be good to get a review from the scheduler maintainers (Dario, George) to make sure they are happy with the suggested states here.

I would not say I'm completely happy with this set of states. I'd like to have a discussion on this topic with scheduler maintainers. Also because they could have a different view from x86 world.

>> Introduce per-pcpu time accounting what includes the following states:
> 
> I think we need a very detailed description of each states. Otherwise it will be hard to know how to categorize it.

I agree that we need a very detailed description of each states. Ask questions if something is not clear or doubtful. I guess we could have something better after Q&A process.

> 
>>
>> TACC_HYP - the pcpu executes hypervisor code like softirq processing
>>             (including scheduling), tasklets and context switches
> 
> IHMO, "like" is too weak here. What do you exactly plan to introduce?

I think this should be what hypervisor does except hypercall and IO emulation (what is TACC_GSYNC).

> 
> For instance, on Arm, you consider that leave_hypervisor_tail() is part of TACC_HYP. This function will include some handling for synchronous trap.

I guess you are saying about `p2m_flush_vm`. I doubt here, and open for suggestions.


>> TACC_GUEST - the pcpu executes guests code
> 
> Looking at the arm64 code, you are executing some hypervisor code here. I agree this is impossible to not run any hypervisor code with TACC_GUEST, but I think this should be clarified in the documentation.

Do you mean adding few words about still having some hypervisor code near the actual context switch from/to guest (entry/return_from_trap)?

> 
>> TACC_IDLE - the low-power state of the pcpu
> 
> Did you intend to mean "idle vCPU" is in use?

No. I did mean what is written.
Currently, the idle vcpu does hypervisor work (e.g. tasklets) along with the low-power mode. IMO we have to separate them.

> 
>> TACC_IRQ - the pcpu performs interrupts processing, without separation to
>>             guest or hypervisor interrupts
>> TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
>>               from the guest. E.g. hypercall processing or io emulation.
>>
>> Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
>> to state other than TACC_IRQ could happen until we return from nested
>> interrupts. IRQ time is accounted in a distinct way comparing to other states.
> 
> s/comparing/compare/

OK.

> 
>> It is acumulated between other states transition moments, and is substracted
> 
> s/acumulated/accumulated/ s/substracted/subtracted/

OK.

> 
>> from the old state on states transion calculation.
[1]
> 
> s/transion/transition/

OK.

> 
>>
>> Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
>> ---
>>   xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   xen/include/xen/sched.h | 27 +++++++++++++++++
>>   2 files changed, 108 insertions(+)
>>
>> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
>> index 7b71581..6dd6603 100644
>> --- a/xen/common/schedule.c
>> +++ b/xen/common/schedule.c
>> @@ -1539,6 +1539,87 @@ static void schedule(void)
>>       context_switch(prev, next);
>>   }
>> +DEFINE_PER_CPU(struct tacc, tacc);
>> +
>> +static void tacc_state_change(enum TACC_STATES new_state)
> 
> This should never be called with the TACC_IRQ, right?

Yes. Actually, tacc->state should never be TACC_IRQ.
Because of TACC_IRQ reenterability it is handled through the tacc->irq_cnt and tacc->irq_enter_time.

> 
>> +{
>> +    s_time_t now, delta;
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +    unsigned long flags;
>> +
>> +    local_irq_save(flags);
>> +
>> +    now = NOW();
>> +    delta = now - tacc->state_entry_time;
>> +
>> +    /* We do not expect states reenterability (at least through this function)*/
>> +    ASSERT(new_state != tacc->state);
>> +
>> +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
>> +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
>> +    tacc->irq_time = 0;
>> +    tacc->state = new_state;
>> +    tacc->state_entry_time = now;
>> +
>> +    local_irq_restore(flags);
>> +}
>> +
>> +void tacc_hyp(int place)
> 
> Place is never used except for your commented printk. So what's the goal for it?

Place is just a piece of code used for debugging, as well as printk. I keept it here because this series is very RFC, yet it could be removed.

> Also, is it really necessary to provide helper for each state? Couldn't we just introduce one functions doing all the state?

I'd like calling that stuff from assembler without parameters. But have no strong opinion here.
  
>> +{
>> +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_HYP);
>> +}
>> +
>> +void tacc_guest(int place)
>> +{
>> +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GUEST);
>> +}
>> +
>> +void tacc_idle(int place)
>> +{
>> +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_IDLE);
>> +}
>> +
>> +void tacc_gsync(int place)
>> +{
>> +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
>> +    tacc_state_change(TACC_GSYNC);
>> +}
>> +
>> +void tacc_irq_enter(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt >= 0);
>> +
>> +    if ( tacc->irq_cnt == 0 )
>> +    {
>> +        tacc->irq_enter_time = NOW();
>> +    }
>> +
>> +    tacc->irq_cnt++;
>> +}
>> +
>> +void tacc_irq_exit(int place)
>> +{
>> +    struct tacc* tacc = &this_cpu(tacc);
>> +
>> +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
>> +    ASSERT(!local_irq_is_enabled());
>> +    ASSERT(tacc->irq_cnt > 0);
>> +    if ( tacc->irq_cnt == 1 )
>> +    {
>> +        tacc->irq_time = NOW() - tacc->irq_enter_time;
> 
> If I understand correctly, you will use irq_time to update TACC_IRQ in tacc_state_change(). It may be possible to receive another interrupt before the state is changed (e.g. HYP -> GUEST). This means only the time for the last IRQ received would be accounted.

I do lock IRQs for state change. Shouldn't that protect it?

> 
>> +        tacc->irq_enter_time = 0;
>> +    }
>> +
>> +    tacc->irq_cnt--;
>> +}
>> +
>>   void context_saved(struct vcpu *prev)
>>   {
>>       /* Clear running flag /after/ writing context to memory. */
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index e3601c1..04a8724 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
>>   void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
>> +enum TACC_STATES {
> 
> We don't tend to use all uppercases for enum name.

OK.

> 
>> +    TACC_HYP = 0,
> 
> enum begins at 0 and increment by one every time. So there is no need to hardcode a number.
> 
> Also, looking at the code, I think you rely on the first state to be TACC_HYP. Am I correct?

TACC_HYP is expected to be the initial state of the PCPU.

> 
>> +    TACC_GUEST = 1,
>> +    TACC_IDLE = 2,
>> +    TACC_IRQ = 3,
>> +    TACC_GSYNC = 4,
>> +    TACC_STATES_MAX
>> +};
> > It would be good to document all the states in the header as well.

OK.

> 
>> +
>> +struct tacc
> 
> Please document the structure.

OK.

> 
>> +{
>> +    s_time_t state_time[TACC_STATES_MAX];
>> +    s_time_t state_entry_time;
>> +    int state;
> 
> This should be the enum you used above here.

Yep.

>> +
>> +    s_time_t guest_time;
> 
> This is not used.

Yep, will drop it.

> 
>> +
>> +    s_time_t irq_enter_time;
>> +    s_time_t irq_time;
>> +    int irq_cnt;
> Why do you need this to be signed?

For assertion.
  
>> +};
>> +
>> +DECLARE_PER_CPU(struct tacc, tacc);
>> +
>> +void tacc_hyp(int place);
>> +void tacc_idle(int place);
>> +
>>   #endif /* __SCHED_H__ */
>>   /*
>>
> 
> Cheers,
>
Volodymyr Babchuk May 26, 2020, 2:27 a.m. UTC | #5
Hello All,

This is gentle reminder about this RFC. 

Sadly, Andrii Anisov has left our team. But I'm commited to continue
his work on time accounting and real time scheduling.

I do realize, that proposed patches have become moldy. I can rebase
them onto current master, if it would help. 

On Wed, 2019-11-06 at 13:24 +0200, Andrii Anisov wrote:
> Hello Julien,
> 
> On 28.10.19 16:28, Julien Grall wrote:
> > It would be good to get a review from the scheduler maintainers (Dario, George) to make sure they are happy with the suggested states here.
> 
> I would not say I'm completely happy with this set of states. I'd like to have a discussion on this topic with scheduler maintainers. Also because they could have a different view from x86 world.

I would love to hear any inputs on this topc from general scheduling
approach standpoint and from x86 view.  

> > > Introduce per-pcpu time accounting what includes the following states:
> > 
> > I think we need a very detailed description of each states. Otherwise it will be hard to know how to categorize it.
> 
> I agree that we need a very detailed description of each states. Ask questions if something is not clear or doubtful. I guess we could have something better after Q&A process.
> 
> > > TACC_HYP - the pcpu executes hypervisor code like softirq processing
> > >             (including scheduling), tasklets and context switches
> > 
> > IHMO, "like" is too weak here. What do you exactly plan to introduce?
> 
> I think this should be what hypervisor does except hypercall and IO emulation (what is TACC_GSYNC).
> 
> > For instance, on Arm, you consider that leave_hypervisor_tail() is part of TACC_HYP. This function will include some handling for synchronous trap.
> 
> I guess you are saying about `p2m_flush_vm`. I doubt here, and open for suggestions.
> 
> 
> > > TACC_GUEST - the pcpu executes guests code
> > 
> > Looking at the arm64 code, you are executing some hypervisor code here. I agree this is impossible to not run any hypervisor code with TACC_GUEST, but I think this should be clarified in the documentation.
> 
> Do you mean adding few words about still having some hypervisor code near the actual context switch from/to guest (entry/return_from_trap)?
> 
> > > TACC_IDLE - the low-power state of the pcpu
> > 
> > Did you intend to mean "idle vCPU" is in use?
> 
> No. I did mean what is written.
> Currently, the idle vcpu does hypervisor work (e.g. tasklets) along with the low-power mode. IMO we have to separate them.
> 
> > > TACC_IRQ - the pcpu performs interrupts processing, without separation to
> > >             guest or hypervisor interrupts
> > > TACC_GSYNC - the pcpu executes hypervisor code to process synchronous trap
> > >               from the guest. E.g. hypercall processing or io emulation.
> > > 
> > > Currently, the only reenterant state is TACC_IRQ. It is assumed, no changes
> > > to state other than TACC_IRQ could happen until we return from nested
> > > interrupts. IRQ time is accounted in a distinct way comparing to other states.
> > 
> > s/comparing/compare/
> 
> OK.
> 
> > > It is acumulated between other states transition moments, and is substracted
> > 
> > s/acumulated/accumulated/ s/substracted/subtracted/
> 
> OK.
> 
> > > from the old state on states transion calculation.
> [1]
> > s/transion/transition/
> 
> OK.
> 
> > > Signed-off-by: Andrii Anisov <andrii_anisov@epam.com>
> > > ---
> > >   xen/common/schedule.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >   xen/include/xen/sched.h | 27 +++++++++++++++++
> > >   2 files changed, 108 insertions(+)
> > > 
> > > diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> > > index 7b71581..6dd6603 100644
> > > --- a/xen/common/schedule.c
> > > +++ b/xen/common/schedule.c
> > > @@ -1539,6 +1539,87 @@ static void schedule(void)
> > >       context_switch(prev, next);
> > >   }
> > > +DEFINE_PER_CPU(struct tacc, tacc);
> > > +
> > > +static void tacc_state_change(enum TACC_STATES new_state)
> > 
> > This should never be called with the TACC_IRQ, right?
> 
> Yes. Actually, tacc->state should never be TACC_IRQ.
> Because of TACC_IRQ reenterability it is handled through the tacc->irq_cnt and tacc->irq_enter_time.
> 
> > > +{
> > > +    s_time_t now, delta;
> > > +    struct tacc* tacc = &this_cpu(tacc);
> > > +    unsigned long flags;
> > > +
> > > +    local_irq_save(flags);
> > > +
> > > +    now = NOW();
> > > +    delta = now - tacc->state_entry_time;
> > > +
> > > +    /* We do not expect states reenterability (at least through this function)*/
> > > +    ASSERT(new_state != tacc->state);
> > > +
> > > +    tacc->state_time[tacc->state] += delta - tacc->irq_time;
> > > +    tacc->state_time[TACC_IRQ] += tacc->irq_time;
> > > +    tacc->irq_time = 0;
> > > +    tacc->state = new_state;
> > > +    tacc->state_entry_time = now;
> > > +
> > > +    local_irq_restore(flags);
> > > +}
> > > +
> > > +void tacc_hyp(int place)
> > 
> > Place is never used except for your commented printk. So what's the goal for it?
> 
> Place is just a piece of code used for debugging, as well as printk. I keept it here because this series is very RFC, yet it could be removed.
> 
> > Also, is it really necessary to provide helper for each state? Couldn't we just introduce one functions doing all the state?
> 
> I'd like calling that stuff from assembler without parameters. But have no strong opinion here.
>   
> > > +{
> > > +//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
> > > +    tacc_state_change(TACC_HYP);
> > > +}
> > > +
> > > +void tacc_guest(int place)
> > > +{
> > > +//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
> > > +    tacc_state_change(TACC_GUEST);
> > > +}
> > > +
> > > +void tacc_idle(int place)
> > > +{
> > > +//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
> > > +    tacc_state_change(TACC_IDLE);
> > > +}
> > > +
> > > +void tacc_gsync(int place)
> > > +{
> > > +//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
> > > +    tacc_state_change(TACC_GSYNC);
> > > +}
> > > +
> > > +void tacc_irq_enter(int place)
> > > +{
> > > +    struct tacc* tacc = &this_cpu(tacc);
> > > +
> > > +//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
> > > +    ASSERT(!local_irq_is_enabled());
> > > +    ASSERT(tacc->irq_cnt >= 0);
> > > +
> > > +    if ( tacc->irq_cnt == 0 )
> > > +    {
> > > +        tacc->irq_enter_time = NOW();
> > > +    }
> > > +
> > > +    tacc->irq_cnt++;
> > > +}
> > > +
> > > +void tacc_irq_exit(int place)
> > > +{
> > > +    struct tacc* tacc = &this_cpu(tacc);
> > > +
> > > +//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
> > > +    ASSERT(!local_irq_is_enabled());
> > > +    ASSERT(tacc->irq_cnt > 0);
> > > +    if ( tacc->irq_cnt == 1 )
> > > +    {
> > > +        tacc->irq_time = NOW() - tacc->irq_enter_time;
> > 
> > If I understand correctly, you will use irq_time to update TACC_IRQ in tacc_state_change(). It may be possible to receive another interrupt before the state is changed (e.g. HYP -> GUEST). This means only the time for the last IRQ received would be accounted.
> 
> I do lock IRQs for state change. Shouldn't that protect it?
> 
> > > +        tacc->irq_enter_time = 0;
> > > +    }
> > > +
> > > +    tacc->irq_cnt--;
> > > +}
> > > +
> > >   void context_saved(struct vcpu *prev)
> > >   {
> > >       /* Clear running flag /after/ writing context to memory. */
> > > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> > > index e3601c1..04a8724 100644
> > > --- a/xen/include/xen/sched.h
> > > +++ b/xen/include/xen/sched.h
> > > @@ -1002,6 +1002,33 @@ extern void dump_runq(unsigned char key);
> > >   void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
> > > +enum TACC_STATES {
> > 
> > We don't tend to use all uppercases for enum name.
> 
> OK.
> 
> > > +    TACC_HYP = 0,
> > 
> > enum begins at 0 and increment by one every time. So there is no need to hardcode a number.
> > 
> > Also, looking at the code, I think you rely on the first state to be TACC_HYP. Am I correct?
> 
> TACC_HYP is expected to be the initial state of the PCPU.
> 
> > > +    TACC_GUEST = 1,
> > > +    TACC_IDLE = 2,
> > > +    TACC_IRQ = 3,
> > > +    TACC_GSYNC = 4,
> > > +    TACC_STATES_MAX
> > > +};
> > > It would be good to document all the states in the header as well.
> 
> OK.
> 
> > > +
> > > +struct tacc
> > 
> > Please document the structure.
> 
> OK.
> 
> > > +{
> > > +    s_time_t state_time[TACC_STATES_MAX];
> > > +    s_time_t state_entry_time;
> > > +    int state;
> > 
> > This should be the enum you used above here.
> 
> Yep.
> 
> > > +
> > > +    s_time_t guest_time;
> > 
> > This is not used.
> 
> Yep, will drop it.
> 
> > > +
> > > +    s_time_t irq_enter_time;
> > > +    s_time_t irq_time;
> > > +    int irq_cnt;
> > Why do you need this to be signed?
> 
> For assertion.
>   
> > > +};
> > > +
> > > +DECLARE_PER_CPU(struct tacc, tacc);
> > > +
> > > +void tacc_hyp(int place);
> > > +void tacc_idle(int place);
> > > +
> > >   #endif /* __SCHED_H__ */
> > >   /*
> > > 
> > 
> > Cheers,
> >
Dario Faggioli May 29, 2020, 8:48 a.m. UTC | #6
On Tue, 2020-05-26 at 02:27 +0000, Volodymyr Babchuk wrote:
> Hello All,
> 
Hello Volodymyr,

> This is gentle reminder about this RFC. 
> 
> Sadly, Andrii Anisov has left our team. But I'm commited to continue
> his work on time accounting and real time scheduling.
> 
Ok, so, first of all, sorry that this has not been properly addressed.

I personally never forgot about it or anything... Still, I haven't been
able to look into it properly.

> I do realize, that proposed patches have become moldy. I can rebase
> them onto current master, if it would help. 
>
As a matter of fact, it would. Especially considering that, AFAICT,
this pre-dates core-scheduling.

So, if you're really keen doing such a rebase and resending, I will be
happy to have a look at how it ends up looking like.

Thanks and Regards
Volodymyr Babchuk June 2, 2020, 1:12 a.m. UTC | #7
On Fri, 2020-05-29 at 10:48 +0200, Dario Faggioli wrote:
> On Tue, 2020-05-26 at 02:27 +0000, Volodymyr Babchuk wrote:
> > Hello All,
> > 
> Hello Volodymyr,
> 

Hi Dario,

> > This is gentle reminder about this RFC. 
> > 
> > Sadly, Andrii Anisov has left our team. But I'm commited to continue
> > his work on time accounting and real time scheduling.
> > 
> Ok, so, first of all, sorry that this has not been properly addressed.
> 
> I personally never forgot about it or anything... Still, I haven't been
> able to look into it properly.
> 

I see.. Anyways, thanks for the reply. 

Actually, I tried to not only rebase this patch series to the current
mainline, but also to add x86 support. This gave me deeper
unsterstanding of the inner workings. At least I hope so :)

Anyways, I want to discuss the matter before continuing reworking the
patches. The goal of those patches is to account guest time more
precisely. 

Right now I can see only two main reasons, when guest can be charged
for a time it dindn't used: interrupts and soft irqs. 

- do_softirq() is called every time we leave hypervisor mode. It is
used to do housekeeping for the hypervisor itself. But, some random
guest will charged for time spent in do_softirq() unless this function
is not called on a idle vcpu.

- also, pCPU can be interrupted by IRQ assigned to some other guest or
to hypervisor itself. But time spent in interrupt handler will be
charged for a guest being interrupted.

So, basically, to account guest time correctly, we need to substract
time spent in do_softirq() and in do_IRQ(). 

Actually, we can charge the correct guest for time spent in do_IRQ(),
because handler code will eventually know target vCPU for the
interrupt. There is technical problem with interrupt nesting. We will
need some stack to track nesting correctly. But this is doable.

Just for statistical purposes we can track hypervisor time somwhere,
but it is not needed for scheduling decisions.

Am I missing something?
Dario Faggioli June 3, 2020, 3:22 p.m. UTC | #8
On Tue, 2020-06-02 at 01:12 +0000, Volodymyr Babchuk wrote:
> On Fri, 2020-05-29 at 10:48 +0200, Dario Faggioli wrote:
> > 
> Actually, I tried to not only rebase this patch series to the current
> mainline, but also to add x86 support. This gave me deeper
> unsterstanding of the inner workings. At least I hope so :)
> 
Right.

> Anyways, I want to discuss the matter before continuing reworking the
> patches. The goal of those patches is to account guest time more
> precisely. 
> 
Yes, I agree. IIRC, the patches are doing more than that, e.g.,
discriminating between the runtime of the idle vCPUs and the time
during which the CPUs were actually idle, and even trying to classify
somehow what the hypervisor was actually doing (guest sync, etc).

But, indeed, I would very much start with the one yous stated above, as
a goal.

> Right now I can see only two main reasons, when guest can be charged
> for a time it dindn't used: interrupts and soft irqs. 
> 
> - do_softirq() is called every time we leave hypervisor mode. It is
> used to do housekeeping for the hypervisor itself. But, some random
> guest will charged for time spent in do_softirq() unless this
> function
> is not called on a idle vcpu.
> 
> - also, pCPU can be interrupted by IRQ assigned to some other guest
> or
> to hypervisor itself. But time spent in interrupt handler will be
> charged for a guest being interrupted.
> 
I think those are the ones, yes.

> So, basically, to account guest time correctly, we need to substract
> time spent in do_softirq() and in do_IRQ(). 
> 
That's how I'd try to do this, if it were me doing it.

> Actually, we can charge the correct guest for time spent in do_IRQ(),
> because handler code will eventually know target vCPU for the
> interrupt. There is technical problem with interrupt nesting. We will
> need some stack to track nesting correctly. But this is doable.
> 
Yes, there's this, and maybe a few other "dependencies" that we may
discuss about, and try to track and account for, for even greather
fairness. But maybe this can come as a second step?

> Just for statistical purposes we can track hypervisor time somwhere,
> but it is not needed for scheduling decisions.
> 
What we need is, I think, a way to tell the used/admin that that time
is being spent in the hypervisor. E.g., if we were spending (let's
exaggerate) 20% of the time processing interrupts and softirqs, the
user would see some of this 20% load coming from each guest. It
certainly wasn't ideal, but we do not want for such 20% to suddenly
vanish either.

> Am I missing something?
>
To me, it seems you're not. :-)

Regards
diff mbox series

Patch

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 7b71581..6dd6603 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1539,6 +1539,87 @@  static void schedule(void)
     context_switch(prev, next);
 }
 
+DEFINE_PER_CPU(struct tacc, tacc);
+
+static void tacc_state_change(enum TACC_STATES new_state)
+{
+    s_time_t now, delta;
+    struct tacc* tacc = &this_cpu(tacc);
+    unsigned long flags;
+
+    local_irq_save(flags);
+
+    now = NOW();
+    delta = now - tacc->state_entry_time;
+
+    /* We do not expect states reenterability (at least through this function)*/
+    ASSERT(new_state != tacc->state);
+
+    tacc->state_time[tacc->state] += delta - tacc->irq_time;
+    tacc->state_time[TACC_IRQ] += tacc->irq_time;
+    tacc->irq_time = 0;
+    tacc->state = new_state;
+    tacc->state_entry_time = now;
+
+    local_irq_restore(flags);
+}
+
+void tacc_hyp(int place)
+{
+//    printk("\ttacc_hyp %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_HYP);
+}
+
+void tacc_guest(int place)
+{
+//    printk("\ttacc_guest %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_GUEST);
+}
+
+void tacc_idle(int place)
+{
+//    printk("\tidle cpu %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_IDLE);
+}
+
+void tacc_gsync(int place)
+{
+//    printk("\ttacc_gsync %u, place %d\n", smp_processor_id(), place);
+    tacc_state_change(TACC_GSYNC);
+}
+
+void tacc_irq_enter(int place)
+{
+    struct tacc* tacc = &this_cpu(tacc);
+
+//    printk("\ttacc_irq_enter %u, place %d, cnt %d\n", smp_processor_id(), place, this_cpu(tacc).irq_cnt);
+    ASSERT(!local_irq_is_enabled());
+    ASSERT(tacc->irq_cnt >= 0);
+
+    if ( tacc->irq_cnt == 0 )
+    {
+        tacc->irq_enter_time = NOW();
+    }
+
+    tacc->irq_cnt++;
+}
+
+void tacc_irq_exit(int place)
+{
+    struct tacc* tacc = &this_cpu(tacc);
+
+//    printk("\ttacc_irq_exit %u, place %d, cnt %d\n", smp_processor_id(), place, tacc->irq_cnt);
+    ASSERT(!local_irq_is_enabled());
+    ASSERT(tacc->irq_cnt > 0);
+    if ( tacc->irq_cnt == 1 )
+    {
+        tacc->irq_time = NOW() - tacc->irq_enter_time;
+        tacc->irq_enter_time = 0;
+    }
+
+    tacc->irq_cnt--;
+}
+
 void context_saved(struct vcpu *prev)
 {
     /* Clear running flag /after/ writing context to memory. */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index e3601c1..04a8724 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -1002,6 +1002,33 @@  extern void dump_runq(unsigned char key);
 
 void arch_do_physinfo(struct xen_sysctl_physinfo *pi);
 
+enum TACC_STATES {
+    TACC_HYP = 0,
+    TACC_GUEST = 1,
+    TACC_IDLE = 2,
+    TACC_IRQ = 3,
+    TACC_GSYNC = 4,
+    TACC_STATES_MAX
+};
+
+struct tacc
+{
+    s_time_t state_time[TACC_STATES_MAX];
+    s_time_t state_entry_time;
+    int state;
+
+    s_time_t guest_time;
+
+    s_time_t irq_enter_time;
+    s_time_t irq_time;
+    int irq_cnt;
+};
+
+DECLARE_PER_CPU(struct tacc, tacc);
+
+void tacc_hyp(int place);
+void tacc_idle(int place);
+
 #endif /* __SCHED_H__ */
 
 /*