diff mbox series

[RFC,v1,2/6] sched: track time spent in hypervisor tasks

Message ID 20200612002205.174295-3-volodymyr_babchuk@epam.com (mailing list archive)
State New, archived
Headers show
Series Fair scheduling | expand

Commit Message

Volodymyr Babchuk June 12, 2020, 12:22 a.m. UTC
In most cases hypervisor code performs guest-related jobs. Tasks like
hypercall handling or MMIO access emulation are done for calling vCPU
so it is okay to charge time spent in hypervisor to the current vCPU.

But, there are also tasks that are not originated from guests. This
includes things like TLB flushing or running tasklets. We don't want
to track time spent in this tasks to a total scheduling unit run
time. So we need to track time spent in such housekeeping tasks
separately.

Those hypervisor tasks are run in do_softirq() function, so we'll
install our hooks there.

TODO: This change is not tested on ARM, and probably we'll get a
failing assertion there. This is because ARM code exits from
schedule() and have chance to get to end of do_softirq().

Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
---
 xen/common/sched/core.c | 32 ++++++++++++++++++++++++++++++++
 xen/common/softirq.c    |  2 ++
 xen/include/xen/sched.h | 16 +++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

Comments

Jürgen Groß June 12, 2020, 4:43 a.m. UTC | #1
On 12.06.20 02:22, Volodymyr Babchuk wrote:
> In most cases hypervisor code performs guest-related jobs. Tasks like
> hypercall handling or MMIO access emulation are done for calling vCPU
> so it is okay to charge time spent in hypervisor to the current vCPU.
> 
> But, there are also tasks that are not originated from guests. This
> includes things like TLB flushing or running tasklets. We don't want
> to track time spent in this tasks to a total scheduling unit run
> time. So we need to track time spent in such housekeeping tasks
> separately.
> 
> Those hypervisor tasks are run in do_softirq() function, so we'll
> install our hooks there.
> 
> TODO: This change is not tested on ARM, and probably we'll get a
> failing assertion there. This is because ARM code exits from
> schedule() and have chance to get to end of do_softirq().
> 
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@epam.com>
> ---
>   xen/common/sched/core.c | 32 ++++++++++++++++++++++++++++++++
>   xen/common/softirq.c    |  2 ++
>   xen/include/xen/sched.h | 16 +++++++++++++++-
>   3 files changed, 49 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
> index 8f642ada05..d597811fef 100644
> --- a/xen/common/sched/core.c
> +++ b/xen/common/sched/core.c
> @@ -945,6 +945,37 @@ void vcpu_end_irq_handler(void)
>       atomic_add(delta, &current->sched_unit->irq_time);
>   }
>   
> +void vcpu_begin_hyp_task(struct vcpu *v)
> +{
> +    if ( is_idle_vcpu(v) )
> +        return;
> +
> +    ASSERT(!v->in_hyp_task);
> +
> +    v->hyp_entry_time = NOW();
> +#ifndef NDEBUG
> +    v->in_hyp_task = true;
> +#endif
> +}
> +
> +void vcpu_end_hyp_task(struct vcpu *v)
> +{
> +    int delta;
> +
> +    if ( is_idle_vcpu(v) )
> +        return;
> +
> +    ASSERT(v->in_hyp_task);
> +
> +    /* We assume that hypervisor task time will not overflow int */

This will definitely happen for long running VMs. Please use a 64-bit
variable.

> +    delta = NOW() - v->hyp_entry_time;
> +    atomic_add(delta, &v->sched_unit->hyp_time);
> +
> +#ifndef NDEBUG
> +    v->in_hyp_task = false;
> +#endif
> +}
> +
>   /*
>    * Do the actual movement of an unit from old to new CPU. Locks for *both*
>    * CPUs needs to have been taken already when calling this!
> @@ -2615,6 +2646,7 @@ static void schedule(void)
>   
>       SCHED_STAT_CRANK(sched_run);
>   
> +    vcpu_end_hyp_task(current);
>       rcu_read_lock(&sched_res_rculock);
>   
>       lock = pcpu_schedule_lock_irq(cpu);
> diff --git a/xen/common/softirq.c b/xen/common/softirq.c
> index 063e93cbe3..03a29384d1 100644
> --- a/xen/common/softirq.c
> +++ b/xen/common/softirq.c
> @@ -71,7 +71,9 @@ void process_pending_softirqs(void)
>   void do_softirq(void)
>   {
>       ASSERT_NOT_IN_ATOMIC();
> +    vcpu_begin_hyp_task(current);
>       __do_softirq(0);
> +    vcpu_end_hyp_task(current);

This won't work for scheduling. current will either have changed,
or in x86 case __do_softirq() might just not return. You need to
handle that case explicitly in schedule() (you did that for the
old vcpu, but for the case schedule() is returning you need to
call vcpu_begin_hyp_task(current) there).


Juergen
Volodymyr Babchuk June 12, 2020, 11:30 a.m. UTC | #2
On Fri, 2020-06-12 at 06:43 +0200, Jürgen Groß wrote:
> On 12.06.20 02:22, Volodymyr Babchuk wrote:
> > +void vcpu_end_hyp_task(struct vcpu *v)
> > +{
> > +    int delta;
> > +
> > +    if ( is_idle_vcpu(v) )
> > +        return;
> > +
> > +    ASSERT(v->in_hyp_task);
> > +
> > +    /* We assume that hypervisor task time will not overflow int */
> 
> This will definitely happen for long running VMs. Please use a 64-bit
> variable.
> 

It is not suposed to hold long time spans, as I described in the reply
to previous email.

> > +    delta = NOW() - v->hyp_entry_time;
> > +    atomic_add(delta, &v->sched_unit->hyp_time);
> > +
> > +#ifndef NDEBUG
> > +    v->in_hyp_task = false;
> > +#endif
> > +}
> > +
> >   /*
> >    * Do the actual movement of an unit from old to new CPU. Locks for *both*
> >    * CPUs needs to have been taken already when calling this!
> > @@ -2615,6 +2646,7 @@ static void schedule(void)
> >   
> >       SCHED_STAT_CRANK(sched_run);
> >   
> > +    vcpu_end_hyp_task(current);
> >       rcu_read_lock(&sched_res_rculock);
> >   
> >       lock = pcpu_schedule_lock_irq(cpu);
> > diff --git a/xen/common/softirq.c b/xen/common/softirq.c
> > index 063e93cbe3..03a29384d1 100644
> > --- a/xen/common/softirq.c
> > +++ b/xen/common/softirq.c
> > @@ -71,7 +71,9 @@ void process_pending_softirqs(void)
> >   void do_softirq(void)
> >   {
> >       ASSERT_NOT_IN_ATOMIC();
> > +    vcpu_begin_hyp_task(current);
> >       __do_softirq(0);
> > +    vcpu_end_hyp_task(current);
> 
> This won't work for scheduling. current will either have changed,
> or in x86 case __do_softirq() might just not return. You need to
> handle that case explicitly in schedule() (you did that for the
> old vcpu, but for the case schedule() is returning you need to
> call vcpu_begin_hyp_task(current) there).
> 

Well, this is one of questions, I wanted to discuss. I certainly need
to call vcpu_begin_hyp_task(current) after context switch. But what it
is the right place? If my understaning is right, code on x86 platform
will never reach this point. Or I'm wrong there?
Jürgen Groß June 12, 2020, 11:40 a.m. UTC | #3
On 12.06.20 13:30, Volodymyr Babchuk wrote:
> On Fri, 2020-06-12 at 06:43 +0200, Jürgen Groß wrote:
>> On 12.06.20 02:22, Volodymyr Babchuk wrote:
>>> +void vcpu_end_hyp_task(struct vcpu *v)
>>> +{
>>> +    int delta;
>>> +
>>> +    if ( is_idle_vcpu(v) )
>>> +        return;
>>> +
>>> +    ASSERT(v->in_hyp_task);
>>> +
>>> +    /* We assume that hypervisor task time will not overflow int */
>>
>> This will definitely happen for long running VMs. Please use a 64-bit
>> variable.
>>
> 
> It is not suposed to hold long time spans, as I described in the reply
> to previous email.
> 
>>> +    delta = NOW() - v->hyp_entry_time;
>>> +    atomic_add(delta, &v->sched_unit->hyp_time);
>>> +
>>> +#ifndef NDEBUG
>>> +    v->in_hyp_task = false;
>>> +#endif
>>> +}
>>> +
>>>    /*
>>>     * Do the actual movement of an unit from old to new CPU. Locks for *both*
>>>     * CPUs needs to have been taken already when calling this!
>>> @@ -2615,6 +2646,7 @@ static void schedule(void)
>>>    
>>>        SCHED_STAT_CRANK(sched_run);
>>>    
>>> +    vcpu_end_hyp_task(current);
>>>        rcu_read_lock(&sched_res_rculock);
>>>    
>>>        lock = pcpu_schedule_lock_irq(cpu);
>>> diff --git a/xen/common/softirq.c b/xen/common/softirq.c
>>> index 063e93cbe3..03a29384d1 100644
>>> --- a/xen/common/softirq.c
>>> +++ b/xen/common/softirq.c
>>> @@ -71,7 +71,9 @@ void process_pending_softirqs(void)
>>>    void do_softirq(void)
>>>    {
>>>        ASSERT_NOT_IN_ATOMIC();
>>> +    vcpu_begin_hyp_task(current);
>>>        __do_softirq(0);
>>> +    vcpu_end_hyp_task(current);
>>
>> This won't work for scheduling. current will either have changed,
>> or in x86 case __do_softirq() might just not return. You need to
>> handle that case explicitly in schedule() (you did that for the
>> old vcpu, but for the case schedule() is returning you need to
>> call vcpu_begin_hyp_task(current) there).
>>
> 
> Well, this is one of questions, I wanted to discuss. I certainly need
> to call vcpu_begin_hyp_task(current) after context switch. But what it
> is the right place? If my understaning is right, code on x86 platform
> will never reach this point. Or I'm wrong there?

No, this is correct.

You can add the call to context_switch() just after set_current() has
been called.


Juergen
Jan Beulich June 16, 2020, 10:10 a.m. UTC | #4
On 12.06.2020 02:22, Volodymyr Babchuk wrote:
> In most cases hypervisor code performs guest-related jobs. Tasks like
> hypercall handling or MMIO access emulation are done for calling vCPU
> so it is okay to charge time spent in hypervisor to the current vCPU.
> 
> But, there are also tasks that are not originated from guests. This
> includes things like TLB flushing or running tasklets. We don't want
> to track time spent in this tasks to a total scheduling unit run
> time. So we need to track time spent in such housekeeping tasks
> separately.
> 
> Those hypervisor tasks are run in do_softirq() function, so we'll
> install our hooks there.

I can see the point and desire, but it feels like you're moving from
one kind of unfairness to another: A softirq may very well be on
behalf of a specific vCPU, in which case not charging current should
lead to charging that specific one (which may still be current then).
Even more than for TLB flushes this may be relevant for the cases
where (on x86) we issue WBINVD on behalf of a guest.

Jan
Volodymyr Babchuk June 18, 2020, 2:50 a.m. UTC | #5
Hi Jan,

Jan Beulich writes:

> On 12.06.2020 02:22, Volodymyr Babchuk wrote:
>> In most cases hypervisor code performs guest-related jobs. Tasks like
>> hypercall handling or MMIO access emulation are done for calling vCPU
>> so it is okay to charge time spent in hypervisor to the current vCPU.
>> 
>> But, there are also tasks that are not originated from guests. This
>> includes things like TLB flushing or running tasklets. We don't want
>> to track time spent in this tasks to a total scheduling unit run
>> time. So we need to track time spent in such housekeeping tasks
>> separately.
>> 
>> Those hypervisor tasks are run in do_softirq() function, so we'll
>> install our hooks there.
>
> I can see the point and desire, but it feels like you're moving from
> one kind of unfairness to another: A softirq may very well be on
> behalf of a specific vCPU, in which case not charging current should
> lead to charging that specific one (which may still be current then).
> Even more than for TLB flushes this may be relevant for the cases
> where (on x86) we issue WBINVD on behalf of a guest.

I'm agree with you. Something similar we discussed with Dario, but in
do_IRQ() context: we can determine for which CPU we are handling
interrupt and we can charge that vcpu for the spent time. The same
stands correct for cases that you described: for some soft irqs there is
a known benefactor, so we can charge it for the spent time.

I and Dario agreed to implement this in the second stage. I'm working on
the next version of the patches and I'll look at this more
closely. There is a possibility that I'll introduce that feature. But
I'll need some help from you or some other x86 expert.

Anyways, are you okay with the general approach? We will work out the
details, but I want to be sure that I'm moving in the right direction.
Jan Beulich June 18, 2020, 6:34 a.m. UTC | #6
On 18.06.2020 04:50, Volodymyr Babchuk wrote:
> Anyways, are you okay with the general approach? We will work out the
> details, but I want to be sure that I'm moving in the right direction.

I'm certainly okay with the goal; I didn't look closely enough to say
I'm okay with the approach - I trust Dario there.

Jan
Volodymyr Babchuk Sept. 24, 2020, 6:08 p.m. UTC | #7
Hello Jürgen,

Jürgen Groß writes:

> On 12.06.20 13:30, Volodymyr Babchuk wrote:
>> On Fri, 2020-06-12 at 06:43 +0200, Jürgen Groß wrote:
>>> On 12.06.20 02:22, Volodymyr Babchuk wrote:

[...]
>>>> +    delta = NOW() - v->hyp_entry_time;
>>>> +    atomic_add(delta, &v->sched_unit->hyp_time);
>>>> +
>>>> +#ifndef NDEBUG
>>>> +    v->in_hyp_task = false;
>>>> +#endif
>>>> +}
>>>> +
>>>>    /*
>>>>     * Do the actual movement of an unit from old to new CPU. Locks for *both*
>>>>     * CPUs needs to have been taken already when calling this!
>>>> @@ -2615,6 +2646,7 @@ static void schedule(void)
>>>>           SCHED_STAT_CRANK(sched_run);
>>>>    +    vcpu_end_hyp_task(current);
>>>>        rcu_read_lock(&sched_res_rculock);
>>>>           lock = pcpu_schedule_lock_irq(cpu);
>>>> diff --git a/xen/common/softirq.c b/xen/common/softirq.c
>>>> index 063e93cbe3..03a29384d1 100644
>>>> --- a/xen/common/softirq.c
>>>> +++ b/xen/common/softirq.c
>>>> @@ -71,7 +71,9 @@ void process_pending_softirqs(void)
>>>>    void do_softirq(void)
>>>>    {
>>>>        ASSERT_NOT_IN_ATOMIC();
>>>> +    vcpu_begin_hyp_task(current);
>>>>        __do_softirq(0);
>>>> +    vcpu_end_hyp_task(current);
>>>
>>> This won't work for scheduling. current will either have changed,
>>> or in x86 case __do_softirq() might just not return. You need to
>>> handle that case explicitly in schedule() (you did that for the
>>> old vcpu, but for the case schedule() is returning you need to
>>> call vcpu_begin_hyp_task(current) there).
>>>
>> Well, this is one of questions, I wanted to discuss. I certainly
>> need
>> to call vcpu_begin_hyp_task(current) after context switch. But what it
>> is the right place? If my understaning is right, code on x86 platform
>> will never reach this point. Or I'm wrong there?
>
> No, this is correct.
>
> You can add the call to context_switch() just after set_current() has
> been called.

Looks like I'm missing something there. If I get this right, code you
mentioned is executed right before leaving hypervisor.

So, as I see this, functions are called in the following way (on x86):

1. do_softirq() calls vcpu_begin_hyp_task() and then executes
__do_softirq()

2. __do_softirq() does different jobs and eventually calls schedule()

3. schedule() calls vcpu_end_hyp_task() and makes scheduling decision
which leads to call to context_switch()

4. On end context_switch() we will exit hypervisor and enter VM. At
least, this is how I understand

       nextd->arch.ctxt_switch->tail(next);

call.

So, no need to call vcpu_begin_hyp_task() in context_switch() for x86.

On ARM, this is different story. There, I am calling
vcpu_begin_hyp_task() after set_current() because ARM code will
eventually return to do_softirq() and there will be called corresponding
vcpu_end_hyp_task().

I have put bunch of ASSERTs to ensure that vcpu_begin_hyp_task() or
vcpu_end_hyp_task() are not called twice and that vcpu_end_hyp_task() is
called after vcpu_begin_hyp_task(). Those asserts are not failing, so I
assume that I did all this in the right way :)
Dario Faggioli Sept. 25, 2020, 5:22 p.m. UTC | #8
On Thu, 2020-09-24 at 18:08 +0000, Volodymyr Babchuk wrote:
> So, as I see this, functions are called in the following way (on
> x86):
> 
> 1. do_softirq() calls vcpu_begin_hyp_task() and then executes
> __do_softirq()
> 
> 2. __do_softirq() does different jobs and eventually calls schedule()
> 
> 3. schedule() calls vcpu_end_hyp_task() and makes scheduling decision
> which leads to call to context_switch()
> 
> 4. On end context_switch() we will exit hypervisor and enter VM. At
> least, this is how I understand
> 
>        nextd->arch.ctxt_switch->tail(next);
> 
> call.
> 
> So, no need to call vcpu_begin_hyp_task() in context_switch() for
> x86.
> 
Mmm... This looks correct to me too.

And what about the cases where schedule() does return?

Are these also fine because they're handled within __do_softirq()
(i.e., without actually going back to do_softirq() and hence never
calling end_hyp_task() for a second time)?


> I have put bunch of ASSERTs to ensure that vcpu_begin_hyp_task() or
> vcpu_end_hyp_task() are not called twice and that vcpu_end_hyp_task()
> is
> called after vcpu_begin_hyp_task(). Those asserts are not failing, so
> I
> assume that I did all this in the right way :)
> 
Yeah, good to know. :-)

Are you doing these tests with both core-scheduling disabled and
enabled?

Regards
Volodymyr Babchuk Sept. 25, 2020, 8:21 p.m. UTC | #9
Hi Dario,


Dario Faggioli writes:

> On Thu, 2020-09-24 at 18:08 +0000, Volodymyr Babchuk wrote:
>> So, as I see this, functions are called in the following way (on
>> x86):
>> 
>> 1. do_softirq() calls vcpu_begin_hyp_task() and then executes
>> __do_softirq()
>> 
>> 2. __do_softirq() does different jobs and eventually calls schedule()
>> 
>> 3. schedule() calls vcpu_end_hyp_task() and makes scheduling decision
>> which leads to call to context_switch()
>> 
>> 4. On end context_switch() we will exit hypervisor and enter VM. At
>> least, this is how I understand
>> 
>>        nextd->arch.ctxt_switch->tail(next);
>> 
>> call.
>> 
>> So, no need to call vcpu_begin_hyp_task() in context_switch() for
>> x86.
>> 
> Mmm... This looks correct to me too.
>
> And what about the cases where schedule() does return?

Can it return on x86? I want to test this case, but how force it? Null
scheduler, perhaps?

> Are these also fine because they're handled within __do_softirq()
> (i.e., without actually going back to do_softirq() and hence never
> calling end_hyp_task() for a second time)?

I afraid, that there will be a bug. schedule() calls end_hyp_task(), and
if it will eventually return from __do_softirq() to do_softirq(),
end_hyp_task() will be called twice.

>
>> I have put bunch of ASSERTs to ensure that vcpu_begin_hyp_task() or
>> vcpu_end_hyp_task() are not called twice and that vcpu_end_hyp_task()
>> is
>> called after vcpu_begin_hyp_task(). Those asserts are not failing, so
>> I
>> assume that I did all this in the right way :)
>> 
> Yeah, good to know. :-)
>
> Are you doing these tests with both core-scheduling disabled and
> enabled?

Good question. On x86 I am running Xen in QEMU. With -smp=2 it sees two
CPUs:

(XEN) Brought up 2 CPUs
(XEN) Scheduling granularity: cpu, 1 CPU per sched-resource

You are right, I need to try other variants of scheduling granularity.

Do you by any chance know how to emulate more complex setup in QEMU?
Also, what is the preferred way to test/debug Xen on x86?
Dario Faggioli Sept. 25, 2020, 9:42 p.m. UTC | #10
On Fri, 2020-09-25 at 20:21 +0000, Volodymyr Babchuk wrote:
> Hi Dario,
> 
Hi! :-)

> Dario Faggioli writes:
> > And what about the cases where schedule() does return?
> 
> Can it return on x86? I want to test this case, but how force it?
> Null
> scheduler, perhaps?
> 
> > Are these also fine because they're handled within __do_softirq()
> > (i.e., without actually going back to do_softirq() and hence never
> > calling end_hyp_task() for a second time)?
> 
> I afraid, that there will be a bug. schedule() calls end_hyp_task(),
> and
> if it will eventually return from __do_softirq() to do_softirq(),
> end_hyp_task() will be called twice.
>
Yeah, exactly. That's why I was asking whether you had verified that we
actually never get to this. Either because we context switch or because
we stay inside __do_schedule() and never go back to do_schedule().

I was, in fact, referring to all the various cases of handling primary
and secondary scheduling request, when core-scheduling is enabled.

> > > I have put bunch of ASSERTs to ensure that vcpu_begin_hyp_task()
> > > or
> > > vcpu_end_hyp_task() are not called twice and that
> > > vcpu_end_hyp_task()
> > > is
> > > called after vcpu_begin_hyp_task(). Those asserts are not
> > > failing, so
> > > I
> > > assume that I did all this in the right way :)
> > > 
> > Yeah, good to know. :-)
> > 
> > Are you doing these tests with both core-scheduling disabled and
> > enabled?
> 
> Good question. On x86 I am running Xen in QEMU. With -smp=2 it sees
> two
> CPUs:
> 
> (XEN) Brought up 2 CPUs
> (XEN) Scheduling granularity: cpu, 1 CPU per sched-resource
> 
> You are right, I need to try other variants of scheduling
> granularity.
> 
> Do you by any chance know how to emulate more complex setup in QEMU?
>
Like enabling a virtual topology, on top of which you could test core
(or socket) scheduling? If yes, indeed you can do that in QEMU:

https://www.qemu.org/docs/master/qemu-doc.html

-smp [cpus=]n[,cores=cores][,threads=threads][,dies=dies]
     [,sockets=sockets][,maxcpus=maxcpus]

Simulate an SMP system with n CPUs. On the PC target, up to 255 CPUs
are supported. On Sparc32 target, Linux limits the number of usable
CPUs to 4. For the PC target, the number of cores per die, the number
of threads per cores, the number of dies per packages and the total
number of sockets can be specified. Missing values will be computed. If
any on the three values is given, the total number of CPUs n can be
omitted. maxcpus specifies the maximum number of hotpluggable CPUs.

Once you have an SMT virtual topology, you can boot Xen inside, with an
higher scheduling granularity.

A (rather big!) example would be:

-smp 224,sockets=4,cores=28,threads=2

You can even define a virtual NUMA topology, if you want.

And you can pin the vCPUs to the physical CPUs of the hosts, in such a
way that the virtual topology is mapped to the physical one. This is
good for performance but also increase a little bit the accuracy of
testing.

> Also, what is the preferred way to test/debug Xen on x86?
> 
I test on real hardware, at least most of the times, if this is what
you're asking.

Checking if the code is "functionally correct" is ok-ish if done in a
VM first. But then, especially for scheduling related things, where
timing plays a rather significant role, I personally prefer to test on
actual hardware sooner rather than later.

Regards
diff mbox series

Patch

diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 8f642ada05..d597811fef 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -945,6 +945,37 @@  void vcpu_end_irq_handler(void)
     atomic_add(delta, &current->sched_unit->irq_time);
 }
 
+void vcpu_begin_hyp_task(struct vcpu *v)
+{
+    if ( is_idle_vcpu(v) )
+        return;
+
+    ASSERT(!v->in_hyp_task);
+
+    v->hyp_entry_time = NOW();
+#ifndef NDEBUG
+    v->in_hyp_task = true;
+#endif
+}
+
+void vcpu_end_hyp_task(struct vcpu *v)
+{
+    int delta;
+
+    if ( is_idle_vcpu(v) )
+        return;
+
+    ASSERT(v->in_hyp_task);
+
+    /* We assume that hypervisor task time will not overflow int */
+    delta = NOW() - v->hyp_entry_time;
+    atomic_add(delta, &v->sched_unit->hyp_time);
+
+#ifndef NDEBUG
+    v->in_hyp_task = false;
+#endif
+}
+
 /*
  * Do the actual movement of an unit from old to new CPU. Locks for *both*
  * CPUs needs to have been taken already when calling this!
@@ -2615,6 +2646,7 @@  static void schedule(void)
 
     SCHED_STAT_CRANK(sched_run);
 
+    vcpu_end_hyp_task(current);
     rcu_read_lock(&sched_res_rculock);
 
     lock = pcpu_schedule_lock_irq(cpu);
diff --git a/xen/common/softirq.c b/xen/common/softirq.c
index 063e93cbe3..03a29384d1 100644
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -71,7 +71,9 @@  void process_pending_softirqs(void)
 void do_softirq(void)
 {
     ASSERT_NOT_IN_ATOMIC();
+    vcpu_begin_hyp_task(current);
     __do_softirq(0);
+    vcpu_end_hyp_task(current);
 }
 
 void open_softirq(int nr, softirq_handler handler)
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index ceed53364b..51dc7c4551 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -239,7 +239,12 @@  struct vcpu
 
     /* Fair scheduling state */
     uint64_t         irq_entry_time;
+    uint64_t         hyp_entry_time;
     unsigned int     irq_nesting;
+#ifndef NDEBUG
+    bool             in_hyp_task;
+#endif
+
     /* Tasklet for continue_hypercall_on_cpu(). */
     struct tasklet   continue_hypercall_tasklet;
 
@@ -279,8 +284,9 @@  struct sched_unit {
     /* Vcpu state summary. */
     unsigned int           runstate_cnt[4];
 
-    /* Fair scheduling correction value */
+    /* Fair scheduling correction values */
     atomic_t               irq_time;
+    atomic_t               hyp_time;
 
     /* Bitmask of CPUs on which this VCPU may run. */
     cpumask_var_t          cpu_hard_affinity;
@@ -703,6 +709,14 @@  void vcpu_sleep_sync(struct vcpu *v);
 void vcpu_begin_irq_handler(void);
 void vcpu_end_irq_handler(void);
 
+/*
+ * Report to scheduler when we are doing housekeeping tasks on the
+ * current vcpu. This is called during do_softirq() but can be called
+ * anywhere else.
+ */
+void vcpu_begin_hyp_task(struct vcpu *v);
+void vcpu_end_hyp_task(struct vcpu *v);
+
 /*
  * Force synchronisation of given VCPU's state. If it is currently descheduled,
  * this call will ensure that all its state is committed to memory and that