diff mbox

[v6,2/3] KVM: dynamic halt_poll_ns adjustment

Message ID BLU436-SMTP5094DCF6122F1E1D93C68B80690@phx.gbl (mailing list archive)
State New, archived
Headers show

Commit Message

Wanpeng Li Sept. 2, 2015, 7:29 a.m. UTC
There is a downside of always-poll since poll is still happened for idle 
vCPUs which can waste cpu usage. This patch adds the ability to adjust 
halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,  
and to shrink halt_poll_ns when long halt is detected.

There are two new kernel parameters for changing the halt_poll_ns:
halt_poll_ns_grow and halt_poll_ns_shrink. 

                        no-poll      always-poll    dynamic-poll
-----------------------------------------------------------------------
Idle (nohz) vCPU %c0     0.15%        0.3%            0.2%  
Idle (250HZ) vCPU %c0    1.1%         4.6%~14%        1.2%
TCP_RR latency           34us         27us            26.7us

"Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
guest was tickless. (250HZ) means the guest was ticking at 250HZ.

The big win is with ticking operating systems. Running the linux guest
with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close 
to no-polling overhead levels by using the dynamic-poll. The savings
should be even higher for higher frequency ticks.
	
Suggested-by: David Matlack <dmatlack@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
---
 virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 3 deletions(-)

Comments

David Matlack Sept. 2, 2015, 6:09 p.m. UTC | #1
On Wed, Sep 2, 2015 at 12:29 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
> There is a downside of always-poll since poll is still happened for idle
> vCPUs which can waste cpu usage. This patch adds the ability to adjust
> halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,
> and to shrink halt_poll_ns when long halt is detected.
>
> There are two new kernel parameters for changing the halt_poll_ns:
> halt_poll_ns_grow and halt_poll_ns_shrink.
>
>                         no-poll      always-poll    dynamic-poll
> -----------------------------------------------------------------------
> Idle (nohz) vCPU %c0     0.15%        0.3%            0.2%
> Idle (250HZ) vCPU %c0    1.1%         4.6%~14%        1.2%
> TCP_RR latency           34us         27us            26.7us
>
> "Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
> c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
> guest was tickless. (250HZ) means the guest was ticking at 250HZ.
>
> The big win is with ticking operating systems. Running the linux guest
> with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close
> to no-polling overhead levels by using the dynamic-poll. The savings
> should be even higher for higher frequency ticks.
>
> Suggested-by: David Matlack <dmatlack@google.com>
> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
> ---
>  virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c06e57c..3cff02f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -66,9 +66,18 @@
>  MODULE_AUTHOR("Qumranet");
>  MODULE_LICENSE("GPL");
>
> -static unsigned int halt_poll_ns;
> +/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
> +static unsigned int halt_poll_ns = 500000;
>  module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
>
> +/* Default doubles per-vcpu halt_poll_ns. */
> +static unsigned int halt_poll_ns_grow = 2;
> +module_param(halt_poll_ns_grow, int, S_IRUGO);
> +
> +/* Default resets per-vcpu halt_poll_ns . */
> +static unsigned int halt_poll_ns_shrink;
> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
> +
>  /*
>   * Ordering of locks:
>   *
> @@ -1907,6 +1916,31 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>
> +static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
> +{
> +       int val = vcpu->halt_poll_ns;
> +
> +       /* 10us base */
> +       if (val == 0 && halt_poll_ns_grow)
> +               val = 10000;
> +       else
> +               val *= halt_poll_ns_grow;
> +
> +       vcpu->halt_poll_ns = val;
> +}
> +
> +static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
> +{
> +       int val = vcpu->halt_poll_ns;
> +
> +       if (halt_poll_ns_shrink == 0)
> +               val = 0;
> +       else
> +               val /= halt_poll_ns_shrink;
> +
> +       vcpu->halt_poll_ns = val;
> +}
> +
>  static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>  {
>         if (kvm_arch_vcpu_runnable(vcpu)) {
> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>         ktime_t start, cur;
>         DEFINE_WAIT(wait);
>         bool waited = false;
> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>
>         start = cur = ktime_get();
>         if (vcpu->halt_poll_ns) {
> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>                          */
>                         if (kvm_vcpu_check_block(vcpu) < 0) {
>                                 ++vcpu->stat.halt_successful_poll;
> -                               goto out;
> +                               break;
>                         }
>                         cur = ktime_get();
>                 } while (single_task_running() && ktime_before(cur, stop));
> +
> +               if (ktime_before(cur, stop)) {

You can't use 'cur' to tell if the interrupt arrived. single_task_running()
can break us out of the loop before 'stop'.

> +                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);

Put this line before the if(). block_ns should always include the time
spent polling; even if polling does not succeed.

> +                       goto out;
> +               }
>         }
>
>         for (;;) {
> @@ -1959,9 +1999,24 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>
>         finish_wait(&vcpu->wq, &wait);
>         cur = ktime_get();
> +       wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>
>  out:
> -       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
> +       block_ns = poll_ns + wait_ns;
> +
> +       if (halt_poll_ns) {

If you want, you can leave this if() out and save some indentation.

> +               if (block_ns <= vcpu->halt_poll_ns)
> +                       ;
> +               /* we had a long block, shrink polling */
> +               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
> +                       shrink_halt_poll_ns(vcpu);
> +               /* we had a short halt and our poll time is too small */
> +               else if (vcpu->halt_poll_ns < halt_poll_ns &&
> +                       block_ns < halt_poll_ns)
> +                       grow_halt_poll_ns(vcpu);
> +       }
> +
> +       trace_kvm_vcpu_wakeup(block_ns, waited);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_block);
>
> --
> 1.9.1
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 2, 2015, 7:12 p.m. UTC | #2
On 02/09/2015 20:09, David Matlack wrote:
> On Wed, Sep 2, 2015 at 12:29 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
>> There is a downside of always-poll since poll is still happened for idle
>> vCPUs which can waste cpu usage. This patch adds the ability to adjust
>> halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,
>> and to shrink halt_poll_ns when long halt is detected.
>>
>> There are two new kernel parameters for changing the halt_poll_ns:
>> halt_poll_ns_grow and halt_poll_ns_shrink.
>>
>>                         no-poll      always-poll    dynamic-poll
>> -----------------------------------------------------------------------
>> Idle (nohz) vCPU %c0     0.15%        0.3%            0.2%
>> Idle (250HZ) vCPU %c0    1.1%         4.6%~14%        1.2%
>> TCP_RR latency           34us         27us            26.7us
>>
>> "Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
>> c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
>> guest was tickless. (250HZ) means the guest was ticking at 250HZ.
>>
>> The big win is with ticking operating systems. Running the linux guest
>> with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close
>> to no-polling overhead levels by using the dynamic-poll. The savings
>> should be even higher for higher frequency ticks.
>>
>> Suggested-by: David Matlack <dmatlack@google.com>
>> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
>> ---
>>  virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 58 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index c06e57c..3cff02f 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -66,9 +66,18 @@
>>  MODULE_AUTHOR("Qumranet");
>>  MODULE_LICENSE("GPL");
>>
>> -static unsigned int halt_poll_ns;
>> +/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
>> +static unsigned int halt_poll_ns = 500000;
>>  module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
>>
>> +/* Default doubles per-vcpu halt_poll_ns. */
>> +static unsigned int halt_poll_ns_grow = 2;
>> +module_param(halt_poll_ns_grow, int, S_IRUGO);
>> +
>> +/* Default resets per-vcpu halt_poll_ns . */
>> +static unsigned int halt_poll_ns_shrink;
>> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
>> +
>>  /*
>>   * Ordering of locks:
>>   *
>> @@ -1907,6 +1916,31 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>>
>> +static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
>> +{
>> +       int val = vcpu->halt_poll_ns;
>> +
>> +       /* 10us base */
>> +       if (val == 0 && halt_poll_ns_grow)
>> +               val = 10000;
>> +       else
>> +               val *= halt_poll_ns_grow;
>> +
>> +       vcpu->halt_poll_ns = val;
>> +}
>> +
>> +static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
>> +{
>> +       int val = vcpu->halt_poll_ns;
>> +
>> +       if (halt_poll_ns_shrink == 0)
>> +               val = 0;
>> +       else
>> +               val /= halt_poll_ns_shrink;
>> +
>> +       vcpu->halt_poll_ns = val;
>> +}
>> +
>>  static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>>  {
>>         if (kvm_arch_vcpu_runnable(vcpu)) {
>> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>         ktime_t start, cur;
>>         DEFINE_WAIT(wait);
>>         bool waited = false;
>> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>>
>>         start = cur = ktime_get();
>>         if (vcpu->halt_poll_ns) {
>> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>                          */
>>                         if (kvm_vcpu_check_block(vcpu) < 0) {
>>                                 ++vcpu->stat.halt_successful_poll;
>> -                               goto out;
>> +                               break;
>>                         }
>>                         cur = ktime_get();
>>                 } while (single_task_running() && ktime_before(cur, stop));
>> +
>> +               if (ktime_before(cur, stop)) {
> 
> You can't use 'cur' to tell if the interrupt arrived. single_task_running()
> can break us out of the loop before 'stop'.

Ah, I thought this was on purpose. :)

If !single_task_running(), it is okay to keep vcpu->halt_poll_ns high,
because the physical CPU is not going to be idle anyway.  Resetting the
timer as soon as single_task_running() becomes false will not cost much
CPU time.

Does it make sense?

Paolo

>> +                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
> 
> Put this line before the if(). block_ns should always include the time
> spent polling; even if polling does not succeed.
> 
>> +                       goto out;
>> +               }
>>         }
>>
>>         for (;;) {
>> @@ -1959,9 +1999,24 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>
>>         finish_wait(&vcpu->wq, &wait);
>>         cur = ktime_get();
>> +       wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>>
>>  out:
>> -       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
>> +       block_ns = poll_ns + wait_ns;
>> +
>> +       if (halt_poll_ns) {
> 
> If you want, you can leave this if() out and save some indentation.
> 
>> +               if (block_ns <= vcpu->halt_poll_ns)
>> +                       ;
>> +               /* we had a long block, shrink polling */
>> +               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
>> +                       shrink_halt_poll_ns(vcpu);
>> +               /* we had a short halt and our poll time is too small */
>> +               else if (vcpu->halt_poll_ns < halt_poll_ns &&
>> +                       block_ns < halt_poll_ns)
>> +                       grow_halt_poll_ns(vcpu);
>> +       }
>> +
>> +       trace_kvm_vcpu_wakeup(block_ns, waited);
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_vcpu_block);
>>
>> --
>> 1.9.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Sept. 2, 2015, 7:23 p.m. UTC | #3
On Wed, Sep 2, 2015 at 12:12 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 02/09/2015 20:09, David Matlack wrote:
>> On Wed, Sep 2, 2015 at 12:29 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
>>> There is a downside of always-poll since poll is still happened for idle
>>> vCPUs which can waste cpu usage. This patch adds the ability to adjust
>>> halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,
>>> and to shrink halt_poll_ns when long halt is detected.
>>>
>>> There are two new kernel parameters for changing the halt_poll_ns:
>>> halt_poll_ns_grow and halt_poll_ns_shrink.
>>>
>>>                         no-poll      always-poll    dynamic-poll
>>> -----------------------------------------------------------------------
>>> Idle (nohz) vCPU %c0     0.15%        0.3%            0.2%
>>> Idle (250HZ) vCPU %c0    1.1%         4.6%~14%        1.2%
>>> TCP_RR latency           34us         27us            26.7us
>>>
>>> "Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
>>> c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
>>> guest was tickless. (250HZ) means the guest was ticking at 250HZ.
>>>
>>> The big win is with ticking operating systems. Running the linux guest
>>> with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close
>>> to no-polling overhead levels by using the dynamic-poll. The savings
>>> should be even higher for higher frequency ticks.
>>>
>>> Suggested-by: David Matlack <dmatlack@google.com>
>>> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
>>> ---
>>>  virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>  1 file changed, 58 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>> index c06e57c..3cff02f 100644
>>> --- a/virt/kvm/kvm_main.c
>>> +++ b/virt/kvm/kvm_main.c
>>> @@ -66,9 +66,18 @@
>>>  MODULE_AUTHOR("Qumranet");
>>>  MODULE_LICENSE("GPL");
>>>
>>> -static unsigned int halt_poll_ns;
>>> +/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
>>> +static unsigned int halt_poll_ns = 500000;
>>>  module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
>>>
>>> +/* Default doubles per-vcpu halt_poll_ns. */
>>> +static unsigned int halt_poll_ns_grow = 2;
>>> +module_param(halt_poll_ns_grow, int, S_IRUGO);
>>> +
>>> +/* Default resets per-vcpu halt_poll_ns . */
>>> +static unsigned int halt_poll_ns_shrink;
>>> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
>>> +
>>>  /*
>>>   * Ordering of locks:
>>>   *
>>> @@ -1907,6 +1916,31 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>>>  }
>>>  EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>>>
>>> +static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
>>> +{
>>> +       int val = vcpu->halt_poll_ns;
>>> +
>>> +       /* 10us base */
>>> +       if (val == 0 && halt_poll_ns_grow)
>>> +               val = 10000;
>>> +       else
>>> +               val *= halt_poll_ns_grow;
>>> +
>>> +       vcpu->halt_poll_ns = val;
>>> +}
>>> +
>>> +static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
>>> +{
>>> +       int val = vcpu->halt_poll_ns;
>>> +
>>> +       if (halt_poll_ns_shrink == 0)
>>> +               val = 0;
>>> +       else
>>> +               val /= halt_poll_ns_shrink;
>>> +
>>> +       vcpu->halt_poll_ns = val;
>>> +}
>>> +
>>>  static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>>>  {
>>>         if (kvm_arch_vcpu_runnable(vcpu)) {
>>> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>>         ktime_t start, cur;
>>>         DEFINE_WAIT(wait);
>>>         bool waited = false;
>>> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>>>
>>>         start = cur = ktime_get();
>>>         if (vcpu->halt_poll_ns) {
>>> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>>                          */
>>>                         if (kvm_vcpu_check_block(vcpu) < 0) {
>>>                                 ++vcpu->stat.halt_successful_poll;
>>> -                               goto out;
>>> +                               break;
>>>                         }
>>>                         cur = ktime_get();
>>>                 } while (single_task_running() && ktime_before(cur, stop));
>>> +
>>> +               if (ktime_before(cur, stop)) {
>>
>> You can't use 'cur' to tell if the interrupt arrived. single_task_running()
>> can break us out of the loop before 'stop'.
>
> Ah, I thought this was on purpose. :)
>
> If !single_task_running(), it is okay to keep vcpu->halt_poll_ns high,
> because the physical CPU is not going to be idle anyway.  Resetting the
> timer as soon as single_task_running() becomes false will not cost much
> CPU time.

Good point. I agree we can keep halt_poll_ns high in this case.

I actually wasn't thinking about vcpu->halt_poll_ns though. If
single_task_running() breaks us out of the loop we will "goto out" instead
of scheduling. My suspicion is this will cause us to loop calling
kvm_vcpu_block and starve the waiting task (at least until need_resched()),
which would break the "only hog the cpu when idle" aspect of halt-polling.

>
> Does it make sense?
>
> Paolo
>
>>> +                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>>
>> Put this line before the if(). block_ns should always include the time
>> spent polling; even if polling does not succeed.
>>
>>> +                       goto out;
>>> +               }
>>>         }
>>>
>>>         for (;;) {
>>> @@ -1959,9 +1999,24 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>>
>>>         finish_wait(&vcpu->wq, &wait);
>>>         cur = ktime_get();
>>> +       wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>>>
>>>  out:
>>> -       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
>>> +       block_ns = poll_ns + wait_ns;
>>> +
>>> +       if (halt_poll_ns) {
>>
>> If you want, you can leave this if() out and save some indentation.
>>
>>> +               if (block_ns <= vcpu->halt_poll_ns)
>>> +                       ;
>>> +               /* we had a long block, shrink polling */
>>> +               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
>>> +                       shrink_halt_poll_ns(vcpu);
>>> +               /* we had a short halt and our poll time is too small */
>>> +               else if (vcpu->halt_poll_ns < halt_poll_ns &&
>>> +                       block_ns < halt_poll_ns)
>>> +                       grow_halt_poll_ns(vcpu);
>>> +       }
>>> +
>>> +       trace_kvm_vcpu_wakeup(block_ns, waited);
>>>  }
>>>  EXPORT_SYMBOL_GPL(kvm_vcpu_block);
>>>
>>> --
>>> 1.9.1
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 3, 2015, 7:31 a.m. UTC | #4
On 02/09/2015 21:23, David Matlack wrote:
> 
> I actually wasn't thinking about vcpu->halt_poll_ns though. If
> single_task_running() breaks us out of the loop we will "goto out" instead
> of scheduling. My suspicion is this will cause us to loop calling
> kvm_vcpu_block and starve the waiting task (at least until need_resched()),
> which would break the "only hog the cpu when idle" aspect of halt-polling.

That's definitely a bug, yes.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Sept. 3, 2015, 8:52 a.m. UTC | #5
On 9/3/15 3:31 PM, Paolo Bonzini wrote:
>
> On 02/09/2015 21:23, David Matlack wrote:
>> I actually wasn't thinking about vcpu->halt_poll_ns though. If
>> single_task_running() breaks us out of the loop we will "goto out" instead
>> of scheduling. My suspicion is this will cause us to loop calling
>> kvm_vcpu_block and starve the waiting task (at least until need_resched()),
>> which would break the "only hog the cpu when idle" aspect of halt-polling.
> That's definitely a bug, yes.

Ok, I will send out v7 to fix this in this sunday since there is 
vacation in my country currently.

Regards,
Wanpeng Li

>
> Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Sept. 3, 2015, 9:23 a.m. UTC | #6
On 9/3/15 2:09 AM, David Matlack wrote:
[...]
>> +
>>   static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>>   {
>>          if (kvm_arch_vcpu_runnable(vcpu)) {
>> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>          ktime_t start, cur;
>>          DEFINE_WAIT(wait);
>>          bool waited = false;
>> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>>
>>          start = cur = ktime_get();
>>          if (vcpu->halt_poll_ns) {
>> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>                           */
>>                          if (kvm_vcpu_check_block(vcpu) < 0) {
>>                                  ++vcpu->stat.halt_successful_poll;
>> -                               goto out;
>> +                               break;
>>                          }
>>                          cur = ktime_get();
>>                  } while (single_task_running() && ktime_before(cur, stop));
>> +
>> +               if (ktime_before(cur, stop)) {
> You can't use 'cur' to tell if the interrupt arrived. single_task_running()
> can break us out of the loop before 'stop'.
>
>> +                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
> Put this line before the if(). block_ns should always include the time
> spent polling; even if polling does not succeed.

How about something like:

@@ -1941,10 +1976,14 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                          */
                         if (kvm_vcpu_check_block(vcpu) < 0) {
++vcpu->stat.halt_successful_poll;
-                               goto out;
+                               break;
                         }
                         cur = ktime_get();
                 } while (single_task_running() && ktime_before(cur, stop));
+
+               poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+               if (ktime_before(cur, stop) && single_task_running())
+                       goto out;
         }

>
>> +                       goto out;
>> +               }
>>          }
>>
>>          for (;;) {
>> @@ -1959,9 +1999,24 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>
>>          finish_wait(&vcpu->wq, &wait);
>>          cur = ktime_get();
>> +       wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>>
>>   out:
>> -       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
>> +       block_ns = poll_ns + wait_ns;
>> +
>> +       if (halt_poll_ns) {
> If you want, you can leave this if() out and save some indentation.

Then we will miss the tracepoint.

Regards,
Wanpeng Li

>
>> +               if (block_ns <= vcpu->halt_poll_ns)
>> +                       ;
>> +               /* we had a long block, shrink polling */
>> +               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
>> +                       shrink_halt_poll_ns(vcpu);
>> +               /* we had a short halt and our poll time is too small */
>> +               else if (vcpu->halt_poll_ns < halt_poll_ns &&
>> +                       block_ns < halt_poll_ns)
>> +                       grow_halt_poll_ns(vcpu);
>> +       }
>> +
>> +       trace_kvm_vcpu_wakeup(block_ns, waited);
>>   }
>>   EXPORT_SYMBOL_GPL(kvm_vcpu_block);
>>
>> --
>> 1.9.1
>>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Sept. 3, 2015, 4:07 p.m. UTC | #7
On Thu, Sep 3, 2015 at 2:23 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
>
> How about something like:
>
> @@ -1941,10 +1976,14 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>                          */
>                         if (kvm_vcpu_check_block(vcpu) < 0) {
> ++vcpu->stat.halt_successful_poll;
> -                               goto out;
> +                               break;
>                         }
>                         cur = ktime_get();
>                 } while (single_task_running() && ktime_before(cur, stop));
> +
> +               poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
> +               if (ktime_before(cur, stop) && single_task_running())
> +                       goto out;

I would prefer an explicit signal (e.g. set a bool to true before breaking out
of the loop, and check it here) to avoid duplicating the loop exit condition.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Sept. 4, 2015, 12:15 a.m. UTC | #8
On 9/4/15 12:07 AM, David Matlack wrote:
> On Thu, Sep 3, 2015 at 2:23 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
>> How about something like:
>>
>> @@ -1941,10 +1976,14 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>                           */
>>                          if (kvm_vcpu_check_block(vcpu) < 0) {
>> ++vcpu->stat.halt_successful_poll;
>> -                               goto out;
>> +                               break;
>>                          }
>>                          cur = ktime_get();
>>                  } while (single_task_running() && ktime_before(cur, stop));
>> +
>> +               poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>> +               if (ktime_before(cur, stop) && single_task_running())
>> +                       goto out;
> I would prefer an explicit signal (e.g. set a bool to true before breaking out
> of the loop, and check it here) to avoid duplicating the loop exit condition.

Fix it in v7, thanks for your review, David! ;-)

Regards,
Wanpeng Li
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Sept. 4, 2015, 1:16 a.m. UTC | #9
On 9/3/15 2:09 AM, David Matlack wrote:
> On Wed, Sep 2, 2015 at 12:29 AM, Wanpeng Li <wanpeng.li@hotmail.com> wrote:
>> There is a downside of always-poll since poll is still happened for idle
>> vCPUs which can waste cpu usage. This patch adds the ability to adjust
>> halt_poll_ns dynamically, to grow halt_poll_ns when shot halt is detected,
>> and to shrink halt_poll_ns when long halt is detected.
>>
>> There are two new kernel parameters for changing the halt_poll_ns:
>> halt_poll_ns_grow and halt_poll_ns_shrink.
>>
>>                          no-poll      always-poll    dynamic-poll
>> -----------------------------------------------------------------------
>> Idle (nohz) vCPU %c0     0.15%        0.3%            0.2%
>> Idle (250HZ) vCPU %c0    1.1%         4.6%~14%        1.2%
>> TCP_RR latency           34us         27us            26.7us
>>
>> "Idle (X) vCPU %c0" is the percent of time the physical cpu spent in
>> c0 over 60 seconds (each vCPU is pinned to a pCPU). (nohz) means the
>> guest was tickless. (250HZ) means the guest was ticking at 250HZ.
>>
>> The big win is with ticking operating systems. Running the linux guest
>> with nohz=off (and HZ=250), we save 3.4%~12.8% CPUs/second and get close
>> to no-polling overhead levels by using the dynamic-poll. The savings
>> should be even higher for higher frequency ticks.
>>
>> Suggested-by: David Matlack <dmatlack@google.com>
>> Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
>> ---
>>   virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 58 insertions(+), 3 deletions(-)
>>
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index c06e57c..3cff02f 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -66,9 +66,18 @@
>>   MODULE_AUTHOR("Qumranet");
>>   MODULE_LICENSE("GPL");
>>
>> -static unsigned int halt_poll_ns;
>> +/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
>> +static unsigned int halt_poll_ns = 500000;
>>   module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
>>
>> +/* Default doubles per-vcpu halt_poll_ns. */
>> +static unsigned int halt_poll_ns_grow = 2;
>> +module_param(halt_poll_ns_grow, int, S_IRUGO);
>> +
>> +/* Default resets per-vcpu halt_poll_ns . */
>> +static unsigned int halt_poll_ns_shrink;
>> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
>> +
>>   /*
>>    * Ordering of locks:
>>    *
>> @@ -1907,6 +1916,31 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
>>   }
>>   EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
>>
>> +static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
>> +{
>> +       int val = vcpu->halt_poll_ns;
>> +
>> +       /* 10us base */
>> +       if (val == 0 && halt_poll_ns_grow)
>> +               val = 10000;
>> +       else
>> +               val *= halt_poll_ns_grow;
>> +
>> +       vcpu->halt_poll_ns = val;
>> +}
>> +
>> +static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
>> +{
>> +       int val = vcpu->halt_poll_ns;
>> +
>> +       if (halt_poll_ns_shrink == 0)
>> +               val = 0;
>> +       else
>> +               val /= halt_poll_ns_shrink;
>> +
>> +       vcpu->halt_poll_ns = val;
>> +}
>> +
>>   static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>>   {
>>          if (kvm_arch_vcpu_runnable(vcpu)) {
>> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>          ktime_t start, cur;
>>          DEFINE_WAIT(wait);
>>          bool waited = false;
>> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>>
>>          start = cur = ktime_get();
>>          if (vcpu->halt_poll_ns) {
>> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>                           */
>>                          if (kvm_vcpu_check_block(vcpu) < 0) {
>>                                  ++vcpu->stat.halt_successful_poll;
>> -                               goto out;
>> +                               break;
>>                          }
>>                          cur = ktime_get();
>>                  } while (single_task_running() && ktime_before(cur, stop));
>> +
>> +               if (ktime_before(cur, stop)) {
> You can't use 'cur' to tell if the interrupt arrived. single_task_running()
> can break us out of the loop before 'stop'.
>
>> +                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
> Put this line before the if(). block_ns should always include the time
> spent polling; even if polling does not succeed.

Ah, I remember why I keep poll_ns calculation in if(). Actually poll_ns 
has already been accumulated into wait_ns(see how wait_ns is calculated) 
if polling does not succeed. If put poll_ns calculation before if() I 
need to reset variable 'start' to ktime_get().

Regards,
Wanpeng Li

>
>> +                       goto out;
>> +               }
>>          }
>>
>>          for (;;) {
>> @@ -1959,9 +1999,24 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>
>>          finish_wait(&vcpu->wq, &wait);
>>          cur = ktime_get();
>> +       wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>>
>>   out:
>> -       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
>> +       block_ns = poll_ns + wait_ns;
>> +
>> +       if (halt_poll_ns) {
> If you want, you can leave this if() out and save some indentation.
>
>> +               if (block_ns <= vcpu->halt_poll_ns)
>> +                       ;
>> +               /* we had a long block, shrink polling */
>> +               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
>> +                       shrink_halt_poll_ns(vcpu);
>> +               /* we had a short halt and our poll time is too small */
>> +               else if (vcpu->halt_poll_ns < halt_poll_ns &&
>> +                       block_ns < halt_poll_ns)
>> +                       grow_halt_poll_ns(vcpu);
>> +       }
>> +
>> +       trace_kvm_vcpu_wakeup(block_ns, waited);
>>   }
>>   EXPORT_SYMBOL_GPL(kvm_vcpu_block);
>>
>> --
>> 1.9.1
>>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wanpeng Li Sept. 4, 2015, 1:42 a.m. UTC | #10
On 9/4/15 9:16 AM, Wanpeng Li wrote:
> [...]
>>> +
>>>   static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
>>>   {
>>>          if (kvm_arch_vcpu_runnable(vcpu)) {
>>> @@ -1929,6 +1963,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>>          ktime_t start, cur;
>>>          DEFINE_WAIT(wait);
>>>          bool waited = false;
>>> +       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
>>>
>>>          start = cur = ktime_get();
>>>          if (vcpu->halt_poll_ns) {
>>> @@ -1941,10 +1976,15 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
>>>                           */
>>>                          if (kvm_vcpu_check_block(vcpu) < 0) {
>>> ++vcpu->stat.halt_successful_poll;
>>> -                               goto out;
>>> +                               break;
>>>                          }
>>>                          cur = ktime_get();
>>>                  } while (single_task_running() && ktime_before(cur, 
>>> stop));
>>> +
>>> +               if (ktime_before(cur, stop)) {
>> You can't use 'cur' to tell if the interrupt arrived. 
>> single_task_running()
>> can break us out of the loop before 'stop'.
>>
>>> +                       poll_ns = ktime_to_ns(cur) - 
>>> ktime_to_ns(start);
>> Put this line before the if(). block_ns should always include the time
>> spent polling; even if polling does not succeed.
>
> Ah, I remember why I keep poll_ns calculation in if(). Actually 
> poll_ns has already been accumulated into wait_ns(see how wait_ns is 
> calculated) if polling does not succeed. If put poll_ns calculation 
> before if() I need to reset variable 'start' to ktime_get().

So how about something like:

@@ -1928,7 +1962,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  {
         ktime_t start, cur;
         DEFINE_WAIT(wait);
-       bool waited = false;
+       bool polled = false, waited = false;
+       u64 poll_ns = 0, wait_ns = 0, block_ns = 0;

         start = cur = ktime_get();
         if (vcpu->halt_poll_ns) {
@@ -1940,11 +1975,17 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
                          * arrives.
                          */
                         if (kvm_vcpu_check_block(vcpu) < 0) {
+                               polled = true;
++vcpu->stat.halt_successful_poll;
-                               goto out;
+                               break;
                         }
                         cur = ktime_get();
                 } while (single_task_running() && ktime_before(cur, stop));
+
+               if (polled) {
+                       poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+                       goto out;
+               }
         }


Regards,
Wanpeng Li

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c06e57c..3cff02f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,9 +66,18 @@ 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 500000;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
  * Ordering of locks:
  *
@@ -1907,6 +1916,31 @@  void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+	int val = vcpu->halt_poll_ns;
+
+	/* 10us base */
+	if (val == 0 && halt_poll_ns_grow)
+		val = 10000;
+	else
+		val *= halt_poll_ns_grow;
+
+	vcpu->halt_poll_ns = val;
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+	int val = vcpu->halt_poll_ns;
+
+	if (halt_poll_ns_shrink == 0)
+		val = 0;
+	else
+		val /= halt_poll_ns_shrink;
+
+	vcpu->halt_poll_ns = val;
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
 	if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1929,6 +1963,7 @@  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	ktime_t start, cur;
 	DEFINE_WAIT(wait);
 	bool waited = false;
+	u64 poll_ns = 0, wait_ns = 0, block_ns = 0;
 
 	start = cur = ktime_get();
 	if (vcpu->halt_poll_ns) {
@@ -1941,10 +1976,15 @@  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 			 */
 			if (kvm_vcpu_check_block(vcpu) < 0) {
 				++vcpu->stat.halt_successful_poll;
-				goto out;
+				break;
 			}
 			cur = ktime_get();
 		} while (single_task_running() && ktime_before(cur, stop));
+
+		if (ktime_before(cur, stop)) {
+			poll_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+			goto out;
+		}
 	}
 
 	for (;;) {
@@ -1959,9 +1999,24 @@  void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
 	finish_wait(&vcpu->wq, &wait);
 	cur = ktime_get();
+	wait_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
 out:
-	trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+	block_ns = poll_ns + wait_ns;
+
+	if (halt_poll_ns) {
+		if (block_ns <= vcpu->halt_poll_ns)
+			;
+		/* we had a long block, shrink polling */
+		else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+			shrink_halt_poll_ns(vcpu);
+		/* we had a short halt and our poll time is too small */
+		else if (vcpu->halt_poll_ns < halt_poll_ns &&
+			block_ns < halt_poll_ns)
+			grow_halt_poll_ns(vcpu);
+	}
+
+	trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);