diff mbox

[v3,5/5] hpet 'driftfix': add code in hpet_timer() to compensate delayed callbacks and coalesced interrupts

Message ID 1304000700-3640-6-git-send-email-uobergfe@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ulrich Obergfell April 28, 2011, 2:25 p.m. UTC
Loss of periodic timer interrupts caused by delayed callbacks and by
interrupt coalescing is compensated by gradually injecting additional
interrupts during subsequent timer intervals, starting at a rate of
one additional interrupt per interval. The injection of additional
interrupts is based on a backlog of unaccounted HPET clock periods
(new HPETTimer field 'ticks_not_accounted'). The backlog increases
due to delayed callbacks and coalesced interrupts, and it decreases
if an interrupt was injected successfully. If the backlog increases
while compensation is still in progress, the rate at which additional
interrupts are injected is increased too. A limit is imposed on the
backlog and on the rate.

Injecting additional timer interrupts to compensate lost interrupts
can alleviate long term time drift. However, on a short time scale,
this method can have the side effect of making virtual machine time
intermittently pass slower and faster than real time (depending on
the guest's time keeping algorithm). Compensation is disabled by
default and can be enabled for guests where this behaviour may be
acceptable.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
---
 hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 61 insertions(+), 2 deletions(-)

Comments

Marcelo Tosatti May 3, 2011, 7:03 p.m. UTC | #1
On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> Loss of periodic timer interrupts caused by delayed callbacks and by
> interrupt coalescing is compensated by gradually injecting additional
> interrupts during subsequent timer intervals, starting at a rate of
> one additional interrupt per interval. The injection of additional
> interrupts is based on a backlog of unaccounted HPET clock periods
> (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> due to delayed callbacks and coalesced interrupts, and it decreases
> if an interrupt was injected successfully. If the backlog increases
> while compensation is still in progress, the rate at which additional
> interrupts are injected is increased too. A limit is imposed on the
> backlog and on the rate.
> 
> Injecting additional timer interrupts to compensate lost interrupts
> can alleviate long term time drift. However, on a short time scale,
> this method can have the side effect of making virtual machine time
> intermittently pass slower and faster than real time (depending on
> the guest's time keeping algorithm). Compensation is disabled by
> default and can be enabled for guests where this behaviour may be
> acceptable.
> 
> Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> ---
>  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/hpet.c b/hw/hpet.c
> index 35466ae..92d5f58 100644
> --- a/hw/hpet.c
> +++ b/hw/hpet.c
> @@ -32,6 +32,7 @@
>  #include "sysbus.h"
>  #include "mc146818rtc.h"
>  #include "sysemu.h"
> +#include <assert.h>
>  
>  //#define HPET_DEBUG
>  #ifdef HPET_DEBUG
> @@ -42,6 +43,9 @@
>  
>  #define HPET_MSI_SUPPORT        0
>  
> +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> +#define MAX_IRQ_RATE                (uint32_t)10
> +
>  struct HPETState;
>  typedef struct HPETTimer {  /* timers */
>      uint8_t tn;             /*timer number*/
> @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
>      }
>  };
>  
> +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> +{
> +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> +    return (backlog >= t->period);
> +}
> +
>  /*
>   * timer expiration callback
>   */
>  static void hpet_timer(void *opaque)
>  {
>      HPETTimer *t = opaque;
> +    HPETState *s = t->state;
>      uint64_t diff;
> -
> +    int irq_delivered = 0;
> +    uint32_t irq_count = 0;
>      uint64_t period = t->period;
>      uint64_t cur_tick = hpet_get_ticks(t->state);
>  
> +    if (s->driftfix && !t->ticks_not_accounted) {
> +        t->ticks_not_accounted = t->prev_period = t->period;
> +    }
>      if (timer_is_periodic(t) && period != 0) {
>          if (t->config & HPET_TN_32BIT) {
>              while (hpet_time_after(cur_tick, t->cmp)) {
>                  t->cmp = (uint32_t)(t->cmp + t->period);
> +                t->ticks_not_accounted += t->period;
> +                irq_count++;
>              }
>          } else {
>              while (hpet_time_after64(cur_tick, t->cmp)) {
>                  t->cmp += period;
> +                t->ticks_not_accounted += period;
> +                irq_count++;
>              }
>          }
>          diff = hpet_calculate_diff(t, cur_tick);
> +        if (s->driftfix) {
> +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> +                t->ticks_not_accounted = t->period + t->prev_period;
> +            }
> +            if (hpet_timer_has_tick_backlog(t)) {
> +                if (t->irq_rate == 1 || irq_count > 1) {
> +                    t->irq_rate++;
> +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> +                }
> +                if (t->divisor == 0) {
> +                    assert(irq_count);
> +                }
> +                if (irq_count) {
> +                    t->divisor = t->irq_rate;
> +                }
> +                diff /= t->divisor--;
> +            } else {
> +                t->irq_rate = 1;
> +            }
> +        }
>          qemu_mod_timer(t->qemu_timer,
>                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
>      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
>              t->wrap_flag = 0;
>          }
>      }
> -    update_irq(t, 1);
> +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
> +            irq_delivered = update_irq(t, 1);
> +            if (irq_delivered) {
> +                t->ticks_not_accounted -= t->prev_period;
> +                t->prev_period = t->period;
> +            } else {
> +                if (irq_count) {
> +                    t->irq_rate++;
> +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> +                }
> +            }
> +        }
> +    } else {
> +        update_irq(t, 1);
> +    }
>  }

Hi Ulrich,

Whats prev_period for, since in practice the period will not change
between interrupts (OS programs comparator once, or perhaps twice during
bootup) ?

Other than that, shouldnt reset accounting variables to init state on
write to GLOBAL_ENABLE_CFG / writes to main counter?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa May 3, 2011, 10:08 p.m. UTC | #2
On Tue, 2011-05-03 at 16:03 -0300, Marcelo Tosatti wrote:
> On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> > Loss of periodic timer interrupts caused by delayed callbacks and by
> > interrupt coalescing is compensated by gradually injecting additional
> > interrupts during subsequent timer intervals, starting at a rate of
> > one additional interrupt per interval. The injection of additional
> > interrupts is based on a backlog of unaccounted HPET clock periods
> > (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> > due to delayed callbacks and coalesced interrupts, and it decreases
> > if an interrupt was injected successfully. If the backlog increases
> > while compensation is still in progress, the rate at which additional
> > interrupts are injected is increased too. A limit is imposed on the
> > backlog and on the rate.
> > 
> > Injecting additional timer interrupts to compensate lost interrupts
> > can alleviate long term time drift. However, on a short time scale,
> > this method can have the side effect of making virtual machine time
> > intermittently pass slower and faster than real time (depending on
> > the guest's time keeping algorithm). Compensation is disabled by
> > default and can be enabled for guests where this behaviour may be
> > acceptable.
> > 
> > Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> > ---
> >  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 files changed, 61 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/hpet.c b/hw/hpet.c
> > index 35466ae..92d5f58 100644
> > --- a/hw/hpet.c
> > +++ b/hw/hpet.c
> > @@ -32,6 +32,7 @@
> >  #include "sysbus.h"
> >  #include "mc146818rtc.h"
> >  #include "sysemu.h"
> > +#include <assert.h>
> >  
> >  //#define HPET_DEBUG
> >  #ifdef HPET_DEBUG
> > @@ -42,6 +43,9 @@
> >  
> >  #define HPET_MSI_SUPPORT        0
> >  
> > +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> > +#define MAX_IRQ_RATE                (uint32_t)10
> > +
> >  struct HPETState;
> >  typedef struct HPETTimer {  /* timers */
> >      uint8_t tn;             /*timer number*/
> > @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
> >      }
> >  };
> >  
> > +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> > +{
> > +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> > +    return (backlog >= t->period);
> > +}
> > +
> >  /*
> >   * timer expiration callback
> >   */
> >  static void hpet_timer(void *opaque)
> >  {
> >      HPETTimer *t = opaque;
> > +    HPETState *s = t->state;
> >      uint64_t diff;
> > -
> > +    int irq_delivered = 0;
> > +    uint32_t irq_count = 0;
> >      uint64_t period = t->period;
> >      uint64_t cur_tick = hpet_get_ticks(t->state);
> >  
> > +    if (s->driftfix && !t->ticks_not_accounted) {
> > +        t->ticks_not_accounted = t->prev_period = t->period;
> > +    }
> >      if (timer_is_periodic(t) && period != 0) {
> >          if (t->config & HPET_TN_32BIT) {
> >              while (hpet_time_after(cur_tick, t->cmp)) {
> >                  t->cmp = (uint32_t)(t->cmp + t->period);
> > +                t->ticks_not_accounted += t->period;
> > +                irq_count++;
> >              }
> >          } else {
> >              while (hpet_time_after64(cur_tick, t->cmp)) {
> >                  t->cmp += period;
> > +                t->ticks_not_accounted += period;
> > +                irq_count++;
> >              }
> >          }
> >          diff = hpet_calculate_diff(t, cur_tick);
> > +        if (s->driftfix) {
> > +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> > +                t->ticks_not_accounted = t->period + t->prev_period;
> > +            }
> > +            if (hpet_timer_has_tick_backlog(t)) {
> > +                if (t->irq_rate == 1 || irq_count > 1) {
> > +                    t->irq_rate++;
> > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > +                }
> > +                if (t->divisor == 0) {
> > +                    assert(irq_count);
> > +                }
> > +                if (irq_count) {
> > +                    t->divisor = t->irq_rate;
> > +                }
> > +                diff /= t->divisor--;
> > +            } else {
> > +                t->irq_rate = 1;
> > +            }
> > +        }
> >          qemu_mod_timer(t->qemu_timer,
> >                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
> >      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> > @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
> >              t->wrap_flag = 0;
> >          }
> >      }
> > -    update_irq(t, 1);
> > +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> > +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
> > +            irq_delivered = update_irq(t, 1);
> > +            if (irq_delivered) {
> > +                t->ticks_not_accounted -= t->prev_period;
> > +                t->prev_period = t->period;
> > +            } else {
> > +                if (irq_count) {
> > +                    t->irq_rate++;
> > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > +                }
> > +            }
> > +        }
> > +    } else {
> > +        update_irq(t, 1);
> > +    }
> >  }
> 
> Hi Ulrich,
> 
> Whats prev_period for, since in practice the period will not change
> between interrupts (OS programs comparator once, or perhaps twice during
> bootup) ?

Actually, some systems, like Windows 2000-and-something change this
period quite heavily under multimedia workloads.

> Other than that, shouldnt reset accounting variables to init state on
> write to GLOBAL_ENABLE_CFG / writes to main counter?
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti May 3, 2011, 10:55 p.m. UTC | #3
On Tue, May 03, 2011 at 07:08:25PM -0300, Glauber Costa wrote:
> On Tue, 2011-05-03 at 16:03 -0300, Marcelo Tosatti wrote:
> > On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> > > Loss of periodic timer interrupts caused by delayed callbacks and by
> > > interrupt coalescing is compensated by gradually injecting additional
> > > interrupts during subsequent timer intervals, starting at a rate of
> > > one additional interrupt per interval. The injection of additional
> > > interrupts is based on a backlog of unaccounted HPET clock periods
> > > (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> > > due to delayed callbacks and coalesced interrupts, and it decreases
> > > if an interrupt was injected successfully. If the backlog increases
> > > while compensation is still in progress, the rate at which additional
> > > interrupts are injected is increased too. A limit is imposed on the
> > > backlog and on the rate.
> > > 
> > > Injecting additional timer interrupts to compensate lost interrupts
> > > can alleviate long term time drift. However, on a short time scale,
> > > this method can have the side effect of making virtual machine time
> > > intermittently pass slower and faster than real time (depending on
> > > the guest's time keeping algorithm). Compensation is disabled by
> > > default and can be enabled for guests where this behaviour may be
> > > acceptable.
> > > 
> > > Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> > > ---
> > >  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >  1 files changed, 61 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/hpet.c b/hw/hpet.c
> > > index 35466ae..92d5f58 100644
> > > --- a/hw/hpet.c
> > > +++ b/hw/hpet.c
> > > @@ -32,6 +32,7 @@
> > >  #include "sysbus.h"
> > >  #include "mc146818rtc.h"
> > >  #include "sysemu.h"
> > > +#include <assert.h>
> > >  
> > >  //#define HPET_DEBUG
> > >  #ifdef HPET_DEBUG
> > > @@ -42,6 +43,9 @@
> > >  
> > >  #define HPET_MSI_SUPPORT        0
> > >  
> > > +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> > > +#define MAX_IRQ_RATE                (uint32_t)10
> > > +
> > >  struct HPETState;
> > >  typedef struct HPETTimer {  /* timers */
> > >      uint8_t tn;             /*timer number*/
> > > @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
> > >      }
> > >  };
> > >  
> > > +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> > > +{
> > > +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> > > +    return (backlog >= t->period);
> > > +}
> > > +
> > >  /*
> > >   * timer expiration callback
> > >   */
> > >  static void hpet_timer(void *opaque)
> > >  {
> > >      HPETTimer *t = opaque;
> > > +    HPETState *s = t->state;
> > >      uint64_t diff;
> > > -
> > > +    int irq_delivered = 0;
> > > +    uint32_t irq_count = 0;
> > >      uint64_t period = t->period;
> > >      uint64_t cur_tick = hpet_get_ticks(t->state);
> > >  
> > > +    if (s->driftfix && !t->ticks_not_accounted) {
> > > +        t->ticks_not_accounted = t->prev_period = t->period;
> > > +    }
> > >      if (timer_is_periodic(t) && period != 0) {
> > >          if (t->config & HPET_TN_32BIT) {
> > >              while (hpet_time_after(cur_tick, t->cmp)) {
> > >                  t->cmp = (uint32_t)(t->cmp + t->period);
> > > +                t->ticks_not_accounted += t->period;
> > > +                irq_count++;
> > >              }
> > >          } else {
> > >              while (hpet_time_after64(cur_tick, t->cmp)) {
> > >                  t->cmp += period;
> > > +                t->ticks_not_accounted += period;
> > > +                irq_count++;
> > >              }
> > >          }
> > >          diff = hpet_calculate_diff(t, cur_tick);
> > > +        if (s->driftfix) {
> > > +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> > > +                t->ticks_not_accounted = t->period + t->prev_period;
> > > +            }
> > > +            if (hpet_timer_has_tick_backlog(t)) {
> > > +                if (t->irq_rate == 1 || irq_count > 1) {
> > > +                    t->irq_rate++;
> > > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > > +                }
> > > +                if (t->divisor == 0) {
> > > +                    assert(irq_count);
> > > +                }
> > > +                if (irq_count) {
> > > +                    t->divisor = t->irq_rate;
> > > +                }
> > > +                diff /= t->divisor--;
> > > +            } else {
> > > +                t->irq_rate = 1;
> > > +            }
> > > +        }
> > >          qemu_mod_timer(t->qemu_timer,
> > >                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
> > >      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> > > @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
> > >              t->wrap_flag = 0;
> > >          }
> > >      }
> > > -    update_irq(t, 1);
> > > +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> > > +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
				             ^^^^^^^^^^

> > > +            irq_delivered = update_irq(t, 1);
> > > +            if (irq_delivered) {
> > > +                t->ticks_not_accounted -= t->prev_period;
> > > +                t->prev_period = t->period;
> > > +            } else {
> > > +                if (irq_count) {
> > > +                    t->irq_rate++;
> > > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > > +                }
> > > +            }
> > > +        }
> > > +    } else {
> > > +        update_irq(t, 1);
> > > +    }
> > >  }
> > 
> > Hi Ulrich,
> > 
> > Whats prev_period for, since in practice the period will not change
> > between interrupts (OS programs comparator once, or perhaps twice during
> > bootup) ?
> 
> Actually, some systems, like Windows 2000-and-something change this
> period quite heavily under multimedia workloads.

I see. Still, using the period at the previous timer handler instance in
the decision whether to inject an interrupt to the guest makes no sense
to me.

> > Other than that, shouldnt reset accounting variables to init state on
> > write to GLOBAL_ENABLE_CFG / writes to main counter?

What i meant to ask here is whether the reinjection state (including
ticks_not_accounted) should be reset whenever a different period is
programmed.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ulrich Obergfell May 4, 2011, 8:06 a.m. UTC | #4
Hi Marcelo,
 
> Whats prev_period for, since in practice the period will not change
> between interrupts (OS programs comparator once, or perhaps twice
> during bootup) ?

'prev_period' is needed if a guest o/s changes the comparator period
'on the fly' (without stopping and restarting the timer).


             guest o/s changes period
               |
  ti(n-1)      |        ti(n)                          ti(n+1)
    |          v          |                              |
    +---------------------+------------------------------+

     <--- prev_period ---> <---------- period ---------->


The idea is that each timer interrupt represents a certain quantum
of time (the comparator period). If a guest o/s changes the period
between timer interrupt 'n-1' and timer interrupt 'n', I think the
new value should not take effect before timer interrupt 'n'. Timer
interrupt 'n' still represents the old/previous quantum, and timer
interrupt 'n+1' represents the new quantum.

Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
and sets 'prev_period' to 'period' when an interrupt was delivered
to the guest o/s.

+            irq_delivered = update_irq(t, 1);
+            if (irq_delivered) {
+                t->ticks_not_accounted -= t->prev_period;
+                t->prev_period = t->period;
+            } else {

Most of the time 'prev_period' is equal to 'period'. It should only
be different in the scenario shown above.


Regards,

Uli
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti May 4, 2011, 9:09 a.m. UTC | #5
On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> 
> Hi Marcelo,
>  
> > Whats prev_period for, since in practice the period will not change
> > between interrupts (OS programs comparator once, or perhaps twice
> > during bootup) ?
> 
> 'prev_period' is needed if a guest o/s changes the comparator period
> 'on the fly' (without stopping and restarting the timer).
> 
> 
>              guest o/s changes period
>                |
>   ti(n-1)      |        ti(n)                          ti(n+1)
>     |          v          |                              |
>     +---------------------+------------------------------+
> 
>      <--- prev_period ---> <---------- period ---------->
> 
> 
> The idea is that each timer interrupt represents a certain quantum
> of time (the comparator period). If a guest o/s changes the period
> between timer interrupt 'n-1' and timer interrupt 'n', I think the
> new value should not take effect before timer interrupt 'n'. Timer
> interrupt 'n' still represents the old/previous quantum, and timer
> interrupt 'n+1' represents the new quantum.
> 
> Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> and sets 'prev_period' to 'period' when an interrupt was delivered
> to the guest o/s.
> 
> +            irq_delivered = update_irq(t, 1);
> +            if (irq_delivered) {
> +                t->ticks_not_accounted -= t->prev_period;
> +                t->prev_period = t->period;
> +            } else {
> 
> Most of the time 'prev_period' is equal to 'period'. It should only
> be different in the scenario shown above.

OK, makes sense. You should probably reset ticks_not_accounted to zero
on HPET initialization (for example, to avoid miscalibration when
kexec'ing a new kernel).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa May 4, 2011, 1:36 p.m. UTC | #6
On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > 
> > Hi Marcelo,
> >  
> > > Whats prev_period for, since in practice the period will not change
> > > between interrupts (OS programs comparator once, or perhaps twice
> > > during bootup) ?
> > 
> > 'prev_period' is needed if a guest o/s changes the comparator period
> > 'on the fly' (without stopping and restarting the timer).
> > 
> > 
> >              guest o/s changes period
> >                |
> >   ti(n-1)      |        ti(n)                          ti(n+1)
> >     |          v          |                              |
> >     +---------------------+------------------------------+
> > 
> >      <--- prev_period ---> <---------- period ---------->
> > 
> > 
> > The idea is that each timer interrupt represents a certain quantum
> > of time (the comparator period). If a guest o/s changes the period
> > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > new value should not take effect before timer interrupt 'n'. Timer
> > interrupt 'n' still represents the old/previous quantum, and timer
> > interrupt 'n+1' represents the new quantum.
> > 
> > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > and sets 'prev_period' to 'period' when an interrupt was delivered
> > to the guest o/s.
> > 
> > +            irq_delivered = update_irq(t, 1);
> > +            if (irq_delivered) {
> > +                t->ticks_not_accounted -= t->prev_period;
> > +                t->prev_period = t->period;
> > +            } else {
> > 
> > Most of the time 'prev_period' is equal to 'period'. It should only
> > be different in the scenario shown above.
> 
> OK, makes sense. You should probably reset ticks_not_accounted to zero
> on HPET initialization (for example, to avoid miscalibration when
> kexec'ing a new kernel).

Everybody resetting the machine in anyway is expected to force devices
to be reinitialized, right ?
I may be wrong, but I was under the impression that kexec would do this
as well. In this case, the reset function should be enough.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 4, 2011, 1:46 p.m. UTC | #7
On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > 
> > > Hi Marcelo,
> > >  
> > > > Whats prev_period for, since in practice the period will not change
> > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > during bootup) ?
> > > 
> > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > 'on the fly' (without stopping and restarting the timer).
> > > 
> > > 
> > >              guest o/s changes period
> > >                |
> > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > >     |          v          |                              |
> > >     +---------------------+------------------------------+
> > > 
> > >      <--- prev_period ---> <---------- period ---------->
> > > 
> > > 
> > > The idea is that each timer interrupt represents a certain quantum
> > > of time (the comparator period). If a guest o/s changes the period
> > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > new value should not take effect before timer interrupt 'n'. Timer
> > > interrupt 'n' still represents the old/previous quantum, and timer
> > > interrupt 'n+1' represents the new quantum.
> > > 
> > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > to the guest o/s.
> > > 
> > > +            irq_delivered = update_irq(t, 1);
> > > +            if (irq_delivered) {
> > > +                t->ticks_not_accounted -= t->prev_period;
> > > +                t->prev_period = t->period;
> > > +            } else {
> > > 
> > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > be different in the scenario shown above.
> > 
> > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > on HPET initialization (for example, to avoid miscalibration when
> > kexec'ing a new kernel).
> 
> Everybody resetting the machine in anyway is expected to force devices
> to be reinitialized, right ?
> I may be wrong, but I was under the impression that kexec would do this
> as well. In this case, the reset function should be enough.
> 
kexec does not reset a machine. That's the whole point of kexec in
fact.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa May 4, 2011, 1:47 p.m. UTC | #8
On Wed, 2011-05-04 at 16:46 +0300, Gleb Natapov wrote:
> On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> > On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > > 
> > > > Hi Marcelo,
> > > >  
> > > > > Whats prev_period for, since in practice the period will not change
> > > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > > during bootup) ?
> > > > 
> > > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > > 'on the fly' (without stopping and restarting the timer).
> > > > 
> > > > 
> > > >              guest o/s changes period
> > > >                |
> > > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > > >     |          v          |                              |
> > > >     +---------------------+------------------------------+
> > > > 
> > > >      <--- prev_period ---> <---------- period ---------->
> > > > 
> > > > 
> > > > The idea is that each timer interrupt represents a certain quantum
> > > > of time (the comparator period). If a guest o/s changes the period
> > > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > > new value should not take effect before timer interrupt 'n'. Timer
> > > > interrupt 'n' still represents the old/previous quantum, and timer
> > > > interrupt 'n+1' represents the new quantum.
> > > > 
> > > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > > to the guest o/s.
> > > > 
> > > > +            irq_delivered = update_irq(t, 1);
> > > > +            if (irq_delivered) {
> > > > +                t->ticks_not_accounted -= t->prev_period;
> > > > +                t->prev_period = t->period;
> > > > +            } else {
> > > > 
> > > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > > be different in the scenario shown above.
> > > 
> > > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > > on HPET initialization (for example, to avoid miscalibration when
> > > kexec'ing a new kernel).
> > 
> > Everybody resetting the machine in anyway is expected to force devices
> > to be reinitialized, right ?
> > I may be wrong, but I was under the impression that kexec would do this
> > as well. In this case, the reset function should be enough.
> > 
> kexec does not reset a machine. That's the whole point of kexec in
> fact.
Sure thing, but doesn't it force the initialization routine of the devices themselves, without
going through the bios ?

> --
> 			Gleb.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov May 4, 2011, 1:55 p.m. UTC | #9
On Wed, May 04, 2011 at 10:47:40AM -0300, Glauber Costa wrote:
> On Wed, 2011-05-04 at 16:46 +0300, Gleb Natapov wrote:
> > On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> > > On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > > > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > > > 
> > > > > Hi Marcelo,
> > > > >  
> > > > > > Whats prev_period for, since in practice the period will not change
> > > > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > > > during bootup) ?
> > > > > 
> > > > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > > > 'on the fly' (without stopping and restarting the timer).
> > > > > 
> > > > > 
> > > > >              guest o/s changes period
> > > > >                |
> > > > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > > > >     |          v          |                              |
> > > > >     +---------------------+------------------------------+
> > > > > 
> > > > >      <--- prev_period ---> <---------- period ---------->
> > > > > 
> > > > > 
> > > > > The idea is that each timer interrupt represents a certain quantum
> > > > > of time (the comparator period). If a guest o/s changes the period
> > > > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > > > new value should not take effect before timer interrupt 'n'. Timer
> > > > > interrupt 'n' still represents the old/previous quantum, and timer
> > > > > interrupt 'n+1' represents the new quantum.
> > > > > 
> > > > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > > > to the guest o/s.
> > > > > 
> > > > > +            irq_delivered = update_irq(t, 1);
> > > > > +            if (irq_delivered) {
> > > > > +                t->ticks_not_accounted -= t->prev_period;
> > > > > +                t->prev_period = t->period;
> > > > > +            } else {
> > > > > 
> > > > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > > > be different in the scenario shown above.
> > > > 
> > > > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > > > on HPET initialization (for example, to avoid miscalibration when
> > > > kexec'ing a new kernel).
> > > 
> > > Everybody resetting the machine in anyway is expected to force devices
> > > to be reinitialized, right ?
> > > I may be wrong, but I was under the impression that kexec would do this
> > > as well. In this case, the reset function should be enough.
> > > 
> > kexec does not reset a machine. That's the whole point of kexec in
> > fact.
> Sure thing, but doesn't it force the initialization routine of the devices themselves, without
> going through the bios ?
> 
It just starts new kernel. New kernel's init runs as usual and
re-initialize everything. No it doesn't go through the BIOS. What for?
Actually I happily use kexec on IBM blade server where BIOS run takes no
less then 5 minutes and kexec reboots instantly.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ulrich Obergfell May 5, 2011, 8:07 a.m. UTC | #10
Hi Marcelo,
 
> Other than that, shouldnt reset accounting variables to init state on
> write to GLOBAL_ENABLE_CFG / writes to main counter?

I'd suggest to initialize/reset the driftfix-related fields in the
'HPETTimer' structure (including the backlog of unaccounted ticks)
in the following situations.


- When the guest o/s sets the 'CFG_ENABLE' bit (overall enable) in
  the General Configuration Register.

  This is the place in hpet_ram_writel():

        case HPET_CFG:
            ...
            if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
                ...
                for (i = 0; i < s->num_timers; i++) {
                    if ((&s->timer[i])->cmp != ~0ULL) {
                        // initialize / reset fields here
                        hpet_set_timer(&s->timer[i]);


- When the guest o/s sets the 'TN_ENABLE' bit (timer N interrupt
  enable) in the Timer N Configuration and Capabilities Register.

  This is the place in hpet_ram_writel():

        case HPET_TN_CFG:
            ...
            if (activating_bit(old_val, new_val, HPET_TN_ENABLE)) {
                // initialize / reset fields here
                hpet_set_timer(timer);


This should cover cases such as ...

- first time initialization of HPET & timers during guest o/s boot.
  (includes kexec and reboot)

- if a guest o/s stops and restarts the timer.
  (for example, to change the comparator register value or
   to switch a timer between periodic mode and non-periodic mode)


Regards,

Uli
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti May 6, 2011, 2:39 p.m. UTC | #11
On Thu, May 05, 2011 at 04:07:19AM -0400, Ulrich Obergfell wrote:
> 
> Hi Marcelo,
>  
> > Other than that, shouldnt reset accounting variables to init state on
> > write to GLOBAL_ENABLE_CFG / writes to main counter?
> 
> I'd suggest to initialize/reset the driftfix-related fields in the
> 'HPETTimer' structure (including the backlog of unaccounted ticks)
> in the following situations.
> 
> 
> - When the guest o/s sets the 'CFG_ENABLE' bit (overall enable) in
>   the General Configuration Register.
> 
>   This is the place in hpet_ram_writel():
> 
>         case HPET_CFG:
>             ...
>             if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
>                 ...
>                 for (i = 0; i < s->num_timers; i++) {
>                     if ((&s->timer[i])->cmp != ~0ULL) {
>                         // initialize / reset fields here
>                         hpet_set_timer(&s->timer[i]);
> 
> 
> - When the guest o/s sets the 'TN_ENABLE' bit (timer N interrupt
>   enable) in the Timer N Configuration and Capabilities Register.
> 
>   This is the place in hpet_ram_writel():
> 
>         case HPET_TN_CFG:
>             ...
>             if (activating_bit(old_val, new_val, HPET_TN_ENABLE)) {
>                 // initialize / reset fields here
>                 hpet_set_timer(timer);
> 
> 
> This should cover cases such as ...
> 
> - first time initialization of HPET & timers during guest o/s boot.
>   (includes kexec and reboot)
> 
> - if a guest o/s stops and restarts the timer.
>   (for example, to change the comparator register value or
>    to switch a timer between periodic mode and non-periodic mode)
> 
> 
> Regards,
> 
> Uli

Uli, looks good.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/hpet.c b/hw/hpet.c
index 35466ae..92d5f58 100644
--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -32,6 +32,7 @@ 
 #include "sysbus.h"
 #include "mc146818rtc.h"
 #include "sysemu.h"
+#include <assert.h>
 
 //#define HPET_DEBUG
 #ifdef HPET_DEBUG
@@ -42,6 +43,9 @@ 
 
 #define HPET_MSI_SUPPORT        0
 
+#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
+#define MAX_IRQ_RATE                (uint32_t)10
+
 struct HPETState;
 typedef struct HPETTimer {  /* timers */
     uint8_t tn;             /*timer number*/
@@ -326,28 +330,63 @@  static const VMStateDescription vmstate_hpet = {
     }
 };
 
+static bool hpet_timer_has_tick_backlog(HPETTimer *t)
+{
+    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
+    return (backlog >= t->period);
+}
+
 /*
  * timer expiration callback
  */
 static void hpet_timer(void *opaque)
 {
     HPETTimer *t = opaque;
+    HPETState *s = t->state;
     uint64_t diff;
-
+    int irq_delivered = 0;
+    uint32_t irq_count = 0;
     uint64_t period = t->period;
     uint64_t cur_tick = hpet_get_ticks(t->state);
 
+    if (s->driftfix && !t->ticks_not_accounted) {
+        t->ticks_not_accounted = t->prev_period = t->period;
+    }
     if (timer_is_periodic(t) && period != 0) {
         if (t->config & HPET_TN_32BIT) {
             while (hpet_time_after(cur_tick, t->cmp)) {
                 t->cmp = (uint32_t)(t->cmp + t->period);
+                t->ticks_not_accounted += t->period;
+                irq_count++;
             }
         } else {
             while (hpet_time_after64(cur_tick, t->cmp)) {
                 t->cmp += period;
+                t->ticks_not_accounted += period;
+                irq_count++;
             }
         }
         diff = hpet_calculate_diff(t, cur_tick);
+        if (s->driftfix) {
+            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
+                t->ticks_not_accounted = t->period + t->prev_period;
+            }
+            if (hpet_timer_has_tick_backlog(t)) {
+                if (t->irq_rate == 1 || irq_count > 1) {
+                    t->irq_rate++;
+                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
+                }
+                if (t->divisor == 0) {
+                    assert(irq_count);
+                }
+                if (irq_count) {
+                    t->divisor = t->irq_rate;
+                }
+                diff /= t->divisor--;
+            } else {
+                t->irq_rate = 1;
+            }
+        }
         qemu_mod_timer(t->qemu_timer,
                        qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
     } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
@@ -358,7 +397,22 @@  static void hpet_timer(void *opaque)
             t->wrap_flag = 0;
         }
     }
-    update_irq(t, 1);
+    if (s->driftfix && timer_is_periodic(t) && period != 0) {
+        if (t->ticks_not_accounted >= t->period + t->prev_period) {
+            irq_delivered = update_irq(t, 1);
+            if (irq_delivered) {
+                t->ticks_not_accounted -= t->prev_period;
+                t->prev_period = t->period;
+            } else {
+                if (irq_count) {
+                    t->irq_rate++;
+                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
+                }
+            }
+        }
+    } else {
+        update_irq(t, 1);
+    }
 }
 
 static void hpet_set_timer(HPETTimer *t)
@@ -697,6 +751,11 @@  static void hpet_reset(DeviceState *d)
         timer->config |=  0x00000004ULL << 32;
         timer->period = 0ULL;
         timer->wrap_flag = 0;
+
+        timer->prev_period = 0;
+        timer->ticks_not_accounted = 0;
+        timer->irq_rate = 1;
+        timer->divisor = 1;
     }
 
     s->hpet_counter = 0ULL;