diff mbox series

[RFC,v2,3/5] target/arm/kvm: Implement virtual time adjustment

Message ID 20191212173320.11610-4-drjones@redhat.com (mailing list archive)
State New, archived
Headers show
Series target/arm/kvm: Adjust virtual time | expand

Commit Message

Andrew Jones Dec. 12, 2019, 5:33 p.m. UTC
When a VM is stopped (guest is paused) guest virtual time
should stop counting. Otherwise, when the VM is resumed it
will experience time jumps and its kernel may report soft
lockups. Not counting virtual time while the VM is stopped
has the side effect of making the guest's time appear to lag
when compared with real time, and even with time derived from
the physical counter. For this reason, this change, which is
enabled by default, comes with a KVM CPU feature allowing it
to be disabled, restoring legacy behavior.

This patch only provides the implementation of the virtual
time adjustment. A subsequent patch will provide the CPU
property allowing the change to be enabled and disabled.

Reported-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Andrew Jones <drjones@redhat.com>
---
 target/arm/cpu.h     |  9 +++++++++
 target/arm/kvm.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++
 target/arm/kvm32.c   |  3 +++
 target/arm/kvm64.c   |  3 +++
 target/arm/kvm_arm.h | 23 +++++++++++++++++++++
 5 files changed, 86 insertions(+)

Comments

Peter Maydell Dec. 16, 2019, 3:14 p.m. UTC | #1
On Thu, 12 Dec 2019 at 17:33, Andrew Jones <drjones@redhat.com> wrote:
>
> When a VM is stopped (guest is paused) guest virtual time
> should stop counting. Otherwise, when the VM is resumed it
> will experience time jumps and its kernel may report soft
> lockups. Not counting virtual time while the VM is stopped
> has the side effect of making the guest's time appear to lag
> when compared with real time, and even with time derived from
> the physical counter. For this reason, this change, which is
> enabled by default, comes with a KVM CPU feature allowing it
> to be disabled, restoring legacy behavior.
>
> This patch only provides the implementation of the virtual
> time adjustment. A subsequent patch will provide the CPU
> property allowing the change to be enabled and disabled.
>
> Reported-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
> Signed-off-by: Andrew Jones <drjones@redhat.com>
> ---
>  target/arm/cpu.h     |  9 +++++++++
>  target/arm/kvm.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  target/arm/kvm32.c   |  3 +++
>  target/arm/kvm64.c   |  3 +++
>  target/arm/kvm_arm.h | 23 +++++++++++++++++++++
>  5 files changed, 86 insertions(+)
>
> diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> index 83a809d4bac4..a79ea74125b3 100644
> --- a/target/arm/cpu.h
> +++ b/target/arm/cpu.h
> @@ -821,6 +821,15 @@ struct ARMCPU {
>      /* KVM init features for this CPU */
>      uint32_t kvm_init_features[7];
>
> +    /* KVM CPU features */
> +    bool kvm_adjvtime;
> +
> +    /* VCPU virtual counter value used with kvm_adjvtime */
> +    uint64_t kvm_vtime;

How does this new state interact with migration ?

> +
> +    /* True if the run state is, or transitioning from, RUN_STATE_PAUSED */
> +    bool runstate_paused;
> +
>      /* Uniprocessor system with MP extensions */
>      bool mp_is_up;
>
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index 5b82cefef608..a55fe7d7aefd 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -348,6 +348,24 @@ void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group,
>      memory_region_ref(kd->mr);
>  }
>
> +void kvm_arm_vm_state_change(void *opaque, int running, RunState state)
> +{
> +    CPUState *cs = opaque;
> +    ARMCPU *cpu = ARM_CPU(cs);
> +
> +    if (running) {
> +        if (cpu->kvm_adjvtime && cpu->runstate_paused) {
> +            kvm_arm_set_virtual_time(cs, cpu->kvm_vtime);
> +        }
> +        cpu->runstate_paused = false;
> +    } else if (state == RUN_STATE_PAUSED) {
> +        cpu->runstate_paused = true;
> +        if (cpu->kvm_adjvtime) {
> +            kvm_arm_get_virtual_time(cs, &cpu->kvm_vtime);
> +        }
> +    }
> +}

How does this interact with the usual register sync to/from
KVM (ie kvm_arch_get_registers(), which I think will do a
GET_ONE_REG read of the TIMER_CNT register the way it does
any other sysreg, inside write_kvmstate_to_list(), plus
kvm_arch_set_registers() which does the write back to the
kernel in write_list_to_kvmstate()) ? Presumably we want this
version to take precedence by the set_virtual_time call
happening after the kvm_arch_set_registers, but is this
guaranteed ?

thanks
-- PMM
Peter Maydell Dec. 16, 2019, 3:40 p.m. UTC | #2
On Mon, 16 Dec 2019 at 15:14, Peter Maydell <peter.maydell@linaro.org> wrote:
> How does this interact with the usual register sync to/from
> KVM (ie kvm_arch_get_registers(), which I think will do a
> GET_ONE_REG read of the TIMER_CNT register the way it does
> any other sysreg, inside write_kvmstate_to_list(), plus
> kvm_arch_set_registers() which does the write back to the
> kernel in write_list_to_kvmstate()) ? Presumably we want this
> version to take precedence by the set_virtual_time call
> happening after the kvm_arch_set_registers, but is this
> guaranteed ?

...you might also want to look at the effects of simply
removing the KVM_REG_ARM_TIMER_CNT entry from the
'non_runtime_cpregs[]' array -- in commit 4b7a6bf402bd064
we explicitly stopped reading/writing this register's value
to/from the kernel except for inbound migration, and it
feels like this patchset is now rolling back that approach,
so maybe we should also be (configurably) rolling back some
of its implementation rather than just leaving it in place.

I note also that the commit message there had a remark
about inconsistencies between VCPUs -- is the right thing
to handle this per-VM rather than per-VCPU somehow?

thanks
-- PMM
Andrew Jones Dec. 16, 2019, 4:36 p.m. UTC | #3
On Mon, Dec 16, 2019 at 03:14:00PM +0000, Peter Maydell wrote:
> On Thu, 12 Dec 2019 at 17:33, Andrew Jones <drjones@redhat.com> wrote:
> >
> > When a VM is stopped (guest is paused) guest virtual time
> > should stop counting. Otherwise, when the VM is resumed it
> > will experience time jumps and its kernel may report soft
> > lockups. Not counting virtual time while the VM is stopped
> > has the side effect of making the guest's time appear to lag
> > when compared with real time, and even with time derived from
> > the physical counter. For this reason, this change, which is
> > enabled by default, comes with a KVM CPU feature allowing it
> > to be disabled, restoring legacy behavior.
> >
> > This patch only provides the implementation of the virtual
> > time adjustment. A subsequent patch will provide the CPU
> > property allowing the change to be enabled and disabled.
> >
> > Reported-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
> > Signed-off-by: Andrew Jones <drjones@redhat.com>
> > ---
> >  target/arm/cpu.h     |  9 +++++++++
> >  target/arm/kvm.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++
> >  target/arm/kvm32.c   |  3 +++
> >  target/arm/kvm64.c   |  3 +++
> >  target/arm/kvm_arm.h | 23 +++++++++++++++++++++
> >  5 files changed, 86 insertions(+)
> >
> > diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> > index 83a809d4bac4..a79ea74125b3 100644
> > --- a/target/arm/cpu.h
> > +++ b/target/arm/cpu.h
> > @@ -821,6 +821,15 @@ struct ARMCPU {
> >      /* KVM init features for this CPU */
> >      uint32_t kvm_init_features[7];
> >
> > +    /* KVM CPU features */
> > +    bool kvm_adjvtime;
> > +
> > +    /* VCPU virtual counter value used with kvm_adjvtime */
> > +    uint64_t kvm_vtime;
> 
> How does this new state interact with migration ?

I don't think we should need to worry about this state, because migration
will do its own save/restore of the virtual counter, and as that restore
comes later, it'll take precedence. We still need this state for the usual
save/restore when not migrating, though, because KVM_REG_ARM_TIMER_CNT is
a non-runtime cpreg with its level set to KVM_PUT_FULL_STATE.

> 
> > +
> > +    /* True if the run state is, or transitioning from, RUN_STATE_PAUSED */
> > +    bool runstate_paused;
> > +
> >      /* Uniprocessor system with MP extensions */
> >      bool mp_is_up;
> >
> > diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> > index 5b82cefef608..a55fe7d7aefd 100644
> > --- a/target/arm/kvm.c
> > +++ b/target/arm/kvm.c
> > @@ -348,6 +348,24 @@ void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group,
> >      memory_region_ref(kd->mr);
> >  }
> >
> > +void kvm_arm_vm_state_change(void *opaque, int running, RunState state)
> > +{
> > +    CPUState *cs = opaque;
> > +    ARMCPU *cpu = ARM_CPU(cs);
> > +
> > +    if (running) {
> > +        if (cpu->kvm_adjvtime && cpu->runstate_paused) {
> > +            kvm_arm_set_virtual_time(cs, cpu->kvm_vtime);
> > +        }
> > +        cpu->runstate_paused = false;
> > +    } else if (state == RUN_STATE_PAUSED) {
> > +        cpu->runstate_paused = true;
> > +        if (cpu->kvm_adjvtime) {
> > +            kvm_arm_get_virtual_time(cs, &cpu->kvm_vtime);
> > +        }
> > +    }
> > +}
> 
> How does this interact with the usual register sync to/from
> KVM (ie kvm_arch_get_registers(), which I think will do a
> GET_ONE_REG read of the TIMER_CNT register the way it does
> any other sysreg, inside write_kvmstate_to_list(), plus
> kvm_arch_set_registers() which does the write back to the
> kernel in write_list_to_kvmstate()) ?

It will, but only when level == KVM_PUT_FULL_STATE.


> Presumably we want this
> version to take precedence by the set_virtual_time call
> happening after the kvm_arch_set_registers, but is this
> guaranteed ?

Actually it doesn't really matter which takes precedence (I don't think),
which is why we can rely on the usual save/restore for migration. We only
need the new state this patch adds because we don't have any recent state
otherwise, and because then we can be selective and only do the
save/restore when transitioning to/from paused state.

Thanks,
drew
Andrew Jones Dec. 16, 2019, 4:43 p.m. UTC | #4
On Mon, Dec 16, 2019 at 03:40:16PM +0000, Peter Maydell wrote:
> On Mon, 16 Dec 2019 at 15:14, Peter Maydell <peter.maydell@linaro.org> wrote:
> > How does this interact with the usual register sync to/from
> > KVM (ie kvm_arch_get_registers(), which I think will do a
> > GET_ONE_REG read of the TIMER_CNT register the way it does
> > any other sysreg, inside write_kvmstate_to_list(), plus
> > kvm_arch_set_registers() which does the write back to the
> > kernel in write_list_to_kvmstate()) ? Presumably we want this
> > version to take precedence by the set_virtual_time call
> > happening after the kvm_arch_set_registers, but is this
> > guaranteed ?
> 
> ...you might also want to look at the effects of simply
> removing the KVM_REG_ARM_TIMER_CNT entry from the
> 'non_runtime_cpregs[]' array -- in commit 4b7a6bf402bd064
> we explicitly stopped reading/writing this register's value
> to/from the kernel except for inbound migration, and it
> feels like this patchset is now rolling back that approach,
> so maybe we should also be (configurably) rolling back some
> of its implementation rather than just leaving it in place.

I feel like I already considered that, maybe even tried it, a few months
ago when I first looked at this. I must have decided against it for some
reason at the time, but I don't recall what. Now I can say the reason is
because we only do this save/restore when we transition to/from paused
state, though.

> 
> I note also that the commit message there had a remark
> about inconsistencies between VCPUs -- is the right thing
> to handle this per-VM rather than per-VCPU somehow?

per-VM would make sense, because the counters should be synchronized
among the VCPUs. KVM does that for us, though, so whichever VCPU last
restores its counter is the one that will determine the final value.

Maybe we should have a VM ioctl instead, but ATM we only have VCPU ioctls.

Thanks,
drew
Peter Maydell Dec. 16, 2019, 6:06 p.m. UTC | #5
On Mon, 16 Dec 2019 at 16:44, Andrew Jones <drjones@redhat.com> wrote:
>
> On Mon, Dec 16, 2019 at 03:40:16PM +0000, Peter Maydell wrote:
> > On Mon, 16 Dec 2019 at 15:14, Peter Maydell <peter.maydell@linaro.org> wrote:
> > > How does this interact with the usual register sync to/from
> > > KVM (ie kvm_arch_get_registers(), which I think will do a
> > > GET_ONE_REG read of the TIMER_CNT register the way it does
> > > any other sysreg, inside write_kvmstate_to_list(), plus
> > > kvm_arch_set_registers() which does the write back to the
> > > kernel in write_list_to_kvmstate()) ? Presumably we want this
> > > version to take precedence by the set_virtual_time call
> > > happening after the kvm_arch_set_registers, but is this
> > > guaranteed ?
> >
> > ...you might also want to look at the effects of simply
> > removing the KVM_REG_ARM_TIMER_CNT entry from the
> > 'non_runtime_cpregs[]' array -- in commit 4b7a6bf402bd064
> > we explicitly stopped reading/writing this register's value
> > to/from the kernel except for inbound migration, and it
> > feels like this patchset is now rolling back that approach,
> > so maybe we should also be (configurably) rolling back some
> > of its implementation rather than just leaving it in place.
>
> I feel like I already considered that, maybe even tried it, a few months
> ago when I first looked at this. I must have decided against it for some
> reason at the time, but I don't recall what. Now I can say the reason is
> because we only do this save/restore when we transition to/from paused
> state, though.

I found the thread which discussed the bug which originally
caused us to add commit 4b7a6bf402bd064:
https://lists.cs.columbia.edu/pipermail/kvmarm/2015-July/015665.html
 -- there are some codepaths which cause us to do a sync from/to
KVM for one VCPU while others are still running. If we do a
read-CNT-and-write-back then we effectively cause time to jump
backwards for the other still-running CPUs.

So we do still want to have TIMER_CNT listed as being KVM_PUT_FULL_STATE
regardless, or we re-introduce that bug.

Your approach in this patchset reads and writes on vm-paused,
so it won't have the pre-2015 problems.

It still feels odd that we're storing this bit of guest state
in two places now though -- in kvm_vtime, and also in its usual
place in the cpreg_array data structures (we write back the
value from kvm_vtime when the VM starts running, and we write
back the value from the cpreg_array for a PUT_FULL_STATE, which
the comments claim is only on startup or when we just loaded
migration state (and also undocumentedly but reasonably on
cpu-hotplug, which arm doesn't have yet).

I've just spent a little while digging through code, and
haven't been able to satisfy myself on the ordering of which
writeback wins: for a loadvm I think we first do a
cpu_synchronize_all_post_init() (writing back the counter
value from the migration data) and then after than we will
unpause the VM -- why doesn't this overwrite the correct
value with the wrong value from kvm_vtime ?

I just noticed also that the logic used in this patch
doesn't match what other architectures do in their vm_state_change
function -- eg cpu_ppc_clock_vm_state_change() has an
"if (running) { load } else { save }", and kvmclock_vm_state_change()
for i386 also has "if (running) { ... } else { ... }", though
it has an extra wrinkle where it captures "are we PAUSED?"
to use in the pre_save function; the comment above
kvmclock_pre_save() suggests maybe that would be useful for other
than x86, too. kvm_s390_tod_vm_state_change() has
logic that's a slightly more complicated variation on just
testing the 'running' flag, but it doesn't look at the
specific new state.

> > I note also that the commit message there had a remark
> > about inconsistencies between VCPUs -- is the right thing
> > to handle this per-VM rather than per-VCPU somehow?
>
> per-VM would make sense, because the counters should be synchronized
> among the VCPUs. KVM does that for us, though, so whichever VCPU last
> restores its counter is the one that will determine the final value.
>
> Maybe we should have a VM ioctl instead, but ATM we only have VCPU ioctls.

I meant more "only do the save/load once per VM in QEMU but
do it by working with just one VCPU". But I guess since migration
works on all the VCPUs we're ok to do pause-resume the same way
(and I've now tracked down what the 'inconsistentencies between VCPUs'
were: they're when we were syncing the CNT value for one vCPU when
others were still running.)

thanks
-- PMM
Andrew Jones Dec. 19, 2019, 2:30 p.m. UTC | #6
On Mon, Dec 16, 2019 at 06:06:30PM +0000, Peter Maydell wrote:
> On Mon, 16 Dec 2019 at 16:44, Andrew Jones <drjones@redhat.com> wrote:
> >
> > On Mon, Dec 16, 2019 at 03:40:16PM +0000, Peter Maydell wrote:
> > > On Mon, 16 Dec 2019 at 15:14, Peter Maydell <peter.maydell@linaro.org> wrote:
> > > > How does this interact with the usual register sync to/from
> > > > KVM (ie kvm_arch_get_registers(), which I think will do a
> > > > GET_ONE_REG read of the TIMER_CNT register the way it does
> > > > any other sysreg, inside write_kvmstate_to_list(), plus
> > > > kvm_arch_set_registers() which does the write back to the
> > > > kernel in write_list_to_kvmstate()) ? Presumably we want this
> > > > version to take precedence by the set_virtual_time call
> > > > happening after the kvm_arch_set_registers, but is this
> > > > guaranteed ?
> > >
> > > ...you might also want to look at the effects of simply
> > > removing the KVM_REG_ARM_TIMER_CNT entry from the
> > > 'non_runtime_cpregs[]' array -- in commit 4b7a6bf402bd064
> > > we explicitly stopped reading/writing this register's value
> > > to/from the kernel except for inbound migration, and it
> > > feels like this patchset is now rolling back that approach,
> > > so maybe we should also be (configurably) rolling back some
> > > of its implementation rather than just leaving it in place.
> >
> > I feel like I already considered that, maybe even tried it, a few months
> > ago when I first looked at this. I must have decided against it for some
> > reason at the time, but I don't recall what. Now I can say the reason is
> > because we only do this save/restore when we transition to/from paused
> > state, though.
> 
> I found the thread which discussed the bug which originally
> caused us to add commit 4b7a6bf402bd064:
> https://lists.cs.columbia.edu/pipermail/kvmarm/2015-July/015665.html
>  -- there are some codepaths which cause us to do a sync from/to
> KVM for one VCPU while others are still running. If we do a
> read-CNT-and-write-back then we effectively cause time to jump
> backwards for the other still-running CPUs.
> 
> So we do still want to have TIMER_CNT listed as being KVM_PUT_FULL_STATE
> regardless, or we re-introduce that bug.

Thanks for digging that up. I now recall also having read that history
back when I first discovered KVM_REG_ARM_TIMER_CNT was special.

> 
> Your approach in this patchset reads and writes on vm-paused,
> so it won't have the pre-2015 problems.
> 
> It still feels odd that we're storing this bit of guest state
> in two places now though -- in kvm_vtime, and also in its usual
> place in the cpreg_array data structures (we write back the
> value from kvm_vtime when the VM starts running, and we write
> back the value from the cpreg_array for a PUT_FULL_STATE, which
> the comments claim is only on startup or when we just loaded
> migration state (and also undocumentedly but reasonably on
> cpu-hotplug, which arm doesn't have yet).
> 
> I've just spent a little while digging through code, and
> haven't been able to satisfy myself on the ordering of which
> writeback wins: for a loadvm I think we first do a
> cpu_synchronize_all_post_init() (writing back the counter
> value from the migration data) and then after than we will
> unpause the VM -- why doesn't this overwrite the correct
> value with the wrong value from kvm_vtime ?

Hmm... I think I may have gotten lost when I went through this before.
I just went through again, and still won't claim that I'm not a bit
lost, but it does appear I got it backwards. When I get a chance I'll
try to test this properly.

We could use the same location as normal, in the cpreg_array. I'd just
need to add a search of cpreg_indexes[] in order to get the index
needed for cpreg_values[]. 

> 
> I just noticed also that the logic used in this patch
> doesn't match what other architectures do in their vm_state_change
> function -- eg cpu_ppc_clock_vm_state_change() has an
> "if (running) { load } else { save }", and kvmclock_vm_state_change()
> for i386 also has "if (running) { ... } else { ... }", though
> it has an extra wrinkle where it captures "are we PAUSED?"
> to use in the pre_save function; the comment above
> kvmclock_pre_save() suggests maybe that would be useful for other
> than x86, too. kvm_s390_tod_vm_state_change() has
> logic that's a slightly more complicated variation on just
> testing the 'running' flag, but it doesn't look at the
> specific new state.

Yes, originally I had just if (running) {} else {}, but after looking at
https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg03695.html and
seeing that the other architectures were careful to track the "are we
paused" state, I got the feeling that we should be more specific and
changed to if (running) {} else if (paused) {}. That's probably wrong,
though, if we want to track all vm-stopped time.

> 
> > > I note also that the commit message there had a remark
> > > about inconsistencies between VCPUs -- is the right thing
> > > to handle this per-VM rather than per-VCPU somehow?
> >
> > per-VM would make sense, because the counters should be synchronized
> > among the VCPUs. KVM does that for us, though, so whichever VCPU last
> > restores its counter is the one that will determine the final value.
> >
> > Maybe we should have a VM ioctl instead, but ATM we only have VCPU ioctls.
> 
> I meant more "only do the save/load once per VM in QEMU but
> do it by working with just one VCPU". But I guess since migration
> works on all the VCPUs we're ok to do pause-resume the same way
> (and I've now tracked down what the 'inconsistentencies between VCPUs'
> were: they're when we were syncing the CNT value for one vCPU when
> others were still running.)
> 
> thanks
> -- PMM
> 

Thanks,
drew
Andrew Jones Jan. 20, 2020, 9:40 a.m. UTC | #7
On Thu, Dec 19, 2019 at 03:30:12PM +0100, Andrew Jones wrote:
> On Mon, Dec 16, 2019 at 06:06:30PM +0000, Peter Maydell wrote:
> > Your approach in this patchset reads and writes on vm-paused,
> > so it won't have the pre-2015 problems.
> > 
> > It still feels odd that we're storing this bit of guest state
> > in two places now though -- in kvm_vtime, and also in its usual
> > place in the cpreg_array data structures (we write back the
> > value from kvm_vtime when the VM starts running, and we write
> > back the value from the cpreg_array for a PUT_FULL_STATE, which
> > the comments claim is only on startup or when we just loaded
> > migration state (and also undocumentedly but reasonably on
> > cpu-hotplug, which arm doesn't have yet).

I tried to get rid of the extra state location (kvm_vtime), but we still
need it because kvm_arch_get_registers() doesn't take 'level', like
kvm_arch_put_registers() does. Maybe it should? Without being able to
filter out TIMER_CNT at get time too, then if we migrate a paused guest
we'll resume with vtime including the ticks between the pause and the
start of the migration. Adding the additional state (kvm_vtime) and a
cpu_pre_save() hook to fixup the cpreg value is a possible way to resolve
that. That's what I've done for v3, which I'll post shortly.

> > 
> > I've just spent a little while digging through code, and
> > haven't been able to satisfy myself on the ordering of which
> > writeback wins: for a loadvm I think we first do a
> > cpu_synchronize_all_post_init() (writing back the counter
> > value from the migration data) and then after than we will
> > unpause the VM -- why doesn't this overwrite the correct
> > value with the wrong value from kvm_vtime ?

It wasn't overwriting because we weren't detecting a runstate
transition from paused to running. However, for v3, I've dropped
the explicit running/pause transition checking and now ensured
we get the right value with a cpu_post_load() hook.

> 
> > I just noticed also that the logic used in this patch
> > doesn't match what other architectures do in their vm_state_change
> > function -- eg cpu_ppc_clock_vm_state_change() has an
> > "if (running) { load } else { save }", and kvmclock_vm_state_change()
> > for i386 also has "if (running) { ... } else { ... }", though
> > it has an extra wrinkle where it captures "are we PAUSED?"
> > to use in the pre_save function; the comment above
> > kvmclock_pre_save() suggests maybe that would be useful for other
> > than x86, too. kvm_s390_tod_vm_state_change() has
> > logic that's a slightly more complicated variation on just
> > testing the 'running' flag, but it doesn't look at the
> > specific new state.

I think I've mimicked this logic now for arm in v3.

Thanks,
drew
diff mbox series

Patch

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 83a809d4bac4..a79ea74125b3 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -821,6 +821,15 @@  struct ARMCPU {
     /* KVM init features for this CPU */
     uint32_t kvm_init_features[7];
 
+    /* KVM CPU features */
+    bool kvm_adjvtime;
+
+    /* VCPU virtual counter value used with kvm_adjvtime */
+    uint64_t kvm_vtime;
+
+    /* True if the run state is, or transitioning from, RUN_STATE_PAUSED */
+    bool runstate_paused;
+
     /* Uniprocessor system with MP extensions */
     bool mp_is_up;
 
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 5b82cefef608..a55fe7d7aefd 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -348,6 +348,24 @@  void kvm_arm_register_device(MemoryRegion *mr, uint64_t devid, uint64_t group,
     memory_region_ref(kd->mr);
 }
 
+void kvm_arm_vm_state_change(void *opaque, int running, RunState state)
+{
+    CPUState *cs = opaque;
+    ARMCPU *cpu = ARM_CPU(cs);
+
+    if (running) {
+        if (cpu->kvm_adjvtime && cpu->runstate_paused) {
+            kvm_arm_set_virtual_time(cs, cpu->kvm_vtime);
+        }
+        cpu->runstate_paused = false;
+    } else if (state == RUN_STATE_PAUSED) {
+        cpu->runstate_paused = true;
+        if (cpu->kvm_adjvtime) {
+            kvm_arm_get_virtual_time(cs, &cpu->kvm_vtime);
+        }
+    }
+}
+
 static int compare_u64(const void *a, const void *b)
 {
     if (*(uint64_t *)a > *(uint64_t *)b) {
@@ -579,6 +597,36 @@  int kvm_arm_sync_mpstate_to_qemu(ARMCPU *cpu)
     return 0;
 }
 
+void kvm_arm_get_virtual_time(CPUState *cs, uint64_t *cnt)
+{
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_ARM_TIMER_CNT,
+        .addr = (uintptr_t)cnt,
+    };
+    int ret;
+
+    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
+    if (ret) {
+        error_report("Failed to get KVM_REG_ARM_TIMER_CNT");
+        abort();
+    }
+}
+
+void kvm_arm_set_virtual_time(CPUState *cs, uint64_t cnt)
+{
+    struct kvm_one_reg reg = {
+        .id = KVM_REG_ARM_TIMER_CNT,
+        .addr = (uintptr_t)&cnt,
+    };
+    int ret;
+
+    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
+    if (ret) {
+        error_report("Failed to set KVM_REG_ARM_TIMER_CNT");
+        abort();
+    }
+}
+
 int kvm_put_vcpu_events(ARMCPU *cpu)
 {
     CPUARMState *env = &cpu->env;
diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
index 32bf8d6757c4..3a8b437eef0b 100644
--- a/target/arm/kvm32.c
+++ b/target/arm/kvm32.c
@@ -16,6 +16,7 @@ 
 #include "qemu-common.h"
 #include "cpu.h"
 #include "qemu/timer.h"
+#include "sysemu/runstate.h"
 #include "sysemu/kvm.h"
 #include "kvm_arm.h"
 #include "internals.h"
@@ -198,6 +199,8 @@  int kvm_arch_init_vcpu(CPUState *cs)
         return -EINVAL;
     }
 
+    qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
+
     /* Determine init features for this CPU */
     memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
     if (cpu->start_powered_off) {
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 5cafcb7d36dd..e486eaf1f944 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -23,6 +23,7 @@ 
 #include "qemu/host-utils.h"
 #include "qemu/main-loop.h"
 #include "exec/gdbstub.h"
+#include "sysemu/runstate.h"
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_int.h"
 #include "kvm_arm.h"
@@ -735,6 +736,8 @@  int kvm_arch_init_vcpu(CPUState *cs)
         return -EINVAL;
     }
 
+    qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
+
     /* Determine init features for this CPU */
     memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
     if (cpu->start_powered_off) {
diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index 8e14d400e8ab..16b53e45377d 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -232,6 +232,24 @@  void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map);
  */
 void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu);
 
+/**
+ * void kvm_arm_get_virtual_time:
+ * @cs: CPUState
+ * @cnt: the virtual counter to fill in
+ *
+ * Gets the VCPU's virtual counter and stores it in @cnt.
+ */
+void kvm_arm_get_virtual_time(CPUState *cs, uint64_t *cnt);
+
+/**
+ * void kvm_arm_set_virtual_time:
+ * @cs: CPUState
+ * @cnt: new virtual counter value
+ *
+ * Sets the VCPU's virtual counter to @cnt.
+ */
+void kvm_arm_set_virtual_time(CPUState *cs, uint64_t cnt);
+
 /**
  * kvm_arm_aarch32_supported:
  * @cs: CPUState
@@ -288,6 +306,8 @@  void kvm_arm_pmu_set_irq(CPUState *cs, int irq);
 void kvm_arm_pmu_init(CPUState *cs);
 int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level);
 
+void kvm_arm_vm_state_change(void *opaque, int running, RunState state);
+
 #else
 
 static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu)
@@ -324,6 +344,9 @@  static inline int kvm_arm_vgic_probe(void)
     return 0;
 }
 
+static inline void kvm_arm_get_virtual_time(CPUState *cs, uint64_t *cnt) {}
+static inline void kvm_arm_set_virtual_time(CPUState *cs, uint64_t cnt) {}
+
 static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq) {}
 static inline void kvm_arm_pmu_init(CPUState *cs) {}