diff mbox series

[02/10] KVM: arm64: Implement initial support for KVM_CAP_SYSTEM_COUNTER_STATE

Message ID 20210608214742.1897483-3-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: Add idempotent controls for migrating system counter state | expand

Commit Message

Oliver Upton June 8, 2021, 9:47 p.m. UTC
ARMv8 provides for a virtual counter-timer offset that is added to guest
views of the virtual counter-timer (CNTVOFF_EL2). To date, KVM has not
provided userspace with any perception of this, and instead affords a
value-based scheme of migrating the virtual counter-timer by directly
reading/writing the guest's CNTVCT_EL0. This is problematic because
counters continue to elapse while the register is being written, meaning
it is possible for drift to sneak in to the guest's time scale. This is
exacerbated by the fact that KVM will calculate an appropriate
CNTVOFF_EL2 every time the register is written, which will be broadcast
to all virtual CPUs. The only possible way to avoid causing guest time
to drift is to restore counter-timers by offset.

Implement initial support for KVM_{GET,SET}_SYSTEM_COUNTER_STATE ioctls
to migrate the value of CNTVOFF_EL2. These ioctls yield precise control
of the virtual counter-timers to userspace, allowing it to define its
own heuristics for managing vCPU offsets.

Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Jing Zhang <jingzhangos@google.com>
Reviewed-by: Peter Shier <pshier@google.com>
Signed-off-by: Oliver Upton <oupton@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  5 +++++
 arch/arm64/include/uapi/asm/kvm.h | 10 ++++++++++
 arch/arm64/kvm/arch_timer.c       | 22 ++++++++++++++++++++++
 arch/arm64/kvm/arm.c              | 25 +++++++++++++++++++++++++
 4 files changed, 62 insertions(+)

Comments

Oliver Upton June 8, 2021, 9:55 p.m. UTC | #1
On Tue, Jun 8, 2021 at 4:48 PM Oliver Upton <oupton@google.com> wrote:
>
> ARMv8 provides for a virtual counter-timer offset that is added to guest
> views of the virtual counter-timer (CNTVOFF_EL2). To date, KVM has not
> provided userspace with any perception of this, and instead affords a
> value-based scheme of migrating the virtual counter-timer by directly
> reading/writing the guest's CNTVCT_EL0. This is problematic because
> counters continue to elapse while the register is being written, meaning
> it is possible for drift to sneak in to the guest's time scale. This is
> exacerbated by the fact that KVM will calculate an appropriate
> CNTVOFF_EL2 every time the register is written, which will be broadcast
> to all virtual CPUs. The only possible way to avoid causing guest time
> to drift is to restore counter-timers by offset.
>
> Implement initial support for KVM_{GET,SET}_SYSTEM_COUNTER_STATE ioctls
> to migrate the value of CNTVOFF_EL2. These ioctls yield precise control
> of the virtual counter-timers to userspace, allowing it to define its
> own heuristics for managing vCPU offsets.
>
> Reviewed-by: Jim Mattson <jmattson@google.com>
> Reviewed-by: Jing Zhang <jingzhangos@google.com>
> Reviewed-by: Peter Shier <pshier@google.com>
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  5 +++++
>  arch/arm64/include/uapi/asm/kvm.h | 10 ++++++++++
>  arch/arm64/kvm/arch_timer.c       | 22 ++++++++++++++++++++++
>  arch/arm64/kvm/arm.c              | 25 +++++++++++++++++++++++++
>  4 files changed, 62 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7cd7d5c8c4bc..31107d5e61af 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -781,4 +781,9 @@ void __init kvm_hyp_reserve(void);
>  static inline void kvm_hyp_reserve(void) { }
>  #endif
>
> +int kvm_arm_vcpu_get_system_counter_state(struct kvm_vcpu *vcpu,
> +                                         struct kvm_system_counter_state *state);
> +int kvm_arm_vcpu_set_system_counter_state(struct kvm_vcpu *vcpu,
> +                                         struct kvm_system_counter_state *state);
> +
>  #endif /* __ARM64_KVM_HOST_H__ */
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 24223adae150..d3987089c524 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -184,6 +184,16 @@ struct kvm_vcpu_events {
>         __u32 reserved[12];
>  };
>
> +/* for KVM_{GET,SET}_SYSTEM_COUNTER_STATE */
> +struct kvm_system_counter_state {
> +       /* indicates what fields are valid in the structure */
> +       __u32 flags;
> +       __u32 pad;
> +       /* guest counter-timer offset, relative to host cntpct_el0 */
> +       __u64 cntvoff;
> +       __u64 rsvd[7];
> +};
> +
>  /* If you need to interpret the index values, here is the key: */
>  #define KVM_REG_ARM_COPROC_MASK                0x000000000FFF0000
>  #define KVM_REG_ARM_COPROC_SHIFT       16
> diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
> index 74e0699661e9..955a7a183362 100644
> --- a/arch/arm64/kvm/arch_timer.c
> +++ b/arch/arm64/kvm/arch_timer.c
> @@ -1259,3 +1259,25 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
>
>         return -ENXIO;
>  }
> +
> +int kvm_arm_vcpu_get_system_counter_state(struct kvm_vcpu *vcpu,
> +                                         struct kvm_system_counter_state *state)
> +{
> +       if (state->flags)
> +               return -EINVAL;
> +
> +       state->cntvoff = timer_get_offset(vcpu_vtimer(vcpu));
> +
> +       return 0;
> +}
> +
> +int kvm_arm_vcpu_set_system_counter_state(struct kvm_vcpu *vcpu,
> +                                         struct kvm_system_counter_state *state)
> +{
> +       if (state->flags)
> +               return -EINVAL;
> +
> +       timer_set_offset(vcpu_vtimer(vcpu), state->cntvoff);

Adding some discussion that Ricardo and I had regarding this portion
of the patch:

Ricardo asks if it would make more sense to have the
KVM_SET_SYSTEM_COUNTER_STATE ioctl broadcast the counter offset to all
vCPUs, like we do for the value-based SET_REG() implementation. To me,
the broadcasting was more necessary for the value-based interface as
it is difficult/impossible to synchronize by value, but now some of
the onus to do the right thing might be on the VMM. No strong opinions
either way, so open to suggestions here.

--
Thanks,
Oliver

> +
> +       return 0;
> +}
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 1126eae27400..b78ffb4db9dd 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -207,6 +207,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_SET_GUEST_DEBUG:
>         case KVM_CAP_VCPU_ATTRIBUTES:
>         case KVM_CAP_PTP_KVM:
> +       case KVM_CAP_SYSTEM_COUNTER_STATE:
>                 r = 1;
>                 break;
>         case KVM_CAP_SET_GUEST_DEBUG2:
> @@ -1273,6 +1274,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
>                 return kvm_arm_vcpu_finalize(vcpu, what);
>         }
> +       case KVM_GET_SYSTEM_COUNTER_STATE: {
> +               struct kvm_system_counter_state state;
> +
> +               if (copy_from_user(&state, argp, sizeof(state)))
> +                       return -EFAULT;
> +
> +               r = kvm_arm_vcpu_get_system_counter_state(vcpu, &state);
> +               if (r)
> +                       break;
> +
> +               if (copy_to_user(argp, &state, sizeof(state)))
> +                       return -EFAULT;
> +
> +               break;
> +       }
> +       case KVM_SET_SYSTEM_COUNTER_STATE: {
> +               struct kvm_system_counter_state state;
> +
> +               if (copy_from_user(&state, argp, sizeof(state)))
> +                       return -EFAULT;
> +
> +               r = kvm_arm_vcpu_set_system_counter_state(vcpu, &state);
> +               break;
> +       }
>         default:
>                 r = -EINVAL;
>         }
> --
> 2.32.0.rc1.229.g3e70b5a671-goog
>
Marc Zyngier June 9, 2021, 10:23 a.m. UTC | #2
Hi Oliver,

Please Cc the KVM/arm64 reviewers (now added). Also, please consider
subscribing to the kvmarm mailing list so that I don't have to
manually approve your posts ;-).

On Tue, 08 Jun 2021 22:47:34 +0100,
Oliver Upton <oupton@google.com> wrote:
> 
> ARMv8 provides for a virtual counter-timer offset that is added to guest
> views of the virtual counter-timer (CNTVOFF_EL2). To date, KVM has not
> provided userspace with any perception of this, and instead affords a
> value-based scheme of migrating the virtual counter-timer by directly
> reading/writing the guest's CNTVCT_EL0. This is problematic because
> counters continue to elapse while the register is being written, meaning
> it is possible for drift to sneak in to the guest's time scale. This is
> exacerbated by the fact that KVM will calculate an appropriate
> CNTVOFF_EL2 every time the register is written, which will be broadcast
> to all virtual CPUs. The only possible way to avoid causing guest time
> to drift is to restore counter-timers by offset.

Well, the current method has one huge advantage: time can never go
backward from the guest PoV if you restore what you have saved. Yes,
time can elapse, but you don't even need to migrate to observe that.

>
> Implement initial support for KVM_{GET,SET}_SYSTEM_COUNTER_STATE ioctls
> to migrate the value of CNTVOFF_EL2. These ioctls yield precise control
> of the virtual counter-timers to userspace, allowing it to define its
> own heuristics for managing vCPU offsets.

I'm not really in favour of inventing a completely new API, for
multiple reasons:

- CNTVOFF is an EL2 concept. I'd rather not expose it as such as it
  becomes really confusing with NV (which does expose its own CNTVOFF
  via the ONE_REG interface)

- You seem to allow each vcpu to get its own offset. I don't think
  that's right. The architecture defines that all PEs have the same
  view of the counters, and an EL1 guest should be given that
  illusion.

- by having a parallel save/restore interface, you make it harder to
  reason about what happens with concurrent calls to both interfaces

- the userspace API is already horribly bloated, and I'm not overly
  keen on adding more if we can avoid it.

I'd rather you extend the current ONE_REG interface and make it modal,
either allowing the restore of an absolute value or an offset for
CNTVCT_EL0. This would also keep a consistent behaviour when restoring
vcpus. The same logic would apply to the physical offset.

As for how to make it modal, we have plenty of bits left in the
ONE_REG encoding. Pick one, and make that a "relative" attribute. This
will result in some minor surgery in the get/set code paths, but at
least no entirely new mechanism.

One question though: how do you plan to reliably compute the offset?
As far as I can see, it is subject to the same issues you described
above (while the guest is being restored, time flies), and you have
the added risk of exposing a counter going backward from a guest
perspective.

Thanks,

	M.
Oliver Upton June 9, 2021, 2:51 p.m. UTC | #3
On Wed, Jun 9, 2021 at 5:23 AM Marc Zyngier <maz@kernel.org> wrote:
>
> Hi Oliver,
>
> Please Cc the KVM/arm64 reviewers (now added). Also, please consider
> subscribing to the kvmarm mailing list so that I don't have to
> manually approve your posts ;-).

/facepalm

Thought I had done this already. Re-requested to join kvmarm@. Seems
that gmail politely decided the mailing list was spam, so no
confirmation email came through.

> On Tue, 08 Jun 2021 22:47:34 +0100,
> Oliver Upton <oupton@google.com> wrote:
> >
> > ARMv8 provides for a virtual counter-timer offset that is added to guest
> > views of the virtual counter-timer (CNTVOFF_EL2). To date, KVM has not
> > provided userspace with any perception of this, and instead affords a
> > value-based scheme of migrating the virtual counter-timer by directly
> > reading/writing the guest's CNTVCT_EL0. This is problematic because
> > counters continue to elapse while the register is being written, meaning
> > it is possible for drift to sneak in to the guest's time scale. This is
> > exacerbated by the fact that KVM will calculate an appropriate
> > CNTVOFF_EL2 every time the register is written, which will be broadcast
> > to all virtual CPUs. The only possible way to avoid causing guest time
> > to drift is to restore counter-timers by offset.
>
> Well, the current method has one huge advantage: time can never go
> backward from the guest PoV if you restore what you have saved. Yes,
> time can elapse, but you don't even need to migrate to observe that.
>
> >
> > Implement initial support for KVM_{GET,SET}_SYSTEM_COUNTER_STATE ioctls
> > to migrate the value of CNTVOFF_EL2. These ioctls yield precise control
> > of the virtual counter-timers to userspace, allowing it to define its
> > own heuristics for managing vCPU offsets.
>
> I'm not really in favour of inventing a completely new API, for
> multiple reasons:
>
> - CNTVOFF is an EL2 concept. I'd rather not expose it as such as it
>   becomes really confusing with NV (which does expose its own CNTVOFF
>   via the ONE_REG interface)

Very true. At least on x86, there's a fair bit of plumbing to handle
the KVM-owned L0 offset reg and the guest-owned L1 offset reg.

> - You seem to allow each vcpu to get its own offset. I don't think
>   that's right. The architecture defines that all PEs have the same
>   view of the counters, and an EL1 guest should be given that
>   illusion.

Agreed. I would have preferred a VM-wide ioctl to do this, but since
x86 explicitly allows for drifted TSCs that can't be the case in a
generic ioctl. I can do the same broadcasting as we do in the case of
a VMM write to CNTVCT_EL0.

> - by having a parallel save/restore interface, you make it harder to
>   reason about what happens with concurrent calls to both interfaces
>
> - the userspace API is already horribly bloated, and I'm not overly
>   keen on adding more if we can avoid it.

Pssh. My ioctl numbers aren't _too_ close to the limit ;-)

>
> I'd rather you extend the current ONE_REG interface and make it modal,
> either allowing the restore of an absolute value or an offset for
> CNTVCT_EL0. This would also keep a consistent behaviour when restoring
> vcpus. The same logic would apply to the physical offset.
>
> As for how to make it modal, we have plenty of bits left in the
> ONE_REG encoding. Pick one, and make that a "relative" attribute. This
> will result in some minor surgery in the get/set code paths, but at
> least no entirely new mechanism.

Yeah, it'd be good to do it w/o adding new plumbing. The only reason
I'd considered it is because x86 might necessitate it. Not wanting to
apply bad convention to other arches, but keeping at least a somewhat
consistent UAPI would be nice.

> One question though: how do you plan to reliably compute the offset?
> As far as I can see, it is subject to the same issues you described
> above (while the guest is being restored, time flies), and you have
> the added risk of exposing a counter going backward from a guest
> perspective.

Indeed, we do have the risk of time going backwards, but I'd say that
the VMM shares in the responsibility to provide a consistent view of
the counter too.

Here's how I envisioned it working:

Record the time, cycles, and offset (T0, C0, Off0) when saving the
counter state. Record time and cycles (T1, C1) again when trying to
restore counter state. Compute the new offset:

Off1 = Off0 - (T1-T0) * CNTFRQ - (C0 - C1).

The primary concern here is idempotence. Once Off1 is calculated, it
doesn't matter how much time elapses between the calculation and the
call into KVM, it will always produce the intended result. If instead
we restore the counters by-value (whilst trying to account for elapsed
time), my impression is that we'd do the following: Record time and
guest counter (T0, G0) when saving counter state. Record time again
when trying to restore counter state.

In userspace, compute the time elapsed and fold it into the guest counter (G1):

G1 = G0 + (T1-T0) * CNTFRQ

And then in the kernel:

CNTVOFF = G1 - CNTPCT

Any number of things can happen in between the kernel and userspace
portions of this operation, causing some drift of the VM's counter.
Fundamentally I suppose the issue we have is that we sample the host
counter twice (T1, G1), when really we'd want to only do so once.

So, open to any suggestions where we avoid the issue of causing the
guest counter to drift, offsets only seemed to be the easiest thing
given that they ought to be constant for the lifetime of a VM on a
host and is the backing state used by hardware.

--
Thanks,
Oliver

>
> Thanks,
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
Paolo Bonzini June 10, 2021, 6:26 a.m. UTC | #4
On 09/06/21 12:23, Marc Zyngier wrote:
>> Implement initial support for KVM_{GET,SET}_SYSTEM_COUNTER_STATE ioctls
>> to migrate the value of CNTVOFF_EL2. These ioctls yield precise control
>> of the virtual counter-timers to userspace, allowing it to define its
>> own heuristics for managing vCPU offsets.
> I'm not really in favour of inventing a completely new API, for
> multiple reasons:
> 
> - CNTVOFF is an EL2 concept. I'd rather not expose it as such as it
>    becomes really confusing with NV (which does expose its own CNTVOFF
>    via the ONE_REG interface)
> 
> - You seem to allow each vcpu to get its own offset. I don't think
>    that's right. The architecture defines that all PEs have the same
>    view of the counters, and an EL1 guest should be given that
>    illusion.
> 
> - by having a parallel save/restore interface, you make it harder to
>    reason about what happens with concurrent calls to both interfaces
> 
> - the userspace API is already horribly bloated, and I'm not overly
>    keen on adding more if we can avoid it.
> 
> I'd rather you extend the current ONE_REG interface and make it modal,
> either allowing the restore of an absolute value or an offset for
> CNTVCT_EL0. This would also keep a consistent behaviour when restoring
> vcpus. The same logic would apply to the physical offset.

What about using KVM_GET/SET_DEVICE_ATTR?  It makes sense that the guest 
value for nested virt goes through ONE_REG, while the host value goes 
through DEVICE_ATTR.

Paolo
Paolo Bonzini June 10, 2021, 6:54 a.m. UTC | #5
On 09/06/21 16:51, Oliver Upton wrote:
>> - You seem to allow each vcpu to get its own offset. I don't think
>>    that's right. The architecture defines that all PEs have the same
>>    view of the counters, and an EL1 guest should be given that
>>    illusion.
> Agreed. I would have preferred a VM-wide ioctl to do this, but since
> x86 explicitly allows for drifted TSCs that can't be the case in a
> generic ioctl. I can do the same broadcasting as we do in the case of
> a VMM write to CNTVCT_EL0.
> 

If you use VM-wide GET/SET_DEVICE_ATTR, please make it retrieve the host 
CLOCK_REALTIME and counter at the same time.

Paolo
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..31107d5e61af 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -781,4 +781,9 @@  void __init kvm_hyp_reserve(void);
 static inline void kvm_hyp_reserve(void) { }
 #endif
 
+int kvm_arm_vcpu_get_system_counter_state(struct kvm_vcpu *vcpu,
+					  struct kvm_system_counter_state *state);
+int kvm_arm_vcpu_set_system_counter_state(struct kvm_vcpu *vcpu,
+					  struct kvm_system_counter_state *state);
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..d3987089c524 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,16 @@  struct kvm_vcpu_events {
 	__u32 reserved[12];
 };
 
+/* for KVM_{GET,SET}_SYSTEM_COUNTER_STATE */
+struct kvm_system_counter_state {
+	/* indicates what fields are valid in the structure */
+	__u32 flags;
+	__u32 pad;
+	/* guest counter-timer offset, relative to host cntpct_el0 */
+	__u64 cntvoff;
+	__u64 rsvd[7];
+};
+
 /* If you need to interpret the index values, here is the key: */
 #define KVM_REG_ARM_COPROC_MASK		0x000000000FFF0000
 #define KVM_REG_ARM_COPROC_SHIFT	16
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 74e0699661e9..955a7a183362 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1259,3 +1259,25 @@  int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	return -ENXIO;
 }
+
+int kvm_arm_vcpu_get_system_counter_state(struct kvm_vcpu *vcpu,
+					  struct kvm_system_counter_state *state)
+{
+	if (state->flags)
+		return -EINVAL;
+
+	state->cntvoff = timer_get_offset(vcpu_vtimer(vcpu));
+
+	return 0;
+}
+
+int kvm_arm_vcpu_set_system_counter_state(struct kvm_vcpu *vcpu,
+					  struct kvm_system_counter_state *state)
+{
+	if (state->flags)
+		return -EINVAL;
+
+	timer_set_offset(vcpu_vtimer(vcpu), state->cntvoff);
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1126eae27400..b78ffb4db9dd 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -207,6 +207,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
+	case KVM_CAP_SYSTEM_COUNTER_STATE:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -1273,6 +1274,30 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 
 		return kvm_arm_vcpu_finalize(vcpu, what);
 	}
+	case KVM_GET_SYSTEM_COUNTER_STATE: {
+		struct kvm_system_counter_state state;
+
+		if (copy_from_user(&state, argp, sizeof(state)))
+			return -EFAULT;
+
+		r = kvm_arm_vcpu_get_system_counter_state(vcpu, &state);
+		if (r)
+			break;
+
+		if (copy_to_user(argp, &state, sizeof(state)))
+			return -EFAULT;
+
+		break;
+	}
+	case KVM_SET_SYSTEM_COUNTER_STATE: {
+		struct kvm_system_counter_state state;
+
+		if (copy_from_user(&state, argp, sizeof(state)))
+			return -EFAULT;
+
+		r = kvm_arm_vcpu_set_system_counter_state(vcpu, &state);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}