diff mbox series

[08/16] KVM: arm64: timers: Allow userspace to set the counter offsets

Message ID 20230216142123.2638675-9-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Rework timer offsetting for fun and profit | expand

Commit Message

Marc Zyngier Feb. 16, 2023, 2:21 p.m. UTC
And this is the moment you have all been waiting for: setting the
counter offsets from userspace.

We expose a brand new capability that reports the ability to set
the offsets for both the virtual and physical sides, independently.

In keeping with the architecture, the offsets are expressed as
a delta that is substracted from the physical counter value.

Once this new API is used, there is no going back, and the counters
cannot be written to to set the offsets implicitly (the writes
are instead ignored).

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h |  4 +++
 arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
 arch/arm64/kvm/arch_timer.c       | 46 +++++++++++++++++++++++++++----
 arch/arm64/kvm/arm.c              |  8 ++++++
 include/uapi/linux/kvm.h          |  3 ++
 5 files changed, 71 insertions(+), 5 deletions(-)

Comments

Oliver Upton Feb. 16, 2023, 10:09 p.m. UTC | #1
Hi Marc,

On Thu, Feb 16, 2023 at 02:21:15PM +0000, Marc Zyngier wrote:
> And this is the moment you have all been waiting for: setting the
> counter offsets from userspace.
> 
> We expose a brand new capability that reports the ability to set
> the offsets for both the virtual and physical sides, independently.
> 
> In keeping with the architecture, the offsets are expressed as
> a delta that is substracted from the physical counter value.
> 
> Once this new API is used, there is no going back, and the counters
> cannot be written to to set the offsets implicitly (the writes
> are instead ignored).

Is there any particular reason to use an explicit ioctl as opposed to
the KVM_{GET,SET}_DEVICE_ATTR ioctls? Dunno where you stand on it, but I
quite like that interface for simple state management. We also avoid
eating up more UAPI bits in the global namespace.

You could also split up the two offsets as individual attributes, but I
really couldn't care less. Its userspace's problem after all!

And on that note, any VMM patches to match this implementation? kvmtool
would suffice, just want something that runs a real VM and pokes these
ioctls.

> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/include/asm/kvm_host.h |  4 +++
>  arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
>  arch/arm64/kvm/arch_timer.c       | 46 +++++++++++++++++++++++++++----
>  arch/arm64/kvm/arm.c              |  8 ++++++
>  include/uapi/linux/kvm.h          |  3 ++
>  5 files changed, 71 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 3adac0c5e175..8514a37cf8d5 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -221,6 +221,8 @@ struct kvm_arch {
>  #define KVM_ARCH_FLAG_EL1_32BIT				4
>  	/* PSCI SYSTEM_SUSPEND enabled for the guest */
>  #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
> +	/* VM counter offsets */
> +#define KVM_ARCH_FLAG_COUNTER_OFFSETS			6

nit: I'm not too bothered by the test_bit(...) mouthful when its used
sparingly but it may be nice to have a helper if it is repeated enough
times.

>  	unsigned long flags;
>  
> @@ -1010,6 +1012,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
>  
>  long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
>  				struct kvm_arm_copy_mte_tags *copy_tags);
> +int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
> +				     struct kvm_arm_counter_offsets *offsets);
>  
>  /* Guest/host FPSIMD coordination helpers */
>  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index f8129c624b07..2d7557a160bd 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -198,6 +198,21 @@ struct kvm_arm_copy_mte_tags {
>  	__u64 reserved[2];
>  };
>  
> +/*
> + * Counter/Timer offset structure. Describe the virtual/physical offsets.
> + * To be used with KVM_ARM_SET_CNT_OFFSETS.
> + */
> +struct kvm_arm_counter_offsets {
> +	__u64 virtual_offset;
> +	__u64 physical_offset;
> +
> +#define KVM_COUNTER_SET_VOFFSET_FLAG	(1UL << 0)
> +#define KVM_COUNTER_SET_POFFSET_FLAG	(1UL << 1)
> +
> +	__u64 flags;
> +	__u64 reserved;
> +};
> +
>  #define KVM_ARM_TAGS_TO_GUEST		0
>  #define KVM_ARM_TAGS_FROM_GUEST		1
>  
> diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
> index 444ea6dca218..b04544b702f9 100644
> --- a/arch/arm64/kvm/arch_timer.c
> +++ b/arch/arm64/kvm/arch_timer.c
> @@ -852,9 +852,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
>  	ptimer->vcpu = vcpu;
>  	ptimer->offset.vm_offset = &vcpu->kvm->arch.offsets.poffset;
>  
> -	/* Synchronize cntvoff across all vtimers of a VM. */
> -	timer_set_offset(vtimer, kvm_phys_timer_read());
> -	timer_set_offset(ptimer, 0);
> +	/* Synchronize offsets across timers of a VM if not already provided */
> +	if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &vcpu->kvm->arch.flags)) {
> +		timer_set_offset(vtimer, kvm_phys_timer_read());
> +		timer_set_offset(ptimer, 0);
> +	}
>  
>  	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
>  	timer->bg_timer.function = kvm_bg_timer_expire;
> @@ -898,8 +900,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
>  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
>  		break;
>  	case KVM_REG_ARM_TIMER_CNT:
> -		timer = vcpu_vtimer(vcpu);
> -		timer_set_offset(timer, kvm_phys_timer_read() - value);
> +		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
> +			      &vcpu->kvm->arch.flags)) {
> +			timer = vcpu_vtimer(vcpu);
> +			timer_set_offset(timer, kvm_phys_timer_read() - value);
> +		}

If we've already got userspace to buy in to the new UAPI, should we
return an error instead of silently failing? Might be good to catch
misbehavior in the act, even if it is benign as this patch is written.

>  		break;
>  	case KVM_REG_ARM_TIMER_CVAL:
>  		timer = vcpu_vtimer(vcpu);
> @@ -909,6 +914,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
>  		timer = vcpu_ptimer(vcpu);
>  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
>  		break;
> +	case KVM_REG_ARM_PTIMER_CNT:
> +		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
> +			      &vcpu->kvm->arch.flags)) {
> +			timer = vcpu_ptimer(vcpu);
> +			timer_set_offset(timer, kvm_phys_timer_read() - value);
> +		}
> +		break;
>  	case KVM_REG_ARM_PTIMER_CVAL:
>  		timer = vcpu_ptimer(vcpu);
>  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
> @@ -1446,3 +1458,27 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
>  
>  	return -ENXIO;
>  }
> +
> +int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
> +				     struct kvm_arm_counter_offsets *offsets)
> +{
> +	if (offsets->reserved ||
> +	    (offsets->flags & ~(KVM_COUNTER_SET_VOFFSET_FLAG |
> +				KVM_COUNTER_SET_POFFSET_FLAG)))
> +		return -EINVAL;
> +
> +	if (!lock_all_vcpus(kvm))
> +		return -EBUSY;

Is there any reason why we can't just order this ioctl before vCPU
creation altogether, or is there a need to do this at runtime? We're
about to tolerate multiple writers to the offset value, and I think the
only thing we need to guarantee is that the below flag is set before
vCPU ioctls have a chance to run.

Actually, the one benefit of enforcing a strict ordering is that you can
hide the old register indices completely from KVM_GET_REG_LIST to
provide further discouragement to userspace.

Otherwise, if you choose to keep it this way then the requirement to
have no vCPU ioctls in flight needs to be explicitly documented as that
is a bit of a tricky interface.

> +	set_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &kvm->arch.flags);
> +
> +	if (offsets->flags & KVM_COUNTER_SET_VOFFSET_FLAG)
> +		kvm->arch.offsets.voffset = offsets->virtual_offset;
> +
> +	if (offsets->flags & KVM_COUNTER_SET_POFFSET_FLAG)
> +		kvm->arch.offsets.poffset = offsets->physical_offset;
> +
> +	unlock_all_vcpus(kvm);
> +
> +	return 0;
> +}
Marc Zyngier Feb. 17, 2023, 10:17 a.m. UTC | #2
Hi Oliver,

On Thu, 16 Feb 2023 22:09:47 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> Hi Marc,
> 
> On Thu, Feb 16, 2023 at 02:21:15PM +0000, Marc Zyngier wrote:
> > And this is the moment you have all been waiting for: setting the
> > counter offsets from userspace.
> > 
> > We expose a brand new capability that reports the ability to set
> > the offsets for both the virtual and physical sides, independently.
> > 
> > In keeping with the architecture, the offsets are expressed as
> > a delta that is substracted from the physical counter value.
> > 
> > Once this new API is used, there is no going back, and the counters
> > cannot be written to to set the offsets implicitly (the writes
> > are instead ignored).
> 
> Is there any particular reason to use an explicit ioctl as opposed to
> the KVM_{GET,SET}_DEVICE_ATTR ioctls? Dunno where you stand on it, but I
> quite like that interface for simple state management. We also avoid
> eating up more UAPI bits in the global namespace.

The problem with that is that it requires yet another KVM device for
this, and I'm lazy. It also makes it a bit harder for the VMM to buy
into this (need to track another FD, for example).

> You could also split up the two offsets as individual attributes, but I
> really couldn't care less. Its userspace's problem after all!

The current approach also allows you to deal with the two offsets
individually (you can update one or both as you see fit).

> 
> And on that note, any VMM patches to match this implementation? kvmtool
> would suffice, just want something that runs a real VM and pokes these
> ioctls.

I did hack kvmtool for that, but that's very uninteresting, as it
doesn't do any save/restore. I want to have a go at QEMU in my copious
spare time, stay tuned.

> 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> >  arch/arm64/include/asm/kvm_host.h |  4 +++
> >  arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
> >  arch/arm64/kvm/arch_timer.c       | 46 +++++++++++++++++++++++++++----
> >  arch/arm64/kvm/arm.c              |  8 ++++++
> >  include/uapi/linux/kvm.h          |  3 ++
> >  5 files changed, 71 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 3adac0c5e175..8514a37cf8d5 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -221,6 +221,8 @@ struct kvm_arch {
> >  #define KVM_ARCH_FLAG_EL1_32BIT				4
> >  	/* PSCI SYSTEM_SUSPEND enabled for the guest */
> >  #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
> > +	/* VM counter offsets */
> > +#define KVM_ARCH_FLAG_COUNTER_OFFSETS			6
> 
> nit: I'm not too bothered by the test_bit(...) mouthful when its used
> sparingly but it may be nice to have a helper if it is repeated enough
> times.

3 times. I'll slap a helper in, can't hurt.

> 
> >  	unsigned long flags;
> >  
> > @@ -1010,6 +1012,8 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
> >  
> >  long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
> >  				struct kvm_arm_copy_mte_tags *copy_tags);
> > +int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
> > +				     struct kvm_arm_counter_offsets *offsets);
> >  
> >  /* Guest/host FPSIMD coordination helpers */
> >  int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
> > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> > index f8129c624b07..2d7557a160bd 100644
> > --- a/arch/arm64/include/uapi/asm/kvm.h
> > +++ b/arch/arm64/include/uapi/asm/kvm.h
> > @@ -198,6 +198,21 @@ struct kvm_arm_copy_mte_tags {
> >  	__u64 reserved[2];
> >  };
> >  
> > +/*
> > + * Counter/Timer offset structure. Describe the virtual/physical offsets.
> > + * To be used with KVM_ARM_SET_CNT_OFFSETS.
> > + */
> > +struct kvm_arm_counter_offsets {
> > +	__u64 virtual_offset;
> > +	__u64 physical_offset;
> > +
> > +#define KVM_COUNTER_SET_VOFFSET_FLAG	(1UL << 0)
> > +#define KVM_COUNTER_SET_POFFSET_FLAG	(1UL << 1)
> > +
> > +	__u64 flags;
> > +	__u64 reserved;
> > +};
> > +
> >  #define KVM_ARM_TAGS_TO_GUEST		0
> >  #define KVM_ARM_TAGS_FROM_GUEST		1
> >  
> > diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
> > index 444ea6dca218..b04544b702f9 100644
> > --- a/arch/arm64/kvm/arch_timer.c
> > +++ b/arch/arm64/kvm/arch_timer.c
> > @@ -852,9 +852,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
> >  	ptimer->vcpu = vcpu;
> >  	ptimer->offset.vm_offset = &vcpu->kvm->arch.offsets.poffset;
> >  
> > -	/* Synchronize cntvoff across all vtimers of a VM. */
> > -	timer_set_offset(vtimer, kvm_phys_timer_read());
> > -	timer_set_offset(ptimer, 0);
> > +	/* Synchronize offsets across timers of a VM if not already provided */
> > +	if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &vcpu->kvm->arch.flags)) {
> > +		timer_set_offset(vtimer, kvm_phys_timer_read());
> > +		timer_set_offset(ptimer, 0);
> > +	}
> >  
> >  	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
> >  	timer->bg_timer.function = kvm_bg_timer_expire;
> > @@ -898,8 +900,11 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
> >  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
> >  		break;
> >  	case KVM_REG_ARM_TIMER_CNT:
> > -		timer = vcpu_vtimer(vcpu);
> > -		timer_set_offset(timer, kvm_phys_timer_read() - value);
> > +		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
> > +			      &vcpu->kvm->arch.flags)) {
> > +			timer = vcpu_vtimer(vcpu);
> > +			timer_set_offset(timer, kvm_phys_timer_read() - value);
> > +		}
> 
> If we've already got userspace to buy in to the new UAPI, should we
> return an error instead of silently failing? Might be good to catch
> misbehavior in the act, even if it is benign as this patch is
> written.

I don't necessarily see it as a misbehaviour. I'm trying to make it
cheap for people to integrate this into their flow, and no two VMM do
that the same way (plus, I only have QEMU as an example -- I'm sure
Amazon, Google, and whoever do that in very different ways).

It is also pretty annoying, as the VMM now has to filter the registers
it restores. This is extra work. Instead, the VMM knows that having
used the new API, the registers become WI.

> 
> >  		break;
> >  	case KVM_REG_ARM_TIMER_CVAL:
> >  		timer = vcpu_vtimer(vcpu);
> > @@ -909,6 +914,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
> >  		timer = vcpu_ptimer(vcpu);
> >  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
> >  		break;
> > +	case KVM_REG_ARM_PTIMER_CNT:
> > +		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
> > +			      &vcpu->kvm->arch.flags)) {
> > +			timer = vcpu_ptimer(vcpu);
> > +			timer_set_offset(timer, kvm_phys_timer_read() - value);
> > +		}
> > +		break;
> >  	case KVM_REG_ARM_PTIMER_CVAL:
> >  		timer = vcpu_ptimer(vcpu);
> >  		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
> > @@ -1446,3 +1458,27 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
> >  
> >  	return -ENXIO;
> >  }
> > +
> > +int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
> > +				     struct kvm_arm_counter_offsets *offsets)
> > +{
> > +	if (offsets->reserved ||
> > +	    (offsets->flags & ~(KVM_COUNTER_SET_VOFFSET_FLAG |
> > +				KVM_COUNTER_SET_POFFSET_FLAG)))
> > +		return -EINVAL;
> > +
> > +	if (!lock_all_vcpus(kvm))
> > +		return -EBUSY;
> 
> Is there any reason why we can't just order this ioctl before vCPU
> creation altogether, or is there a need to do this at runtime? We're
> about to tolerate multiple writers to the offset value, and I think the
> only thing we need to guarantee is that the below flag is set before
> vCPU ioctls have a chance to run.

Again, we don't know for sure whether the final offset is available
before vcpu creation time. My idea for QEMU would be to perform the
offset adjustment as late as possible, right before executing the VM,
after having restored the vcpus with whatever value they had.

> Actually, the one benefit of enforcing a strict ordering is that you can
> hide the old register indices completely from KVM_GET_REG_LIST to
> provide further discouragement to userspace.

But if you do that, how do you know what offset to program on the
target? You need the VM's view of time. You could compute it from the
physical counter and the offset locally, but you now need to transfer
it across. That's a change in the wire protocol, which is far more
than I'm prepared to bite.

> Otherwise, if you choose to keep it this way then the requirement to
> have no vCPU ioctls in flight needs to be explicitly documented as that
> is a bit of a tricky interface.

Ah, good point. Completely missed that.

Thanks,

	M.
Oliver Upton Feb. 17, 2023, 10:11 p.m. UTC | #3
On Fri, Feb 17, 2023 at 10:17:27AM +0000, Marc Zyngier wrote:
> Hi Oliver,
> 
> On Thu, 16 Feb 2023 22:09:47 +0000,
> Oliver Upton <oliver.upton@linux.dev> wrote:
> > 
> > Hi Marc,
> > 
> > On Thu, Feb 16, 2023 at 02:21:15PM +0000, Marc Zyngier wrote:
> > > And this is the moment you have all been waiting for: setting the
> > > counter offsets from userspace.
> > > 
> > > We expose a brand new capability that reports the ability to set
> > > the offsets for both the virtual and physical sides, independently.
> > > 
> > > In keeping with the architecture, the offsets are expressed as
> > > a delta that is substracted from the physical counter value.
> > > 
> > > Once this new API is used, there is no going back, and the counters
> > > cannot be written to to set the offsets implicitly (the writes
> > > are instead ignored).
> > 
> > Is there any particular reason to use an explicit ioctl as opposed to
> > the KVM_{GET,SET}_DEVICE_ATTR ioctls? Dunno where you stand on it, but I
> > quite like that interface for simple state management. We also avoid
> > eating up more UAPI bits in the global namespace.
> 
> The problem with that is that it requires yet another KVM device for
> this, and I'm lazy. It also makes it a bit harder for the VMM to buy
> into this (need to track another FD, for example).

You can also accept the device ioctls on the actual VM FD, quite like
we do for the vCPU right now. And hey, I've got a patch that gets you
most of the way there!

https://lore.kernel.org/kvmarm/20230211013759.3556016-3-oliver.upton@linux.dev/

> > Is there any reason why we can't just order this ioctl before vCPU
> > creation altogether, or is there a need to do this at runtime? We're
> > about to tolerate multiple writers to the offset value, and I think the
> > only thing we need to guarantee is that the below flag is set before
> > vCPU ioctls have a chance to run.
> 
> Again, we don't know for sure whether the final offset is available
> before vcpu creation time. My idea for QEMU would be to perform the
> offset adjustment as late as possible, right before executing the VM,
> after having restored the vcpus with whatever value they had.

So how does userspace work out an offset based on available information?
The part that hasn't clicked for me yet is where userspace gets the
current value of the true physical counter to calculate an offset.

We could make it ABI that the guest's physical counter matches that of
the host by default. Of course, that has been the case since the
beginning of time but it is now directly user-visible.

The only part I don't like about that is that we aren't fully creating
an abstraction around host and guest system time. So here's my current
mental model of how we represent the generic timer to userspace:

				+-----------------------+
				|	   		|
				| Host System Counter	|
				|	   (1) 		|
				+-----------------------+
				    	   |
			       +-----------+-----------+
			       |		       |
       +-----------------+  +-----+		    +-----+  +--------------------+
       | (2) CNTPOFF_EL2 |--| sub |		    | sub |--| (3) CNTVOFF_EL2    |
       +-----------------+  +-----+	     	    +-----+  +--------------------+
			       |           	       |
			       |		       |
		     +-----------------+	 +----------------+
		     | (5) CNTPCT_EL0  |         | (4) CNTVCT_EL0 |
		     +-----------------+	 +----------------+

AFAICT, this UAPI exposes abstractions for (2) and (3) to userspace, but
userspace cannot directly get at (1).

Chewing on this a bit more, I don't think userspace has any business
messing with virtual and physical time independently, especially when
nested virtualization comes into play.

I think the illusion to userspace needs to be built around the notion of
a system counter:

                                +-----------------------+
                                |                       |
                                | Host System Counter   |
                                |          (1)          |
                                +-----------------------+
					   |
					   |
					+-----+   +-------------------+
					| sub |---| (6) system_offset |
					+-----+   +-------------------+
					   |
					   |
                                +-----------------------+
                                |                       |
                                | Guest System Counter  |
                                |          (7)          |
                                +-----------------------+
                                           |
                               +-----------+-----------+
                               |                       |
       +-----------------+  +-----+                 +-----+  +--------------------+
       | (2) CNTPOFF_EL2 |--| sub |                 | sub |--| (3) CNTVOFF_EL2    |
       +-----------------+  +-----+                 +-----+  +--------------------+
                               |                       |
                               |                       |
                     +-----------------+         +----------------+
                     | (5) CNTPCT_EL0  |         | (4) CNTVCT_EL0 |
                     +-----------------+         +----------------+

And from a UAPI perspective, we would either expose (1) and (6) to let
userspace calculate an offset or simply allow (7) to be directly
read/written.

That frees up the meaning of the counter offsets as being purely a
virtual EL2 thing. These registers would reset to 0, and non-NV guests
could never change their value.

Under the hood KVM would program the true offset registers as:

	CNT{P,V}OFF_EL2 = 'virtual CNT{P,V}OFF_EL2' + system_offset

With this we would effectively configure CNTPCT = CNTVCT = 0 at the
point of VM creation. Only crappy thing is it requires full physical
counter/timer emulation for non-ECV systems, but the guest shouldn't be
using the physical counter in the first place.

Yes, this sucks for guests running on hosts w/ NV but not ECV. If anyone
can tell me how an L0 hypervisor is supposed to do NV without ECV, I'm
all ears.

Does any of what I've written make remote sense or have I gone entirely
off the rails with my ASCII art? :)
Marc Zyngier Feb. 22, 2023, 11:56 a.m. UTC | #4
On Fri, 17 Feb 2023 22:11:36 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Fri, Feb 17, 2023 at 10:17:27AM +0000, Marc Zyngier wrote:
> > Hi Oliver,
> > 
> > On Thu, 16 Feb 2023 22:09:47 +0000,
> > Oliver Upton <oliver.upton@linux.dev> wrote:
> > > 
> > > Hi Marc,
> > > 
> > > On Thu, Feb 16, 2023 at 02:21:15PM +0000, Marc Zyngier wrote:
> > > > And this is the moment you have all been waiting for: setting the
> > > > counter offsets from userspace.
> > > > 
> > > > We expose a brand new capability that reports the ability to set
> > > > the offsets for both the virtual and physical sides, independently.
> > > > 
> > > > In keeping with the architecture, the offsets are expressed as
> > > > a delta that is substracted from the physical counter value.
> > > > 
> > > > Once this new API is used, there is no going back, and the counters
> > > > cannot be written to to set the offsets implicitly (the writes
> > > > are instead ignored).
> > > 
> > > Is there any particular reason to use an explicit ioctl as opposed to
> > > the KVM_{GET,SET}_DEVICE_ATTR ioctls? Dunno where you stand on it, but I
> > > quite like that interface for simple state management. We also avoid
> > > eating up more UAPI bits in the global namespace.
> > 
> > The problem with that is that it requires yet another KVM device for
> > this, and I'm lazy. It also makes it a bit harder for the VMM to buy
> > into this (need to track another FD, for example).
> 
> You can also accept the device ioctls on the actual VM FD, quite like
> we do for the vCPU right now. And hey, I've got a patch that gets you
> most of the way there!
> 
> https://lore.kernel.org/kvmarm/20230211013759.3556016-3-oliver.upton@linux.dev/

Huh... I don't know yet if I love it or hate it.At the end of the day,
this is just another ioctl, so I don't care either way.

> > > Is there any reason why we can't just order this ioctl before vCPU
> > > creation altogether, or is there a need to do this at runtime? We're
> > > about to tolerate multiple writers to the offset value, and I think the
> > > only thing we need to guarantee is that the below flag is set before
> > > vCPU ioctls have a chance to run.
> > 
> > Again, we don't know for sure whether the final offset is available
> > before vcpu creation time. My idea for QEMU would be to perform the
> > offset adjustment as late as possible, right before executing the VM,
> > after having restored the vcpus with whatever value they had.
> 
> So how does userspace work out an offset based on available information?
> The part that hasn't clicked for me yet is where userspace gets the
> current value of the true physical counter to calculate an offset.

What's wrong with CNTVCT_EL0?

> We could make it ABI that the guest's physical counter matches that of
> the host by default. Of course, that has been the case since the
> beginning of time but it is now directly user-visible.
> 
> The only part I don't like about that is that we aren't fully creating
> an abstraction around host and guest system time. So here's my current
> mental model of how we represent the generic timer to userspace:
> 
> 				+-----------------------+
> 				|	   		|
> 				| Host System Counter	|
> 				|	   (1) 		|
> 				+-----------------------+
> 				    	   |
> 			       +-----------+-----------+
> 			       |		       |
>        +-----------------+  +-----+		    +-----+  +--------------------+
>        | (2) CNTPOFF_EL2 |--| sub |		    | sub |--| (3) CNTVOFF_EL2    |
>        +-----------------+  +-----+	     	    +-----+  +--------------------+
> 			       |           	       |
> 			       |		       |
> 		     +-----------------+	 +----------------+
> 		     | (5) CNTPCT_EL0  |         | (4) CNTVCT_EL0 |
> 		     +-----------------+	 +----------------+
> 
> AFAICT, this UAPI exposes abstractions for (2) and (3) to userspace, but
> userspace cannot directly get at (1).

Of course it can! CNTVCT_EL0 is accessible from userspace, and is
guaranteed to have an offset of 0 on a host.

> 
> Chewing on this a bit more, I don't think userspace has any business
> messing with virtual and physical time independently, especially when
> nested virtualization comes into play.

Well, NV already ignores the virtual offset completely (see how the
virtual timer gets its offset reassigned at reset time).

> 
> I think the illusion to userspace needs to be built around the notion of
> a system counter:
> 
>                                 +-----------------------+
>                                 |                       |
>                                 | Host System Counter   |
>                                 |          (1)          |
>                                 +-----------------------+
> 					   |
> 					   |
> 					+-----+   +-------------------+
> 					| sub |---| (6) system_offset |
> 					+-----+   +-------------------+
> 					   |
> 					   |
>                                 +-----------------------+
>                                 |                       |
>                                 | Guest System Counter  |
>                                 |          (7)          |
>                                 +-----------------------+
>                                            |
>                                +-----------+-----------+
>                                |                       |
>        +-----------------+  +-----+                 +-----+  +--------------------+
>        | (2) CNTPOFF_EL2 |--| sub |                 | sub |--| (3) CNTVOFF_EL2    |
>        +-----------------+  +-----+                 +-----+  +--------------------+
>                                |                       |
>                                |                       |
>                      +-----------------+         +----------------+
>                      | (5) CNTPCT_EL0  |         | (4) CNTVCT_EL0 |
>                      +-----------------+         +----------------+
> 
> And from a UAPI perspective, we would either expose (1) and (6) to let
> userspace calculate an offset or simply allow (7) to be directly
> read/written.

I previously toyed with this idea, and I really like it. However, the
problem with this is that it breaks the current behaviour of having
two different values for CNTVCT and CNTPCT in the guest, and CNTPCT
representing the counter value on the host.

Such a VM cannot be migrated *today*, but not everybody cares about
migration. My "dual offset" approach allows the current behaviour to
persist, and such a VM to be migrated. The luser even gets the choice
of preserving counter continuity in the guest or to stay without a
physical offset and reflect the host's counter.

Is it a good behaviour? Of course not. Does anyone depend on it? I
have no idea, but odds are that someone does. Can we break their toys?
The jury is still out.

> 
> That frees up the meaning of the counter offsets as being purely a
> virtual EL2 thing. These registers would reset to 0, and non-NV guests
> could never change their value.
> 
> Under the hood KVM would program the true offset registers as:
> 
> 	CNT{P,V}OFF_EL2 = 'virtual CNT{P,V}OFF_EL2' + system_offset
> 
> With this we would effectively configure CNTPCT = CNTVCT = 0 at the
> point of VM creation. Only crappy thing is it requires full physical
> counter/timer emulation for non-ECV systems, but the guest shouldn't be
> using the physical counter in the first place.

And I think that's the point where we differ. I can completely imagine
some in-VM code using the physical counter to export some timestamping
to the host (for tracing purposes, amongst other things).

> Yes, this sucks for guests running on hosts w/ NV but not ECV. If anyone
> can tell me how an L0 hypervisor is supposed to do NV without ECV, I'm
> all ears.

You absolutely can run with NV2 without ECV. You just get a bad
quality of emulation for the EL0 timers. But that's about it.

> Does any of what I've written make remote sense or have I gone entirely
> off the rails with my ASCII art? :)

Your ASCII art is beautiful, only a tad too wide! ;-) What you suggest
makes a lot of sense, but it leaves existing behaviours in the lurch.
Can we pretend they don't exist? You tell me!

Thanks,

	M.
Oliver Upton Feb. 22, 2023, 4:34 p.m. UTC | #5
On Wed, Feb 22, 2023 at 11:56:53AM +0000, Marc Zyngier wrote:

[...]

> > AFAICT, this UAPI exposes abstractions for (2) and (3) to userspace, but
> > userspace cannot directly get at (1).
> 
> Of course it can! CNTVCT_EL0 is accessible from userspace, and is
> guaranteed to have an offset of 0 on a host.

Derp, yes :)

> > 
> > Chewing on this a bit more, I don't think userspace has any business
> > messing with virtual and physical time independently, especially when
> > nested virtualization comes into play.
> 
> Well, NV already ignores the virtual offset completely (see how the
> virtual timer gets its offset reassigned at reset time).

I'll need to have a look at that, but if we need to ignore user input on
the shiny new interface for NV then I really do wonder if it is the
right fit.

> I previously toyed with this idea, and I really like it. However, the
> problem with this is that it breaks the current behaviour of having
> two different values for CNTVCT and CNTPCT in the guest, and CNTPCT
> representing the counter value on the host.
> 
> Such a VM cannot be migrated *today*, but not everybody cares about
> migration. My "dual offset" approach allows the current behaviour to
> persist, and such a VM to be migrated. The luser even gets the choice
> of preserving counter continuity in the guest or to stay without a
> physical offset and reflect the host's counter.
> 
> Is it a good behaviour? Of course not. Does anyone depend on it? I
> have no idea, but odds are that someone does. Can we break their toys?
> The jury is still out.

Well, I think the new interface presents an opportunity to change the
rules around counter migration, and the illusion of two distinct offsets
for physical / virtual counters will need to be broken soon enough for
NV. Do we need to bend over backwards for a theoretical use case with
the new UAPI? If anyone depends on the existing behavior then they can
continue to use the old UAPI to partially migrate the guest counters.

My previous suggestion of tying the physical and virtual counters
together at VM creation would definitely break such a use case, though,
so we'd be at the point of requiring explicit opt-in from userspace.

> > 
> > That frees up the meaning of the counter offsets as being purely a
> > virtual EL2 thing. These registers would reset to 0, and non-NV guests
> > could never change their value.
> > 
> > Under the hood KVM would program the true offset registers as:
> > 
> > 	CNT{P,V}OFF_EL2 = 'virtual CNT{P,V}OFF_EL2' + system_offset
> > 
> > With this we would effectively configure CNTPCT = CNTVCT = 0 at the
> > point of VM creation. Only crappy thing is it requires full physical
> > counter/timer emulation for non-ECV systems, but the guest shouldn't be
> > using the physical counter in the first place.
> 
> And I think that's the point where we differ. I can completely imagine
> some in-VM code using the physical counter to export some timestamping
> to the host (for tracing purposes, amongst other things).

So in this case the guest and userspace would already be in cahoots, so
userspace could choose to not use UAPI. Hell, if userspace did
absolutely nothing then it all continues to work.

> > Yes, this sucks for guests running on hosts w/ NV but not ECV. If anyone
> > can tell me how an L0 hypervisor is supposed to do NV without ECV, I'm
> > all ears.
> 
> You absolutely can run with NV2 without ECV. You just get a bad
> quality of emulation for the EL0 timers. But that's about it.a

'do NV well', I should've said :)

> > Does any of what I've written make remote sense or have I gone entirely
> > off the rails with my ASCII art? :)
> 
> Your ASCII art is beautiful, only a tad too wide! ;-) What you suggest
> makes a lot of sense, but it leaves existing behaviours in the lurch.
> Can we pretend they don't exist? You tell me!

Oh, we're definitely on the hook for any existing misuse of observable
KVM behavior. I just think if we're at the point of adding new UAPI we
may as well lay down some new rules with userspace to avoid surprises.

OTOH, ignoring the virtual offset for NV is another way out of the mess,
but it just bothers me we're about to ignore input on a brand new
UAPI...
Marc Zyngier Feb. 23, 2023, 6:25 p.m. UTC | #6
On Wed, 22 Feb 2023 16:34:32 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Wed, Feb 22, 2023 at 11:56:53AM +0000, Marc Zyngier wrote:
> 
> > > Chewing on this a bit more, I don't think userspace has any business
> > > messing with virtual and physical time independently, especially when
> > > nested virtualization comes into play.
> > 
> > Well, NV already ignores the virtual offset completely (see how the
> > virtual timer gets its offset reassigned at reset time).
> 
> I'll need to have a look at that, but if we need to ignore user input on
> the shiny new interface for NV then I really do wonder if it is the
> right fit.

The thing is, I'm not keen making the interface modal. Modes suck and
lead to invasive userspace changes. I'd rather ignore irrelevant
parameters than change the way userspace works. And you don't *have*
to provide the virtual offset with NV.

> > I previously toyed with this idea, and I really like it. However, the
> > problem with this is that it breaks the current behaviour of having
> > two different values for CNTVCT and CNTPCT in the guest, and CNTPCT
> > representing the counter value on the host.
> > 
> > Such a VM cannot be migrated *today*, but not everybody cares about
> > migration. My "dual offset" approach allows the current behaviour to
> > persist, and such a VM to be migrated. The luser even gets the choice
> > of preserving counter continuity in the guest or to stay without a
> > physical offset and reflect the host's counter.
> > 
> > Is it a good behaviour? Of course not. Does anyone depend on it? I
> > have no idea, but odds are that someone does. Can we break their toys?
> > The jury is still out.
> 
> Well, I think the new interface presents an opportunity to change the
> rules around counter migration, and the illusion of two distinct offsets
> for physical / virtual counters will need to be broken soon enough for
> NV.

Broken in the kernel, sure. Do we need to involve userspace in what
was an initial mis-design of the API? Changing these rules mean
changing the way userspace works, and I'm not keen on going there.

> Do we need to bend over backwards for a theoretical use case with
> the new UAPI? If anyone depends on the existing behavior then they can
> continue to use the old UAPI to partially migrate the guest counters.

I don't buy the old/new thing. My take is that these things should be
cumulative if there isn't a hard reason to break the existing API.

> My previous suggestion of tying the physical and virtual counters
> together at VM creation would definitely break such a use case, though,
> so we'd be at the point of requiring explicit opt-in from userspace.

I'm trying to find a middle ground, so bear with me. Here's the
situation as I see it:

(1) a VM that is migrating today can only set the virtual offset and
    doesn't affect the physical counter. This behaviour must be
    preserved in we cannot prove that nobody relies on it.

(2) setting the physical offset could be done by two means:

    (a) writing the counter register (like we do for CNTVCT)
    (b) providing an offset via a side channel

I think (1) must stay forever, just like we still support the old
GICv2 implicit initialisation.

(2a) is also desirable as it requires no extra work on the VMM side.
Just restore the damn thing, and nothing changes (we're finally able
to migrate the physical timer). (2b) really is icing on the cake.

The question is whether we can come up with an API offering (2b) that
still allows (1) and (2a). I'd be happy with a new API that, when
used, resets both offsets to the same value, matching your pretty
picture. But the dual offsetting still has to exist internally.

When it comes to NV, it uses either the physical offset that has been
provided by writing CNTPCT, or the one that has been written via the
new API. Under the hood, this is the same piece of data, of course.

The only meaningful difference with my initial proposal is that there
is no new virtual offset API. It is either register writes that obey
the same rules as before, or a single offset setting.

> > And I think that's the point where we differ. I can completely imagine
> > some in-VM code using the physical counter to export some timestamping
> > to the host (for tracing purposes, amongst other things).
> 
> So in this case the guest and userspace would already be in cahoots, so
> userspace could choose to not use UAPI. Hell, if userspace did
> absolutely nothing then it all continues to work.

Userspace, yes. Not necessarily the VMM. Let's say my guest spits a
bunch of timestamped traces over a network connection, which I then
correlate with host traces (all similarities with a shipping product
are completely fortuitous...). The VMM isn't involved at all here.

> Oh, we're definitely on the hook for any existing misuse of observable
> KVM behavior. I just think if we're at the point of adding new UAPI we
> may as well lay down some new rules with userspace to avoid surprises.
> 
> OTOH, ignoring the virtual offset for NV is another way out of the mess,
> but it just bothers me we're about to ignore input on a brand new
> UAPI...

I think my single-offset suggestion should answer your questioning.

Thoughts?

	M.
Marc Zyngier Feb. 24, 2023, 11:24 a.m. UTC | #7
On Thu, 23 Feb 2023 22:41:13 +0000,
Colton Lewis <coltonlewis@google.com> wrote:
> 
> Marc Zyngier <maz@kernel.org> writes:
> 
> > Once this new API is used, there is no going back, and the counters
> > cannot be written to to set the offsets implicitly (the writes
> > are instead ignored).
> 
> Why do this? I can't see a reason for disabling the other API the first
> time this one is used.

I can't see a reason not to. The new API is VM-wide. The old one
operates on a per-vcpu basis. What sense does it make to accept
something that directly conflicts with the previous actions from
userspace?

Once userspace has bought into the new API, it should use it
consistently.  The only reason we don't reject the write with an error
is to allow userspace to keep using the vcpu register dump as an
opaque object that it doesn't have to scan and amend.

> 
> > In keeping with the architecture, the offsets are expressed as
> > a delta that is substracted from the physical counter value.
>                   ^
> nit: subtracted
> 
> > +/*
> > + * Counter/Timer offset structure. Describe the virtual/physical offsets.
> > + * To be used with KVM_ARM_SET_CNT_OFFSETS.
> > + */
> > +struct kvm_arm_counter_offsets {
> > +	__u64 virtual_offset;
> > +	__u64 physical_offset;
> > +
> > +#define KVM_COUNTER_SET_VOFFSET_FLAG	(1UL << 0)
> > +#define KVM_COUNTER_SET_POFFSET_FLAG	(1UL << 1)
> > +
> > +	__u64 flags;
> > +	__u64 reserved;
> > +};
> > +
> 
> It looks weird to have the #defines in the middle of the struct like
> that. I think it would be easier to read with the #defines before the
> struct.

I do like it, as it perfectly shows in which context these #defines
are valid. This is also a common idiom used all over the existing KVM
code (just take a look at kvm_run for the canonical example).

> 
> > @@ -852,9 +852,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
> >   	ptimer->vcpu = vcpu;
> >   	ptimer->offset.vm_offset = &vcpu->kvm->arch.offsets.poffset;
> 
> > -	/* Synchronize cntvoff across all vtimers of a VM. */
> > -	timer_set_offset(vtimer, kvm_phys_timer_read());
> > -	timer_set_offset(ptimer, 0);
> > +	/* Synchronize offsets across timers of a VM if not already provided */
> > +	if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &vcpu->kvm->arch.flags)) {
> > +		timer_set_offset(vtimer, kvm_phys_timer_read());
> > +		timer_set_offset(ptimer, 0);
> > +	}
> 
> >   	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
> >   	timer->bg_timer.function = kvm_bg_timer_expire;
> 
> The code says "assign the offsets if the KVM_ARCH_FLAG_COUNTER_OFFSETS
> flag is not on". The flag name is confusing and made it hard for me to
> understand the intent. I think the intent is to only assign the offsets
> if the user has not called the API to provide some offsets (that would
> have been assigned in the API call along with flipping the flag
> on). With that in mind, I would prefer the flag name reference the
> user. KVM_ARCH_FLAG_USER_OFFSETS

All offsets are provided by the user, no matter what API they used, so
I don't think this adds much clarity. The real distinction is between
the offsets being set by writing a vcpu attribute or a VM attribute.

By this token, I'd suggest KVM_ARM_FLAG_VM_COUNTER_OFFSETS.

Thoughts?

	M.
Oliver Upton March 8, 2023, 7:46 a.m. UTC | #8
Hey Marc,

On Thu, Feb 23, 2023 at 06:25:57PM +0000, Marc Zyngier wrote:

[...]

> > Do we need to bend over backwards for a theoretical use case with
> > the new UAPI? If anyone depends on the existing behavior then they can
> > continue to use the old UAPI to partially migrate the guest counters.
> 
> I don't buy the old/new thing. My take is that these things should be
> cumulative if there isn't a hard reason to break the existing API.

Unsurprisingly, I may have been a bit confusing in my replies to you.

I have zero interest in breaking the existing API. Any suggestion of
'changing the rules' was more along the lines of providing an alternate
scheme for the counters and letting the quirks of the old interface
continue.

> > My previous suggestion of tying the physical and virtual counters
> > together at VM creation would definitely break such a use case, though,
> > so we'd be at the point of requiring explicit opt-in from userspace.
> 
> I'm trying to find a middle ground, so bear with me. Here's the
> situation as I see it:
> 
> (1) a VM that is migrating today can only set the virtual offset and
>     doesn't affect the physical counter. This behaviour must be
>     preserved in we cannot prove that nobody relies on it.
> 
> (2) setting the physical offset could be done by two means:
> 
>     (a) writing the counter register (like we do for CNTVCT)
>     (b) providing an offset via a side channel
> 
> I think (1) must stay forever, just like we still support the old
> GICv2 implicit initialisation.

No argument here. Unless userspace pokes some new bit of UAPI, the old
behavior of CNTVCT must live on.

> (2a) is also desirable as it requires no extra work on the VMM side.
> Just restore the damn thing, and nothing changes (we're finally able
> to migrate the physical timer). (2b) really is icing on the cake.
> 
> The question is whether we can come up with an API offering (2b) that
> still allows (1) and (2a). I'd be happy with a new API that, when
> used, resets both offsets to the same value, matching your pretty
> picture. But the dual offsetting still has to exist internally.
> 
> When it comes to NV, it uses either the physical offset that has been
> provided by writing CNTPCT, or the one that has been written via the
> new API. Under the hood, this is the same piece of data, of course.
> 
> The only meaningful difference with my initial proposal is that there
> is no new virtual offset API. It is either register writes that obey
> the same rules as before, or a single offset setting.

I certainly agree that (2a) is highly desirable to get existing VMMs to
'do the right thing' for free. Playing devil's advocate, would this not
also break the tracing example you've given of correlating timestamps
between the host and guest? I wouldn't expect a userspace + VM tracing
contraption to live migrate but restoring from a snapshot seems
plausible.

Regardless, I like the general direction you've proposed. IIUC, you'll
want to go ahead with ignoring writes to CNT{P,V}CT if the offset was
written by userspace, right?
Oliver Upton March 8, 2023, 7:53 a.m. UTC | #9
On Wed, Mar 08, 2023 at 07:46:00AM +0000, Oliver Upton wrote:
> Hey Marc,
> 
> On Thu, Feb 23, 2023 at 06:25:57PM +0000, Marc Zyngier wrote:
> 
> [...]
> 
> > > Do we need to bend over backwards for a theoretical use case with
> > > the new UAPI? If anyone depends on the existing behavior then they can
> > > continue to use the old UAPI to partially migrate the guest counters.
> > 
> > I don't buy the old/new thing. My take is that these things should be
> > cumulative if there isn't a hard reason to break the existing API.
> 
> Unsurprisingly, I may have been a bit confusing in my replies to you.
> 
> I have zero interest in breaking the existing API. Any suggestion of
> 'changing the rules' was more along the lines of providing an alternate
> scheme for the counters and letting the quirks of the old interface
> continue.
> 
> > > My previous suggestion of tying the physical and virtual counters
> > > together at VM creation would definitely break such a use case, though,
> > > so we'd be at the point of requiring explicit opt-in from userspace.
> > 
> > I'm trying to find a middle ground, so bear with me. Here's the
> > situation as I see it:
> > 
> > (1) a VM that is migrating today can only set the virtual offset and
> >     doesn't affect the physical counter. This behaviour must be
> >     preserved in we cannot prove that nobody relies on it.
> > 
> > (2) setting the physical offset could be done by two means:
> > 
> >     (a) writing the counter register (like we do for CNTVCT)
> >     (b) providing an offset via a side channel
> > 
> > I think (1) must stay forever, just like we still support the old
> > GICv2 implicit initialisation.
> 
> No argument here. Unless userspace pokes some new bit of UAPI, the old
> behavior of CNTVCT must live on.
> 
> > (2a) is also desirable as it requires no extra work on the VMM side.
> > Just restore the damn thing, and nothing changes (we're finally able
> > to migrate the physical timer). (2b) really is icing on the cake.
> > 
> > The question is whether we can come up with an API offering (2b) that
> > still allows (1) and (2a). I'd be happy with a new API that, when
> > used, resets both offsets to the same value, matching your pretty
> > picture. But the dual offsetting still has to exist internally.
> > 
> > When it comes to NV, it uses either the physical offset that has been
> > provided by writing CNTPCT, or the one that has been written via the
> > new API. Under the hood, this is the same piece of data, of course.
> > 
> > The only meaningful difference with my initial proposal is that there
> > is no new virtual offset API. It is either register writes that obey
> > the same rules as before, or a single offset setting.
> 
> I certainly agree that (2a) is highly desirable to get existing VMMs to
> 'do the right thing' for free. Playing devil's advocate, would this not
> also break the tracing example you've given of correlating timestamps
> between the host and guest? I wouldn't expect a userspace + VM tracing
> contraption to live migrate but restoring from a snapshot seems
> plausible.

The problem I'm alluding to here is that the VMM will save/restore
the physical counter value and cause KVM to offset the physical counter.
Live migration is a pretty obvious example, but resuming from a snapshot
after resetting a system be similarly affected.

> Regardless, I like the general direction you've proposed. IIUC, you'll
> want to go ahead with ignoring writes to CNT{P,V}CT if the offset was
> written by userspace, right?
> 
> -- 
> Thanks,
> Oliver
>
Marc Zyngier March 9, 2023, 8:25 a.m. UTC | #10
Hi Oliver,

On Wed, 08 Mar 2023 07:46:00 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:

[...]

> I certainly agree that (2a) is highly desirable to get existing VMMs to
> 'do the right thing' for free. Playing devil's advocate, would this not
> also break the tracing example you've given of correlating timestamps
> between the host and guest? I wouldn't expect a userspace + VM tracing
> contraption to live migrate but restoring from a snapshot seems
> plausible.

It really depends when this VM was saved.

If you saved it on an old host that doesn't expose CNTPCT and restore
it on a new host, the physical offset is still zero (this is already
special-cased), and there is no difference in behaviour.

If you saved it from a host that does expose CNTPCT, then the
behaviour changes. But should we really care?

> Regardless, I like the general direction you've proposed. IIUC, you'll
> want to go ahead with ignoring writes to CNT{P,V}CT if the offset was
> written by userspace, right?

That'd be my preference in order not to break the "blind restore"
behaviour that QEMU already uses for about everything.

I'll repost the series shortly, as it has grown some extra goodies
such as moving the PPI settings out of the way...

Thanks,

	M.
Marc Zyngier March 9, 2023, 8:29 a.m. UTC | #11
On Wed, 08 Mar 2023 07:53:46 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> > I certainly agree that (2a) is highly desirable to get existing VMMs to
> > 'do the right thing' for free. Playing devil's advocate, would this not
> > also break the tracing example you've given of correlating timestamps
> > between the host and guest? I wouldn't expect a userspace + VM tracing
> > contraption to live migrate but restoring from a snapshot seems
> > plausible.
> 
> The problem I'm alluding to here is that the VMM will save/restore
> the physical counter value and cause KVM to offset the physical counter.
> Live migration is a pretty obvious example, but resuming from a snapshot
> after resetting a system be similarly affected.

My take on this is that if you have produced the snapshot on a
pre-CNTPCT host, there will be no change in behaviour. If you've
produced the snapshot on a new host, you get the new behaviour.

I am willing to be accommodating to the use case, but only to a
certain extent! ;-)

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3adac0c5e175..8514a37cf8d5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -221,6 +221,8 @@  struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
+	/* VM counter offsets */
+#define KVM_ARCH_FLAG_COUNTER_OFFSETS			6
 
 	unsigned long flags;
 
@@ -1010,6 +1012,8 @@  int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
 
 long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 				struct kvm_arm_copy_mte_tags *copy_tags);
+int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
+				     struct kvm_arm_counter_offsets *offsets);
 
 /* Guest/host FPSIMD coordination helpers */
 int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f8129c624b07..2d7557a160bd 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -198,6 +198,21 @@  struct kvm_arm_copy_mte_tags {
 	__u64 reserved[2];
 };
 
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offsets.
+ * To be used with KVM_ARM_SET_CNT_OFFSETS.
+ */
+struct kvm_arm_counter_offsets {
+	__u64 virtual_offset;
+	__u64 physical_offset;
+
+#define KVM_COUNTER_SET_VOFFSET_FLAG	(1UL << 0)
+#define KVM_COUNTER_SET_POFFSET_FLAG	(1UL << 1)
+
+	__u64 flags;
+	__u64 reserved;
+};
+
 #define KVM_ARM_TAGS_TO_GUEST		0
 #define KVM_ARM_TAGS_FROM_GUEST		1
 
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 444ea6dca218..b04544b702f9 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -852,9 +852,11 @@  void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	ptimer->vcpu = vcpu;
 	ptimer->offset.vm_offset = &vcpu->kvm->arch.offsets.poffset;
 
-	/* Synchronize cntvoff across all vtimers of a VM. */
-	timer_set_offset(vtimer, kvm_phys_timer_read());
-	timer_set_offset(ptimer, 0);
+	/* Synchronize offsets across timers of a VM if not already provided */
+	if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &vcpu->kvm->arch.flags)) {
+		timer_set_offset(vtimer, kvm_phys_timer_read());
+		timer_set_offset(ptimer, 0);
+	}
 
 	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	timer->bg_timer.function = kvm_bg_timer_expire;
@@ -898,8 +900,11 @@  int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
 	case KVM_REG_ARM_TIMER_CNT:
-		timer = vcpu_vtimer(vcpu);
-		timer_set_offset(timer, kvm_phys_timer_read() - value);
+		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_vtimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
 		break;
 	case KVM_REG_ARM_TIMER_CVAL:
 		timer = vcpu_vtimer(vcpu);
@@ -909,6 +914,13 @@  int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
 		break;
+	case KVM_REG_ARM_PTIMER_CNT:
+		if (!test_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS,
+			      &vcpu->kvm->arch.flags)) {
+			timer = vcpu_ptimer(vcpu);
+			timer_set_offset(timer, kvm_phys_timer_read() - value);
+		}
+		break;
 	case KVM_REG_ARM_PTIMER_CVAL:
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
@@ -1446,3 +1458,27 @@  int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 
 	return -ENXIO;
 }
+
+int kvm_vm_ioctl_set_counter_offsets(struct kvm *kvm,
+				     struct kvm_arm_counter_offsets *offsets)
+{
+	if (offsets->reserved ||
+	    (offsets->flags & ~(KVM_COUNTER_SET_VOFFSET_FLAG |
+				KVM_COUNTER_SET_POFFSET_FLAG)))
+		return -EINVAL;
+
+	if (!lock_all_vcpus(kvm))
+		return -EBUSY;
+
+	set_bit(KVM_ARCH_FLAG_COUNTER_OFFSETS, &kvm->arch.flags);
+
+	if (offsets->flags & KVM_COUNTER_SET_VOFFSET_FLAG)
+		kvm->arch.offsets.voffset = offsets->virtual_offset;
+
+	if (offsets->flags & KVM_COUNTER_SET_POFFSET_FLAG)
+		kvm->arch.offsets.poffset = offsets->physical_offset;
+
+	unlock_all_vcpus(kvm);
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 097750a01497..1182d8ce7319 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -220,6 +220,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
+	case KVM_CAP_COUNTER_OFFSETS:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -1479,6 +1480,13 @@  long kvm_arch_vm_ioctl(struct file *filp,
 			return -EFAULT;
 		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
 	}
+	case KVM_ARM_SET_CNT_OFFSETS: {
+		struct kvm_arm_counter_offsets offsets;
+
+		if (copy_from_user(&offsets, argp, sizeof(offsets)))
+			return -EFAULT;
+		return kvm_vm_ioctl_set_counter_offsets(kvm, &offsets);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 55155e262646..3753765dbc4f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1175,6 +1175,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
+#define KVM_CAP_COUNTER_OFFSETS 226
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1534,6 +1535,8 @@  struct kvm_s390_ucas_mapping {
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
 #define KVM_ARM_MTE_COPY_TAGS	  _IOR(KVMIO,  0xb4, struct kvm_arm_copy_mte_tags)
+/* Available with KVM_CAP_COUNTER_OFFSETS */
+#define KVM_ARM_SET_CNT_OFFSETS	  _IOW(KVMIO,  0xb5, struct kvm_arm_counter_offsets)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)