diff mbox series

[v2,03/12] KVM: x86: Expose TSC offset controls to userspace

Message ID 20210716212629.2232756-4-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: Add idempotent controls for migrating system counter state | expand

Commit Message

Oliver Upton July 16, 2021, 9:26 p.m. UTC
To date, VMM-directed TSC synchronization and migration has been a bit
messy. KVM has some baked-in heuristics around TSC writes to infer if
the VMM is attempting to synchronize. This is problematic, as it depends
on host userspace writing to the guest's TSC within 1 second of the last
write.

A much cleaner approach to configuring the guest's views of the TSC is to
simply migrate the TSC offset for every vCPU. Offsets are idempotent,
and thus not subject to change depending on when the VMM actually
reads/writes values from/to KVM. The VMM can then read the TSC once with
KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
the guest is paused.

Cc: David Matlack <dmatlack@google.com>
Signed-off-by: Oliver Upton <oupton@gooogle.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/include/uapi/asm/kvm.h |   4 +
 arch/x86/kvm/x86.c              | 166 ++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+)

Comments

Paolo Bonzini July 18, 2021, 8:42 p.m. UTC | #1
On 16/07/21 23:26, Oliver Upton wrote:
> To date, VMM-directed TSC synchronization and migration has been a bit
> messy. KVM has some baked-in heuristics around TSC writes to infer if
> the VMM is attempting to synchronize. This is problematic, as it depends
> on host userspace writing to the guest's TSC within 1 second of the last
> write.
> 
> A much cleaner approach to configuring the guest's views of the TSC is to
> simply migrate the TSC offset for every vCPU. Offsets are idempotent,
> and thus not subject to change depending on when the VMM actually
> reads/writes values from/to KVM. The VMM can then read the TSC once with
> KVM_GET_CLOCK to capture a (realtime, host_tsc) pair at the instant when
> the guest is paused.
> 
> Cc: David Matlack <dmatlack@google.com>
> Signed-off-by: Oliver Upton <oupton@gooogle.com>
> ---
>   arch/x86/include/asm/kvm_host.h |   1 +
>   arch/x86/include/uapi/asm/kvm.h |   4 +
>   arch/x86/kvm/x86.c              | 166 ++++++++++++++++++++++++++++++++
>   3 files changed, 171 insertions(+)

This is missing documentation.  The documentation should also include 
the algorithm in https://www.spinics.net/lists/kvm-arm/msg47383.html 
(modulo the fact that KVM_GET/SET_CLOCK return or pass realtime_ns 
rather than kvmclock_ns - relatime_ns; which is fine of course).

Paolo

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index e527d7259415..45134b7b14d6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1070,6 +1070,7 @@ struct kvm_arch {
>   	u64 last_tsc_nsec;
>   	u64 last_tsc_write;
>   	u32 last_tsc_khz;
> +	u64 last_tsc_offset;
>   	u64 cur_tsc_nsec;
>   	u64 cur_tsc_write;
>   	u64 cur_tsc_offset;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index a6c327f8ad9e..0b22e1e84e78 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -503,4 +503,8 @@ struct kvm_pmu_event_filter {
>   #define KVM_PMU_EVENT_ALLOW 0
>   #define KVM_PMU_EVENT_DENY 1
>   
> +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
> +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
> +#define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
> +
>   #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index e1b7c8b67428..d22de0a1988a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2411,6 +2411,11 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
>   	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
>   }
>   
> +static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.l1_tsc_offset;
> +}
> +
>   static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
>   {
>   	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
> @@ -2467,6 +2472,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
>   	kvm->arch.last_tsc_nsec = ns;
>   	kvm->arch.last_tsc_write = tsc;
>   	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
> +	kvm->arch.last_tsc_offset = offset;
>   
>   	vcpu->arch.last_guest_tsc = tsc;
>   
> @@ -4914,6 +4920,136 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>   
> +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
> +				 struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->attr) {
> +	case KVM_VCPU_TSC_OFFSET:
> +		r = 0;
> +		break;
> +	default:
> +		r = -ENXIO;
> +	}
> +
> +	return r;
> +}
> +
> +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
> +				 struct kvm_device_attr *attr)
> +{
> +	void __user *uaddr = (void __user *)attr->addr;
> +	int r;
> +
> +	switch (attr->attr) {
> +	case KVM_VCPU_TSC_OFFSET: {
> +		u64 offset;
> +
> +		offset = kvm_vcpu_read_tsc_offset(vcpu);
> +		r = -EFAULT;
> +		if (copy_to_user(uaddr, &offset, sizeof(offset)))
> +			break;
> +
> +		r = 0;
> +	}
> +	default:
> +		r = -ENXIO;
> +	}
> +
> +	return r;
> +}
> +
> +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
> +				 struct kvm_device_attr *attr)
> +{
> +	void __user *uaddr = (void __user *)attr->addr;
> +	struct kvm *kvm = vcpu->kvm;
> +	int r;
> +
> +	switch (attr->attr) {
> +	case KVM_VCPU_TSC_OFFSET: {
> +		u64 offset, tsc, ns;
> +		unsigned long flags;
> +		bool matched;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&offset, uaddr, sizeof(offset)))
> +			break;
> +
> +		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
> +
> +		matched = (vcpu->arch.virtual_tsc_khz &&
> +			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
> +			   kvm->arch.last_tsc_offset == offset);
> +
> +		tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
> +		ns = get_kvmclock_base_ns();
> +
> +		__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
> +		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
> +
> +		r = 0;
> +		break;
> +	}
> +	default:
> +		r = -ENXIO;
> +	}
> +
> +	return r;
> +}
> +
> +static int kvm_vcpu_ioctl_has_device_attr(struct kvm_vcpu *vcpu,
> +					  struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_VCPU_TSC_CTRL:
> +		r = kvm_arch_tsc_has_attr(vcpu, attr);
> +		break;
> +	default:
> +		r = -ENXIO;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
> +static int kvm_vcpu_ioctl_get_device_attr(struct kvm_vcpu *vcpu,
> +					  struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_VCPU_TSC_CTRL:
> +		r = kvm_arch_tsc_get_attr(vcpu, attr);
> +		break;
> +	default:
> +		r = -ENXIO;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
> +static int kvm_vcpu_ioctl_set_device_attr(struct kvm_vcpu *vcpu,
> +					  struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_VCPU_TSC_CTRL:
> +		r = kvm_arch_tsc_set_attr(vcpu, attr);
> +		break;
> +	default:
> +		r = -ENXIO;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
>   static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>   				     struct kvm_enable_cap *cap)
>   {
> @@ -5368,6 +5504,36 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>   		r = __set_sregs2(vcpu, u.sregs2);
>   		break;
>   	}
> +	case KVM_HAS_DEVICE_ATTR: {
> +		struct kvm_device_attr attr;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&attr, argp, sizeof(attr)))
> +			goto out;
> +
> +		r = kvm_vcpu_ioctl_has_device_attr(vcpu, &attr);
> +		break;
> +	}
> +	case KVM_GET_DEVICE_ATTR: {
> +		struct kvm_device_attr attr;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&attr, argp, sizeof(attr)))
> +			goto out;
> +
> +		r = kvm_vcpu_ioctl_get_device_attr(vcpu, &attr);
> +		break;
> +	}
> +	case KVM_SET_DEVICE_ATTR: {
> +		struct kvm_device_attr attr;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&attr, argp, sizeof(attr)))
> +			goto out;
> +
> +		r = kvm_vcpu_ioctl_set_device_attr(vcpu, &attr);
> +		break;
> +	}
>   	default:
>   		r = -EINVAL;
>   	}
>
kernel test robot July 18, 2021, 8:50 p.m. UTC | #2
Hi Oliver,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on kvm/queue]
[also build test WARNING on vhost/linux-next v5.14-rc1 next-20210716]
[cannot apply to kvmarm/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Oliver-Upton/KVM-Add-idempotent-controls-for-migrating-system-counter-state/20210718-103407
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
config: i386-randconfig-a013-20210718 (attached as .config)
compiler: gcc-10 (Ubuntu 10.3.0-1ubuntu1~20.04) 10.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/1e9a9a8e956c70659f72ad66a7d8176ff4756061
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Oliver-Upton/KVM-Add-idempotent-controls-for-migrating-system-counter-state/20210718-103407
        git checkout 1e9a9a8e956c70659f72ad66a7d8176ff4756061
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   arch/x86/kvm/x86.c: In function 'get_kvmclock_and_realtime':
   arch/x86/kvm/x86.c:2844:7: error: implicit declaration of function 'kvm_get_walltime_and_clockread' [-Werror=implicit-function-declaration]
    2844 |   if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
         |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/x86.c: In function 'kvm_arch_tsc_get_attr':
>> arch/x86/kvm/x86.c:4945:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    4945 |  void __user *uaddr = (void __user *)attr->addr;
         |                       ^
   arch/x86/kvm/x86.c: In function 'kvm_arch_tsc_set_attr':
   arch/x86/kvm/x86.c:4969:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
    4969 |  void __user *uaddr = (void __user *)attr->addr;
         |                       ^
   arch/x86/kvm/x86.c: In function 'kvm_arch_tsc_get_attr':
   arch/x86/kvm/x86.c:4949:28: warning: this statement may fall through [-Wimplicit-fallthrough=]
    4949 |  case KVM_VCPU_TSC_OFFSET: {
         |                            ^
   arch/x86/kvm/x86.c:4959:2: note: here
    4959 |  default:
         |  ^~~~~~~
   cc1: some warnings being treated as errors


vim +4945 arch/x86/kvm/x86.c

  4941	
  4942	static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
  4943					 struct kvm_device_attr *attr)
  4944	{
> 4945		void __user *uaddr = (void __user *)attr->addr;
  4946		int r;
  4947	
  4948		switch (attr->attr) {
  4949		case KVM_VCPU_TSC_OFFSET: {
  4950			u64 offset;
  4951	
  4952			offset = kvm_vcpu_read_tsc_offset(vcpu);
  4953			r = -EFAULT;
  4954			if (copy_to_user(uaddr, &offset, sizeof(offset)))
  4955				break;
  4956	
  4957			r = 0;
  4958		}
  4959		default:
  4960			r = -ENXIO;
  4961		}
  4962	
  4963		return r;
  4964	}
  4965	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e527d7259415..45134b7b14d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1070,6 +1070,7 @@  struct kvm_arch {
 	u64 last_tsc_nsec;
 	u64 last_tsc_write;
 	u32 last_tsc_khz;
+	u64 last_tsc_offset;
 	u64 cur_tsc_nsec;
 	u64 cur_tsc_write;
 	u64 cur_tsc_offset;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..0b22e1e84e78 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -503,4 +503,8 @@  struct kvm_pmu_event_filter {
 #define KVM_PMU_EVENT_ALLOW 0
 #define KVM_PMU_EVENT_DENY 1
 
+/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
+#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
+#define   KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1b7c8b67428..d22de0a1988a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2411,6 +2411,11 @@  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
 	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
 }
 
+static u64 kvm_vcpu_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.l1_tsc_offset;
+}
+
 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
 {
 	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
@@ -2467,6 +2472,7 @@  static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
 	kvm->arch.last_tsc_nsec = ns;
 	kvm->arch.last_tsc_write = tsc;
 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+	kvm->arch.last_tsc_offset = offset;
 
 	vcpu->arch.last_guest_tsc = tsc;
 
@@ -4914,6 +4920,136 @@  static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->attr) {
+	case KVM_VCPU_TSC_OFFSET:
+		r = 0;
+		break;
+	default:
+		r = -ENXIO;
+	}
+
+	return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user *)attr->addr;
+	int r;
+
+	switch (attr->attr) {
+	case KVM_VCPU_TSC_OFFSET: {
+		u64 offset;
+
+		offset = kvm_vcpu_read_tsc_offset(vcpu);
+		r = -EFAULT;
+		if (copy_to_user(uaddr, &offset, sizeof(offset)))
+			break;
+
+		r = 0;
+	}
+	default:
+		r = -ENXIO;
+	}
+
+	return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	void __user *uaddr = (void __user *)attr->addr;
+	struct kvm *kvm = vcpu->kvm;
+	int r;
+
+	switch (attr->attr) {
+	case KVM_VCPU_TSC_OFFSET: {
+		u64 offset, tsc, ns;
+		unsigned long flags;
+		bool matched;
+
+		r = -EFAULT;
+		if (copy_from_user(&offset, uaddr, sizeof(offset)))
+			break;
+
+		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+		matched = (vcpu->arch.virtual_tsc_khz &&
+			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+			   kvm->arch.last_tsc_offset == offset);
+
+		tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+		ns = get_kvmclock_base_ns();
+
+		__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+		r = 0;
+		break;
+	}
+	default:
+		r = -ENXIO;
+	}
+
+	return r;
+}
+
+static int kvm_vcpu_ioctl_has_device_attr(struct kvm_vcpu *vcpu,
+					  struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_VCPU_TSC_CTRL:
+		r = kvm_arch_tsc_has_attr(vcpu, attr);
+		break;
+	default:
+		r = -ENXIO;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_vcpu_ioctl_get_device_attr(struct kvm_vcpu *vcpu,
+					  struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_VCPU_TSC_CTRL:
+		r = kvm_arch_tsc_get_attr(vcpu, attr);
+		break;
+	default:
+		r = -ENXIO;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_vcpu_ioctl_set_device_attr(struct kvm_vcpu *vcpu,
+					  struct kvm_device_attr *attr)
+{
+	int r;
+
+	switch (attr->group) {
+	case KVM_VCPU_TSC_CTRL:
+		r = kvm_arch_tsc_set_attr(vcpu, attr);
+		break;
+	default:
+		r = -ENXIO;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 				     struct kvm_enable_cap *cap)
 {
@@ -5368,6 +5504,36 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = __set_sregs2(vcpu, u.sregs2);
 		break;
 	}
+	case KVM_HAS_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			goto out;
+
+		r = kvm_vcpu_ioctl_has_device_attr(vcpu, &attr);
+		break;
+	}
+	case KVM_GET_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			goto out;
+
+		r = kvm_vcpu_ioctl_get_device_attr(vcpu, &attr);
+		break;
+	}
+	case KVM_SET_DEVICE_ATTR: {
+		struct kvm_device_attr attr;
+
+		r = -EFAULT;
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			goto out;
+
+		r = kvm_vcpu_ioctl_set_device_attr(vcpu, &attr);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}