diff mbox series

[v2,4/4] x86/kvm: Add guest side support for virtual suspend time injection

Message ID 20210806190607.v2.4.I2cbcd43256eacc3c92274adff6d0458b6a9c15ee@changeid (mailing list archive)
State New, archived
Headers show
Series x86/kvm: Virtual suspend time injection support | expand

Commit Message

Hikaru Nishida Aug. 6, 2021, 10:07 a.m. UTC
This patch implements virtual suspend time injection support for kvm
guests. If this functionality is enabled and the host supports
KVM_FEATURE_HOST_SUSPEND_TIME,
the guest will register struct kvm_host_suspend_time through MSR to
get how much time the guest spend during the host's suspension.
Host will notify the update on the structure (which happens if the host
went into suspension while the guest was running) through the irq and
the irq will trigger the adjustment of CLOCK_BOOTTIME inside a guest.

Before this patch, there was no way to adjust the CLOCK_BOOTTIME without
actually suspending the kernel. However, some guest applications rely on
the fact that there will be some difference between CLOCK_BOOTTIME and
CLOCK_MONOTONIC after the suspention of the execution and they will be
broken if we just pausing the guest instead of actually suspending them.
Pausing the guest kernels is one solution to solve the problem, but
if we could adjust the clocks without actually suspending them, we can
reduce the overhead of guest's suspend/resume cycles on every host's
suspensions. So this change will be useful for the devices which
experience suspend/resume frequently.

Signed-off-by: Hikaru Nishida <hikalium@chromium.org>
---

 arch/x86/Kconfig                    | 13 ++++++++++
 arch/x86/include/asm/idtentry.h     |  4 +++
 arch/x86/include/asm/kvm_para.h     |  9 +++++++
 arch/x86/kernel/kvmclock.c          | 40 +++++++++++++++++++++++++++++
 include/linux/timekeeper_internal.h |  4 +++
 kernel/time/timekeeping.c           | 33 ++++++++++++++++++++++++
 6 files changed, 103 insertions(+)

Comments

Thomas Gleixner Aug. 10, 2021, 3:48 p.m. UTC | #1
On Fri, Aug 06 2021 at 19:07, Hikaru Nishida wrote:
>  arch/x86/Kconfig                    | 13 ++++++++++
>  arch/x86/include/asm/idtentry.h     |  4 +++
>  arch/x86/include/asm/kvm_para.h     |  9 +++++++
>  arch/x86/kernel/kvmclock.c          | 40 +++++++++++++++++++++++++++++
>  include/linux/timekeeper_internal.h |  4 +++
>  kernel/time/timekeeping.c           | 33 ++++++++++++++++++++++++

Again, this wants to be split into infrastructure and usage.

> --- a/include/linux/timekeeper_internal.h
> +++ b/include/linux/timekeeper_internal.h
> @@ -124,6 +124,10 @@ struct timekeeper {
> 	u32			ntp_err_mult;
> 	/* Flag used to avoid updating NTP twice with same second */
> 	u32			skip_second_overflow;
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
> +	/* suspend_time_injected keeps the duration injected through kvm */
> +	u64			suspend_time_injected;

This is KVM only, so please can we have a name for that struct member
which reflects this?

> +#endif
>  #ifdef CONFIG_DEBUG_TIMEKEEPING
> 	long			last_warning;
> 	/*

> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index 3ac3fb479981..424c61d38646 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -2125,6 +2125,39 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
>  	return offset;
>  }
>  
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
> +/*
> + * timekeeping_inject_virtual_suspend_time - Inject virtual suspend time
> + * when requested by the kvm host.

If this is an attempt to provide a kernel-doc comment for this function,
then it's clearly a failed attempt and aside of that malformatted.

> + * This function should be called under irq context.

Why? There is no reason for being called from interrupt context and
nothing inforces it.

> + */
> +void timekeeping_inject_virtual_suspend_time(void)
> +{
> +	/*
> +	 * Only updates shadow_timekeeper so the change will be reflected
> +	 * on the next call of timekeeping_advance().

No. That's broken.

    timekeeping_inject_virtual_suspend_time();

    do_settimeofday() or do_adjtimex()

       timekeeping_update(tk, TK_MIRROR...);

and your change to the shadow timekeeper is gone.

Of course there is also no justification for this approach. What's wrong
with updating it right away?

> +	 */
> +	struct timekeeper *tk = &shadow_timekeeper;
> +	unsigned long flags;
> +	struct timespec64 delta;
> +	u64 suspend_time;

Please sort variables in reverse fir tree order and not randomly as you
see fit.

> +
> +	raw_spin_lock_irqsave(&timekeeper_lock, flags);
> +	suspend_time = kvm_get_suspend_time();
> +	if (suspend_time > tk->suspend_time_injected) {
> +		/*
> +		 * Do injection only if the time is not injected yet.
> +		 * suspend_time and tk->suspend_time_injected values are
> +		 * cummrative, so take a diff and inject the duration.

cummrative?

> +		 */
> +		delta = ns_to_timespec64(suspend_time - tk->suspend_time_injected);
> +		__timekeeping_inject_sleeptime(tk, &delta);
> +		tk->suspend_time_injected = suspend_time;

It's absolutely unclear how this storage and diff magic works and the
comment is not helping someone not familiar with the implementation of
kvm_get_suspend_time() and the related code at all. Please explain
non-obvious logic properly.

Thanks,

        tglx
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 45463c65ea0a..760fe7f04170 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -822,6 +822,19 @@  config KVM_GUEST
 	  underlying device model, the host provides the guest with
 	  timing infrastructure such as time of day, and system time
 
+config KVM_VIRT_SUSPEND_TIMING_GUEST
+	bool "Virtual suspend time injection (guest side)"
+	depends on KVM_GUEST
+	default n
+	help
+	 This option makes the host's suspension reflected on the guest's clocks.
+	 In other words, guest's CLOCK_MONOTONIC will stop and
+	 CLOCK_BOOTTIME keeps running during the host's suspension.
+	 This feature will only be effective when both guest and host enable
+	 this option.
+
+	 If unsure, say N.
+
 config ARCH_CPUIDLE_HALTPOLL
 	def_bool n
 	prompt "Disable host haltpoll when loading haltpoll driver"
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 1345088e9902..38f37c2a6063 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -671,6 +671,10 @@  DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	sysvec_kvm_posted_intr_wakeup
 DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested_ipi);
 #endif
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+DECLARE_IDTENTRY_SYSVEC(VIRT_SUSPEND_TIMING_VECTOR, sysvec_virtual_suspend_time);
+#endif
+
 #if IS_ENABLED(CONFIG_HYPERV)
 DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR,	sysvec_hyperv_callback);
 DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR,	sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 69299878b200..094023687c8b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,6 +16,15 @@  static inline bool kvm_check_and_clear_guest_paused(void)
 }
 #endif /* CONFIG_KVM_GUEST */
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+u64 kvm_get_suspend_time(void);
+#else
+static inline u64 kvm_get_suspend_time(void)
+{
+	return 0;
+}
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST */
+
 #define KVM_HYPERCALL \
         ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ad273e5861c1..1c92b54b1bce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,11 +16,15 @@ 
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/set_memory.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hypervisor.h>
 #include <asm/mem_encrypt.h>
 #include <asm/x86_init.h>
 #include <asm/kvmclock.h>
+#include <asm/desc.h>
+#include <asm/idtentry.h>
 
 static int kvmclock __initdata = 1;
 static int kvmclock_vsyscall __initdata = 1;
@@ -48,6 +52,9 @@  early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 
 static struct pvclock_vsyscall_time_info
 			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+static struct kvm_suspend_time suspend_time __bss_decrypted;
+#endif
 static struct pvclock_wall_clock wall_clock __bss_decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 static struct pvclock_vsyscall_time_info *hvclock_mem;
@@ -163,6 +170,18 @@  static int kvm_cs_enable(struct clocksource *cs)
 	return 0;
 }
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/*
+ * kvm_get_suspend_time - This function returns total time passed during
+ * the host was in a suspend state while this guest was running.
+ * (Not a duration of the last host suspension but cumulative time.)
+ */
+u64 kvm_get_suspend_time(void)
+{
+	return suspend_time.suspend_time_ns;
+}
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST */
+
 struct clocksource kvm_clock = {
 	.name	= "kvm-clock",
 	.read	= kvm_clock_get_cycles,
@@ -290,6 +309,18 @@  static int kvmclock_setup_percpu(unsigned int cpu)
 	return p ? 0 : -ENOMEM;
 }
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+void timekeeping_inject_virtual_suspend_time(void);
+DEFINE_IDTENTRY_SYSVEC(sysvec_virtual_suspend_time)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	timekeeping_inject_virtual_suspend_time();
+
+	set_irq_regs(old_regs);
+}
+#endif
+
 void __init kvmclock_init(void)
 {
 	u8 flags;
@@ -304,6 +335,15 @@  void __init kvmclock_init(void)
 		return;
 	}
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+	if (kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME)) {
+		alloc_intr_gate(VIRT_SUSPEND_TIMING_VECTOR, asm_sysvec_virtual_suspend_time);
+		/* Register the suspend time structure */
+		wrmsrl(MSR_KVM_HOST_SUSPEND_TIME,
+		       slow_virt_to_phys(&suspend_time) | KVM_MSR_ENABLED);
+	}
+#endif
+
 	if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
 			      kvmclock_setup_percpu, NULL) < 0) {
 		return;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 84ff2844df2a..a5fd515f0a9d 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -124,6 +124,10 @@  struct timekeeper {
 	u32			ntp_err_mult;
 	/* Flag used to avoid updating NTP twice with same second */
 	u32			skip_second_overflow;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+	/* suspend_time_injected keeps the duration injected through kvm */
+	u64			suspend_time_injected;
+#endif
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 	long			last_warning;
 	/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3ac3fb479981..424c61d38646 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2125,6 +2125,39 @@  static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
 	return offset;
 }
 
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/*
+ * timekeeping_inject_virtual_suspend_time - Inject virtual suspend time
+ * when requested by the kvm host.
+ * This function should be called under irq context.
+ */
+void timekeeping_inject_virtual_suspend_time(void)
+{
+	/*
+	 * Only updates shadow_timekeeper so the change will be reflected
+	 * on the next call of timekeeping_advance().
+	 */
+	struct timekeeper *tk = &shadow_timekeeper;
+	unsigned long flags;
+	struct timespec64 delta;
+	u64 suspend_time;
+
+	raw_spin_lock_irqsave(&timekeeper_lock, flags);
+	suspend_time = kvm_get_suspend_time();
+	if (suspend_time > tk->suspend_time_injected) {
+		/*
+		 * Do injection only if the time is not injected yet.
+		 * suspend_time and tk->suspend_time_injected values are
+		 * cummrative, so take a diff and inject the duration.
+		 */
+		delta = ns_to_timespec64(suspend_time - tk->suspend_time_injected);
+		__timekeeping_inject_sleeptime(tk, &delta);
+		tk->suspend_time_injected = suspend_time;
+	}
+	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+#endif
+
 /*
  * timekeeping_advance - Updates the timekeeper to the current time and
  * current NTP tick length