@@ -824,6 +824,19 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time
+config KVM_VIRT_SUSPEND_TIMING_GUEST
+ bool "Guest support for virtual suspend time injection"
+ depends on KVM_GUEST
+ default n
+ help
+ This option makes the host's suspension reflected on the guest's clocks.
+ In other words, guest's CLOCK_MONOTONIC will stop and
+ CLOCK_BOOTTIME keeps running during the host's suspension.
+ This feature will only be effective when both guest and host support
+ this feature. For the host side, see KVM_VIRT_SUSPEND_TIMING.
+
+ If unsure, say N.
+
config ARCH_CPUIDLE_HALTPOLL
def_bool n
prompt "Disable host haltpoll when loading haltpoll driver"
@@ -686,7 +686,7 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
#endif
#ifdef CONFIG_KVM_GUEST
-DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_hv_callback);
#endif
#undef X86_TRAP_OTHER
@@ -20,4 +20,13 @@ static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
return this_cpu_read(hv_clock_per_cpu);
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+u64 kvm_get_suspend_time(void);
+#else
+static inline u64 kvm_get_suspend_time(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _ASM_X86_KVM_CLOCK_H */
@@ -40,6 +40,7 @@
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
+#include <asm/kvmclock.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -270,7 +271,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return true;
}
-DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
@@ -286,6 +287,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
}
+ timekeeping_inject_virtual_suspend_time(kvm_get_suspend_time());
+
set_irq_regs(old_regs);
}
@@ -710,10 +713,13 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf)
static_branch_enable(&kvm_async_pf_enabled);
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
- }
+
+ if ((kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) ||
+ kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME))
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+ asm_sysvec_kvm_hv_callback);
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
@@ -16,11 +16,15 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/kvmclock.h>
+#include <asm/desc.h>
+#include <asm/idtentry.h>
static int kvmclock __initdata = 1;
static int kvmclock_vsyscall __initdata = 1;
@@ -48,6 +52,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+static struct kvm_suspend_time suspend_time __bss_decrypted;
+#endif
static struct pvclock_wall_clock wall_clock __bss_decrypted;
static struct pvclock_vsyscall_time_info *hvclock_mem;
DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
@@ -281,6 +288,17 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return p ? 0 : -ENOMEM;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/**
+ * kvm_get_suspend_time - duration of host suspend.
+ * Return: Cumulative duration of host suspend in nanoseconds.
+ */
+u64 kvm_get_suspend_time(void)
+{
+ return suspend_time.suspend_time_ns;
+}
+#endif
+
void __init kvmclock_init(void)
{
u8 flags;
@@ -295,6 +313,14 @@ void __init kvmclock_init(void)
return;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ if (kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME)) {
+ /* Register the suspend time structure */
+ wrmsrl(MSR_KVM_HOST_SUSPEND_TIME,
+ slow_virt_to_phys(&suspend_time) | KVM_MSR_ENABLED);
+ }
+#endif
+
if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, NULL) < 0) {
return;
@@ -1509,7 +1509,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* memory is swapped out). Note, the corresponding "page ready" event
* which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
- * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
+ * (see arch/x86/kernel/kvm.c: sysvec_kvm_hv_callback()).
*
* We are relying on the interrupted context being sane (valid RSP,
* relevant locks not held, etc.), which is fine as long as the
@@ -68,6 +68,8 @@ struct tk_read_base {
* shifted nano seconds.
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
* ntp shifted nano seconds.
+ * @kvm_suspend_time: The cumulative duration of suspend injected through KVM
+ * in nano seconds.
* @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING)
* @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING)
* @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING)
@@ -124,6 +126,9 @@ struct timekeeper {
u32 ntp_err_mult;
/* Flag used to avoid updating NTP twice with same second */
u32 skip_second_overflow;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ u64 kvm_suspend_time;
+#endif
#ifdef CONFIG_DEBUG_TIMEKEEPING
long last_warning;
/*
@@ -310,4 +310,8 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
extern int update_persistent_clock64(struct timespec64 now);
#endif
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+void timekeeping_inject_virtual_suspend_time(u64 total_duration_ns);
+#endif
+
#endif
@@ -2133,6 +2133,52 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/**
+ * timekeeping_inject_virtual_suspend_time - Inject virtual suspend time
+ * when requested by the kvm host.
+ * @total_duration_ns: Total suspend time to be injected in nanoseconds.
+ */
+void timekeeping_inject_virtual_suspend_time(u64 total_duration_ns)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ if (total_duration_ns > tk->kvm_suspend_time) {
+ /*
+ * Do injection only if the time is not injected yet.
+ * total_duration_ns and tk->kvm_suspend_time values are
+ * cumulative, so the delta between them will be an amount
+ * of adjustments. For example, if the host suspends 2 times
+ * during the guest is running and each suspend is 5 seconds,
+ * total_duration_ns will be 5 seconds at the first injection
+ * and tk->kvm_suspend_time was initialized to zero so the
+ * adjustment injected here will be 5 - 0 = 5 seconds and
+ * tk->kvm_suspend_time will be updated to 5 seconds.
+ * On the second injection after the second resume,
+ * total_duration_ns will be 10 seconds and
+ * tk->kvm_suspend_time will be 5 seconds so 10 - 5 = 5 seconds
+ * of the suspend time will be injected again.
+ */
+ struct timespec64 delta =
+ ns_to_timespec64(total_duration_ns -
+ tk->kvm_suspend_time);
+ tk->kvm_suspend_time = total_duration_ns;
+
+ write_seqcount_begin(&tk_core.seq);
+ timekeeping_forward_now(tk);
+ __timekeeping_inject_sleeptime(tk, &delta);
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&tk_core.seq);
+
+ /* signal hrtimers about time change */
+ clock_was_set_delayed();
+ }
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+#endif
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
Add guest side implementation of KVM virtual suspend time injection. How it works from guest's view: - Guest will be paused without going through suspend/resume path in the guest kernel - Before resuming the execution of the guest's vcpus, host will adjust the hardware clock (and kvm_clock) to the time before the suspend. - By this action, guest's CLOCK_MONOTONIC behaves as expected (stops during the host's suspension.) - the guest will receive an IRQ from the guest that notifies about the suspend which was invisible to the guest. In the handler, the guest can adjust their CLOCK_BOOTTIME to reflect the suspension. - Now, CLOCK_BOOTTIME includes the time passed during the host's suspension. Signed-off-by: Hikaru Nishida <hikalium@chromium.org> --- Changes in v3: - Reused HYPERVISOR_CALLBACK_VECTOR IRQ instead of adding a new one. - Extracted arch-independent parts. arch/x86/Kconfig | 13 ++++++++ arch/x86/include/asm/idtentry.h | 2 +- arch/x86/include/asm/kvmclock.h | 9 ++++++ arch/x86/kernel/kvm.c | 14 ++++++--- arch/x86/kernel/kvmclock.c | 26 ++++++++++++++++ arch/x86/mm/fault.c | 2 +- include/linux/timekeeper_internal.h | 5 ++++ include/linux/timekeeping.h | 4 +++ kernel/time/timekeeping.c | 46 +++++++++++++++++++++++++++++ 9 files changed, 115 insertions(+), 6 deletions(-)