From patchwork Tue Dec 15 04:08:33 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Zachary Amsden X-Patchwork-Id: 67479 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nBF4QSLs016846 for ; Tue, 15 Dec 2009 04:26:31 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758628AbZLOEMa (ORCPT ); Mon, 14 Dec 2009 23:12:30 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1759270AbZLOEM2 (ORCPT ); Mon, 14 Dec 2009 23:12:28 -0500 Received: from mx1.redhat.com ([209.132.183.28]:6146 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758545AbZLOEHG (ORCPT ); Mon, 14 Dec 2009 23:07:06 -0500 Received: from int-mx08.intmail.prod.int.phx2.redhat.com (int-mx08.intmail.prod.int.phx2.redhat.com [10.5.11.21]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id nBF474Hv019551 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Mon, 14 Dec 2009 23:07:04 -0500 Received: from localhost.localdomain (vpn-9-100.rdu.redhat.com [10.11.9.100]) by int-mx08.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id nBF46m90031119; Mon, 14 Dec 2009 23:07:02 -0500 From: Zachary Amsden To: kvm@vger.kernel.org Cc: Zachary Amsden , Avi Kivity , Marcelo Tosatti , Joerg Roedel , linux-kernel@vger.kernel.org, Dor Laor Subject: [PATCH RFC: kvm tsc virtualization 06/20] Make TSC reference stable across frequency changes Date: Mon, 14 Dec 2009 18:08:33 -1000 Message-Id: <1260850127-9766-7-git-send-email-zamsden@redhat.com> In-Reply-To: <1260850127-9766-6-git-send-email-zamsden@redhat.com> References: <1260850127-9766-1-git-send-email-zamsden@redhat.com> <1260850127-9766-2-git-send-email-zamsden@redhat.com> <1260850127-9766-3-git-send-email-zamsden@redhat.com> <1260850127-9766-4-git-send-email-zamsden@redhat.com> <1260850127-9766-5-git-send-email-zamsden@redhat.com> <1260850127-9766-6-git-send-email-zamsden@redhat.com> Organization: Frobozz Magic Timekeeping Company X-Scanned-By: MIMEDefang 2.67 on 10.5.11.21 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c4d2e0..144a820 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -741,9 +741,15 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier); static DEFINE_PER_CPU(int, cpu_tsc_shift); static DEFINE_PER_CPU(s64, cpu_tsc_offset); static DEFINE_PER_CPU(u64, cpu_tsc_measure_base); +static DEFINE_PER_CPU(atomic_t, cpu_tsc_synchronized); static int tsc_base_cpu = -1; static unsigned long ref_tsc_khz; +static inline int cpu_is_tsc_synchronized(int cpu) +{ + return (atomic_read(&per_cpu(cpu_tsc_synchronized, cpu)) != 0); +} + static inline unsigned long div_precise(unsigned long hi, unsigned long lo, unsigned long divisor, unsigned long *rptr) { @@ -923,6 +929,7 @@ static void kvm_sync_tsc(void *cpup) accumulator -= delta[i+SYNC_TRIES]; accumulator = accumulator / (SYNC_TRIES*2-12); per_cpu(cpu_tsc_offset, new_cpu) = accumulator; + atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1); pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), per_cpu(cpu_tsc_shift, new_cpu)); } local_irq_restore(flags); @@ -931,6 +938,11 @@ static void kvm_sync_tsc(void *cpup) static void kvm_do_sync_tsc(int cpu) { spin_lock(&kvm_tsc_lock); + + /* tsc_base_cpu can change without tsc_lock, so recheck */ + if (unlikely(cpu == tsc_base_cpu)) + goto out_unlock; + if (raw_smp_processor_id() != tsc_base_cpu) { smp_call_function_single(tsc_base_cpu, kvm_sync_tsc, (void *)&cpu, 0); @@ -940,6 +952,8 @@ static void kvm_do_sync_tsc(int cpu) smp_call_function_single(tsc_base_cpu, kvm_sync_tsc, (void *)&cpu, 1); } + +out_unlock: spin_unlock(&kvm_tsc_lock); } @@ -1656,7 +1670,6 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - BUG_ON(per_cpu(cpu_tsc_khz, cpu) == 0); kvm_request_guest_time_update(vcpu); } @@ -3461,11 +3474,46 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); -static void bounce_off(void *info) +static void evict(void *info) { /* nothing */ } +static struct execute_work resync_work; +static int work_scheduled; + +static void resync_user(struct work_struct *work) +{ + int cpu; + + work_scheduled = 0; + for_each_online_cpu(cpu) + if (cpu != tsc_base_cpu) + kvm_do_sync_tsc(cpu); +} + +static void resync(void *info) +{ + int cpu; + u64 tsc; + + /* Fixup our own values to stay in sync with the reference */ + cpu = raw_smp_processor_id(); + tsc = compute_ref_tsc(cpu); + per_cpu(cpu_tsc_measure_base, cpu) = native_read_tsc(); + per_cpu(cpu_tsc_offset, cpu) = tsc; + compute_best_multiplier(ref_tsc_khz, per_cpu(cpu_tsc_khz, cpu), + &per_cpu(cpu_tsc_multiplier, cpu), + &per_cpu(cpu_tsc_shift, cpu)); + atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 1); + + /* Then, get everybody else on board */ + if (!work_scheduled) { + work_scheduled = 1; + execute_in_process_context(resync_user, &resync_work); + } +} + static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -3474,39 +3522,68 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) - continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; - if (vcpu->cpu != smp_processor_id()) - send_ipi++; + /* + * There is no way to precisely know the TSC value at which time the + * CPU frequency actually changed, and these callbacks may happen at + * different times, thus there will be drift in the reference TSC + * clock across all CPUs. To avoid this problem, we forcibly evict + * any CPUs which may be running in hardware virtualization. + * + * We do this by setting cpu_tsc_synchronized to zero and polling for + * this value to change when entering hardware virtualization. + */ + if (val == CPUFREQ_PRECHANGE) { + get_online_cpus(); + atomic_set(&per_cpu(cpu_tsc_synchronized, freq->cpu), 0); + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->cpu != freq->cpu) + continue; + if (vcpu->cpu != smp_processor_id()) + send_ipi++; + kvm_request_guest_time_update(vcpu); + } + } + spin_unlock(&kvm_lock); + if (send_ipi) { + /* + * In case we update the frequency for another cpu + * (which might be in guest context) send an interrupt + * to kick the cpu out of guest context. Next time + * guest context is entered kvmclock will be updated, + * so the guest will not see stale values. + */ + smp_call_function_single(freq->cpu, evict, NULL, 1); } - } - spin_unlock(&kvm_lock); - if (freq->old < freq->new && send_ipi) { /* - * We upscale the frequency. Must make the guest - * doesn't see old kvmclock values while running with - * the new frequency, otherwise we risk the guest sees - * time go backwards. - * - * In case we update the frequency for another cpu - * (which might be in guest context) send an interrupt - * to kick the cpu out of guest context. Next time - * guest context is entered kvmclock will be updated, - * so the guest will not see stale values. + * The update of the frequency can't happen while a VM + * is running, nor can it happen during init when we can + * race against the init code setting the first known freq. + * Just use the vm_tsc_lock for a mutex. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + spin_lock(&kvm_tsc_lock); + per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; + spin_unlock(&kvm_tsc_lock); + + return 0; + } + + /* + * After the change, we must resynchronize affected CPUs. + * If the master reference changes, that means all CPUs. + * Note that some CPUs may be resynchronized twice, once + * by the master, and once by themselves, depending on the + * order in which this notifier is called; this is harmless. + */ + if (val == CPUFREQ_POSTCHANGE) { + if (freq->cpu == tsc_base_cpu) + smp_call_function_single(freq->cpu, resync, NULL, 1); + else if (cpu_is_tsc_synchronized(tsc_base_cpu)) + /* Can't do this if base is not yet updated */ + kvm_do_sync_tsc(freq->cpu); + put_online_cpus(); } return 0; } @@ -3536,8 +3613,7 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier, case CPU_DYING: case CPU_UP_CANCELED: - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - per_cpu(cpu_tsc_khz, cpu) = 0; + atomic_set(&per_cpu(cpu_tsc_synchronized, cpu), 0); break; case CPU_ONLINE: @@ -3590,6 +3666,12 @@ static void kvm_timer_init(void) } } #endif + + /* + * Register notifiers for both CPU add / remove and CPU frequency + * change. Must be careful to avoid subtle races, as frequency + * can change at any time. + */ register_cpu_notifier(&kvm_x86_cpu_notifier); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, @@ -3598,7 +3680,10 @@ static void kvm_timer_init(void) unsigned long khz = cpufreq_get(cpu); if (!khz) khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; + spin_lock(&kvm_tsc_lock); + if (!per_cpu(cpu_tsc_khz, cpu)) + per_cpu(cpu_tsc_khz, cpu) = khz; + spin_unlock(&kvm_tsc_lock); } } else { for_each_possible_cpu(cpu) @@ -3606,12 +3691,7 @@ static void kvm_timer_init(void) } tsc_base_cpu = get_cpu(); ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu); - per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1; - per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0; - per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0; - for_each_online_cpu(cpu) - if (cpu != tsc_base_cpu) - kvm_do_sync_tsc(cpu); + resync(NULL); put_cpu(); } @@ -4097,7 +4177,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) clear_bit(KVM_REQ_KICK, &vcpu->requests); smp_mb__after_clear_bit(); - if (vcpu->requests || need_resched() || signal_pending(current)) { + if (vcpu->requests || need_resched() || signal_pending(current) || + !cpu_is_tsc_synchronized(smp_processor_id())) { set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable();