From patchwork Tue Dec 15 04:08:42 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Zachary Amsden X-Patchwork-Id: 67467 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id nBF4QSLe016846 for ; Tue, 15 Dec 2009 04:26:29 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932547AbZLOEHg (ORCPT ); Mon, 14 Dec 2009 23:07:36 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1755457AbZLOEHc (ORCPT ); Mon, 14 Dec 2009 23:07:32 -0500 Received: from mx1.redhat.com ([209.132.183.28]:20296 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932493AbZLOEH1 (ORCPT ); Mon, 14 Dec 2009 23:07:27 -0500 Received: from int-mx08.intmail.prod.int.phx2.redhat.com (int-mx08.intmail.prod.int.phx2.redhat.com [10.5.11.21]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id nBF47QD3029541 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Mon, 14 Dec 2009 23:07:26 -0500 Received: from localhost.localdomain (vpn-9-100.rdu.redhat.com [10.11.9.100]) by int-mx08.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id nBF46m99031119; Mon, 14 Dec 2009 23:07:24 -0500 From: Zachary Amsden To: kvm@vger.kernel.org Cc: Zachary Amsden , Avi Kivity , Marcelo Tosatti , Joerg Roedel , linux-kernel@vger.kernel.org, Dor Laor Subject: [PATCH RFC: kvm tsc virtualization 15/20] Fix longstanding races Date: Mon, 14 Dec 2009 18:08:42 -1000 Message-Id: <1260850127-9766-16-git-send-email-zamsden@redhat.com> In-Reply-To: <1260850127-9766-15-git-send-email-zamsden@redhat.com> References: <1260850127-9766-1-git-send-email-zamsden@redhat.com> <1260850127-9766-2-git-send-email-zamsden@redhat.com> <1260850127-9766-3-git-send-email-zamsden@redhat.com> <1260850127-9766-4-git-send-email-zamsden@redhat.com> <1260850127-9766-5-git-send-email-zamsden@redhat.com> <1260850127-9766-6-git-send-email-zamsden@redhat.com> <1260850127-9766-7-git-send-email-zamsden@redhat.com> <1260850127-9766-8-git-send-email-zamsden@redhat.com> <1260850127-9766-9-git-send-email-zamsden@redhat.com> <1260850127-9766-10-git-send-email-zamsden@redhat.com> <1260850127-9766-11-git-send-email-zamsden@redhat.com> <1260850127-9766-12-git-send-email-zamsden@redhat.com> <1260850127-9766-13-git-send-email-zamsden@redhat.com> <1260850127-9766-14-git-send-email-zamsden@redhat.com> <1260850127-9766-15-git-send-email-zamsden@redhat.com> Organization: Frobozz Magic Timekeeping Company X-Scanned-By: MIMEDefang 2.67 on 10.5.11.21 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bc20ac..7e2ba3e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -747,6 +747,7 @@ struct cpu_tsc_vars s64 tsc_offset; u64 tsc_measure_base; atomic_t tsc_synchronized; + u64 last_ref; }; static DEFINE_PER_CPU(struct cpu_tsc_vars, cpu_tsc_vars); @@ -3570,42 +3571,83 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); -static void evict(void *info) +/* + * evict(struct cpufreq_freqs *data) + * + * This function is meant to be called from an IPI and runs with + * interrupts disabled. It records the last good value of the + * reference TSC before optionally updating the CPU frequency and + * marking the CPU unsynchronized. Since it is entered from an + * interrupt handler, it can be used to bring CPUs out of hardware + * virtualization and have them wait until their clock has been + * resynchronized. + */ +static void evict(void *data) { - /* nothing */ + struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars); + WARN_ON(!irqs_disabled()); + cv->last_ref = compute_ref_tsc(); + if (data) { + struct cpufreq_freqs *freq = data; + cv->tsc_khz = freq->new; + } + atomic_set(&cv->tsc_synchronized, 0); } -static struct execute_work resync_work; -static int work_scheduled; - -static void resync_user(struct work_struct *work) +static long resync(void *unused) { + struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars); + u64 tsc = 0; int cpu; - work_scheduled = 0; - for_each_online_cpu(cpu) - if (cpu != tsc_base_cpu) - kvm_do_sync_tsc(cpu); -} + /* + * First, make sure we are on the right CPU; between when the work got + * scheduled and when this runs, the tsc_base could have changed. We + * lock out changes to tsc_base_cpu with cpu_get(). The cpu takedown + * code can't run until all non-preempt sections are finished, so being + * in a non-preempt section protects the base from changing. Tricky. + */ + cpu = get_cpu(); + if (cpu != tsc_base_cpu) + return -1; -static void resync(void *info) -{ - struct cpu_tsc_vars *cv = &__get_cpu_var(cpu_tsc_vars); - u64 tsc; + /* + * When the base CPU frequency changes independenetly of other + * cores, the reference TSC can fall behind. Rather than + * introduce a complicated and fragile locking scheme for this + * rare case, simply evict all CPUs from VM execution and take + * the highest global reference clock. This also allows us to + * recover from skew events which slightly advance other clocks + * relative to the base. + */ + on_each_cpu(evict, NULL, 1); + for_each_online_cpu(cpu) + if (per_cpu(cpu_tsc_vars, cpu).last_ref > tsc) + tsc = per_cpu(cpu_tsc_vars, cpu).last_ref; /* Fixup our own values to stay in sync with the reference */ - tsc = compute_ref_tsc(); cv->tsc_measure_base = native_read_tsc(); cv->tsc_offset = tsc; + cv->tsc_generation++; // XXX needed? */ compute_best_multiplier(ref_tsc_khz, cv->tsc_khz, &cv->tsc_multiplier, &cv->tsc_shift); atomic_set(&cv->tsc_synchronized, 1); - /* Then, get everybody else on board */ - if (!work_scheduled) { - work_scheduled = 1; - execute_in_process_context(resync_user, &resync_work); - } + for_each_online_cpu(cpu) + kvm_do_sync_tsc(cpu); + + put_cpu(); + return 0; +} + +static DEFINE_MUTEX(resync_lock); + +static void resync_all(void) +{ + mutex_lock(&resync_lock); + while (work_on_cpu(tsc_base_cpu, resync, NULL) != 0); + /* do nothing */ + mutex_unlock(&resync_lock); } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -3614,8 +3656,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct cpufreq_freqs *freq = data; struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; - struct cpu_tsc_vars *cv = &per_cpu(cpu_tsc_vars, freq->cpu); + int i; /* * There is no way to precisely know the TSC value at which time the @@ -3625,43 +3666,34 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * any CPUs which may be running in hardware virtualization. * * We do this by setting cpu_tsc_synchronized to zero and polling for - * this value to change when entering hardware virtualization. + * this value to change when entering hardware virtualization; this + * will happen in the call to evict(), which also stores the new freq */ if (val == CPUFREQ_PRECHANGE) { - get_online_cpus(); - atomic_set(&cv->tsc_synchronized, 0); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (vcpu->cpu != smp_processor_id()) - send_ipi++; kvm_request_guest_time_update(vcpu); } } spin_unlock(&kvm_lock); - if (send_ipi) { - /* - * In case we update the frequency for another cpu - * (which might be in guest context) send an interrupt - * to kick the cpu out of guest context. Next time - * guest context is entered kvmclock will be updated, - * so the guest will not see stale values. - */ - smp_call_function_single(freq->cpu, evict, NULL, 1); - } /* - * The update of the frequency can't happen while a VM - * is running, nor can it happen during init when we can - * race against the init code setting the first known freq. - * Just use the vm_tsc_lock for a mutex. + * In case we update the frequency for another cpu (which might + * be in guest context) we must send an interrupt to kick the + * cpu out of guest context. Next time guest context is + * entered kvmclock will be updated, so the guest will not see + * stale values. + * + * The update of the frequency can't happen while a VM is + * running, nor can it happen while we might be possibly + * running a resync. To avoid locking problems which arise + * otherwise, always update the tsc_khz value on the affected + * processor; this is done by passing the freq data to evict */ - spin_lock(&kvm_tsc_lock); - cv->tsc_khz = freq->new; - spin_unlock(&kvm_tsc_lock); - + smp_call_function_single(freq->cpu, evict, data, 1); return 0; } @@ -3673,8 +3705,9 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * order in which this notifier is called; this is harmless. */ if (val == CPUFREQ_POSTCHANGE) { + get_online_cpus(); if (freq->cpu == tsc_base_cpu) - smp_call_function_single(freq->cpu, resync, NULL, 1); + resync_all(); else if (cpu_is_tsc_synchronized(tsc_base_cpu)) /* Can't do this if base is not yet updated */ kvm_do_sync_tsc(freq->cpu); @@ -3694,19 +3727,17 @@ static int kvm_x86_cpu_hotplug(struct notifier_block *notifier, val &= ~CPU_TASKS_FROZEN; switch (val) { - case CPU_DOWN_PREPARE: + case CPU_DYING: if (cpu == tsc_base_cpu) { int new_cpu; - spin_lock(&kvm_tsc_lock); for_each_online_cpu(new_cpu) if (new_cpu != tsc_base_cpu) break; tsc_base_cpu = new_cpu; - spin_unlock(&kvm_tsc_lock); } break; - case CPU_DYING: + case CPU_DOWN_PREPARE: case CPU_UP_CANCELED: atomic_set(&per_cpu(cpu_tsc_vars, cpu).tsc_synchronized, 0); break; @@ -3784,18 +3815,18 @@ static void kvm_timer_init(void) unsigned long khz = cpufreq_get(cpu); if (!khz) khz = tsc_khz; - spin_lock(&kvm_tsc_lock); + local_irq_disable(); if (!per_cpu(cpu_tsc_vars, cpu).tsc_khz) per_cpu(cpu_tsc_vars, cpu).tsc_khz = khz; - spin_unlock(&kvm_tsc_lock); + local_irq_enable(); } } else { for_each_possible_cpu(cpu) per_cpu(cpu_tsc_vars, cpu).tsc_khz = tsc_khz; } tsc_base_cpu = get_cpu(); - resync(NULL); put_cpu(); + resync_all(); } int kvm_arch_init(void *opaque)