@@ -148,6 +148,10 @@ bool kvm_ioeventfd_any_length_allowed;
bool kvm_msi_use_devid;
static bool kvm_immediate_exit;
static hwaddr kvm_max_slot_size = ~0;
+static QemuMutex kvm_run_mutex;
+static QemuCond kvm_run_cond;
+static QemuCond kvm_run_inhibit_cond;
+static int kvm_run_inhibited;
static const KVMCapabilityInfo kvm_required_capabilites[] = {
KVM_CAP_INFO(USER_MEMORY),
@@ -1121,6 +1125,57 @@ static void kvm_region_del(MemoryListener *listener,
memory_region_unref(section->mr);
}
+/*
+ * Certain updates (e.g., resizing memory regions) require temporarily removing
+ * kvm memory slots. Avoid any VCPU to fault by making sure all VCPUs
+ * left KVM_RUN and won't enter it again until unblocked.
+ */
+static void kvm_run_inhibit_begin(void)
+{
+ CPUState *cpu;
+
+ atomic_inc(&kvm_run_inhibited);
+ while (true) {
+ bool any_in_kernel = false;
+
+ CPU_FOREACH(cpu) {
+ if (atomic_read(&cpu->in_kernel)) {
+ any_in_kernel = true;
+ qemu_cpu_kick(cpu);
+ }
+ }
+ if (!any_in_kernel) {
+ break;
+ }
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_wait(&kvm_run_inhibit_cond, &kvm_run_mutex);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ }
+}
+
+static void kvm_run_inhibit_end(void)
+{
+ atomic_dec(&kvm_run_inhibited);
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_cond);
+ qemu_mutex_unlock(&kvm_run_mutex);
+}
+
+static void kvm_region_resize(MemoryListener *listener,
+ MemoryRegionSection *section, Int128 new)
+{
+ KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
+ MemoryRegionSection new_section = *section;
+
+ new_section.size = new;
+
+ /* Inhibit KVM while we temporarily remove slots. */
+ kvm_run_inhibit_begin();
+ kvm_set_phys_mem(kml, section, false);
+ kvm_set_phys_mem(kml, &new_section, true);
+ kvm_run_inhibit_end();
+}
+
static void kvm_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
@@ -1239,6 +1294,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
kml->listener.region_add = kvm_region_add;
kml->listener.region_del = kvm_region_del;
+ kml->listener.region_resize = kvm_region_resize;
kml->listener.log_start = kvm_log_start;
kml->listener.log_stop = kvm_log_stop;
kml->listener.log_sync = kvm_log_sync;
@@ -1884,6 +1940,9 @@ static int kvm_init(MachineState *ms)
assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
s->sigmask_len = 8;
+ qemu_mutex_init(&kvm_run_mutex);
+ qemu_cond_init(&kvm_run_cond);
+ qemu_cond_init(&kvm_run_inhibit_cond);
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_INIT(&s->kvm_sw_breakpoints);
@@ -2294,6 +2353,29 @@ static void kvm_eat_signals(CPUState *cpu)
} while (sigismember(&chkset, SIG_IPI));
}
+static void kvm_set_cpu_in_kernel(CPUState *cpu, bool in_kernel)
+{
+ atomic_set(&cpu->in_kernel, in_kernel);
+ if (in_kernel) {
+ /* wait until KVM_RUN is no longer inhibited */
+ while (unlikely(atomic_read(&kvm_run_inhibited))) {
+ atomic_set(&cpu->in_kernel, false);
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_inhibit_cond);
+ qemu_cond_wait(&kvm_run_cond, &kvm_run_mutex);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ atomic_set(&cpu->in_kernel, true);
+ }
+ } else {
+ /* wake up somebody wanting to inhibit KVM_RUN */
+ if (unlikely(atomic_read(&kvm_run_inhibited))) {
+ qemu_mutex_lock(&kvm_run_mutex);
+ qemu_cond_broadcast(&kvm_run_inhibit_cond);
+ qemu_mutex_unlock(&kvm_run_mutex);
+ }
+ }
+}
+
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
@@ -2318,6 +2400,9 @@ int kvm_cpu_exec(CPUState *cpu)
}
kvm_arch_pre_run(cpu, run);
+
+ kvm_set_cpu_in_kernel(cpu, true);
+
if (atomic_read(&cpu->exit_request)) {
DPRINTF("interrupt exit requested\n");
/*
@@ -2335,6 +2420,8 @@ int kvm_cpu_exec(CPUState *cpu)
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
+ kvm_set_cpu_in_kernel(cpu, false);
+
attrs = kvm_arch_post_run(cpu, run);
#ifdef KVM_HAVE_MCE_INJECTION
@@ -431,6 +431,9 @@ struct CPUState {
/* shared by kvm, hax and hvf */
bool vcpu_dirty;
+ /* kvm only for now: VCPU is executing in the kernel (KVM_RUN) */
+ bool in_kernel;
+
/* Used to keep track of an outstanding cpu throttle thread for migration
* autoconverge
*/
virtio-mem wants to resize (esp. grow) memory regions while the guest is already aware of them and makes use of them. Resizing a KVM slot can only currently be done by removing it and re-adding it. While the kvm slot is temporarily removed, VCPUs that try to read from these slots will fault. Let's inhibit KVM_RUN while performing the resize. Keep it lightweight by remembering using one bool per VCPU, if the VCPU is executing in the kernel. Note1: Instead of implementing region_resize(), we could also inhibit in begin() and let the VCPUs continue to run in commit(). This would also handle atomic splitting of memory regions. (I remember a BUG report but cannot dig up the mail). However, using the region_resize() callback we can later wire up an ioctl that can perform the resize atomically, and make the inhibit conditional. Also, this way we inhibit KVM only when resizing - not on any address space changes. This will not affect existing RT workloads (resizes currently only happen during reboot or at the start of an incoming migration). Note2: We cannot use pause_all_vcpus()/resume_all_vcpus(), as it will temporarily drop the BQL, which is something most caller cannot deal with when trying to resize a memory region. Signed-off-by: David Hildenbrand <david@redhat.com> --- accel/kvm/kvm-all.c | 87 +++++++++++++++++++++++++++++++++++++++++++ include/hw/core/cpu.h | 3 ++ 2 files changed, 90 insertions(+)