@@ -46,6 +46,8 @@
#include "sysemu/hw_accel.h"
#include "kvm-cpus.h"
#include "sysemu/dirtylimit.h"
+#include "hw/core/cpu.h"
+#include "migration/migration.h"
#include "hw/boards.h"
#include "monitor/stats.h"
@@ -2463,6 +2465,8 @@ static int kvm_init(MachineState *ms)
}
}
+ s->dirty_quota_supported = kvm_vm_check_extension(s, KVM_CAP_DIRTY_QUOTA);
+
/*
* KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
* enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
@@ -2808,6 +2812,88 @@ static void kvm_eat_signals(CPUState *cpu)
} while (sigismember(&chkset, SIG_IPI));
}
+static void handle_dirty_quota_sleep(int64_t sleep_time)
+{
+ /* Do not throttle the vcpu more than the maximum throttle. */
+ sleep_time = MIN(sleep_time,
+ DIRTY_QUOTA_MAX_THROTTLE * DIRTY_QUOTA_INTERVAL_SIZE);
+ /* Convert sleep time from nanoseconds to microseconds. */
+ g_usleep(sleep_time / 1000);
+}
+
+static uint64_t handle_dirty_quota_exhausted(
+ CPUState *cpu, const uint64_t count, const uint64_t quota)
+{
+ MigrationState *s = migrate_get_current();
+ uint64_t time_to_sleep;
+ int64_t unclaimed_quota;
+ int64_t dirty_quota_overflow = (count - quota);
+ uint64_t dirty_rate_limit = qatomic_read(&s->per_vcpu_dirty_rate_limit);
+ uint64_t new_quota = (dirty_rate_limit * DIRTY_QUOTA_INTERVAL_SIZE) /
+ NANOSECONDS_PER_SECOND;
+ uint64_t current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+ /* Penalize the vCPU if it dirtied more pages than it was allowed to. */
+ if (dirty_quota_overflow > 0) {
+ time_to_sleep = (dirty_quota_overflow * NANOSECONDS_PER_SECOND) /
+ dirty_rate_limit;
+ cpu->dirty_quota_expiry_time = current_time + time_to_sleep;
+ return time_to_sleep;
+ }
+
+ /*
+ * If the current dirty quota interval hasn't ended, try using common quota
+ * if it is available, else sleep.
+ */
+ current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ if (current_time < cpu->dirty_quota_expiry_time) {
+ qemu_spin_lock(&s->common_dirty_quota_lock);
+ if (s->common_dirty_quota > 0) {
+ s->common_dirty_quota -= new_quota;
+ qemu_spin_unlock(&s->common_dirty_quota_lock);
+ cpu->kvm_run->dirty_quota = count + new_quota;
+ return 0;
+ }
+
+ qemu_spin_unlock(&s->common_dirty_quota_lock);
+ current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ /* If common quota isn't available, sleep for the remaining interval. */
+ if (current_time < cpu->dirty_quota_expiry_time) {
+ time_to_sleep = cpu->dirty_quota_expiry_time - current_time;
+ return time_to_sleep;
+ }
+ }
+
+ /*
+ * This is a fresh dirty quota interval. If the vcpu has not claimed its
+ * quota for the previous intervals, add them to the common quota.
+ */
+ current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ unclaimed_quota = (current_time - cpu->dirty_quota_expiry_time) *
+ dirty_rate_limit;
+ qemu_spin_lock(&s->common_dirty_quota_lock);
+ s->common_dirty_quota += unclaimed_quota;
+ qemu_spin_unlock(&s->common_dirty_quota_lock);
+
+ /* Allocate the vcpu this new interval's dirty quota. */
+ cpu->kvm_run->dirty_quota = count + new_quota;
+ cpu->dirty_quota_expiry_time = current_time + DIRTY_QUOTA_INTERVAL_SIZE;
+ return 0;
+}
+
+
+static void handle_kvm_exit_dirty_quota_exhausted(CPUState *cpu,
+ const uint64_t count, const uint64_t quota)
+{
+ uint64_t time_to_sleep;
+ do {
+ time_to_sleep = handle_dirty_quota_exhausted(cpu, count, quota);
+ if (time_to_sleep > 0) {
+ handle_dirty_quota_sleep(time_to_sleep);
+ }
+ } while (time_to_sleep != 0);
+}
+
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
@@ -2943,6 +3029,11 @@ int kvm_cpu_exec(CPUState *cpu)
dirtylimit_vcpu_execute(cpu);
ret = 0;
break;
+ case KVM_EXIT_DIRTY_QUOTA_EXHAUSTED:
+ handle_kvm_exit_dirty_quota_exhausted(cpu,
+ run->dirty_quota_exit.count, run->dirty_quota_exit.quota);
+ ret = 0;
+ break;
case KVM_EXIT_SYSTEM_EVENT:
switch (run->system_event.type) {
case KVM_SYSTEM_EVENT_SHUTDOWN:
@@ -3009,6 +3009,9 @@ bool ram_block_discard_is_disabled(void);
*/
bool ram_block_discard_is_required(void);
+void dirty_quota_migration_start(void);
+void dirty_quota_migration_stop(void);
+
#endif
#endif
@@ -36,6 +36,9 @@
typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
void *opaque);
+#define DIRTY_QUOTA_INTERVAL_SIZE 10000000
+#define DIRTY_QUOTA_MAX_THROTTLE .99
+
/**
* SECTION:cpu
* @section_id: QEMU-cpu
@@ -443,6 +446,8 @@ struct CPUState {
/* track IOMMUs whose translations we've cached in the TCG TLB */
GArray *iommu_notifiers;
+
+ uint64_t dirty_quota_expiry_time;
};
typedef QTAILQ_HEAD(CPUTailQ, CPUState) CPUTailQ;
@@ -110,6 +110,7 @@ struct KVMState
struct KVMDirtyRingReaper reaper;
NotifyVmexitOption notify_vmexit;
uint32_t notify_window;
+ bool dirty_quota_supported; /* Whether KVM supports dirty quota or not */
};
void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
@@ -272,6 +272,7 @@ struct kvm_xen_exit {
#define KVM_EXIT_RISCV_SBI 35
#define KVM_EXIT_RISCV_CSR 36
#define KVM_EXIT_NOTIFY 37
+#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -508,6 +509,11 @@ struct kvm_run {
#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0)
__u32 flags;
} notify;
+ /* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
+ struct {
+ __u64 count;
+ __u64 quota;
+ } dirty_quota_exit;
/* Fix the size of the union. */
char padding[256];
};
@@ -529,6 +535,8 @@ struct kvm_run {
struct kvm_sync_regs regs;
char padding[SYNC_REGS_SIZE_BYTES];
} s;
+
+ __u64 dirty_quota;
};
/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -1175,6 +1183,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
#define KVM_CAP_S390_ZPCI_OP 221
#define KVM_CAP_S390_CPU_TOPOLOGY 222
+#define KVM_CAP_DIRTY_QUOTA 224
#ifdef KVM_CAP_IRQ_ROUTING
@@ -61,6 +61,8 @@
#include "sysemu/cpus.h"
#include "yank_functions.h"
#include "sysemu/qtest.h"
+#include "hw/core/cpu.h"
+#include "sysemu/kvm_int.h"
#define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */
@@ -3685,8 +3687,11 @@ static void migration_update_counters(MigrationState *s,
int64_t current_time)
{
uint64_t transferred, transferred_pages, time_spent;
+ uint64_t pages_transferred_since_last_update, time_spent_since_last_update;
uint64_t current_bytes; /* bytes transferred since the beginning */
double bandwidth;
+ CPUState *cpu;
+ uint32_t nr_cpus;
if (current_time < s->iteration_start_time + BUFFER_DELAY) {
return;
@@ -3706,6 +3711,23 @@ static void migration_update_counters(MigrationState *s,
s->pages_per_second = (double) transferred_pages /
(((double) time_spent / 1000.0));
+ if (kvm_state->dirty_quota_supported) {
+ CPU_FOREACH(cpu) {
+ nr_cpus++;
+ }
+ pages_transferred_since_last_update = transferred_pages -
+ s->last_counters_update.transferred_pages;
+ time_spent_since_last_update = time_spent -
+ s->last_counters_update.time_spent;
+ qatomic_set(&s->per_vcpu_dirty_rate_limit,
+ ((double) pages_transferred_since_last_update) /
+ (((double) time_spent_since_last_update) / 1000.0) /
+ ((double) nr_cpus));
+
+ s->last_counters_update.transferred_pages = transferred_pages;
+ s->last_counters_update.time_spent = time_spent;
+ }
+
/*
* if we haven't sent anything, we don't want to
* recalculate. 10000 is a small enough number for our purposes
@@ -249,6 +249,15 @@ struct MigrationState {
uint64_t iteration_initial_bytes;
/* time at the start of current iteration */
int64_t iteration_start_time;
+
+ /* state related to last migration counters update */
+ struct {
+ /* time spent from the start of iteration till the last update */
+ uint64_t time_spent;
+ /* pages already sent in the current iteration till the last update */
+ uint64_t transferred_pages;
+ } last_counters_update;
+
/*
* The final stage happens when the remaining data is smaller than
* this threshold; it's calculated from the requested downtime and
@@ -373,6 +382,28 @@ struct MigrationState {
* This save hostname when out-going migration starts
*/
char *hostname;
+
+ /*
+ * Dirty quota throttling tries to limit the dirty rate of the guest to some
+ * factor of network throughput. This factor is dirty_quota_throttle_ratio.
+ */
+ double dirty_quota_throttle_ratio;
+
+ /*
+ * For dirty quota throttling, this is the limit on the dirty rate of the
+ * vcpus. There maybe exceptions where this limit might be enforced losely
+ * to avoid overthrottling of the vcpus.
+ */
+ uint64_t per_vcpu_dirty_rate_limit;
+
+ /*
+ * If a vcpu doesn't claim its dirty quota for a given dirty quota interval,
+ * the unclaimed quota gets added to common quota.
+ * Common dirty quota can be claimed by any vcpu which has already used its
+ * individual dirty quota for the current dirty quota interval.
+ */
+ QemuSpin common_dirty_quota_lock;
+ int64_t common_dirty_quota;
};
void migrate_set_state(int *state, int old_state, int new_state);
@@ -12,6 +12,7 @@
* Contributions after 2012-01-13 are licensed under the terms of the
* GNU GPL, version 2 or (at your option) any later version.
*/
+#include <linux/kvm.h>
#include "qemu/osdep.h"
#include "qemu/log.h"
@@ -34,6 +35,10 @@
#include "hw/boards.h"
#include "migration/vmstate.h"
#include "exec/address-spaces.h"
+#include "hw/core/cpu.h"
+#include "exec/target_page.h"
+#include "migration/migration.h"
+#include "sysemu/kvm_int.h"
//#define DEBUG_UNASSIGNED
@@ -2869,6 +2874,46 @@ static unsigned int postponed_stop_flags;
static VMChangeStateEntry *vmstate_change;
static void memory_global_dirty_log_stop_postponed_run(void);
+static void init_vcpu_dirty_quota(CPUState *cpu, run_on_cpu_data arg)
+{
+ uint64_t current_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ cpu->kvm_run->dirty_quota = 1;
+ cpu->dirty_quota_expiry_time = current_time;
+}
+
+void dirty_quota_migration_start(void)
+{
+ if (!kvm_state->dirty_quota_supported) {
+ return;
+ }
+
+ MigrationState *s = migrate_get_current();
+ /* Assuming initial bandwidth to be 128 MBps. */
+ double pages_per_second = (((double) 1e9) / 8.0) /
+ (double) qemu_target_page_size();
+ uint32_t nr_cpus;
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ nr_cpus++;
+ }
+ /*
+ * Currently we are hardcoding this to 2. There are plans to allow the user
+ * to manually select this ratio.
+ */
+ s->dirty_quota_throttle_ratio = 2;
+ qatomic_set(&s->per_vcpu_dirty_rate_limit,
+ pages_per_second / s->dirty_quota_throttle_ratio / nr_cpus);
+
+ qemu_spin_lock(&s->common_dirty_quota_lock);
+ s->common_dirty_quota = 0;
+ qemu_spin_unlock(&s->common_dirty_quota_lock);
+
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, init_vcpu_dirty_quota, RUN_ON_CPU_NULL);
+ }
+}
+
void memory_global_dirty_log_start(unsigned int flags)
{
unsigned int old_flags;
@@ -2891,6 +2936,7 @@ void memory_global_dirty_log_start(unsigned int flags)
trace_global_dirty_changed(global_dirty_tracking);
if (!old_flags) {
+ dirty_quota_migration_start();
MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward);
memory_region_transaction_begin();
memory_region_update_pending = true;
@@ -2898,6 +2944,23 @@ void memory_global_dirty_log_start(unsigned int flags)
}
}
+static void reset_vcpu_dirty_quota(CPUState *cpu, run_on_cpu_data arg)
+{
+ cpu->kvm_run->dirty_quota = 0;
+}
+
+void dirty_quota_migration_stop(void)
+{
+ if (!kvm_state->dirty_quota_supported) {
+ return;
+ }
+
+ CPUState *cpu;
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, reset_vcpu_dirty_quota, RUN_ON_CPU_NULL);
+ }
+}
+
static void memory_global_dirty_log_do_stop(unsigned int flags)
{
assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
@@ -2907,6 +2970,7 @@ static void memory_global_dirty_log_do_stop(unsigned int flags)
trace_global_dirty_changed(global_dirty_tracking);
if (!global_dirty_tracking) {
+ dirty_quota_migration_stop();
memory_region_transaction_begin();
memory_region_update_pending = true;
memory_region_transaction_commit();