Message ID | 20240403140116.3002809-3-vineeth@bitbyteword.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Paravirt Scheduling (Dynamic vcpu priority management) | expand |
Adding sched_ext folks On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google) <vineeth@bitbyteword.org> wrote: > > kvm uses the kernel's paravirt sched framework to assign an available > pvsched driver for a guest. guest vcpus registers with the pvsched > driver and calls into the driver callback to notify the events that the > driver is interested in. > > This PoC doesn't do the callback on interrupt injection yet. Will be > implemented in subsequent iterations. > > Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> > --- > arch/x86/kvm/Kconfig | 13 ++++ > arch/x86/kvm/x86.c | 3 + > include/linux/kvm_host.h | 32 +++++++++ > virt/kvm/kvm_main.c | 148 +++++++++++++++++++++++++++++++++++++++ > 4 files changed, 196 insertions(+) > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 65ed14b6540b..c1776cdb5b65 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS > the memory footprint of each KVM guest, regardless of how many vCPUs are > created for a given VM. > > +config PARAVIRT_SCHED_KVM > + bool "Enable paravirt scheduling capability for kvm" > + depends on KVM > + default n > + help > + Paravirtualized scheduling facilitates the exchange of scheduling > + related information between the host and guest through shared memory, > + enhancing the efficiency of vCPU thread scheduling by the hypervisor. > + An illustrative use case involves dynamically boosting the priority of > + a vCPU thread when the guest is executing a latency-sensitive workload > + on that specific vCPU. > + This config enables paravirt scheduling in the kvm hypervisor. > + > endif # VIRTUALIZATION > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index ffe580169c93..d0abc2c64d47 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > preempt_disable(); > > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER); > + > static_call(kvm_x86_prepare_switch_to_guest)(vcpu); > > /* > @@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > guest_timing_exit_irqoff(); > > local_irq_enable(); > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT); > preempt_enable(); > > kvm_vcpu_srcu_read_lock(vcpu); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 179df96b20f8..6381569f3de8 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -45,6 +45,8 @@ > #include <asm/kvm_host.h> > #include <linux/kvm_dirty_ring.h> > > +#include <linux/pvsched.h> > + > #ifndef KVM_MAX_VCPU_IDS > #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS > #endif > @@ -832,6 +834,11 @@ struct kvm { > bool vm_bugged; > bool vm_dead; > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > + spinlock_t pvsched_ops_lock; > + struct pvsched_vcpu_ops __rcu *pvsched_ops; > +#endif > + > #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER > struct notifier_block pm_notifier; > #endif > @@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, > } > #endif /* CONFIG_KVM_PRIVATE_MEM */ > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events); > +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu); > +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu); > + > +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name); > +#else > +static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) > +{ > + return 0; > +} > +static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) > +{ > + return 0; > +} > +static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) > +{ > +} > + > +static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) > +{ > + return 0; > +} > +#endif > + > #endif > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 0f50960b0e3a..0546814e4db7 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page) > return is_zone_device_page(page); > } > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > +typedef enum { > + PVSCHED_CB_REGISTER = 1, > + PVSCHED_CB_UNREGISTER = 2, > + PVSCHED_CB_NOTIFY = 3 > +} pvsched_vcpu_callback_t; > + > +/* > + * Helper function to invoke the pvsched driver callback. > + */ > +static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events, > + pvsched_vcpu_callback_t action) > +{ > + int ret = 0; > + struct pid *pid; > + struct pvsched_vcpu_ops *ops; > + > + rcu_read_lock(); > + ops = rcu_dereference(vcpu->kvm->pvsched_ops); > + if (!ops) { > + ret = -ENOENT; > + goto out; > + } > + > + pid = rcu_dereference(vcpu->pid); > + if (WARN_ON_ONCE(!pid)) { > + ret = -EINVAL; > + goto out; > + } > + get_pid(pid); > + switch(action) { > + case PVSCHED_CB_REGISTER: > + ops->pvsched_vcpu_register(pid); > + break; > + case PVSCHED_CB_UNREGISTER: > + ops->pvsched_vcpu_unregister(pid); > + break; > + case PVSCHED_CB_NOTIFY: > + if (ops->events & events) { > + ops->pvsched_vcpu_notify_event( > + NULL, /* TODO: Pass guest allocated sharedmem addr */ > + pid, > + ops->events & events); > + } > + break; > + default: > + WARN_ON_ONCE(1); > + } > + put_pid(pid); > + > +out: > + rcu_read_unlock(); > + return ret; > +} > + > +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) > +{ > + return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY); > +} > + > +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) > +{ > + return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER); > + /* > + * TODO: Action if the registration fails? > + */ > +} > + > +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) > +{ > + __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER); > +} > + > +/* > + * Replaces the VM's current pvsched driver. > + * if name is NULL or empty string, unassign the > + * current driver. > + */ > +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) > +{ > + int ret = 0; > + unsigned long i; > + struct kvm_vcpu *vcpu = NULL; > + struct pvsched_vcpu_ops *ops = NULL, *prev_ops; > + > + > + spin_lock(&kvm->pvsched_ops_lock); > + > + prev_ops = rcu_dereference(kvm->pvsched_ops); > + > + /* > + * Unassign operation if the passed in value is > + * NULL or an empty string. > + */ > + if (name && *name) { > + ops = pvsched_get_vcpu_ops(name); > + if (!ops) { > + ret = -EINVAL; > + goto out; > + } > + } > + > + if (prev_ops) { > + /* > + * Unregister current pvsched driver. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + kvm_vcpu_pvsched_unregister(vcpu); > + } > + > + pvsched_put_vcpu_ops(prev_ops); > + } > + > + > + rcu_assign_pointer(kvm->pvsched_ops, ops); > + if (ops) { > + /* > + * Register new pvsched driver. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu)); > + } > + } > + > +out: > + spin_unlock(&kvm->pvsched_ops_lock); > + > + if (ret) > + return ret; > + > + synchronize_rcu(); > + > + return 0; > +} > +#endif > + > /* > * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted > * page, NULL otherwise. Note, the list of refcounted PG_reserved page types > @@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_arch_vcpu_destroy(vcpu); > kvm_dirty_ring_free(&vcpu->dirty_ring); > > + kvm_vcpu_pvsched_unregister(vcpu); > + > /* > * No need for rcu_read_lock as VCPU_RUN is the only place that changes > * the vcpu->pid pointer, and at destruction time all file descriptors > @@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) > > BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > + spin_lock_init(&kvm->pvsched_ops_lock); > +#endif > + > /* > * Force subsequent debugfs file creations to fail if the VM directory > * is not created (by kvm_create_vm_debugfs()). > @@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm) > int i; > struct mm_struct *mm = kvm->mm; > > + kvm_replace_pvsched_ops(kvm, NULL); > + > kvm_destroy_pm_notifier(kvm); > kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); > kvm_destroy_vm_debugfs(kvm); > @@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) > if (kvm_vcpu_check_block(vcpu) < 0) > break; > > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT); > + > waited = true; > schedule(); > } > @@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp, > /* The thread running this VCPU changed. */ > struct pid *newpid; > > + kvm_vcpu_pvsched_unregister(vcpu); > r = kvm_arch_vcpu_run_pid_change(vcpu); > if (r) > break; > @@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp, > rcu_assign_pointer(vcpu->pid, newpid); > if (oldpid) > synchronize_rcu(); > + kvm_vcpu_pvsched_register(vcpu); > put_pid(oldpid); > } > r = kvm_arch_vcpu_ioctl_run(vcpu); > -- > 2.40.1 >
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 65ed14b6540b..c1776cdb5b65 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS the memory footprint of each KVM guest, regardless of how many vCPUs are created for a given VM. +config PARAVIRT_SCHED_KVM + bool "Enable paravirt scheduling capability for kvm" + depends on KVM + default n + help + Paravirtualized scheduling facilitates the exchange of scheduling + related information between the host and guest through shared memory, + enhancing the efficiency of vCPU thread scheduling by the hypervisor. + An illustrative use case involves dynamically boosting the priority of + a vCPU thread when the guest is executing a latency-sensitive workload + on that specific vCPU. + This config enables paravirt scheduling in the kvm hypervisor. + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ffe580169c93..d0abc2c64d47 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* @@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) guest_timing_exit_irqoff(); local_irq_enable(); + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT); preempt_enable(); kvm_vcpu_srcu_read_lock(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 179df96b20f8..6381569f3de8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -45,6 +45,8 @@ #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> +#include <linux/pvsched.h> + #ifndef KVM_MAX_VCPU_IDS #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif @@ -832,6 +834,11 @@ struct kvm { bool vm_bugged; bool vm_dead; +#ifdef CONFIG_PARAVIRT_SCHED_KVM + spinlock_t pvsched_ops_lock; + struct pvsched_vcpu_ops __rcu *pvsched_ops; +#endif + #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; #endif @@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, } #endif /* CONFIG_KVM_PRIVATE_MEM */ +#ifdef CONFIG_PARAVIRT_SCHED_KVM +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events); +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu); +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu); + +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name); +#else +static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) +{ + return 0; +} +static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) +{ +} + +static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) +{ + return 0; +} +#endif + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0f50960b0e3a..0546814e4db7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page) return is_zone_device_page(page); } +#ifdef CONFIG_PARAVIRT_SCHED_KVM +typedef enum { + PVSCHED_CB_REGISTER = 1, + PVSCHED_CB_UNREGISTER = 2, + PVSCHED_CB_NOTIFY = 3 +} pvsched_vcpu_callback_t; + +/* + * Helper function to invoke the pvsched driver callback. + */ +static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events, + pvsched_vcpu_callback_t action) +{ + int ret = 0; + struct pid *pid; + struct pvsched_vcpu_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(vcpu->kvm->pvsched_ops); + if (!ops) { + ret = -ENOENT; + goto out; + } + + pid = rcu_dereference(vcpu->pid); + if (WARN_ON_ONCE(!pid)) { + ret = -EINVAL; + goto out; + } + get_pid(pid); + switch(action) { + case PVSCHED_CB_REGISTER: + ops->pvsched_vcpu_register(pid); + break; + case PVSCHED_CB_UNREGISTER: + ops->pvsched_vcpu_unregister(pid); + break; + case PVSCHED_CB_NOTIFY: + if (ops->events & events) { + ops->pvsched_vcpu_notify_event( + NULL, /* TODO: Pass guest allocated sharedmem addr */ + pid, + ops->events & events); + } + break; + default: + WARN_ON_ONCE(1); + } + put_pid(pid); + +out: + rcu_read_unlock(); + return ret; +} + +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) +{ + return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY); +} + +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) +{ + return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER); + /* + * TODO: Action if the registration fails? + */ +} + +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) +{ + __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER); +} + +/* + * Replaces the VM's current pvsched driver. + * if name is NULL or empty string, unassign the + * current driver. + */ +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) +{ + int ret = 0; + unsigned long i; + struct kvm_vcpu *vcpu = NULL; + struct pvsched_vcpu_ops *ops = NULL, *prev_ops; + + + spin_lock(&kvm->pvsched_ops_lock); + + prev_ops = rcu_dereference(kvm->pvsched_ops); + + /* + * Unassign operation if the passed in value is + * NULL or an empty string. + */ + if (name && *name) { + ops = pvsched_get_vcpu_ops(name); + if (!ops) { + ret = -EINVAL; + goto out; + } + } + + if (prev_ops) { + /* + * Unregister current pvsched driver. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_vcpu_pvsched_unregister(vcpu); + } + + pvsched_put_vcpu_ops(prev_ops); + } + + + rcu_assign_pointer(kvm->pvsched_ops, ops); + if (ops) { + /* + * Register new pvsched driver. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu)); + } + } + +out: + spin_unlock(&kvm->pvsched_ops_lock); + + if (ret) + return ret; + + synchronize_rcu(); + + return 0; +} +#endif + /* * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted * page, NULL otherwise. Note, the list of refcounted PG_reserved page types @@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_arch_vcpu_destroy(vcpu); kvm_dirty_ring_free(&vcpu->dirty_ring); + kvm_vcpu_pvsched_unregister(vcpu); + /* * No need for rcu_read_lock as VCPU_RUN is the only place that changes * the vcpu->pid pointer, and at destruction time all file descriptors @@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); +#ifdef CONFIG_PARAVIRT_SCHED_KVM + spin_lock_init(&kvm->pvsched_ops_lock); +#endif + /* * Force subsequent debugfs file creations to fail if the VM directory * is not created (by kvm_create_vm_debugfs()). @@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm) int i; struct mm_struct *mm = kvm->mm; + kvm_replace_pvsched_ops(kvm, NULL); + kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); @@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) if (kvm_vcpu_check_block(vcpu) < 0) break; + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT); + waited = true; schedule(); } @@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp, /* The thread running this VCPU changed. */ struct pid *newpid; + kvm_vcpu_pvsched_unregister(vcpu); r = kvm_arch_vcpu_run_pid_change(vcpu); if (r) break; @@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp, rcu_assign_pointer(vcpu->pid, newpid); if (oldpid) synchronize_rcu(); + kvm_vcpu_pvsched_register(vcpu); put_pid(oldpid); } r = kvm_arch_vcpu_ioctl_run(vcpu);