diff mbox series

[v2] KVM: Mark a vCPU as preempted/ready iff it's scheduled out while running

Message ID 20240307163541.92138-1-dmatlack@google.com (mailing list archive)
State New, archived
Headers show
Series [v2] KVM: Mark a vCPU as preempted/ready iff it's scheduled out while running | expand

Commit Message

David Matlack March 7, 2024, 4:35 p.m. UTC
Mark a vCPU as preempted/ready if-and-only-if it's scheduled out while
running. i.e. Do not mark a vCPU preempted/ready if it's scheduled out
during a non-KVM_RUN ioctl() or when userspace is doing KVM_RUN with
immediate_exit.

Commit 54aa83c90198 ("KVM: x86: do not set st->preempted when going back
to user space") stopped marking a vCPU as preempted when returning to
userspace, but if userspace then invokes a KVM vCPU ioctl() that gets
preempted, the vCPU will be marked preempted/ready. This is arguably
incorrect behavior since the vCPU was not actually preempted while the
guest was running, it was preempted while doing something on behalf of
userspace.

This commit also avoids KVM dirtying guest memory after userspace has
paused vCPUs, e.g. for Live Migration, which allows userspace to collect
the final dirty bitmap before or in parallel with saving vCPU state
without having to worry about saving vCPU state triggering writes to
guest memory.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
---
v2:
 - Drop Google-specific "PRODKERNEL: " shortlog prefix

v1: https://lore.kernel.org/kvm/20231218185850.1659570-1-dmatlack@google.com/

 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)


base-commit: 687d8f4c3dea0758afd748968d91288220bbe7e3

Comments

David Matlack April 2, 2024, 4:41 p.m. UTC | #1
On Thu, Mar 7, 2024 at 8:35 AM David Matlack <dmatlack@google.com> wrote:
>
> Mark a vCPU as preempted/ready if-and-only-if it's scheduled out while
> running. i.e. Do not mark a vCPU preempted/ready if it's scheduled out
> during a non-KVM_RUN ioctl() or when userspace is doing KVM_RUN with
> immediate_exit.
>
> Commit 54aa83c90198 ("KVM: x86: do not set st->preempted when going back
> to user space") stopped marking a vCPU as preempted when returning to
> userspace, but if userspace then invokes a KVM vCPU ioctl() that gets
> preempted, the vCPU will be marked preempted/ready. This is arguably
> incorrect behavior since the vCPU was not actually preempted while the
> guest was running, it was preempted while doing something on behalf of
> userspace.
>
> This commit also avoids KVM dirtying guest memory after userspace has
> paused vCPUs, e.g. for Live Migration, which allows userspace to collect
> the final dirty bitmap before or in parallel with saving vCPU state
> without having to worry about saving vCPU state triggering writes to
> guest memory.
>
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: David Matlack <dmatlack@google.com>

Gentle ping.
Sean Christopherson April 26, 2024, 9:01 p.m. UTC | #2
On Thu, Mar 07, 2024, David Matlack wrote:
> Mark a vCPU as preempted/ready if-and-only-if it's scheduled out while
> running. i.e. Do not mark a vCPU preempted/ready if it's scheduled out
> during a non-KVM_RUN ioctl() or when userspace is doing KVM_RUN with
> immediate_exit.
> 
> Commit 54aa83c90198 ("KVM: x86: do not set st->preempted when going back
> to user space") stopped marking a vCPU as preempted when returning to
> userspace, but if userspace then invokes a KVM vCPU ioctl() that gets
> preempted, the vCPU will be marked preempted/ready. This is arguably
> incorrect behavior since the vCPU was not actually preempted while the
> guest was running, it was preempted while doing something on behalf of
> userspace.
> 
> This commit also avoids KVM dirtying guest memory after userspace has
> paused vCPUs, e.g. for Live Migration, which allows userspace to collect
> the final dirty bitmap before or in parallel with saving vCPU state
> without having to worry about saving vCPU state triggering writes to
> guest memory.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
> v2:
>  - Drop Google-specific "PRODKERNEL: " shortlog prefix
> 
> v1: https://lore.kernel.org/kvm/20231218185850.1659570-1-dmatlack@google.com/
> 
>  include/linux/kvm_host.h | 1 +
>  virt/kvm/kvm_main.c      | 5 ++++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7e7fd25b09b3..5b2300614d22 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -378,6 +378,7 @@ struct kvm_vcpu {
>  		bool dy_eligible;
>  	} spin_loop;
>  #endif
> +	bool wants_to_run;
>  	bool preempted;
>  	bool ready;
>  	struct kvm_vcpu_arch arch;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index ff588677beb7..3da1b2e3785d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4438,7 +4438,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  				synchronize_rcu();
>  			put_pid(oldpid);
>  		}
> +		vcpu->wants_to_run = !vcpu->run->immediate_exit;

>  		r = kvm_arch_vcpu_ioctl_run(vcpu);
> +		vcpu->wants_to_run = false;
> +
>  		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
>  		break;
>  	}
> @@ -6312,7 +6315,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
>  {
>  	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
>  
> -	if (current->on_rq) {
> +	if (current->on_rq && vcpu->wants_to_run) {
>  		WRITE_ONCE(vcpu->preempted, true);
>  		WRITE_ONCE(vcpu->ready, true);
>  	}
> 
> base-commit: 687d8f4c3dea0758afd748968d91288220bbe7e3

Long story short, I was playing around with wants_to_run for a few hairbrained
ideas, and realized that there's a TOCTOU bug here.  Userspace can toggle
run->immediate_exit at will, e.g. can clear it after the kernel loads it to
compute vcpu->wants_to_run.

That's not fatal for this use case, since userspace would only be shooting itself
in the foot, but it leaves a very dangerous landmine, e.g. if something else in
KVM keys off of vcpu->wants_to_run to detect that KVM is in its run loop, i.e.
relies on wants_to_run being set if KVM is in its core run loop.

To address that, I think we should have all architectures check wants_to_run, not
immediate_exit.  And loading immediate_exit needs to be done with READ_ONCE().

E.g. for x86 (every other arch has similar code)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e9ef1fa4b90b..1a2f6bf14fb2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11396,7 +11396,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 
        kvm_vcpu_srcu_read_lock(vcpu);
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
-               if (kvm_run->immediate_exit) {
+               if (!vcpu->wants_to_run) {
                        r = -EINTR;
                        goto out;
                }
@@ -11474,7 +11474,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
                WARN_ON_ONCE(vcpu->mmio_needed);
        }
 
-       if (kvm_run->immediate_exit) {
+       if (!vcpu->wants_to_run) {
                r = -EINTR;
                goto out;
        }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f9b9ce0c3cd9..0c0aae224000 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1497,9 +1497,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);
 
-void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu, bool sched_in);
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9501fbd5dfd2..4384bbdba65c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4410,7 +4410,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
                                synchronize_rcu();
                        put_pid(oldpid);
                }
-               vcpu->wants_to_run = !vcpu->run->immediate_exit;
+               vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit);
                r = kvm_arch_vcpu_ioctl_run(vcpu);
                vcpu->wants_to_run = false;


---

Hmm, and we should probably go a step further and actively prevent using
immediate_exit from the kernel, e.g. rename it to something scary like:

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe3002..9c5fe1dae744 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -196,7 +196,11 @@ struct kvm_xen_exit {
 struct kvm_run {
        /* in */
        __u8 request_interrupt_window;
+#ifndef __KERNEL__
        __u8 immediate_exit;
+#else
+       __u8 hidden_do_not_touch;
+#endif
        __u8 padding1[6];
 
        /* out */
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7e7fd25b09b3..5b2300614d22 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -378,6 +378,7 @@  struct kvm_vcpu {
 		bool dy_eligible;
 	} spin_loop;
 #endif
+	bool wants_to_run;
 	bool preempted;
 	bool ready;
 	struct kvm_vcpu_arch arch;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff588677beb7..3da1b2e3785d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4438,7 +4438,10 @@  static long kvm_vcpu_ioctl(struct file *filp,
 				synchronize_rcu();
 			put_pid(oldpid);
 		}
+		vcpu->wants_to_run = !vcpu->run->immediate_exit;
 		r = kvm_arch_vcpu_ioctl_run(vcpu);
+		vcpu->wants_to_run = false;
+
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}
@@ -6312,7 +6315,7 @@  static void kvm_sched_out(struct preempt_notifier *pn,
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-	if (current->on_rq) {
+	if (current->on_rq && vcpu->wants_to_run) {
 		WRITE_ONCE(vcpu->preempted, true);
 		WRITE_ONCE(vcpu->ready, true);
 	}