diff mbox series

[v5,10/30] KVM: Add arch hooks when VM is added/deleted

Message ID aab342d576fe22b8f5b27e61d4fc635d45a4f866.1663869838.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: hardware enable/disable reorganize | expand

Commit Message

Isaku Yamahata Sept. 22, 2022, 6:20 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

and pass kvm_usage_count with kvm_lock.  Move kvm_arch_post_init_vm() under
kvm_arch_add_vm().  Replace enable/disable_hardware_all() with the default
implementation of kvm_arch_add/del_vm().  Later kvm_arch_post_init_vm() is
deleted once x86 overrides kvm_arch_add_vm().

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 include/linux/kvm_host.h |   2 +
 virt/kvm/kvm_main.c      | 121 ++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 58 deletions(-)

Comments

Isaku Yamahata Oct. 4, 2022, 12:16 a.m. UTC | #1
On Thu, Sep 22, 2022 at 11:20:39AM -0700,
isaku.yamahata@intel.com wrote:

> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> and pass kvm_usage_count with kvm_lock.  Move kvm_arch_post_init_vm() under
> kvm_arch_add_vm().  Replace enable/disable_hardware_all() with the default
> implementation of kvm_arch_add/del_vm().  Later kvm_arch_post_init_vm() is
> deleted once x86 overrides kvm_arch_add_vm().
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  include/linux/kvm_host.h |   2 +
>  virt/kvm/kvm_main.c      | 121 ++++++++++++++++++++-------------------
>  2 files changed, 65 insertions(+), 58 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index eab352902de7..3fbb01bbac98 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1445,6 +1445,8 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
>  bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
>  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
> +int kvm_arch_add_vm(struct kvm *kvm, int usage_count);
> +int kvm_arch_del_vm(int usage_count);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c4b908553726..e2c8823786ff 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -142,8 +142,9 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
>  #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
>  			.open		= kvm_no_compat_open
>  #endif
> -static int hardware_enable_all(void);
> -static void hardware_disable_all(void);
> +static void hardware_enable_nolock(void *junk);
> +static void hardware_disable_nolock(void *junk);
> +static void kvm_del_vm(void);
>  
>  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
>  
> @@ -1106,6 +1107,41 @@ int __weak kvm_arch_post_init_vm(struct kvm *kvm)
>  	return 0;
>  }
>  
> +/*
> + * Called after the VM is otherwise initialized, but just before adding it to
> + * the vm_list.
> + */
> +int __weak kvm_arch_add_vm(struct kvm *kvm, int usage_count)
> +{
> +	int r = 0;
> +
> +	if (usage_count != 1)
> +		return 0;

Oops. This line should be.
+               return kvm_arch_post_init_vm(kvm);

"KVM: x86: Duplicate arch callbacks related to pm events and compat check"
should have same line. "KVM: Eliminate kvm_arch_post_init_vm()" should include
kvm_mmu_post_init_vm().
It creates a kernel thread. kvm unit test didn't catch it.

Thanks,

> +
> +	atomic_set(&hardware_enable_failed, 0);
> +	on_each_cpu(hardware_enable_nolock, NULL, 1);
> +
> +	if (atomic_read(&hardware_enable_failed)) {
> +		r = -EBUSY;
> +		goto err;
> +	}
> +
> +	r = kvm_arch_post_init_vm(kvm);
> +err:
> +	if (r)
> +		on_each_cpu(hardware_disable_nolock, NULL, 1);
> +	return r;
> +}
> +
> +int __weak kvm_arch_del_vm(int usage_count)
> +{
> +	if (usage_count)
> +		return 0;
> +
> +	on_each_cpu(hardware_disable_nolock, NULL, 1);
> +	return 0;
> +}
> +
>  /*
>   * Called just after removing the VM from the vm_list, but before doing any
>   * other destruction.
Sean Christopherson Oct. 12, 2022, 8:43 p.m. UTC | #2
On Thu, Sep 22, 2022, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> and pass kvm_usage_count with kvm_lock.  Move kvm_arch_post_init_vm() under
> kvm_arch_add_vm().  Replace enable/disable_hardware_all() with the default
> implementation of kvm_arch_add/del_vm().  Later kvm_arch_post_init_vm() is
> deleted once x86 overrides kvm_arch_add_vm().

This needs to explain _why_ KVM is pivoting to add/remove hooks.

> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  include/linux/kvm_host.h |   2 +
>  virt/kvm/kvm_main.c      | 121 ++++++++++++++++++++-------------------
>  2 files changed, 65 insertions(+), 58 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index eab352902de7..3fbb01bbac98 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1445,6 +1445,8 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
>  bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
>  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
> +int kvm_arch_add_vm(struct kvm *kvm, int usage_count);
> +int kvm_arch_del_vm(int usage_count);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c4b908553726..e2c8823786ff 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -142,8 +142,9 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
>  #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
>  			.open		= kvm_no_compat_open
>  #endif
> -static int hardware_enable_all(void);
> -static void hardware_disable_all(void);
> +static void hardware_enable_nolock(void *junk);
> +static void hardware_disable_nolock(void *junk);
> +static void kvm_del_vm(void);

I think kvm_remove_vm() will be less confusing as "remove" is almost never used
to describe freeing something, whereas "delete" is somewhat interchangeable with
"free.  I.e. make it more obvious that the hook isn't intented to actually
delete/free a VM, rather it's there to remove/delete a VM from global tracking.

Ah, and this is especially true since the VM needs to be deleted from vm_list
before the is destroyed, but hardware needs to stay enabled until the VM is fully
destroyed.

Hmm, actually, I think even better would be to have kvm_remove_vm() and kvm_drop_vm()
to make it more obvious that there isn't 100% symmetry between "add" and "remove".

E.g. rename kvm_arch_pre_destroy_vm() => kvm_arch_remove_vm() and then we end up
with (see comments below for more details):

static int kvm_add_vm(struct kvm *kvm)
{
	/*
	 * During onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
	 * is called, i.e. on_each_cpu() includes CPUs that have not yet been
	 * onlined by KVM.  Disable CPU hotplug to prevent enabling hardware on
	 * a CPU that hasn't yet done compatibility checks.
	 */
	cpus_read_lock();
	mutex_lock(&kvm_lock);
	r = kvm_arch_add_vm(kvm, ++kvm_usage_count);
	if (r) {
		--kvm_usage_count;
		goto out;
	}
	
	list_add(&kvm->vm_list, &vm_list);

out:
	mutex_unlock(&kvm_lock);
	cpus_read_unlock();
}

static void kvm_remove_vm(struct kvm *kvm)
{
	mutex_lock(&kvm_lock);
	list_del(&kvm->vm_list);
	mutex_unlock(&kvm_lock);
	kvm_arch_remove_vm(kvm);
}

static void kvm_drop_vm(void)
{
	cpus_read_lock();
	mutex_lock(&kvm_lock);
	WARN_ON_ONCE(!kvm_usage_count);
	kvm_usage_count--;
	kvm_arch_drop_vm(kvm_usage_count);
	mutex_unlock(&kvm_lock);
	cpus_read_unlock();
}

> @@ -1223,13 +1255,28 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>  	if (r)
>  		goto out_err_no_debugfs;
>  
> -	r = kvm_arch_post_init_vm(kvm);
> -	if (r)
> -		goto out_err;
> -
> +	/*
> +	 * During onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
> +	 * is called. on_each_cpu() between them includes the CPU. As a result,
> +	 * hardware_enable_nolock() may get invoked before kvm_online_cpu().
> +	 * This would enable hardware virtualization on that cpu without
> +	 * compatibility checks, which can potentially crash system or break
> +	 * running VMs.
> +	 *
> +	 * Disable CPU hotplug to prevent this case from happening.
> +	 */
> +	cpus_read_lock();
>  	mutex_lock(&kvm_lock);
> +	kvm_usage_count++;
> +	r = kvm_arch_add_vm(kvm, kvm_usage_count);
> +	if (r) {
> +		/* the following kvm_del_vm() decrements kvm_usage_count. */

This is buggy on two fronts.  kvm_usage_count needs to be protected with kvm_lock,
and AFAICT cpus_read_unlock() isn't called.

Invoking kvm_del_vm() in the error path is the source of both bugs.  Typically,
the paired "undo" of an operation should only be used if the "do" operation was
fully successful.

As above, move this to a helper to make juggling the locks less painful.
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eab352902de7..3fbb01bbac98 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1445,6 +1445,8 @@  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
+int kvm_arch_add_vm(struct kvm *kvm, int usage_count);
+int kvm_arch_del_vm(int usage_count);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 int kvm_arch_create_vm_debugfs(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c4b908553726..e2c8823786ff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -142,8 +142,9 @@  static int kvm_no_compat_open(struct inode *inode, struct file *file)
 #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
 			.open		= kvm_no_compat_open
 #endif
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
+static void hardware_enable_nolock(void *junk);
+static void hardware_disable_nolock(void *junk);
+static void kvm_del_vm(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
@@ -1106,6 +1107,41 @@  int __weak kvm_arch_post_init_vm(struct kvm *kvm)
 	return 0;
 }
 
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_add_vm(struct kvm *kvm, int usage_count)
+{
+	int r = 0;
+
+	if (usage_count != 1)
+		return 0;
+
+	atomic_set(&hardware_enable_failed, 0);
+	on_each_cpu(hardware_enable_nolock, NULL, 1);
+
+	if (atomic_read(&hardware_enable_failed)) {
+		r = -EBUSY;
+		goto err;
+	}
+
+	r = kvm_arch_post_init_vm(kvm);
+err:
+	if (r)
+		on_each_cpu(hardware_disable_nolock, NULL, 1);
+	return r;
+}
+
+int __weak kvm_arch_del_vm(int usage_count)
+{
+	if (usage_count)
+		return 0;
+
+	on_each_cpu(hardware_disable_nolock, NULL, 1);
+	return 0;
+}
+
 /*
  * Called just after removing the VM from the vm_list, but before doing any
  * other destruction.
@@ -1203,10 +1239,6 @@  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	if (r)
 		goto out_err_no_arch_destroy_vm;
 
-	r = hardware_enable_all();
-	if (r)
-		goto out_err_no_disable;
-
 #ifdef CONFIG_HAVE_KVM_IRQFD
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
@@ -1223,13 +1255,28 @@  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	if (r)
 		goto out_err_no_debugfs;
 
-	r = kvm_arch_post_init_vm(kvm);
-	if (r)
-		goto out_err;
-
+	/*
+	 * During onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
+	 * is called. on_each_cpu() between them includes the CPU. As a result,
+	 * hardware_enable_nolock() may get invoked before kvm_online_cpu().
+	 * This would enable hardware virtualization on that cpu without
+	 * compatibility checks, which can potentially crash system or break
+	 * running VMs.
+	 *
+	 * Disable CPU hotplug to prevent this case from happening.
+	 */
+	cpus_read_lock();
 	mutex_lock(&kvm_lock);
+	kvm_usage_count++;
+	r = kvm_arch_add_vm(kvm, kvm_usage_count);
+	if (r) {
+		/* the following kvm_del_vm() decrements kvm_usage_count. */
+		mutex_unlock(&kvm_lock);
+		goto out_err;
+	}
 	list_add(&kvm->vm_list, &vm_list);
 	mutex_unlock(&kvm_lock);
+	cpus_read_unlock();
 
 	preempt_notifier_inc();
 	kvm_init_pm_notifier(kvm);
@@ -1246,8 +1293,7 @@  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
 #endif
 out_err_no_mmu_notifier:
-	hardware_disable_all();
-out_err_no_disable:
+	kvm_del_vm();
 	kvm_arch_destroy_vm(kvm);
 out_err_no_arch_destroy_vm:
 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
@@ -1326,7 +1372,7 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
 	preempt_notifier_dec();
-	hardware_disable_all();
+	kvm_del_vm();
 	mmdrop(mm);
 	module_put(kvm_chardev_ops.owner);
 }
@@ -5075,56 +5121,15 @@  static int kvm_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-static void hardware_disable_all_nolock(void)
-{
-	BUG_ON(!kvm_usage_count);
-
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable_nolock, NULL, 1);
-}
-
-static void hardware_disable_all(void)
+static void kvm_del_vm(void)
 {
 	cpus_read_lock();
 	mutex_lock(&kvm_lock);
-	hardware_disable_all_nolock();
-	mutex_unlock(&kvm_lock);
-	cpus_read_unlock();
-}
-
-static int hardware_enable_all(void)
-{
-	int r = 0;
-
-	/*
-	 * During onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
-	 * is called. on_each_cpu() between them includes the CPU. As a result,
-	 * hardware_enable_nolock() may get invoked before kvm_online_cpu().
-	 * This would enable hardware virtualization on that cpu without
-	 * compatibility checks, which can potentially crash system or break
-	 * running VMs.
-	 *
-	 * Disable CPU hotplug to prevent this case from happening.
-	 */
-	cpus_read_lock();
-	mutex_lock(&kvm_lock);
-
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		atomic_set(&hardware_enable_failed, 0);
-		on_each_cpu(hardware_enable_nolock, NULL, 1);
-
-		if (atomic_read(&hardware_enable_failed)) {
-			hardware_disable_all_nolock();
-			r = -EBUSY;
-		}
-	}
-
+	WARN_ON_ONCE(!kvm_usage_count);
+	kvm_usage_count--;
+	kvm_arch_del_vm(kvm_usage_count);
 	mutex_unlock(&kvm_lock);
 	cpus_read_unlock();
-
-	return r;
 }
 
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,