diff mbox series

KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking

Message ID 20250124234623.3609069-1-seanjc@google.com (mailing list archive)
State New
Headers show
Series KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking | expand

Commit Message

Sean Christopherson Jan. 24, 2025, 11:46 p.m. UTC
When waking a VM's NX huge page recovery thread, ensure the thread is
actually alive before trying to wake it.  Now that the thread is spawned
on-demand during KVM_RUN, a VM without a recovery thread is reachable via
the related module params.

  BUG: kernel NULL pointer dereference, address: 0000000000000040
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:vhost_task_wake+0x5/0x10
  Call Trace:
   <TASK>
   set_nx_huge_pages+0xcc/0x1e0 [kvm]
   param_attr_store+0x8a/0xd0
   module_attr_store+0x1a/0x30
   kernfs_fop_write_iter+0x12f/0x1e0
   vfs_write+0x233/0x3e0
   ksys_write+0x60/0xd0
   do_syscall_64+0x5b/0x160
   entry_SYSCALL_64_after_hwframe+0x4b/0x53
  RIP: 0033:0x7f3b52710104
   </TASK>
  Modules linked in: kvm_intel kvm
  CR2: 0000000000000040

Fixes: 931656b9e2ff ("kvm: defer huge page recovery vhost task to later")
Cc: stable@vger.kernel.org
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)


base-commit: f7bafceba76e9ab475b413578c1757ee18c3e44b

Comments

Sean Christopherson Jan. 25, 2025, 12:50 a.m. UTC | #1
On Fri, Jan 24, 2025, Sean Christopherson wrote:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a45ae60e84ab..74c20dbb92da 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7120,6 +7120,19 @@ static void mmu_destroy_caches(void)
>  	kmem_cache_destroy(mmu_page_header_cache);
>  }
>  
> +static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
> +{
> +	/*
> +	 * The NX recovery thread is spawned on-demand at the first KVM_RUN and
> +	 * may not be valid even though the VM is globally visible.  Do nothing,
> +	 * as such a VM can't have any possible NX huge pages.
> +	 */
> +	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> +
> +	if (nx_thread)
> +		vhost_task_wake(nx_thread);

As mentioned in the original thread[*], I belatedly realized there's a race with
this approach.  If vhost_task_start() completes and kvm_nx_huge_page_recovery_worker()
runs before a parameter change, but the parameter change runs before the WRITE_ONCE(),
then the worker will run with stale params and could end up sleeping for far longer
than userspace wants.

I assume we could address that by taking kvm->arch.nx_once.mutex in this helper
instead of using the lockless approach.  I don't think that would lead to any
deadlocks?

[*] https://lore.kernel.org/all/Z5QsBXJ7rkJFDtmK@google.com
Keith Busch Jan. 25, 2025, 4:11 a.m. UTC | #2
On Fri, Jan 24, 2025 at 03:46:23PM -0800, Sean Christopherson wrote:
> When waking a VM's NX huge page recovery thread, ensure the thread is
> actually alive before trying to wake it.  Now that the thread is spawned
> on-demand during KVM_RUN, a VM without a recovery thread is reachable via
> the related module params.

Oh, this is what I thought we could do. I should have read ahead. :)

> +static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
> +{
> +	/*
> +	 * The NX recovery thread is spawned on-demand at the first KVM_RUN and
> +	 * may not be valid even though the VM is globally visible.  Do nothing,
> +	 * as such a VM can't have any possible NX huge pages.
> +	 */
> +	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> +
> +	if (nx_thread)
> +		vhost_task_wake(nx_thread);
> +}
> +
>  static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
>  {
>  	if (nx_hugepage_mitigation_hard_disabled)
> @@ -7180,7 +7193,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
>  			kvm_mmu_zap_all_fast(kvm);
>  			mutex_unlock(&kvm->slots_lock);
>  
> -			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
> +			kvm_wake_nx_recovery_thread(kvm);
>  		}
>  		mutex_unlock(&kvm_lock);
>  	}
> @@ -7315,7 +7328,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
>  		mutex_lock(&kvm_lock);
>  
>  		list_for_each_entry(kvm, &vm_list, vm_list)
> -			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
> +			kvm_wake_nx_recovery_thread(kvm);
>  
>  		mutex_unlock(&kvm_lock);
>  	}
> @@ -7451,14 +7464,20 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
>  {
>  	struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
>  	struct kvm *kvm = container_of(ka, struct kvm, arch);
> +	struct vhost_task *nx_thread;
>  
>  	kvm->arch.nx_huge_page_last = get_jiffies_64();
> -	kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
> -		kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
> -		kvm, "kvm-nx-lpage-recovery");
> +	nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
> +				      kvm_nx_huge_page_recovery_worker_kill,
> +				      kvm, "kvm-nx-lpage-recovery");
>  
> -	if (kvm->arch.nx_huge_page_recovery_thread)
> -		vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
> +	if (!nx_thread)
> +		return;
> +
> +	vhost_task_start(nx_thread);
> +
> +	/* Make the task visible only once it is fully started. */
> +	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);

I believe the WRITE_ONCE needs to happen before the vhost_task_start to
ensure the parameter update callback can see it before it's started.
Sean Christopherson Jan. 27, 2025, 4:48 p.m. UTC | #3
On Fri, Jan 24, 2025, Keith Busch wrote:
> On Fri, Jan 24, 2025 at 03:46:23PM -0800, Sean Christopherson wrote:
> > +static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
> > +{
> > +	/*
> > +	 * The NX recovery thread is spawned on-demand at the first KVM_RUN and
> > +	 * may not be valid even though the VM is globally visible.  Do nothing,
> > +	 * as such a VM can't have any possible NX huge pages.
> > +	 */
> > +	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> > +
> > +	if (nx_thread)
> > +		vhost_task_wake(nx_thread);
> > +}

...

> > +	nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
> > +				      kvm_nx_huge_page_recovery_worker_kill,
> > +				      kvm, "kvm-nx-lpage-recovery");
> >  
> > -	if (kvm->arch.nx_huge_page_recovery_thread)
> > -		vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
> > +	if (!nx_thread)
> > +		return;
> > +
> > +	vhost_task_start(nx_thread);
> > +
> > +	/* Make the task visible only once it is fully started. */
> > +	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
> 
> I believe the WRITE_ONCE needs to happen before the vhost_task_start to
> ensure the parameter update callback can see it before it's started.

It's not clear to me that calling vhost_task_wake() before vhost_task_start() is
allowed, which is why I deliberately waited until the task was started to make it
visible.  Though FWIW, doing "vhost_task_wake(nx_thread)" before vhost_task_start()
doesn't explode.

Ha!  There is another bug here, but we can smack 'em both with a bit of trickery
and do an optimized serialization in the process.

If vhost_task_create() fails, then the call_once() will "succeed" and mark the
structure as ONCE_COMPLETED.  The first KVM_RUN will fail with -ENOMEM, but any
subsequent calls will succeed, including in-flight KVM_RUNs on other threads.
Odds are good userspace will terminate the VM on -ENOMEM, but that't not guaranteed,
e.g. if userspace has logic to retry a few times before giving up.

If call_once() and its callback are modified to return errors, then we can abuse
call_once() to serialize against kvm_mmu_start_lpage_recovery() when waking the
recovery thread.  If the recovery thread is fully created, call_once() is a lockless
happy path, otherwise the wakup path will serialize against the creation path
via the once's mutex.

Over two patches...

---
 arch/x86/kvm/mmu/mmu.c    | 46 ++++++++++++++++++++++++++++-----------
 include/linux/call_once.h | 16 ++++++++++----
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a45ae60e84ab..f3ad33cd68b3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7120,6 +7120,26 @@ static void mmu_destroy_caches(void)
 	kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static int kvm_nx_recovery_thread_not_ready(struct once *once)
+{
+	return -ENOENT;
+}
+
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+	/*
+	 * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+	 * may not be started even though the VM is globally visible.  Abuse
+	 * call_once() to serialize against starting the recovery thread; if
+	 * this task's callback is invoked, then the thread hasn't been created
+	 * and the thread is guaranteed to see up-to-date parameters.
+	 */
+	if (call_once(&kvm->arch.nx_once, kvm_nx_recovery_thread_not_ready))
+		return;
+
+	vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+}
+
 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
 {
 	if (nx_hugepage_mitigation_hard_disabled)
@@ -7180,7 +7200,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
 			kvm_mmu_zap_all_fast(kvm);
 			mutex_unlock(&kvm->slots_lock);
 
-			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+			kvm_wake_nx_recovery_thread(kvm);
 		}
 		mutex_unlock(&kvm_lock);
 	}
@@ -7315,7 +7335,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
 		mutex_lock(&kvm_lock);
 
 		list_for_each_entry(kvm, &vm_list, vm_list)
-			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+			kvm_wake_nx_recovery_thread(kvm);
 
 		mutex_unlock(&kvm_lock);
 	}
@@ -7447,7 +7467,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
 	return true;
 }
 
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
 {
 	struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7457,21 +7477,21 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
 		kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
 		kvm, "kvm-nx-lpage-recovery");
 
-	if (kvm->arch.nx_huge_page_recovery_thread)
-		vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
-}
-
-int kvm_mmu_post_init_vm(struct kvm *kvm)
-{
-	if (nx_hugepage_mitigation_hard_disabled)
-		return 0;
-
-	call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
 	if (!kvm->arch.nx_huge_page_recovery_thread)
 		return -ENOMEM;
+
+	vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
 	return 0;
 }
 
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+	if (nx_hugepage_mitigation_hard_disabled)
+		return 0;
+
+	return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
 {
 	if (kvm->arch.nx_huge_page_recovery_thread)
diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..9d47ed50139b 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -26,20 +26,28 @@ do {									\
 	__once_init((once), #once, &__key);				\
 } while (0)
 
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+static inline int call_once(struct once *once, int (*cb)(struct once *))
 {
+        int r;
+
         /* Pairs with atomic_set_release() below.  */
         if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
-                return;
+                return 0;
 
         guard(mutex)(&once->lock);
         WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
         if (atomic_read(&once->state) != ONCE_NOT_STARTED)
-                return;
+                return -EINVAL;
 
         atomic_set(&once->state, ONCE_RUNNING);
-        cb(once);
+        r = cb(once);
+        if (r) {
+                atomic_set(&once->state, ONCE_NOT_STARTED);
+                return r;
+        }
+
         atomic_set_release(&once->state, ONCE_COMPLETED);
+        return 0;
 }
 
 #endif /* _LINUX_CALL_ONCE_H */

base-commit: f7bafceba76e9ab475b413578c1757ee18c3e44b
--
Keith Busch Jan. 27, 2025, 5:04 p.m. UTC | #4
On Mon, Jan 27, 2025 at 08:48:03AM -0800, Sean Christopherson wrote:
> If vhost_task_create() fails, then the call_once() will "succeed" and mark the
> structure as ONCE_COMPLETED.  The first KVM_RUN will fail with -ENOMEM, but any
> subsequent calls will succeed, including in-flight KVM_RUNs on other threads.

The criteria for returning -ENOMEM for any KVM_RUN is if we have a NULL
nx_huge_page_recovery_thread vhost_task. So I think that part, at least,
is fine.

The call_once is just needed to ensure that only the very first KVM_RUN
even tries to create it. If the vhost_task_create fails, then all the
KVM_RUN threads will see the NULL nx_huge_page_recovery_thread and
return -ENOMEM.

What you're suggesting here will allow a subsequent thread to attempt
creating the vhost task if the first one failed. Maybe you do want to
try again, but the current upstream code doesn't retry this, so I
thought it best to keep that behavior.
Sean Christopherson Jan. 27, 2025, 5:19 p.m. UTC | #5
On Mon, Jan 27, 2025, Keith Busch wrote:
> On Mon, Jan 27, 2025 at 08:48:03AM -0800, Sean Christopherson wrote:
> > If vhost_task_create() fails, then the call_once() will "succeed" and mark the
> > structure as ONCE_COMPLETED.  The first KVM_RUN will fail with -ENOMEM, but any
> > subsequent calls will succeed, including in-flight KVM_RUNs on other threads.
> 
> The criteria for returning -ENOMEM for any KVM_RUN is if we have a NULL
> nx_huge_page_recovery_thread vhost_task. So I think that part, at least,
> is fine.
> 
> The call_once is just needed to ensure that only the very first KVM_RUN
> even tries to create it. If the vhost_task_create fails, then all the
> KVM_RUN threads will see the NULL nx_huge_page_recovery_thread and
> return -ENOMEM.

Ah, duh, because the check is performed by the caller, outside of the "once"
protection.

> What you're suggesting here will allow a subsequent thread to attempt
> creating the vhost task if the first one failed. Maybe you do want to
> try again, but the current upstream code doesn't retry this, so I
> thought it best to keep that behavior.

No strong opinion.  In practice, it's a moot point because the odds of a VM being
able to make forward progress if task creation hits an OOM are basically nil.

I'll defer to Paolo on what he thinks is best for the call_once() API.
Keith Busch Jan. 27, 2025, 6:22 p.m. UTC | #6
On Mon, Jan 27, 2025 at 08:48:03AM -0800, Sean Christopherson wrote:
> > > -		vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
> > > +	if (!nx_thread)
> > > +		return;
> > > +
> > > +	vhost_task_start(nx_thread);
> > > +
> > > +	/* Make the task visible only once it is fully started. */
> > > +	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
> > 
> > I believe the WRITE_ONCE needs to happen before the vhost_task_start to
> > ensure the parameter update callback can see it before it's started.
> 
> It's not clear to me that calling vhost_task_wake() before vhost_task_start() is
> allowed, which is why I deliberately waited until the task was started to make it
> visible.  Though FWIW, doing "vhost_task_wake(nx_thread)" before vhost_task_start()
> doesn't explode.

Hm, it does look questionable to try to wake a process that hadn't been
started yet, but I think it may be okay: task state will be TASK_NEW
before vhost_task_start(), which looks like will cause wake_up_process()
to do nothing.
Paolo Bonzini Jan. 28, 2025, 3:41 p.m. UTC | #7
On 1/27/25 19:22, Keith Busch wrote:
>> It's not clear to me that calling vhost_task_wake() before vhost_task_start() is
>> allowed, which is why I deliberately waited until the task was started to make it
>> visible.  Though FWIW, doing "vhost_task_wake(nx_thread)" before vhost_task_start()
>> doesn't explode.
>
> Hm, it does look questionable to try to wake a process that hadn't been
> started yet, but I think it may be okay: task state will be TASK_NEW
> before vhost_task_start(), which looks like will cause wake_up_process()
> to do nothing.

Yes, it's okay because both wake_up_new_task() and try_to_wake_up() take
p->pi_lock.  try_to_wake_up() does not match either bit in TASK_NORMAL
(which is TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) and does nothing.

I'm queuing the patch with the store before vhost_task_start, and
acquire/release instead of just READ_ONCE/WRITE_ONCE.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 74c20dbb92da..6d5708146384 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7127,7 +7127,8 @@ static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
  	 * may not be valid even though the VM is globally visible.  Do nothing,
  	 * as such a VM can't have any possible NX huge pages.
  	 */
-	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+	struct vhost_task *nx_thread =
+		smp_load_acquire(&kvm->arch.nx_huge_page_recovery_thread);
  
  	if (nx_thread)
  		vhost_task_wake(nx_thread);
@@ -7474,10 +7475,10 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
  	if (!nx_thread)
  		return;
  
-	vhost_task_start(nx_thread);
+	/* Make the task visible only once it is fully created. */
+	smp_store_release(&kvm->arch.nx_huge_page_recovery_thread, nx_thread);
  
-	/* Make the task visible only once it is fully started. */
-	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+	vhost_task_start(nx_thread);
  }
  
  int kvm_mmu_post_init_vm(struct kvm *kvm)
Keith Busch Jan. 28, 2025, 3:44 p.m. UTC | #8
On Tue, Jan 28, 2025 at 04:41:41PM +0100, Paolo Bonzini wrote:
> I'm queuing the patch with the store before vhost_task_start, and
> acquire/release instead of just READ_ONCE/WRITE_ONCE.

Thanks, looks good to me:

Reviewed-by: Keith Busch <kbusch@kernel.org>
 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 74c20dbb92da..6d5708146384 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7127,7 +7127,8 @@ static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
>  	 * may not be valid even though the VM is globally visible.  Do nothing,
>  	 * as such a VM can't have any possible NX huge pages.
>  	 */
> -	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
> +	struct vhost_task *nx_thread =
> +		smp_load_acquire(&kvm->arch.nx_huge_page_recovery_thread);
>  	if (nx_thread)
>  		vhost_task_wake(nx_thread);
> @@ -7474,10 +7475,10 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
>  	if (!nx_thread)
>  		return;
> -	vhost_task_start(nx_thread);
> +	/* Make the task visible only once it is fully created. */
> +	smp_store_release(&kvm->arch.nx_huge_page_recovery_thread, nx_thread);
> -	/* Make the task visible only once it is fully started. */
> -	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
> +	vhost_task_start(nx_thread);
>  }
>  int kvm_mmu_post_init_vm(struct kvm *kvm)
>
Paolo Bonzini Feb. 4, 2025, 4:28 p.m. UTC | #9
Queued, thanks.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a45ae60e84ab..74c20dbb92da 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7120,6 +7120,19 @@  static void mmu_destroy_caches(void)
 	kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+	/*
+	 * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+	 * may not be valid even though the VM is globally visible.  Do nothing,
+	 * as such a VM can't have any possible NX huge pages.
+	 */
+	struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+
+	if (nx_thread)
+		vhost_task_wake(nx_thread);
+}
+
 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
 {
 	if (nx_hugepage_mitigation_hard_disabled)
@@ -7180,7 +7193,7 @@  static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
 			kvm_mmu_zap_all_fast(kvm);
 			mutex_unlock(&kvm->slots_lock);
 
-			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+			kvm_wake_nx_recovery_thread(kvm);
 		}
 		mutex_unlock(&kvm_lock);
 	}
@@ -7315,7 +7328,7 @@  static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
 		mutex_lock(&kvm_lock);
 
 		list_for_each_entry(kvm, &vm_list, vm_list)
-			vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
+			kvm_wake_nx_recovery_thread(kvm);
 
 		mutex_unlock(&kvm_lock);
 	}
@@ -7451,14 +7464,20 @@  static void kvm_mmu_start_lpage_recovery(struct once *once)
 {
 	struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
+	struct vhost_task *nx_thread;
 
 	kvm->arch.nx_huge_page_last = get_jiffies_64();
-	kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
-		kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
-		kvm, "kvm-nx-lpage-recovery");
+	nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+				      kvm_nx_huge_page_recovery_worker_kill,
+				      kvm, "kvm-nx-lpage-recovery");
 
-	if (kvm->arch.nx_huge_page_recovery_thread)
-		vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
+	if (!nx_thread)
+		return;
+
+	vhost_task_start(nx_thread);
+
+	/* Make the task visible only once it is fully started. */
+	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
 }
 
 int kvm_mmu_post_init_vm(struct kvm *kvm)