diff mbox series

[v3] x86/sgx: Fix deadlock and race conditions between fork() and EPC reclaim

Message ID 20200404010741.24486-1-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [v3] x86/sgx: Fix deadlock and race conditions between fork() and EPC reclaim | expand

Commit Message

Jarkko Sakkinen April 4, 2020, 1:07 a.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Drop the synchronize_srcu() from sgx_encl_mm_add() and replace it with a
mm_list versioning concept to avoid deadlock when adding a mm during
dup_mmap()/fork(), and to ensure copied PTEs are zapped.

When dup_mmap() runs, it holds mmap_sem for write in both the old mm and
new mm.  Invoking synchronize_srcu() while holding mmap_sem of a mm that
is already attached to the enclave will deadlock if the reclaimer is in
the process of walking mm_list, as the reclaimer will try to acquire
mmap_sem (of the old mm) while holding encl->srcu for read.

 INFO: task ksgxswapd:181 blocked for more than 120 seconds.
 ksgxswapd       D    0   181      2 0x80004000
 Call Trace:
  __schedule+0x2db/0x700
  schedule+0x44/0xb0
  rwsem_down_read_slowpath+0x370/0x470
  down_read+0x95/0xa0
  sgx_reclaim_pages+0x1d2/0x7d0
  ksgxswapd+0x151/0x2e0
  kthread+0x120/0x140
  ret_from_fork+0x35/0x40

 INFO: task fork_consistenc:18824 blocked for more than 120 seconds.
 fork_consistenc D    0 18824  18786 0x00004320
 Call Trace:
  __schedule+0x2db/0x700
  schedule+0x44/0xb0
  schedule_timeout+0x205/0x300
  wait_for_completion+0xb7/0x140
  __synchronize_srcu.part.22+0x81/0xb0
  synchronize_srcu_expedited+0x27/0x30
  synchronize_srcu+0x57/0xe0
  sgx_encl_mm_add+0x12b/0x160
  sgx_vma_open+0x22/0x40
  dup_mm+0x521/0x580
  copy_process+0x1a56/0x1b50
  _do_fork+0x85/0x3a0
  __x64_sys_clone+0x8e/0xb0
  do_syscall_64+0x57/0x1b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Furthermore, doing synchronize_srcu() in sgx_encl_mm_add() does not
prevent the new mm from having stale PTEs pointing at the EPC page to be
reclaimed.  dup_mmap() calls vm_ops->open()/sgx_encl_mm_add() _after_
PTEs are copied to the new mm, i.e. blocking fork() until reclaim zaps
the old mm is pointless as the stale PTEs have already been created in
the new mm.

All other flows that walk mm_list can safely race with dup_mmap() or are
protected by a different mechanism.  Add comments to all srcu readers
that don't check the list version to document why its ok for the flow to
ignore the version.

Note, synchronize_srcu() is still needed when removing a mm from an
enclave, as the srcu readers must complete their walk before the mm can
be freed.  Removing a mm is never done while holding mmap_sem.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
v3:
* Sanitized version list version handling in sgx_reclaimer_block().
  With the fences it was quite complicted given that the version
  was read both in the beginning and end of the loop.
* Removed comment before cpumask_clear() because technically it is
  not part of this bug fix.
v2:
* Remove smp_wmb() as x86 does not reorder writes in the pipeline.
* Refine comments to be more to the point and more maintainable when
  things might change.
* Replace the ad hoc (goto-based) loop construct with a proper loop
  construct.
 arch/x86/kernel/cpu/sgx/encl.c    | 11 +++++++--
 arch/x86/kernel/cpu/sgx/encl.h    |  1 +
 arch/x86/kernel/cpu/sgx/ioctl.c   |  1 +
 arch/x86/kernel/cpu/sgx/reclaim.c | 41 ++++++++++++++++++++++---------
 4 files changed, 40 insertions(+), 14 deletions(-)

Comments

Sean Christopherson April 6, 2020, 4:15 p.m. UTC | #1
On Sat, Apr 04, 2020 at 04:07:41AM +0300, Jarkko Sakkinen wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index e0124a2f22d5..5b15352b3d4f 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -196,6 +196,9 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
>  	struct sgx_encl_mm *encl_mm;
>  	int ret;
>  
> +	/* mm_list can be accessed only by a single thread at a time. */

s/accessed/mutated


> +	lockdep_assert_held_write(&mm->mmap_sem);
> +
>  	if (atomic_read(&encl->flags) & SGX_ENCL_DEAD)
>  		return -EINVAL;
>  
> @@ -221,12 +224,16 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
>  		return ret;
>  	}
>  
> +	/*
> +	 * The page reclaimer uses list version for synchronization instead of
> +	 * synchronize_scru() because otherwise we could conflict with
> +	 * dup_mmap().
> +	 */
>  	spin_lock(&encl->mm_lock);
>  	list_add_rcu(&encl_mm->list, &encl->mm_list);
> +	encl->mm_list_version++;
>  	spin_unlock(&encl->mm_lock);
>  
> -	synchronize_srcu(&encl->srcu);
> -
>  	return 0;
>  }
>  
> diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
> index 44b353aa8866..aa664363f229 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.h
> +++ b/arch/x86/kernel/cpu/sgx/encl.h
> @@ -74,6 +74,7 @@ struct sgx_encl {
>  	struct mutex lock;
>  	struct list_head mm_list;
>  	spinlock_t mm_lock;
> +	uint64_t mm_list_version;
>  	struct file *backing;
>  	struct kref refcount;
>  	struct srcu_struct srcu;
> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
> index 3af0596530a8..9b516f41b4d9 100644
> --- a/arch/x86/kernel/cpu/sgx/ioctl.c
> +++ b/arch/x86/kernel/cpu/sgx/ioctl.c
> @@ -212,6 +212,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
>  	encl->base = secs->base;
>  	encl->size = secs->size;
>  	encl->ssaframesize = secs->ssa_frame_size;
> +	encl->mm_list_version = 1;

This is unnecessary.  A mm_list_version of '0' means the list walk started
when there were no mm structs associated with the enclave, i.e. skipping
everything related to walking the list is ok.  It's subtle, and I dislike
relying on that behavior, but IMO it's preferable to incorrectly implying
that a list version of '0' is somehow bad.

>  	/*
>  	 * Set SGX_ENCL_CREATED only after the enclave is fully prepped.  This
> diff --git a/arch/x86/kernel/cpu/sgx/reclaim.c b/arch/x86/kernel/cpu/sgx/reclaim.c
> index 39f0ddefbb79..3483e9bc590a 100644
> --- a/arch/x86/kernel/cpu/sgx/reclaim.c
> +++ b/arch/x86/kernel/cpu/sgx/reclaim.c
> @@ -186,26 +186,43 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
>  	struct sgx_encl *encl = page->encl;
>  	struct sgx_encl_mm *encl_mm;
>  	struct vm_area_struct *vma;
> +	uint64_t version, next;
>  	int idx, ret;
>  
> -	idx = srcu_read_lock(&encl->srcu);
> +	version = 0;
>  
> -	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
> -		if (!mmget_not_zero(encl_mm->mm))
> -			continue;
> +	for ( ; ; ) {
> +		next = encl->mm_list_version;
>  
> -		down_read(&encl_mm->mm->mmap_sem);
> +		if (version == next)
> +			break;

Functionally this works, but I personally find it the logic kludgy, and it
generates worse code.  Not that we're at the point where counting uops is a
to priority, but I don't think it makes sense to go out of our way to make
the resulting code worse.

The main issue is that the "0 is invalid" approach means the loop
termination condition is both likely and unlikely, e.g. the first test of
"version == next", when version is explicitly 0 is unlikely, but subsequent
checks are likely since racing with adding a mm is expected to be very rare.


Without "likely", it requires a taken Jcc to break the loop.

        next = encl->mm_list_version;
0xffffffff8102e423 <+51>:    mov    0x58(%r12),%r15

        if (version == next) // if (next == 0)
                break;
0xffffffff8102e43d <+77>:    test   %r15,%r15
0xffffffff8102e440 <+80>:    je     0xffffffff8102e51f <sgx_reclaimer_block+303>

        next = encl->mm_list_version;
0xffffffff8102e509 <+281>:   mov    0x58(%r12),%rax

        if (version == next), i.e. if (nested != 0)
                break;
0xffffffff8102e50e <+286>:   cmp    %r15,%rax
0xffffffff8102e511 <+289>:   je     0xffffffff8102e51f <sgx_reclaimer_block+303>
0xffffffff8102e513 <+291>:   mov    %rax,%r15
0xffffffff8102e516 <+294>:   jmpq   0xffffffff8102e446 <sgx_reclaimer_block+86>
0xffffffff8102e51b <+299>:   ud2


Using likely results in even worse code because the guts of the loop get
out out of line at the bottom of the function, and executing the first (and
most likely, only) iteration of the loop requires a taken Jcc.

        next = encl->mm_list_version;
0xffffffff8102e41e <+46>:    mov    0x58(%rbx),%r12

        if (likely(version == next))  // if (next == 0)
0xffffffff8102e422 <+50>:    test   %r12,%r12
0xffffffff8102e425 <+53>:    jne    0xffffffff8102e4cf <sgx_reclaimer_block+223

        ...

        next = encl->mm_list_version;
0xffffffff8102e5ad <+445>:   mov    0x58(%rbx),%rax

        if (likely(version == next))
0xffffffff8102e5b1 <+449>:   cmp    %r12,%rax
0xffffffff8102e5b4 <+452>:   je     0xffffffff8102e42b <sgx_reclaimer_block+59>
0xffffffff8102e5ba <+458>:   mov    %rax,%r12
0xffffffff8102e5bd <+461>:   jmpq   0xffffffff8102e4e6 <sgx_reclaimer_block+246>


Contrast that with the do-while form, which puts everything inline and does
not require a taken Jcc.  Note, the number of reads from encl->mm_list_version
are identical, i.e. the compiler isn't stupid.

        mm_list_version = encl->mm_list_version;
0xffffffff8102e441 <+49>:    mov    0x58(%r12),%rax
0xffffffff8102e452 <+66>:    mov    %rax,0x8(%rsp)

	...

        } while (unlikely(encl->mm_list_version != mm_list_version));
0xffffffff8102e527 <+279>:   mov    0x58(%r12),%rax
0xffffffff8102e52c <+284>:   cmp    0x8(%rsp),%rax
0xffffffff8102e531 <+289>:   jne    0xffffffff8102e5e5 <sgx_reclaimer_block+469>

>  
> -		ret = sgx_encl_find(encl_mm->mm, addr, &vma);
> -		if (!ret && encl == vma->vm_private_data)
> -			zap_vma_ptes(vma, addr, PAGE_SIZE);
> +		version = next;
>  
> -		up_read(&encl_mm->mm->mmap_sem);
> +		/* Fence reads as the CPU can reorder them. This guarantees
> +		 * that we don't access old list with a new version.
> +		 */
> +		smp_rmb();
>  
> -		mmput_async(encl_mm->mm);
> -	}
> +		idx = srcu_read_lock(&encl->srcu);
>  
> -	srcu_read_unlock(&encl->srcu, idx);
> +		list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
> +			if (!mmget_not_zero(encl_mm->mm))
> +				continue;
> +
> +			down_read(&encl_mm->mm->mmap_sem);
> +
> +			ret = sgx_encl_find(encl_mm->mm, addr, &vma);
> +			if (!ret && encl == vma->vm_private_data)
> +				zap_vma_ptes(vma, addr, PAGE_SIZE);
> +
> +			up_read(&encl_mm->mm->mmap_sem);
> +
> +			mmput_async(encl_mm->mm);
> +		}
> +
> +		srcu_read_unlock(&encl->srcu, idx);
> +	}
>  
>  	mutex_lock(&encl->lock);
>  
> -- 
> 2.25.1
>
Jarkko Sakkinen April 6, 2020, 8:41 p.m. UTC | #2
On Mon, Apr 06, 2020 at 09:15:57AM -0700, Sean Christopherson wrote:
> >  	encl->ssaframesize = secs->ssa_frame_size;
> > +	encl->mm_list_version = 1;
> 
> This is unnecessary.  A mm_list_version of '0' means the list walk started
> when there were no mm structs associated with the enclave, i.e. skipping
> everything related to walking the list is ok.  It's subtle, and I dislike
> relying on that behavior, but IMO it's preferable to incorrectly implying
> that a list version of '0' is somehow bad.

'0' means whatever code requires to mean. There is no absolute meaning.

> > +	for ( ; ; ) {
> > +		next = encl->mm_list_version;
> >  
> > -		down_read(&encl_mm->mm->mmap_sem);
> > +		if (version == next)
> > +			break;
> 
> Functionally this works, but I personally find it the logic kludgy, and it
> generates worse code.  Not that we're at the point where counting uops is a
> to priority, but I don't think it makes sense to go out of our way to make
> the resulting code worse.

I'll pick v2 then.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index e0124a2f22d5..5b15352b3d4f 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -196,6 +196,9 @@  int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
 	struct sgx_encl_mm *encl_mm;
 	int ret;
 
+	/* mm_list can be accessed only by a single thread at a time. */
+	lockdep_assert_held_write(&mm->mmap_sem);
+
 	if (atomic_read(&encl->flags) & SGX_ENCL_DEAD)
 		return -EINVAL;
 
@@ -221,12 +224,16 @@  int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
 		return ret;
 	}
 
+	/*
+	 * The page reclaimer uses list version for synchronization instead of
+	 * synchronize_scru() because otherwise we could conflict with
+	 * dup_mmap().
+	 */
 	spin_lock(&encl->mm_lock);
 	list_add_rcu(&encl_mm->list, &encl->mm_list);
+	encl->mm_list_version++;
 	spin_unlock(&encl->mm_lock);
 
-	synchronize_srcu(&encl->srcu);
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 44b353aa8866..aa664363f229 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -74,6 +74,7 @@  struct sgx_encl {
 	struct mutex lock;
 	struct list_head mm_list;
 	spinlock_t mm_lock;
+	uint64_t mm_list_version;
 	struct file *backing;
 	struct kref refcount;
 	struct srcu_struct srcu;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 3af0596530a8..9b516f41b4d9 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -212,6 +212,7 @@  static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
 	encl->base = secs->base;
 	encl->size = secs->size;
 	encl->ssaframesize = secs->ssa_frame_size;
+	encl->mm_list_version = 1;
 
 	/*
 	 * Set SGX_ENCL_CREATED only after the enclave is fully prepped.  This
diff --git a/arch/x86/kernel/cpu/sgx/reclaim.c b/arch/x86/kernel/cpu/sgx/reclaim.c
index 39f0ddefbb79..3483e9bc590a 100644
--- a/arch/x86/kernel/cpu/sgx/reclaim.c
+++ b/arch/x86/kernel/cpu/sgx/reclaim.c
@@ -186,26 +186,43 @@  static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
 	struct sgx_encl *encl = page->encl;
 	struct sgx_encl_mm *encl_mm;
 	struct vm_area_struct *vma;
+	uint64_t version, next;
 	int idx, ret;
 
-	idx = srcu_read_lock(&encl->srcu);
+	version = 0;
 
-	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
-		if (!mmget_not_zero(encl_mm->mm))
-			continue;
+	for ( ; ; ) {
+		next = encl->mm_list_version;
 
-		down_read(&encl_mm->mm->mmap_sem);
+		if (version == next)
+			break;
 
-		ret = sgx_encl_find(encl_mm->mm, addr, &vma);
-		if (!ret && encl == vma->vm_private_data)
-			zap_vma_ptes(vma, addr, PAGE_SIZE);
+		version = next;
 
-		up_read(&encl_mm->mm->mmap_sem);
+		/* Fence reads as the CPU can reorder them. This guarantees
+		 * that we don't access old list with a new version.
+		 */
+		smp_rmb();
 
-		mmput_async(encl_mm->mm);
-	}
+		idx = srcu_read_lock(&encl->srcu);
 
-	srcu_read_unlock(&encl->srcu, idx);
+		list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+			if (!mmget_not_zero(encl_mm->mm))
+				continue;
+
+			down_read(&encl_mm->mm->mmap_sem);
+
+			ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+			if (!ret && encl == vma->vm_private_data)
+				zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+			up_read(&encl_mm->mm->mmap_sem);
+
+			mmput_async(encl_mm->mm);
+		}
+
+		srcu_read_unlock(&encl->srcu, idx);
+	}
 
 	mutex_lock(&encl->lock);