diff mbox series

mm/mmu_notifier.c: Fix race in mmu_interval_notifier_remove()

Message ID 20220414031810.1787209-1-apopple@nvidia.com (mailing list archive)
State New
Headers show
Series mm/mmu_notifier.c: Fix race in mmu_interval_notifier_remove() | expand

Commit Message

Alistair Popple April 14, 2022, 3:18 a.m. UTC
In some cases it is possible for mmu_interval_notifier_remove() to race
with mn_tree_inv_end() allowing it to return while the notifier data
structure is still in use. Consider the following sequence:

CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
----------------------------------- ------------------------------------
                                    spin_lock(subscriptions->lock);
                                    seq = subscriptions->invalidate_seq;
spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
subscriptions->invalidate_seq++;
                                    wait_event(invalidate_seq != seq);
                                    return;
interval_tree_remove(interval_sub); kfree(interval_sub);
spin_unlock(subscriptions->lock);
wake_up_all();

As the wait_event() condition is true it will return immediately. This
can lead to use-after-free type errors if the caller frees the data
structure containing the interval notifier subscription while it is
still on a deferred list. Fix this by changing invalidate_seq to an
atomic type as it is read outside of the lock and moving the increment
until after deferred lists have been updated.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier")
---
 mm/mmu_notifier.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

Comments

Jason Gunthorpe April 14, 2022, 1:50 p.m. UTC | #1
On Thu, Apr 14, 2022 at 01:18:10PM +1000, Alistair Popple wrote:
> In some cases it is possible for mmu_interval_notifier_remove() to race
> with mn_tree_inv_end() allowing it to return while the notifier data
> structure is still in use. Consider the following sequence:
> 
> CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
>                                     spin_lock(subscriptions->lock);
>                                     seq = subscriptions->invalidate_seq;
> spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
> subscriptions->invalidate_seq++;
>                                     wait_event(invalidate_seq != seq);
>                                     return;
> interval_tree_remove(interval_sub); kfree(interval_sub);
> spin_unlock(subscriptions->lock);
> wake_up_all();
> 
> As the wait_event() condition is true it will return immediately. This
> can lead to use-after-free type errors if the caller frees the data
> structure containing the interval notifier subscription while it is
> still on a deferred list. Fix this by changing invalidate_seq to an
> atomic type as it is read outside of the lock and moving the increment
> until after deferred lists have been updated.

Oh, yes, that is a mistake.

I would not solve it with more unlocked atomics though, this is just a
simple case of a missing lock - can you look at this and if you like
it post it as a patch please?

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 459d195d2ff64b..f45ff1b7626a62 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
 }
 EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
 
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+			  unsigned long seq)
+{
+	bool ret;
+
+	spin_lock(&subscriptions->lock);
+	ret = subscriptions->invalidate_seq != seq;
+	spin_unlock(&subscriptions->lock);
+	return ret;
+}
+
 /**
  * mmu_interval_notifier_remove - Remove a interval notifier
  * @interval_sub: Interval subscription to unregister
@@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 	if (seq)
 		wait_event(subscriptions->wq,
-			   READ_ONCE(subscriptions->invalidate_seq) != seq);
+			   mmu_interval_seq_released(subscriptions, seq));
 
 	/* pairs with mmgrab in mmu_interval_notifier_insert() */
 	mmdrop(mm);

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Alistair Popple April 19, 2022, 1:51 a.m. UTC | #2
Jason Gunthorpe <jgg@ziepe.ca> writes:

> On Thu, Apr 14, 2022 at 01:18:10PM +1000, Alistair Popple wrote:
>> In some cases it is possible for mmu_interval_notifier_remove() to race
>> with mn_tree_inv_end() allowing it to return while the notifier data
>> structure is still in use. Consider the following sequence:
>>
>> CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
>>                                     spin_lock(subscriptions->lock);
>>                                     seq = subscriptions->invalidate_seq;
>> spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
>> subscriptions->invalidate_seq++;
>>                                     wait_event(invalidate_seq != seq);
>>                                     return;
>> interval_tree_remove(interval_sub); kfree(interval_sub);
>> spin_unlock(subscriptions->lock);
>> wake_up_all();
>>
>> As the wait_event() condition is true it will return immediately. This
>> can lead to use-after-free type errors if the caller frees the data
>> structure containing the interval notifier subscription while it is
>> still on a deferred list. Fix this by changing invalidate_seq to an
>> atomic type as it is read outside of the lock and moving the increment
>> until after deferred lists have been updated.
>
> Oh, yes, that is a mistake.
>
> I would not solve it with more unlocked atomics though, this is just a
> simple case of a missing lock - can you look at this and if you like
> it post it as a patch please?

Yep, that looks good and is easier to understand. For some reason I had assumed
the lack of locking was intentional. Will post the below fix as v2.

> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 459d195d2ff64b..f45ff1b7626a62 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
>  }
>  EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
>
> +static bool
> +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
> +			  unsigned long seq)
> +{
> +	bool ret;
> +
> +	spin_lock(&subscriptions->lock);
> +	ret = subscriptions->invalidate_seq != seq;
> +	spin_unlock(&subscriptions->lock);
> +	return ret;
> +}
> +
>  /**
>   * mmu_interval_notifier_remove - Remove a interval notifier
>   * @interval_sub: Interval subscription to unregister
> @@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
>  	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
>  	if (seq)
>  		wait_event(subscriptions->wq,
> -			   READ_ONCE(subscriptions->invalidate_seq) != seq);
> +			   mmu_interval_seq_released(subscriptions, seq));
>
>  	/* pairs with mmgrab in mmu_interval_notifier_insert() */
>  	mmdrop(mm);
>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
diff mbox series

Patch

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3f3bbcd298c6..41eb5c2e09f4 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,7 +40,7 @@  struct mmu_notifier_subscriptions {
 	bool has_itree;
 	/* to serialize the list modifications and hlist_unhashed */
 	spinlock_t lock;
-	unsigned long invalidate_seq;
+	atomic_long_t invalidate_seq;
 	unsigned long active_invalidate_ranges;
 	struct rb_root_cached itree;
 	wait_queue_head_t wq;
@@ -87,7 +87,7 @@  static bool
 mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
 {
 	lockdep_assert_held(&subscriptions->lock);
-	return subscriptions->invalidate_seq & 1;
+	return atomic_long_read(&subscriptions->invalidate_seq) & 1;
 }
 
 static struct mmu_interval_notifier *
@@ -103,12 +103,12 @@  mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
 	node = interval_tree_iter_first(&subscriptions->itree, range->start,
 					range->end - 1);
 	if (node) {
-		subscriptions->invalidate_seq |= 1;
+		atomic_long_or(1, &subscriptions->invalidate_seq);
 		res = container_of(node, struct mmu_interval_notifier,
 				   interval_tree);
 	}
 
-	*seq = subscriptions->invalidate_seq;
+	*seq = atomic_long_read(&subscriptions->invalidate_seq);
 	spin_unlock(&subscriptions->lock);
 	return res;
 }
@@ -138,9 +138,6 @@  static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
 		return;
 	}
 
-	/* Make invalidate_seq even */
-	subscriptions->invalidate_seq++;
-
 	/*
 	 * The inv_end incorporates a deferred mechanism like rtnl_unlock().
 	 * Adds and removes are queued until the final inv_end happens then
@@ -158,6 +155,13 @@  static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
 					     &subscriptions->itree);
 		hlist_del(&interval_sub->deferred_item);
 	}
+
+	/* Pairs with the atomic_long_read in mmu_interval_notifier_remove(). */
+	smp_mb__before_atomic();
+
+	/* Make invalidate_seq even */
+	atomic_long_inc(&subscriptions->invalidate_seq);
+
 	spin_unlock(&subscriptions->lock);
 
 	wake_up_all(&subscriptions->wq);
@@ -232,7 +236,7 @@  mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
 	spin_lock(&subscriptions->lock);
 	/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
 	seq = READ_ONCE(interval_sub->invalidate_seq);
-	is_invalidating = seq == subscriptions->invalidate_seq;
+	is_invalidating = seq == atomic_long_read(&subscriptions->invalidate_seq);
 	spin_unlock(&subscriptions->lock);
 
 	/*
@@ -246,7 +250,7 @@  mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 	if (is_invalidating)
 		wait_event(subscriptions->wq,
-			   READ_ONCE(subscriptions->invalidate_seq) != seq);
+			   atomic_long_read(&subscriptions->invalidate_seq) != seq);
 
 	/*
 	 * Notice that mmu_interval_read_retry() can already be true at this
@@ -648,7 +652,7 @@  int __mmu_notifier_register(struct mmu_notifier *subscription,
 
 		INIT_HLIST_HEAD(&subscriptions->list);
 		spin_lock_init(&subscriptions->lock);
-		subscriptions->invalidate_seq = 2;
+		atomic_long_set(&subscriptions->invalidate_seq, 2);
 		subscriptions->itree = RB_ROOT_CACHED;
 		init_waitqueue_head(&subscriptions->wq);
 		INIT_HLIST_HEAD(&subscriptions->deferred_list);
@@ -954,11 +958,11 @@  static int __mmu_interval_notifier_insert(
 			hlist_add_head(&interval_sub->deferred_item,
 				       &subscriptions->deferred_list);
 		else {
-			subscriptions->invalidate_seq |= 1;
+			atomic_long_or(1, &subscriptions->invalidate_seq);
 			interval_tree_insert(&interval_sub->interval_tree,
 					     &subscriptions->itree);
 		}
-		interval_sub->invalidate_seq = subscriptions->invalidate_seq;
+		interval_sub->invalidate_seq = atomic_long_read(&subscriptions->invalidate_seq);
 	} else {
 		WARN_ON(mn_itree_is_invalidating(subscriptions));
 		/*
@@ -968,7 +972,7 @@  static int __mmu_interval_notifier_insert(
 		 * soon.
 		 */
 		interval_sub->invalidate_seq =
-			subscriptions->invalidate_seq - 1;
+			atomic_long_read(&subscriptions->invalidate_seq) - 1;
 		interval_tree_insert(&interval_sub->interval_tree,
 				     &subscriptions->itree);
 	}
@@ -1066,7 +1070,7 @@  void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
 		} else {
 			hlist_add_head(&interval_sub->deferred_item,
 				       &subscriptions->deferred_list);
-			seq = subscriptions->invalidate_seq;
+			seq = atomic_long_read(&subscriptions->invalidate_seq);
 		}
 	} else {
 		if (WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb))) {
@@ -1086,7 +1090,7 @@  void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
 	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 	if (seq)
 		wait_event(subscriptions->wq,
-			   READ_ONCE(subscriptions->invalidate_seq) != seq);
+			   atomic_long_read(&subscriptions->invalidate_seq) != seq);
 
 	/* pairs with mmgrab in mmu_interval_notifier_insert() */
 	mmdrop(mm);