diff mbox series

[RFC,04/11] mm/hmm: Simplify hmm_get_or_create and make it reliable

Message ID 20190523153436.19102-5-jgg@ziepe.ca (mailing list archive)
State New, archived
Headers show
Series mm/hmm: Various revisions from a locking/code review | expand

Commit Message

Jason Gunthorpe May 23, 2019, 3:34 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

As coded this function can false-fail in various racy situations. Make it
reliable by running only under the write side of the mmap_sem and avoiding
the false-failing compare/exchange pattern.

Also make the locking very easy to understand by only ever reading or
writing mm->hmm while holding the write side of the mmap_sem.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 75 ++++++++++++++++++++------------------------------------
 1 file changed, 27 insertions(+), 48 deletions(-)

Comments

Ralph Campbell May 23, 2019, 11:38 p.m. UTC | #1
On 5/23/19 8:34 AM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
> 
> As coded this function can false-fail in various racy situations. Make it
> reliable by running only under the write side of the mmap_sem and avoiding
> the false-failing compare/exchange pattern.
> 
> Also make the locking very easy to understand by only ever reading or
> writing mm->hmm while holding the write side of the mmap_sem.
> 
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>   mm/hmm.c | 75 ++++++++++++++++++++------------------------------------
>   1 file changed, 27 insertions(+), 48 deletions(-)
> 
> diff --git a/mm/hmm.c b/mm/hmm.c
> index e27058e92508b9..ec54be54d81135 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -40,16 +40,6 @@
>   #if IS_ENABLED(CONFIG_HMM_MIRROR)
>   static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
>   
> -static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
> -{
> -	struct hmm *hmm = READ_ONCE(mm->hmm);
> -
> -	if (hmm && kref_get_unless_zero(&hmm->kref))
> -		return hmm;
> -
> -	return NULL;
> -}
> -
>   /**
>    * hmm_get_or_create - register HMM against an mm (HMM internal)
>    *
> @@ -64,11 +54,20 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
>    */
>   static struct hmm *hmm_get_or_create(struct mm_struct *mm)
>   {
> -	struct hmm *hmm = mm_get_hmm(mm);
> -	bool cleanup = false;
> +	struct hmm *hmm;
>   
> -	if (hmm)
> -		return hmm;
> +	lockdep_assert_held_exclusive(mm->mmap_sem);
> +
> +	if (mm->hmm) {
> +		if (kref_get_unless_zero(&mm->hmm->kref))
> +			return mm->hmm;
> +		/*
> +		 * The hmm is being freed by some other CPU and is pending a
> +		 * RCU grace period, but this CPU can NULL now it since we
> +		 * have the mmap_sem.
> +		 */
> +		mm->hmm = NULL;

Shouldn't there be a "return NULL;" here so it doesn't fall through and
allocate a struct hmm below?

> +	}
>   
>   	hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
>   	if (!hmm)
> @@ -85,54 +84,34 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
>   	hmm->mm = mm;
>   	mmgrab(hmm->mm);
>   
> -	spin_lock(&mm->page_table_lock);
> -	if (!mm->hmm)
> -		mm->hmm = hmm;
> -	else
> -		cleanup = true;
> -	spin_unlock(&mm->page_table_lock);
> -
> -	if (cleanup)
> -		goto error;
> -
> -	/*
> -	 * We should only get here if hold the mmap_sem in write mode ie on
> -	 * registration of first mirror through hmm_mirror_register()
> -	 */
>   	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
> -	if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
> -		goto error_mm;
> +	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
> +		kfree(hmm);
> +		return NULL;
> +	}
>   
> +	mm->hmm = hmm;
>   	return hmm;
> -
> -error_mm:
> -	spin_lock(&mm->page_table_lock);
> -	if (mm->hmm == hmm)
> -		mm->hmm = NULL;
> -	spin_unlock(&mm->page_table_lock);
> -error:
> -	kfree(hmm);
> -	return NULL;
>   }
>   
>   static void hmm_fee_rcu(struct rcu_head *rcu)

I see Jerome already saw and named this hmm_free_rcu()
which I agree with.

>   {
> +	struct hmm *hmm = container_of(rcu, struct hmm, rcu);
> +
> +	down_write(&hmm->mm->mmap_sem);
> +	if (hmm->mm->hmm == hmm)
> +		hmm->mm->hmm = NULL;
> +	up_write(&hmm->mm->mmap_sem);
> +	mmdrop(hmm->mm);
> +
>   	kfree(container_of(rcu, struct hmm, rcu));
>   }
>   
>   static void hmm_free(struct kref *kref)
>   {
>   	struct hmm *hmm = container_of(kref, struct hmm, kref);
> -	struct mm_struct *mm = hmm->mm;
> -
> -	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
>   
> -	spin_lock(&mm->page_table_lock);
> -	if (mm->hmm == hmm)
> -		mm->hmm = NULL;
> -	spin_unlock(&mm->page_table_lock);
> -
> -	mmdrop(hmm->mm);
> +	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
>   	mmu_notifier_call_srcu(&hmm->rcu, hmm_fee_rcu);
>   }
>   
> 

-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------
Jason Gunthorpe May 24, 2019, 1:23 a.m. UTC | #2
On Thu, May 23, 2019 at 04:38:28PM -0700, Ralph Campbell wrote:
> 
> On 5/23/19 8:34 AM, Jason Gunthorpe wrote:
> > From: Jason Gunthorpe <jgg@mellanox.com>
> > 
> > As coded this function can false-fail in various racy situations. Make it
> > reliable by running only under the write side of the mmap_sem and avoiding
> > the false-failing compare/exchange pattern.
> > 
> > Also make the locking very easy to understand by only ever reading or
> > writing mm->hmm while holding the write side of the mmap_sem.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> >   mm/hmm.c | 75 ++++++++++++++++++++------------------------------------
> >   1 file changed, 27 insertions(+), 48 deletions(-)
> > 
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index e27058e92508b9..ec54be54d81135 100644
> > +++ b/mm/hmm.c
> > @@ -40,16 +40,6 @@
> >   #if IS_ENABLED(CONFIG_HMM_MIRROR)
> >   static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
> > -static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
> > -{
> > -	struct hmm *hmm = READ_ONCE(mm->hmm);
> > -
> > -	if (hmm && kref_get_unless_zero(&hmm->kref))
> > -		return hmm;
> > -
> > -	return NULL;
> > -}
> > -
> >   /**
> >    * hmm_get_or_create - register HMM against an mm (HMM internal)
> >    *
> > @@ -64,11 +54,20 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
> >    */
> >   static struct hmm *hmm_get_or_create(struct mm_struct *mm)
> >   {
> > -	struct hmm *hmm = mm_get_hmm(mm);
> > -	bool cleanup = false;
> > +	struct hmm *hmm;
> > -	if (hmm)
> > -		return hmm;
> > +	lockdep_assert_held_exclusive(mm->mmap_sem);
> > +
> > +	if (mm->hmm) {
> > +		if (kref_get_unless_zero(&mm->hmm->kref))
> > +			return mm->hmm;
> > +		/*
> > +		 * The hmm is being freed by some other CPU and is pending a
> > +		 * RCU grace period, but this CPU can NULL now it since we
> > +		 * have the mmap_sem.
> > +		 */
> > +		mm->hmm = NULL;
> 
> Shouldn't there be a "return NULL;" here so it doesn't fall through and
> allocate a struct hmm below?

No, this function should only return NULL on memory allocation
failure.

In this case another thread is busy freeing the hmm but wasn't able to
update mm->hmm to null due to a locking constraint. So we make it null
on behalf of the other thread and allocate a fresh new hmm that is
valid. The freeing thread will complete the free and do nothing with
mm->hmm.

> >   static void hmm_fee_rcu(struct rcu_head *rcu)
> 
> I see Jerome already saw and named this hmm_free_rcu()
> which I agree with.

I do love my typos :)

> >   {
> > +	struct hmm *hmm = container_of(rcu, struct hmm, rcu);
> > +
> > +	down_write(&hmm->mm->mmap_sem);
> > +	if (hmm->mm->hmm == hmm)
> > +		hmm->mm->hmm = NULL;
> > +	up_write(&hmm->mm->mmap_sem);
> > +	mmdrop(hmm->mm);
> > +
> >   	kfree(container_of(rcu, struct hmm, rcu));
> >   }
> >   static void hmm_free(struct kref *kref)
> >   {
> >   	struct hmm *hmm = container_of(kref, struct hmm, kref);
> > -	struct mm_struct *mm = hmm->mm;
> > -
> > -	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
> > -	spin_lock(&mm->page_table_lock);
> > -	if (mm->hmm == hmm)
> > -		mm->hmm = NULL;
> > -	spin_unlock(&mm->page_table_lock);
> > -
> > -	mmdrop(hmm->mm);
> > +	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
> >   	mmu_notifier_call_srcu(&hmm->rcu, hmm_fee_rcu);
> >   }
> > 
> 
> This email message is for the sole use of the intended recipient(s) and may contain
> confidential information.  Any unauthorized review, use, disclosure or distribution
> is prohibited.  If you are not the intended recipient, please contact the sender by
> reply email and destroy all copies of the original message.

Ah, you should not send this trailer to the public mailing lists.

Thanks,
Jason
Ralph Campbell May 24, 2019, 5:06 p.m. UTC | #3
On 5/23/19 6:23 PM, Jason Gunthorpe wrote:
> On Thu, May 23, 2019 at 04:38:28PM -0700, Ralph Campbell wrote:
>>
>> On 5/23/19 8:34 AM, Jason Gunthorpe wrote:
>>> From: Jason Gunthorpe <jgg@mellanox.com>
>>>
>>> As coded this function can false-fail in various racy situations. Make it
>>> reliable by running only under the write side of the mmap_sem and avoiding
>>> the false-failing compare/exchange pattern.
>>>
>>> Also make the locking very easy to understand by only ever reading or
>>> writing mm->hmm while holding the write side of the mmap_sem.
>>>
>>> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
>>>    mm/hmm.c | 75 ++++++++++++++++++++------------------------------------
>>>    1 file changed, 27 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/mm/hmm.c b/mm/hmm.c
>>> index e27058e92508b9..ec54be54d81135 100644
>>> +++ b/mm/hmm.c
>>> @@ -40,16 +40,6 @@
>>>    #if IS_ENABLED(CONFIG_HMM_MIRROR)
>>>    static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
>>> -static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
>>> -{
>>> -	struct hmm *hmm = READ_ONCE(mm->hmm);
>>> -
>>> -	if (hmm && kref_get_unless_zero(&hmm->kref))
>>> -		return hmm;
>>> -
>>> -	return NULL;
>>> -}
>>> -
>>>    /**
>>>     * hmm_get_or_create - register HMM against an mm (HMM internal)
>>>     *
>>> @@ -64,11 +54,20 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
>>>     */
>>>    static struct hmm *hmm_get_or_create(struct mm_struct *mm)
>>>    {
>>> -	struct hmm *hmm = mm_get_hmm(mm);
>>> -	bool cleanup = false;
>>> +	struct hmm *hmm;
>>> -	if (hmm)
>>> -		return hmm;
>>> +	lockdep_assert_held_exclusive(mm->mmap_sem);
>>> +
>>> +	if (mm->hmm) {
>>> +		if (kref_get_unless_zero(&mm->hmm->kref))
>>> +			return mm->hmm;
>>> +		/*
>>> +		 * The hmm is being freed by some other CPU and is pending a
>>> +		 * RCU grace period, but this CPU can NULL now it since we
>>> +		 * have the mmap_sem.
>>> +		 */
>>> +		mm->hmm = NULL;
>>
>> Shouldn't there be a "return NULL;" here so it doesn't fall through and
>> allocate a struct hmm below?
> 
> No, this function should only return NULL on memory allocation
> failure.
> 
> In this case another thread is busy freeing the hmm but wasn't able to
> update mm->hmm to null due to a locking constraint. So we make it null
> on behalf of the other thread and allocate a fresh new hmm that is
> valid. The freeing thread will complete the free and do nothing with
> mm->hmm.
> 
>>>    static void hmm_fee_rcu(struct rcu_head *rcu)
>>
>> I see Jerome already saw and named this hmm_free_rcu()
>> which I agree with.
> 
> I do love my typos :)
> 
>>>    {
>>> +	struct hmm *hmm = container_of(rcu, struct hmm, rcu);
>>> +
>>> +	down_write(&hmm->mm->mmap_sem);
>>> +	if (hmm->mm->hmm == hmm)
>>> +		hmm->mm->hmm = NULL;
>>> +	up_write(&hmm->mm->mmap_sem);
>>> +	mmdrop(hmm->mm);
>>> +
>>>    	kfree(container_of(rcu, struct hmm, rcu));
>>>    }
>>>    static void hmm_free(struct kref *kref)
>>>    {
>>>    	struct hmm *hmm = container_of(kref, struct hmm, kref);
>>> -	struct mm_struct *mm = hmm->mm;
>>> -
>>> -	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
>>> -	spin_lock(&mm->page_table_lock);
>>> -	if (mm->hmm == hmm)
>>> -		mm->hmm = NULL;
>>> -	spin_unlock(&mm->page_table_lock);
>>> -
>>> -	mmdrop(hmm->mm);
>>> +	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
>>>    	mmu_notifier_call_srcu(&hmm->rcu, hmm_fee_rcu);
>>>    }
>>>
>>
>> This email message is for the sole use of the intended recipient(s) and may contain
>> confidential information.  Any unauthorized review, use, disclosure or distribution
>> is prohibited.  If you are not the intended recipient, please contact the sender by
>> reply email and destroy all copies of the original message.
> 
> Ah, you should not send this trailer to the public mailing lists.
> 
> Thanks,
> Jason
> 

Sorry, I have to apply the "magic" header to suppress this each time I
send email to a public list.
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index e27058e92508b9..ec54be54d81135 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -40,16 +40,6 @@ 
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
-static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
-{
-	struct hmm *hmm = READ_ONCE(mm->hmm);
-
-	if (hmm && kref_get_unless_zero(&hmm->kref))
-		return hmm;
-
-	return NULL;
-}
-
 /**
  * hmm_get_or_create - register HMM against an mm (HMM internal)
  *
@@ -64,11 +54,20 @@  static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
  */
 static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 {
-	struct hmm *hmm = mm_get_hmm(mm);
-	bool cleanup = false;
+	struct hmm *hmm;
 
-	if (hmm)
-		return hmm;
+	lockdep_assert_held_exclusive(mm->mmap_sem);
+
+	if (mm->hmm) {
+		if (kref_get_unless_zero(&mm->hmm->kref))
+			return mm->hmm;
+		/*
+		 * The hmm is being freed by some other CPU and is pending a
+		 * RCU grace period, but this CPU can NULL now it since we
+		 * have the mmap_sem.
+		 */
+		mm->hmm = NULL;
+	}
 
 	hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
 	if (!hmm)
@@ -85,54 +84,34 @@  static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	hmm->mm = mm;
 	mmgrab(hmm->mm);
 
-	spin_lock(&mm->page_table_lock);
-	if (!mm->hmm)
-		mm->hmm = hmm;
-	else
-		cleanup = true;
-	spin_unlock(&mm->page_table_lock);
-
-	if (cleanup)
-		goto error;
-
-	/*
-	 * We should only get here if hold the mmap_sem in write mode ie on
-	 * registration of first mirror through hmm_mirror_register()
-	 */
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
-	if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
-		goto error_mm;
+	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+		kfree(hmm);
+		return NULL;
+	}
 
+	mm->hmm = hmm;
 	return hmm;
-
-error_mm:
-	spin_lock(&mm->page_table_lock);
-	if (mm->hmm == hmm)
-		mm->hmm = NULL;
-	spin_unlock(&mm->page_table_lock);
-error:
-	kfree(hmm);
-	return NULL;
 }
 
 static void hmm_fee_rcu(struct rcu_head *rcu)
 {
+	struct hmm *hmm = container_of(rcu, struct hmm, rcu);
+
+	down_write(&hmm->mm->mmap_sem);
+	if (hmm->mm->hmm == hmm)
+		hmm->mm->hmm = NULL;
+	up_write(&hmm->mm->mmap_sem);
+	mmdrop(hmm->mm);
+
 	kfree(container_of(rcu, struct hmm, rcu));
 }
 
 static void hmm_free(struct kref *kref)
 {
 	struct hmm *hmm = container_of(kref, struct hmm, kref);
-	struct mm_struct *mm = hmm->mm;
-
-	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
 
-	spin_lock(&mm->page_table_lock);
-	if (mm->hmm == hmm)
-		mm->hmm = NULL;
-	spin_unlock(&mm->page_table_lock);
-
-	mmdrop(hmm->mm);
+	mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
 	mmu_notifier_call_srcu(&hmm->rcu, hmm_fee_rcu);
 }