diff mbox series

[3/6] mm: hugetlb: fix a race between freeing and dissolving the page

Message ID 20210104065843.5658-3-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series [1/6] mm: migrate: do not migrate HugeTLB page whose refcount is one | expand

Commit Message

Muchun Song Jan. 4, 2021, 6:58 a.m. UTC
There is a race condition between __free_huge_page()
and dissolve_free_huge_page().

CPU0:                         CPU1:

// page_count(page) == 1
put_page(page)
  __free_huge_page(page)
                              dissolve_free_huge_page(page)
                                spin_lock(&hugetlb_lock)
                                // PageHuge(page) && !page_count(page)
                                update_and_free_page(page)
                                // page is freed to the buddy
                                spin_unlock(&hugetlb_lock)
    spin_lock(&hugetlb_lock)
    clear_page_huge_active(page)
    enqueue_huge_page(page)
    // It is wrong, the page is already freed
    spin_unlock(&hugetlb_lock)

The race windows is between put_page() and spin_lock() which
is in the __free_huge_page().

We should make sure that the page is already on the free list
when it is dissolved.

Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

Comments

Mike Kravetz Jan. 5, 2021, midnight UTC | #1
On 1/3/21 10:58 PM, Muchun Song wrote:
> There is a race condition between __free_huge_page()
> and dissolve_free_huge_page().
> 
> CPU0:                         CPU1:
> 
> // page_count(page) == 1
> put_page(page)
>   __free_huge_page(page)
>                               dissolve_free_huge_page(page)
>                                 spin_lock(&hugetlb_lock)
>                                 // PageHuge(page) && !page_count(page)
>                                 update_and_free_page(page)
>                                 // page is freed to the buddy
>                                 spin_unlock(&hugetlb_lock)
>     spin_lock(&hugetlb_lock)
>     clear_page_huge_active(page)
>     enqueue_huge_page(page)
>     // It is wrong, the page is already freed
>     spin_unlock(&hugetlb_lock)
> 
> The race windows is between put_page() and spin_lock() which
> is in the __free_huge_page().
> 
> We should make sure that the page is already on the free list
> when it is dissolved.
> 
> Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 48 insertions(+)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1f3bf1710b66..72608008f8b4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
>  static int num_fault_mutexes;
>  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>  
> +static inline bool PageHugeFreed(struct page *head)
> +{
> +	return page_private(head) == -1UL;

	return page_private(head + 4) == -1UL;

> +}
> +
> +static inline void SetPageHugeFreed(struct page *head)
> +{
> +	set_page_private(head + 4, -1UL);
> +}
> +
> +static inline void ClearPageHugeFreed(struct page *head)
> +{
> +	set_page_private(head + 4, 0);
> +}

It is unfortunate that we can not use some existing value like
page_huge_active() to determine if dissolve_free_huge_page() should
proceed with freeing the page to buddy.  If the existing check,

	if (!page_count(page)) {

was changed to

	if (!page_count(page) && !page_huge_active(page)) {

the race window would be shrunk.  However, the most straight forward
way to fully close the window is with the approach taken here.

> +
>  /* Forward declaration */
>  static int hugetlb_acct_memory(struct hstate *h, long delta);
>  
> @@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
>  	list_move(&page->lru, &h->hugepage_freelists[nid]);
>  	h->free_huge_pages++;
>  	h->free_huge_pages_node[nid]++;
> +	SetPageHugeFreed(page);
>  }
>  
>  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> @@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
>  
>  		list_move(&page->lru, &h->hugepage_activelist);
>  		set_page_refcounted(page);
> +		ClearPageHugeFreed(page);
>  		h->free_huge_pages--;
>  		h->free_huge_pages_node[nid]--;
>  		return page;
> @@ -1504,6 +1521,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  	spin_lock(&hugetlb_lock);
>  	h->nr_huge_pages++;
>  	h->nr_huge_pages_node[nid]++;
> +	ClearPageHugeFreed(page);
>  	spin_unlock(&hugetlb_lock);
>  }
>  
> @@ -1770,6 +1788,36 @@ int dissolve_free_huge_page(struct page *page)
>  		int nid = page_to_nid(head);
>  		if (h->free_huge_pages - h->resv_huge_pages == 0)
>  			goto out;
> +
> +		/*
> +		 * There is a race condition between __free_huge_page()
> +		 * and dissolve_free_huge_page().
> +		 *
> +		 * CPU0:                         CPU1:
> +		 *
> +		 * // page_count(page) == 1
> +		 * put_page(page)
> +		 *   __free_huge_page(page)
> +		 *                               dissolve_free_huge_page(page)
> +		 *                                 spin_lock(&hugetlb_lock)
> +		 *                                 // PageHuge(page) && !page_count(page)
> +		 *                                 update_and_free_page(page)
> +		 *                                 // page is freed to the buddy
> +		 *                                 spin_unlock(&hugetlb_lock)
> +		 *     spin_lock(&hugetlb_lock)
> +		 *     enqueue_huge_page(page)
> +		 *     // It is wrong, the page is already freed
> +		 *     spin_unlock(&hugetlb_lock)
> +		 *
> +		 * The race window is between put_page() and spin_lock() which
> +		 * is in the __free_huge_page().

IMO, the description of the race condition in the commit message is
sufficient.  It does not need to be here in the code.  The below comment
should be sufficient.
Muchun Song Jan. 5, 2021, 2:55 a.m. UTC | #2
On Tue, Jan 5, 2021 at 8:02 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 1/3/21 10:58 PM, Muchun Song wrote:
> > There is a race condition between __free_huge_page()
> > and dissolve_free_huge_page().
> >
> > CPU0:                         CPU1:
> >
> > // page_count(page) == 1
> > put_page(page)
> >   __free_huge_page(page)
> >                               dissolve_free_huge_page(page)
> >                                 spin_lock(&hugetlb_lock)
> >                                 // PageHuge(page) && !page_count(page)
> >                                 update_and_free_page(page)
> >                                 // page is freed to the buddy
> >                                 spin_unlock(&hugetlb_lock)
> >     spin_lock(&hugetlb_lock)
> >     clear_page_huge_active(page)
> >     enqueue_huge_page(page)
> >     // It is wrong, the page is already freed
> >     spin_unlock(&hugetlb_lock)
> >
> > The race windows is between put_page() and spin_lock() which
> > is in the __free_huge_page().
> >
> > We should make sure that the page is already on the free list
> > when it is dissolved.
> >
> > Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 48 insertions(+)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 1f3bf1710b66..72608008f8b4 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
> >  static int num_fault_mutexes;
> >  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
> >
> > +static inline bool PageHugeFreed(struct page *head)
> > +{
> > +     return page_private(head) == -1UL;
>
>         return page_private(head + 4) == -1UL;
>
> > +}
> > +
> > +static inline void SetPageHugeFreed(struct page *head)
> > +{
> > +     set_page_private(head + 4, -1UL);
> > +}
> > +
> > +static inline void ClearPageHugeFreed(struct page *head)
> > +{
> > +     set_page_private(head + 4, 0);
> > +}
>
> It is unfortunate that we can not use some existing value like
> page_huge_active() to determine if dissolve_free_huge_page() should
> proceed with freeing the page to buddy.  If the existing check,
>
>         if (!page_count(page)) {
>
> was changed to
>
>         if (!page_count(page) && !page_huge_active(page)) {
>
> the race window would be shrunk.  However, the most straight forward
> way to fully close the window is with the approach taken here.

I also thought about this fix. But this is not enough. Because
we just call put_page to free the HugeTLB page without
setting activeness in some place (e.g. error handling
routines).

If we use page_huge_active, we should set activeness
before put_page. But we cannot guarantee this.

Thanks.

>
> > +
> >  /* Forward declaration */
> >  static int hugetlb_acct_memory(struct hstate *h, long delta);
> >
> > @@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
> >       list_move(&page->lru, &h->hugepage_freelists[nid]);
> >       h->free_huge_pages++;
> >       h->free_huge_pages_node[nid]++;
> > +     SetPageHugeFreed(page);
> >  }
> >
> >  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> > @@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> >
> >               list_move(&page->lru, &h->hugepage_activelist);
> >               set_page_refcounted(page);
> > +             ClearPageHugeFreed(page);
> >               h->free_huge_pages--;
> >               h->free_huge_pages_node[nid]--;
> >               return page;
> > @@ -1504,6 +1521,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >       spin_lock(&hugetlb_lock);
> >       h->nr_huge_pages++;
> >       h->nr_huge_pages_node[nid]++;
> > +     ClearPageHugeFreed(page);
> >       spin_unlock(&hugetlb_lock);
> >  }
> >
> > @@ -1770,6 +1788,36 @@ int dissolve_free_huge_page(struct page *page)
> >               int nid = page_to_nid(head);
> >               if (h->free_huge_pages - h->resv_huge_pages == 0)
> >                       goto out;
> > +
> > +             /*
> > +              * There is a race condition between __free_huge_page()
> > +              * and dissolve_free_huge_page().
> > +              *
> > +              * CPU0:                         CPU1:
> > +              *
> > +              * // page_count(page) == 1
> > +              * put_page(page)
> > +              *   __free_huge_page(page)
> > +              *                               dissolve_free_huge_page(page)
> > +              *                                 spin_lock(&hugetlb_lock)
> > +              *                                 // PageHuge(page) && !page_count(page)
> > +              *                                 update_and_free_page(page)
> > +              *                                 // page is freed to the buddy
> > +              *                                 spin_unlock(&hugetlb_lock)
> > +              *     spin_lock(&hugetlb_lock)
> > +              *     enqueue_huge_page(page)
> > +              *     // It is wrong, the page is already freed
> > +              *     spin_unlock(&hugetlb_lock)
> > +              *
> > +              * The race window is between put_page() and spin_lock() which
> > +              * is in the __free_huge_page().
>
> IMO, the description of the race condition in the commit message is
> sufficient.  It does not need to be here in the code.  The below comment
> should be sufficient.

Thanks. Will do.

>
> --
> Mike Kravetz
>
> > +              *
> > +              * We should make sure that the page is already on the free list
> > +              * when it is dissolved.
> > +              */
> > +             if (unlikely(!PageHugeFreed(head)))
> > +                     goto out;
> > +
> >               /*
> >                * Move PageHWPoison flag from head page to the raw error page,
> >                * which makes any subpages rather than the error page reusable.
> >
Muchun Song Jan. 5, 2021, 6:12 a.m. UTC | #3
On Tue, Jan 5, 2021 at 8:02 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 1/3/21 10:58 PM, Muchun Song wrote:
> > There is a race condition between __free_huge_page()
> > and dissolve_free_huge_page().
> >
> > CPU0:                         CPU1:
> >
> > // page_count(page) == 1
> > put_page(page)
> >   __free_huge_page(page)
> >                               dissolve_free_huge_page(page)
> >                                 spin_lock(&hugetlb_lock)
> >                                 // PageHuge(page) && !page_count(page)
> >                                 update_and_free_page(page)
> >                                 // page is freed to the buddy
> >                                 spin_unlock(&hugetlb_lock)
> >     spin_lock(&hugetlb_lock)
> >     clear_page_huge_active(page)
> >     enqueue_huge_page(page)
> >     // It is wrong, the page is already freed
> >     spin_unlock(&hugetlb_lock)
> >
> > The race windows is between put_page() and spin_lock() which
> > is in the __free_huge_page().
> >
> > We should make sure that the page is already on the free list
> > when it is dissolved.
> >
> > Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 48 insertions(+)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 1f3bf1710b66..72608008f8b4 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
> >  static int num_fault_mutexes;
> >  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
> >
> > +static inline bool PageHugeFreed(struct page *head)
> > +{
> > +     return page_private(head) == -1UL;
>
>         return page_private(head + 4) == -1UL;

Very thanks. It's my mistake when rebasing. Will
update in the next version.

>
> > +}
> > +
> > +static inline void SetPageHugeFreed(struct page *head)
> > +{
> > +     set_page_private(head + 4, -1UL);
> > +}
> > +
> > +static inline void ClearPageHugeFreed(struct page *head)
> > +{
> > +     set_page_private(head + 4, 0);
> > +}
>
> It is unfortunate that we can not use some existing value like
> page_huge_active() to determine if dissolve_free_huge_page() should
> proceed with freeing the page to buddy.  If the existing check,
>
>         if (!page_count(page)) {
>
> was changed to
>
>         if (!page_count(page) && !page_huge_active(page)) {
>
> the race window would be shrunk.  However, the most straight forward
> way to fully close the window is with the approach taken here.
>
> > +
> >  /* Forward declaration */
> >  static int hugetlb_acct_memory(struct hstate *h, long delta);
> >
> > @@ -1028,6 +1043,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
> >       list_move(&page->lru, &h->hugepage_freelists[nid]);
> >       h->free_huge_pages++;
> >       h->free_huge_pages_node[nid]++;
> > +     SetPageHugeFreed(page);
> >  }
> >
> >  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> > @@ -1044,6 +1060,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> >
> >               list_move(&page->lru, &h->hugepage_activelist);
> >               set_page_refcounted(page);
> > +             ClearPageHugeFreed(page);
> >               h->free_huge_pages--;
> >               h->free_huge_pages_node[nid]--;
> >               return page;
> > @@ -1504,6 +1521,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> >       spin_lock(&hugetlb_lock);
> >       h->nr_huge_pages++;
> >       h->nr_huge_pages_node[nid]++;
> > +     ClearPageHugeFreed(page);
> >       spin_unlock(&hugetlb_lock);
> >  }
> >
> > @@ -1770,6 +1788,36 @@ int dissolve_free_huge_page(struct page *page)
> >               int nid = page_to_nid(head);
> >               if (h->free_huge_pages - h->resv_huge_pages == 0)
> >                       goto out;
> > +
> > +             /*
> > +              * There is a race condition between __free_huge_page()
> > +              * and dissolve_free_huge_page().
> > +              *
> > +              * CPU0:                         CPU1:
> > +              *
> > +              * // page_count(page) == 1
> > +              * put_page(page)
> > +              *   __free_huge_page(page)
> > +              *                               dissolve_free_huge_page(page)
> > +              *                                 spin_lock(&hugetlb_lock)
> > +              *                                 // PageHuge(page) && !page_count(page)
> > +              *                                 update_and_free_page(page)
> > +              *                                 // page is freed to the buddy
> > +              *                                 spin_unlock(&hugetlb_lock)
> > +              *     spin_lock(&hugetlb_lock)
> > +              *     enqueue_huge_page(page)
> > +              *     // It is wrong, the page is already freed
> > +              *     spin_unlock(&hugetlb_lock)
> > +              *
> > +              * The race window is between put_page() and spin_lock() which
> > +              * is in the __free_huge_page().
>
> IMO, the description of the race condition in the commit message is
> sufficient.  It does not need to be here in the code.  The below comment
> should be sufficient.
>
> --
> Mike Kravetz
>
> > +              *
> > +              * We should make sure that the page is already on the free list
> > +              * when it is dissolved.
> > +              */
> > +             if (unlikely(!PageHugeFreed(head)))
> > +                     goto out;
> > +
> >               /*
> >                * Move PageHWPoison flag from head page to the raw error page,
> >                * which makes any subpages rather than the error page reusable.
> >
Mike Kravetz Jan. 5, 2021, 11:22 p.m. UTC | #4
On 1/4/21 6:55 PM, Muchun Song wrote:
> On Tue, Jan 5, 2021 at 8:02 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 1/3/21 10:58 PM, Muchun Song wrote:
>>> There is a race condition between __free_huge_page()
>>> and dissolve_free_huge_page().
>>>
>>> CPU0:                         CPU1:
>>>
>>> // page_count(page) == 1
>>> put_page(page)
>>>   __free_huge_page(page)
>>>                               dissolve_free_huge_page(page)
>>>                                 spin_lock(&hugetlb_lock)
>>>                                 // PageHuge(page) && !page_count(page)
>>>                                 update_and_free_page(page)
>>>                                 // page is freed to the buddy
>>>                                 spin_unlock(&hugetlb_lock)
>>>     spin_lock(&hugetlb_lock)
>>>     clear_page_huge_active(page)
>>>     enqueue_huge_page(page)
>>>     // It is wrong, the page is already freed
>>>     spin_unlock(&hugetlb_lock)
>>>
>>> The race windows is between put_page() and spin_lock() which
>>> is in the __free_huge_page().
>>>
>>> We should make sure that the page is already on the free list
>>> when it is dissolved.
>>>
>>> Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
>>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>>> ---
>>>  mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 48 insertions(+)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 1f3bf1710b66..72608008f8b4 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
>>>  static int num_fault_mutexes;
>>>  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
>>>
>>> +static inline bool PageHugeFreed(struct page *head)
>>> +{
>>> +     return page_private(head) == -1UL;
>>
>>         return page_private(head + 4) == -1UL;
>>
>>> +}
>>> +
>>> +static inline void SetPageHugeFreed(struct page *head)
>>> +{
>>> +     set_page_private(head + 4, -1UL);
>>> +}
>>> +
>>> +static inline void ClearPageHugeFreed(struct page *head)
>>> +{
>>> +     set_page_private(head + 4, 0);
>>> +}
>>
>> It is unfortunate that we can not use some existing value like
>> page_huge_active() to determine if dissolve_free_huge_page() should
>> proceed with freeing the page to buddy.  If the existing check,
>>
>>         if (!page_count(page)) {
>>
>> was changed to
>>
>>         if (!page_count(page) && !page_huge_active(page)) {
>>
>> the race window would be shrunk.  However, the most straight forward
>> way to fully close the window is with the approach taken here.
> 
> I also thought about this fix. But this is not enough. Because
> we just call put_page to free the HugeTLB page without
> setting activeness in some place (e.g. error handling
> routines).
> 
> If we use page_huge_active, we should set activeness
> before put_page. But we cannot guarantee this.

Just FYI,
I went back and explored the option of doing set_page_huge_active
when a page was put on the active list and clear_page_huge_active
when put on the free list.  This would be much like what you are
doing with PageHugeFreed.  Commit bcc54222309c which added page_huge_active
implied that this was possible.  Then I remembered a race fixed in
cb6acd01e2e4 that required delaying the call to set_page_huge_active
in hugetlb_no_page.  So, such a scheme would not work.

Also,
It seems we could use head[3].mapping for PageHugeFreed ?  Not much
of an advantage.  It does not add another tail page needed to store
page metadata.  And, this fits within the already defined
HUGETLB_CGROUP_MIN_ORDER.
Muchun Song Jan. 6, 2021, 6:05 a.m. UTC | #5
On Wed, Jan 6, 2021 at 7:22 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 1/4/21 6:55 PM, Muchun Song wrote:
> > On Tue, Jan 5, 2021 at 8:02 AM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >>
> >> On 1/3/21 10:58 PM, Muchun Song wrote:
> >>> There is a race condition between __free_huge_page()
> >>> and dissolve_free_huge_page().
> >>>
> >>> CPU0:                         CPU1:
> >>>
> >>> // page_count(page) == 1
> >>> put_page(page)
> >>>   __free_huge_page(page)
> >>>                               dissolve_free_huge_page(page)
> >>>                                 spin_lock(&hugetlb_lock)
> >>>                                 // PageHuge(page) && !page_count(page)
> >>>                                 update_and_free_page(page)
> >>>                                 // page is freed to the buddy
> >>>                                 spin_unlock(&hugetlb_lock)
> >>>     spin_lock(&hugetlb_lock)
> >>>     clear_page_huge_active(page)
> >>>     enqueue_huge_page(page)
> >>>     // It is wrong, the page is already freed
> >>>     spin_unlock(&hugetlb_lock)
> >>>
> >>> The race windows is between put_page() and spin_lock() which
> >>> is in the __free_huge_page().
> >>>
> >>> We should make sure that the page is already on the free list
> >>> when it is dissolved.
> >>>
> >>> Fixes: c8721bbbdd36 ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
> >>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> >>> ---
> >>>  mm/hugetlb.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>  1 file changed, 48 insertions(+)
> >>>
> >>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >>> index 1f3bf1710b66..72608008f8b4 100644
> >>> --- a/mm/hugetlb.c
> >>> +++ b/mm/hugetlb.c
> >>> @@ -79,6 +79,21 @@ DEFINE_SPINLOCK(hugetlb_lock);
> >>>  static int num_fault_mutexes;
> >>>  struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
> >>>
> >>> +static inline bool PageHugeFreed(struct page *head)
> >>> +{
> >>> +     return page_private(head) == -1UL;
> >>
> >>         return page_private(head + 4) == -1UL;
> >>
> >>> +}
> >>> +
> >>> +static inline void SetPageHugeFreed(struct page *head)
> >>> +{
> >>> +     set_page_private(head + 4, -1UL);
> >>> +}
> >>> +
> >>> +static inline void ClearPageHugeFreed(struct page *head)
> >>> +{
> >>> +     set_page_private(head + 4, 0);
> >>> +}
> >>
> >> It is unfortunate that we can not use some existing value like
> >> page_huge_active() to determine if dissolve_free_huge_page() should
> >> proceed with freeing the page to buddy.  If the existing check,
> >>
> >>         if (!page_count(page)) {
> >>
> >> was changed to
> >>
> >>         if (!page_count(page) && !page_huge_active(page)) {
> >>
> >> the race window would be shrunk.  However, the most straight forward
> >> way to fully close the window is with the approach taken here.
> >
> > I also thought about this fix. But this is not enough. Because
> > we just call put_page to free the HugeTLB page without
> > setting activeness in some place (e.g. error handling
> > routines).
> >
> > If we use page_huge_active, we should set activeness
> > before put_page. But we cannot guarantee this.
>
> Just FYI,
> I went back and explored the option of doing set_page_huge_active
> when a page was put on the active list and clear_page_huge_active
> when put on the free list.  This would be much like what you are
> doing with PageHugeFreed.  Commit bcc54222309c which added page_huge_active
> implied that this was possible.  Then I remembered a race fixed in
> cb6acd01e2e4 that required delaying the call to set_page_huge_active
> in hugetlb_no_page.  So, such a scheme would not work.

Sounds like a tortuous story. :)

>
> Also,
> It seems we could use head[3].mapping for PageHugeFreed ?  Not much
> of an advantage.  It does not add another tail page needed to store
> page metadata.  And, this fits within the already defined
> HUGETLB_CGROUP_MIN_ORDER.

It is fine to me. Will do. Thanks.

> --
> Mike Kravetz
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1f3bf1710b66..72608008f8b4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,6 +79,21 @@  DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
+static inline bool PageHugeFreed(struct page *head)
+{
+	return page_private(head) == -1UL;
+}
+
+static inline void SetPageHugeFreed(struct page *head)
+{
+	set_page_private(head + 4, -1UL);
+}
+
+static inline void ClearPageHugeFreed(struct page *head)
+{
+	set_page_private(head + 4, 0);
+}
+
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 
@@ -1028,6 +1043,7 @@  static void enqueue_huge_page(struct hstate *h, struct page *page)
 	list_move(&page->lru, &h->hugepage_freelists[nid]);
 	h->free_huge_pages++;
 	h->free_huge_pages_node[nid]++;
+	SetPageHugeFreed(page);
 }
 
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1044,6 +1060,7 @@  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 		list_move(&page->lru, &h->hugepage_activelist);
 		set_page_refcounted(page);
+		ClearPageHugeFreed(page);
 		h->free_huge_pages--;
 		h->free_huge_pages_node[nid]--;
 		return page;
@@ -1504,6 +1521,7 @@  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 	spin_lock(&hugetlb_lock);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
+	ClearPageHugeFreed(page);
 	spin_unlock(&hugetlb_lock);
 }
 
@@ -1770,6 +1788,36 @@  int dissolve_free_huge_page(struct page *page)
 		int nid = page_to_nid(head);
 		if (h->free_huge_pages - h->resv_huge_pages == 0)
 			goto out;
+
+		/*
+		 * There is a race condition between __free_huge_page()
+		 * and dissolve_free_huge_page().
+		 *
+		 * CPU0:                         CPU1:
+		 *
+		 * // page_count(page) == 1
+		 * put_page(page)
+		 *   __free_huge_page(page)
+		 *                               dissolve_free_huge_page(page)
+		 *                                 spin_lock(&hugetlb_lock)
+		 *                                 // PageHuge(page) && !page_count(page)
+		 *                                 update_and_free_page(page)
+		 *                                 // page is freed to the buddy
+		 *                                 spin_unlock(&hugetlb_lock)
+		 *     spin_lock(&hugetlb_lock)
+		 *     enqueue_huge_page(page)
+		 *     // It is wrong, the page is already freed
+		 *     spin_unlock(&hugetlb_lock)
+		 *
+		 * The race window is between put_page() and spin_lock() which
+		 * is in the __free_huge_page().
+		 *
+		 * We should make sure that the page is already on the free list
+		 * when it is dissolved.
+		 */
+		if (unlikely(!PageHugeFreed(head)))
+			goto out;
+
 		/*
 		 * Move PageHWPoison flag from head page to the raw error page,
 		 * which makes any subpages rather than the error page reusable.