diff mbox series

[RFC,2/8] hugetlb: recompute min_count when dropping hugetlb_lock

Message ID 20210319224209.150047-3-mike.kravetz@oracle.com (mailing list archive)
State New, archived
Headers show
Series make hugetlb put_page safe for all calling contexts | expand

Commit Message

Mike Kravetz March 19, 2021, 10:42 p.m. UTC
The routine set_max_huge_pages reduces the number of hugetlb_pages,
by calling free_pool_huge_page in a loop.  It does this as long as
persistent_huge_pages() is above a calculated min_count value.
However, this loop can conditionally drop hugetlb_lock and in some
circumstances free_pool_huge_page can drop hugetlb_lock.  If the
lock is dropped, counters could change the calculated min_count
value may no longer be valid.

The routine try_to_free_low has the same issue.

Recalculate min_count in each loop iteration as hugetlb_lock may have
been dropped.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

Comments

Michal Hocko March 22, 2021, 2:07 p.m. UTC | #1
On Fri 19-03-21 15:42:03, Mike Kravetz wrote:
> The routine set_max_huge_pages reduces the number of hugetlb_pages,
> by calling free_pool_huge_page in a loop.  It does this as long as
> persistent_huge_pages() is above a calculated min_count value.
> However, this loop can conditionally drop hugetlb_lock and in some
> circumstances free_pool_huge_page can drop hugetlb_lock.  If the
> lock is dropped, counters could change the calculated min_count
> value may no longer be valid.

OK, this one looks like a real bug fix introduced by 55f67141a8927.
Unless I am missing something we could release pages which are reserved
already.
 
> The routine try_to_free_low has the same issue.
> 
> Recalculate min_count in each loop iteration as hugetlb_lock may have
> been dropped.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  mm/hugetlb.c | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d5be25f910e8..c537274c2a38 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2521,11 +2521,20 @@ static void __init report_hugepages(void)
>  	}
>  }
>  
> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> +{
> +	unsigned long min_count;
> +
> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
> +	return max(count, min_count);

Just out of curiousity, is compiler allowed to inline this piece of code
and then cache the value? In other words do we need to make these
READ_ONCE or otherwise enforce the no-caching behavior?

> +}
> +
>  #ifdef CONFIG_HIGHMEM
>  static void try_to_free_low(struct hstate *h, unsigned long count,
>  						nodemask_t *nodes_allowed)
>  {
>  	int i;
> +	unsigned long min_count = min_hp_count(h, count);
>  
>  	if (hstate_is_gigantic(h))
>  		return;
> @@ -2534,7 +2543,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
>  		struct page *page, *next;
>  		struct list_head *freel = &h->hugepage_freelists[i];
>  		list_for_each_entry_safe(page, next, freel, lru) {
> -			if (count >= h->nr_huge_pages)
> +			if (min_count >= h->nr_huge_pages)
>  				return;
>  			if (PageHighMem(page))
>  				continue;
> @@ -2542,6 +2551,12 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
>  			update_and_free_page(h, page);
>  			h->free_huge_pages--;
>  			h->free_huge_pages_node[page_to_nid(page)]--;
> +
> +			/*
> +			 * update_and_free_page could have dropped lock so
> +			 * recompute min_count.
> +			 */
> +			min_count = min_hp_count(h, count);
>  		}
>  	}
>  }
> @@ -2695,13 +2710,15 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
>  	 * and won't grow the pool anywhere else. Not until one of the
>  	 * sysctls are changed, or the surplus pages go out of use.
>  	 */
> -	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
> -	min_count = max(count, min_count);
> -	try_to_free_low(h, min_count, nodes_allowed);
> +	min_count = min_hp_count(h, count);
> +	try_to_free_low(h, count, nodes_allowed);
>  	while (min_count < persistent_huge_pages(h)) {
>  		if (!free_pool_huge_page(h, nodes_allowed, 0))
>  			break;
>  		cond_resched_lock(&hugetlb_lock);
> +
> +		/* Recompute min_count in case hugetlb_lock was dropped */
> +		min_count = min_hp_count(h, count);
>  	}
>  	while (count < persistent_huge_pages(h)) {
>  		if (!adjust_pool_surplus(h, nodes_allowed, 1))
> -- 
> 2.30.2
>
Mike Kravetz March 22, 2021, 11:07 p.m. UTC | #2
On 3/22/21 7:07 AM, Michal Hocko wrote:
> On Fri 19-03-21 15:42:03, Mike Kravetz wrote:
>> The routine set_max_huge_pages reduces the number of hugetlb_pages,
>> by calling free_pool_huge_page in a loop.  It does this as long as
>> persistent_huge_pages() is above a calculated min_count value.
>> However, this loop can conditionally drop hugetlb_lock and in some
>> circumstances free_pool_huge_page can drop hugetlb_lock.  If the
>> lock is dropped, counters could change the calculated min_count
>> value may no longer be valid.
> 
> OK, this one looks like a real bug fix introduced by 55f67141a8927.
> Unless I am missing something we could release pages which are reserved
> already.
>  
>> The routine try_to_free_low has the same issue.
>>
>> Recalculate min_count in each loop iteration as hugetlb_lock may have
>> been dropped.
>>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>  mm/hugetlb.c | 25 +++++++++++++++++++++----
>>  1 file changed, 21 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index d5be25f910e8..c537274c2a38 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -2521,11 +2521,20 @@ static void __init report_hugepages(void)
>>  	}
>>  }
>>  
>> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
>> +{
>> +	unsigned long min_count;
>> +
>> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
>> +	return max(count, min_count);
> 
> Just out of curiousity, is compiler allowed to inline this piece of code
> and then cache the value? In other words do we need to make these
> READ_ONCE or otherwise enforce the no-caching behavior?

I honestly do not know if the compiler is allowed to do that.  The
assembly code generated by my compiler does not cache the value, but
that does not guarantee anything.  I can add READ_ONCE to make the
function look something like:

static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
{
	unsigned long min_count;

	min_count = READ_ONCE(h->resv_huge_pages) + READ_ONCE(h->nr_huge_pages)
					- READ_ONCE(h->free_huge_pages);
	return max(count, min_count);
}
Michal Hocko March 23, 2021, 7:50 a.m. UTC | #3
On Mon 22-03-21 16:07:29, Mike Kravetz wrote:
> On 3/22/21 7:07 AM, Michal Hocko wrote:
> > On Fri 19-03-21 15:42:03, Mike Kravetz wrote:
> >> The routine set_max_huge_pages reduces the number of hugetlb_pages,
> >> by calling free_pool_huge_page in a loop.  It does this as long as
> >> persistent_huge_pages() is above a calculated min_count value.
> >> However, this loop can conditionally drop hugetlb_lock and in some
> >> circumstances free_pool_huge_page can drop hugetlb_lock.  If the
> >> lock is dropped, counters could change the calculated min_count
> >> value may no longer be valid.
> > 
> > OK, this one looks like a real bug fix introduced by 55f67141a8927.
> > Unless I am missing something we could release pages which are reserved
> > already.
> >  
> >> The routine try_to_free_low has the same issue.
> >>
> >> Recalculate min_count in each loop iteration as hugetlb_lock may have
> >> been dropped.
> >>
> >> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> >> ---
> >>  mm/hugetlb.c | 25 +++++++++++++++++++++----
> >>  1 file changed, 21 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >> index d5be25f910e8..c537274c2a38 100644
> >> --- a/mm/hugetlb.c
> >> +++ b/mm/hugetlb.c
> >> @@ -2521,11 +2521,20 @@ static void __init report_hugepages(void)
> >>  	}
> >>  }
> >>  
> >> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> >> +{
> >> +	unsigned long min_count;
> >> +
> >> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
> >> +	return max(count, min_count);
> > 
> > Just out of curiousity, is compiler allowed to inline this piece of code
> > and then cache the value? In other words do we need to make these
> > READ_ONCE or otherwise enforce the no-caching behavior?
> 
> I honestly do not know if the compiler is allowed to do that.  The
> assembly code generated by my compiler does not cache the value, but
> that does not guarantee anything.  I can add READ_ONCE to make the
> function look something like:
> 
> static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> {
> 	unsigned long min_count;
> 
> 	min_count = READ_ONCE(h->resv_huge_pages) + READ_ONCE(h->nr_huge_pages)
> 					- READ_ONCE(h->free_huge_pages);
> 	return max(count, min_count);
> }

Maybe just forcing to never inline the function should be sufficient.
This is not a hot path to micro optimize for no function call. But there
are much more qualified people on the CC list on this matter who could
clarify. Peter?
Peter Zijlstra March 23, 2021, 8:01 a.m. UTC | #4
On Tue, Mar 23, 2021 at 08:50:53AM +0100, Michal Hocko wrote:

> > >> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> > >> +{
> > >> +	unsigned long min_count;
> > >> +
> > >> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
> > >> +	return max(count, min_count);
> > > 
> > > Just out of curiousity, is compiler allowed to inline this piece of code
> > > and then cache the value? In other words do we need to make these
> > > READ_ONCE or otherwise enforce the no-caching behavior?
> > 
> > I honestly do not know if the compiler is allowed to do that.  The
> > assembly code generated by my compiler does not cache the value, but
> > that does not guarantee anything.  I can add READ_ONCE to make the
> > function look something like:
> > 
> > static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> > {
> > 	unsigned long min_count;
> > 
> > 	min_count = READ_ONCE(h->resv_huge_pages) + READ_ONCE(h->nr_huge_pages)
> > 					- READ_ONCE(h->free_huge_pages);
> > 	return max(count, min_count);
> > }
> 
> Maybe just forcing to never inline the function should be sufficient.
> This is not a hot path to micro optimize for no function call. But there
> are much more qualified people on the CC list on this matter who could
> clarify. Peter?

I'm not sure I understand the code right. But inline or not doesn't
matter, LTO completely ruins that game. Just like if it was a static
function, then the compiler is free to inline it, even if the function
lacks an inline attribute.

Basically, without READ_ONCE() the compiler is allowed to entirely elide
the load (and use a previous load), or to duplicate the load and do it
again later (reaching a different result).

Similarly, the compiler is allowed to byte-wise load the variable in any
random order and re-assemble.

If any of that is a problem, you have to use READ_ONCE().
Michal Hocko March 23, 2021, 8:14 a.m. UTC | #5
On Tue 23-03-21 09:01:02, Peter Zijlstra wrote:
> On Tue, Mar 23, 2021 at 08:50:53AM +0100, Michal Hocko wrote:
> 
> > > >> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> > > >> +{
> > > >> +	unsigned long min_count;
> > > >> +
> > > >> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
> > > >> +	return max(count, min_count);
> > > > 
> > > > Just out of curiousity, is compiler allowed to inline this piece of code
> > > > and then cache the value? In other words do we need to make these
> > > > READ_ONCE or otherwise enforce the no-caching behavior?
> > > 
> > > I honestly do not know if the compiler is allowed to do that.  The
> > > assembly code generated by my compiler does not cache the value, but
> > > that does not guarantee anything.  I can add READ_ONCE to make the
> > > function look something like:
> > > 
> > > static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
> > > {
> > > 	unsigned long min_count;
> > > 
> > > 	min_count = READ_ONCE(h->resv_huge_pages) + READ_ONCE(h->nr_huge_pages)
> > > 					- READ_ONCE(h->free_huge_pages);
> > > 	return max(count, min_count);
> > > }
> > 
> > Maybe just forcing to never inline the function should be sufficient.
> > This is not a hot path to micro optimize for no function call. But there
> > are much more qualified people on the CC list on this matter who could
> > clarify. Peter?
> 
> I'm not sure I understand the code right.

We need to ensure the function is evaluated each time it is called
because it will be used after a lock is dropped and reacquired so
numbers could have changed. The point of wrapping this into a function
is to reduce the code duplication IIUC.

> But inline or not doesn't
> matter, LTO completely ruins that game. Just like if it was a static
> function, then the compiler is free to inline it, even if the function
> lacks an inline attribute.

OK

> Basically, without READ_ONCE() the compiler is allowed to entirely elide
> the load (and use a previous load), or to duplicate the load and do it
> again later (reaching a different result).
> 
> Similarly, the compiler is allowed to byte-wise load the variable in any
> random order and re-assemble.
> 
> If any of that is a problem, you have to use READ_ONCE().

Thanks for the confirmation!
Mike Kravetz March 23, 2021, 11:18 p.m. UTC | #6
On 3/23/21 1:14 AM, Michal Hocko wrote:
> On Tue 23-03-21 09:01:02, Peter Zijlstra wrote:
>> On Tue, Mar 23, 2021 at 08:50:53AM +0100, Michal Hocko wrote:
>>
>>>>>> +static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
>>>>>> +{
>>>>>> +	unsigned long min_count;
>>>>>> +
>>>>>> +	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
>>>>>> +	return max(count, min_count);
>>>>>
>>>>> Just out of curiousity, is compiler allowed to inline this piece of code
>>>>> and then cache the value? In other words do we need to make these
>>>>> READ_ONCE or otherwise enforce the no-caching behavior?
>>>>
>>>> I honestly do not know if the compiler is allowed to do that.  The
>>>> assembly code generated by my compiler does not cache the value, but
>>>> that does not guarantee anything.  I can add READ_ONCE to make the
>>>> function look something like:
>>>>
>>>> static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
>>>> {
>>>> 	unsigned long min_count;
>>>>
>>>> 	min_count = READ_ONCE(h->resv_huge_pages) + READ_ONCE(h->nr_huge_pages)
>>>> 					- READ_ONCE(h->free_huge_pages);
>>>> 	return max(count, min_count);
>>>> }
>>>
>>> Maybe just forcing to never inline the function should be sufficient.
>>> This is not a hot path to micro optimize for no function call. But there
>>> are much more qualified people on the CC list on this matter who could
>>> clarify. Peter?
>>
>> I'm not sure I understand the code right.
> 
> We need to ensure the function is evaluated each time it is called
> because it will be used after a lock is dropped and reacquired so
> numbers could have changed. The point of wrapping this into a function
> is to reduce the code duplication IIUC.
> 
>> But inline or not doesn't
>> matter, LTO completely ruins that game. Just like if it was a static
>> function, then the compiler is free to inline it, even if the function
>> lacks an inline attribute.
> 
> OK
> 
>> Basically, without READ_ONCE() the compiler is allowed to entirely elide
>> the load (and use a previous load), or to duplicate the load and do it
>> again later (reaching a different result).
>>
>> Similarly, the compiler is allowed to byte-wise load the variable in any
>> random order and re-assemble.
>>
>> If any of that is a problem, you have to use READ_ONCE().
> 
> Thanks for the confirmation!
> 

Here is another thought.
In patch 5 you suggest removing all pages from hugetlb with the lock
held, and adding them to a list.  Then, drop the lock and free all
pages on the list.  If we do this, then the value computed here (min_count)
can not change while we are looping.  So, this patch would be unnecessary.
That is another argument in favor of batching the frees.

Unless there is something wrong in my thinking, I am going to take that
approach and drop this patch.
Michal Hocko March 24, 2021, 8:36 a.m. UTC | #7
On Tue 23-03-21 16:18:08, Mike Kravetz wrote:
[...]
> Here is another thought.
> In patch 5 you suggest removing all pages from hugetlb with the lock
> held, and adding them to a list.  Then, drop the lock and free all
> pages on the list.  If we do this, then the value computed here (min_count)
> can not change while we are looping.  So, this patch would be unnecessary.
> That is another argument in favor of batching the frees.
> 
> Unless there is something wrong in my thinking, I am going to take that
> approach and drop this patch.

Makes sense
Mike Kravetz March 24, 2021, 4:43 p.m. UTC | #8
On 3/24/21 1:36 AM, Michal Hocko wrote:
> On Tue 23-03-21 16:18:08, Mike Kravetz wrote:
> [...]
>> Here is another thought.
>> In patch 5 you suggest removing all pages from hugetlb with the lock
>> held, and adding them to a list.  Then, drop the lock and free all
>> pages on the list.  If we do this, then the value computed here (min_count)
>> can not change while we are looping.  So, this patch would be unnecessary.
>> That is another argument in favor of batching the frees.
>>
>> Unless there is something wrong in my thinking, I am going to take that
>> approach and drop this patch.
> 
> Makes sense
> 

I still think this is the way to go in this series.

However, Muchun's "Free some vmemmap pages of HugeTLB page" series would
likely want to drop the lock for each page as the free operation may
fail.  So, we may end up back with one lock cycle per page.  That is
something that will be discussed in that series.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5be25f910e8..c537274c2a38 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2521,11 +2521,20 @@  static void __init report_hugepages(void)
 	}
 }
 
+static inline unsigned long min_hp_count(struct hstate *h, unsigned long count)
+{
+	unsigned long min_count;
+
+	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+	return max(count, min_count);
+}
+
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(struct hstate *h, unsigned long count,
 						nodemask_t *nodes_allowed)
 {
 	int i;
+	unsigned long min_count = min_hp_count(h, count);
 
 	if (hstate_is_gigantic(h))
 		return;
@@ -2534,7 +2543,7 @@  static void try_to_free_low(struct hstate *h, unsigned long count,
 		struct page *page, *next;
 		struct list_head *freel = &h->hugepage_freelists[i];
 		list_for_each_entry_safe(page, next, freel, lru) {
-			if (count >= h->nr_huge_pages)
+			if (min_count >= h->nr_huge_pages)
 				return;
 			if (PageHighMem(page))
 				continue;
@@ -2542,6 +2551,12 @@  static void try_to_free_low(struct hstate *h, unsigned long count,
 			update_and_free_page(h, page);
 			h->free_huge_pages--;
 			h->free_huge_pages_node[page_to_nid(page)]--;
+
+			/*
+			 * update_and_free_page could have dropped lock so
+			 * recompute min_count.
+			 */
+			min_count = min_hp_count(h, count);
 		}
 	}
 }
@@ -2695,13 +2710,15 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
 	 */
-	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
-	min_count = max(count, min_count);
-	try_to_free_low(h, min_count, nodes_allowed);
+	min_count = min_hp_count(h, count);
+	try_to_free_low(h, count, nodes_allowed);
 	while (min_count < persistent_huge_pages(h)) {
 		if (!free_pool_huge_page(h, nodes_allowed, 0))
 			break;
 		cond_resched_lock(&hugetlb_lock);
+
+		/* Recompute min_count in case hugetlb_lock was dropped */
+		min_count = min_hp_count(h, count);
 	}
 	while (count < persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, nodes_allowed, 1))