diff mbox series

[6/8] hugetlb: change free_pool_huge_page to remove_pool_huge_page

Message ID 20210325002835.216118-7-mike.kravetz@oracle.com (mailing list archive)
State New, archived
Headers show
Series make hugetlb put_page safe for all calling contexts | expand

Commit Message

Mike Kravetz March 25, 2021, 12:28 a.m. UTC
free_pool_huge_page was called with hugetlb_lock held.  It would remove
a hugetlb page, and then free the corresponding pages to the lower level
allocators such as buddy.  free_pool_huge_page was called in a loop to
remove hugetlb pages and these loops could hold the hugetlb_lock for a
considerable time.

Create new routine remove_pool_huge_page to replace free_pool_huge_page.
remove_pool_huge_page will remove the hugetlb page, and it must be
called with the hugetlb_lock held.  It will return the removed page and
it is the responsibility of the caller to free the page to the lower
level allocators.  The hugetlb_lock is dropped before freeing to these
allocators which results in shorter lock hold times.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c | 88 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 37 deletions(-)

Comments

Michal Hocko March 25, 2021, 11:06 a.m. UTC | #1
On Wed 24-03-21 17:28:33, Mike Kravetz wrote:
[...]
> @@ -2074,17 +2067,16 @@ static int gather_surplus_pages(struct hstate *h, long delta)
>   *    to the associated reservation map.
>   * 2) Free any unused surplus pages that may have been allocated to satisfy
>   *    the reservation.  As many as unused_resv_pages may be freed.
> - *
> - * Called with hugetlb_lock held.  However, the lock could be dropped (and
> - * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
> - * we must make sure nobody else can claim pages we are in the process of
> - * freeing.  Do this by ensuring resv_huge_page always is greater than the
> - * number of huge pages we plan to free when dropping the lock.
>   */
>  static void return_unused_surplus_pages(struct hstate *h,
>  					unsigned long unused_resv_pages)
>  {
>  	unsigned long nr_pages;
> +	struct page *page, *t_page;
> +	struct list_head page_list;
> +
> +	/* Uncommit the reservation */
> +	h->resv_huge_pages -= unused_resv_pages;

Is this ok for cases where remove_pool_huge_page fails early? I have to
say I am kinda lost in the resv_huge_pages accounting here. The original
code was already quite supicious to me. TBH.
>  
>  	/* Cannot return gigantic pages currently */
>  	if (hstate_is_gigantic(h))
> @@ -2101,24 +2093,27 @@ static void return_unused_surplus_pages(struct hstate *h,
>  	 * evenly across all nodes with memory. Iterate across these nodes
>  	 * until we can no longer free unreserved surplus pages. This occurs
>  	 * when the nodes with surplus pages have no free pages.
> -	 * free_pool_huge_page() will balance the freed pages across the
> +	 * remove_pool_huge_page() will balance the freed pages across the
>  	 * on-line nodes with memory and will handle the hstate accounting.
> -	 *
> -	 * Note that we decrement resv_huge_pages as we free the pages.  If
> -	 * we drop the lock, resv_huge_pages will still be sufficiently large
> -	 * to cover subsequent pages we may free.
>  	 */
> +	INIT_LIST_HEAD(&page_list);
>  	while (nr_pages--) {
> -		h->resv_huge_pages--;
> -		unused_resv_pages--;
> -		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
> +		page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
> +		if (!page)
>  			goto out;
> -		cond_resched_lock(&hugetlb_lock);
> +
> +		INIT_LIST_HEAD(&page->lru);

again unnecessary INIT_LIST_HEAD

> +		list_add(&page->lru, &page_list);
>  	}
>  
>  out:
> -	/* Fully uncommit the reservation */
> -	h->resv_huge_pages -= unused_resv_pages;
> +	spin_unlock(&hugetlb_lock);
> +	list_for_each_entry_safe(page, t_page, &page_list, lru) {
> +		list_del(&page->lru);
> +		update_and_free_page(h, page);
> +		cond_resched();
> +	}

You have the same construct at 3 different places maybe it deserves a
little helper update_and_free_page_batch.

> +	spin_lock(&hugetlb_lock);
>  }
>  
>  
> @@ -2648,6 +2643,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
>  			      nodemask_t *nodes_allowed)
>  {
>  	unsigned long min_count, ret;
> +	struct page *page, *t_page;
> +	struct list_head page_list;
>  	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
>  
>  	/*
> @@ -2757,11 +2754,28 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
>  	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
>  	min_count = max(count, min_count);
>  	try_to_free_low(h, min_count, nodes_allowed);
> +
> +	/*
> +	 * Collect pages to be removed on list without dropping lock
> +	 */
> +	INIT_LIST_HEAD(&page_list);
>  	while (min_count < persistent_huge_pages(h)) {
> -		if (!free_pool_huge_page(h, nodes_allowed, 0))
> +		page = remove_pool_huge_page(h, nodes_allowed, 0);
> +		if (!page)
>  			break;
> -		cond_resched_lock(&hugetlb_lock);
> +
> +		INIT_LIST_HEAD(&page->lru);

INIT_LIST_HEAD again.

> +		list_add(&page->lru, &page_list);
>  	}
> +	/* free the pages after dropping lock */
> +	spin_unlock(&hugetlb_lock);
> +	list_for_each_entry_safe(page, t_page, &page_list, lru) {
> +		list_del(&page->lru);
> +		update_and_free_page(h, page);
> +		cond_resched();
> +	}
> +	spin_lock(&hugetlb_lock);
> +
>  	while (count < persistent_huge_pages(h)) {
>  		if (!adjust_pool_surplus(h, nodes_allowed, 1))
>  			break;
> -- 
> 2.30.2
Mike Kravetz March 25, 2021, 5:29 p.m. UTC | #2
On 3/25/21 4:06 AM, Michal Hocko wrote:
> On Wed 24-03-21 17:28:33, Mike Kravetz wrote:
> [...]
>> @@ -2074,17 +2067,16 @@ static int gather_surplus_pages(struct hstate *h, long delta)
>>   *    to the associated reservation map.
>>   * 2) Free any unused surplus pages that may have been allocated to satisfy
>>   *    the reservation.  As many as unused_resv_pages may be freed.
>> - *
>> - * Called with hugetlb_lock held.  However, the lock could be dropped (and
>> - * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
>> - * we must make sure nobody else can claim pages we are in the process of
>> - * freeing.  Do this by ensuring resv_huge_page always is greater than the
>> - * number of huge pages we plan to free when dropping the lock.
>>   */
>>  static void return_unused_surplus_pages(struct hstate *h,
>>  					unsigned long unused_resv_pages)
>>  {
>>  	unsigned long nr_pages;
>> +	struct page *page, *t_page;
>> +	struct list_head page_list;
>> +
>> +	/* Uncommit the reservation */
>> +	h->resv_huge_pages -= unused_resv_pages;
> 
> Is this ok for cases where remove_pool_huge_page fails early? I have to
> say I am kinda lost in the resv_huge_pages accounting here. The original
> code was already quite supicious to me. TBH.

Yes, it is safe.  The existing code will do the same but perhaps in a
different way.

Some history is in the changelog for commit e5bbc8a6c992 ("mm/hugetlb.c:
fix reservation race when freeing surplus pages").  The race fixed by
that commit was introduced by the cond_resched_lock() which we are
removing in this patch.  Therefore, we can remove the tricky code that
was added to deal with dropping the lock.

I should add an explanation to the commit message.

Additionally, I suspect we may end up once again dropping the lock in
the below loop when adding vmemmap support.  Then, we would need to add
back the code in commit e5bbc8a6c992.  Sigh.

>>  
>>  	/* Cannot return gigantic pages currently */
>>  	if (hstate_is_gigantic(h))
>> @@ -2101,24 +2093,27 @@ static void return_unused_surplus_pages(struct hstate *h,
>>  	 * evenly across all nodes with memory. Iterate across these nodes
>>  	 * until we can no longer free unreserved surplus pages. This occurs
>>  	 * when the nodes with surplus pages have no free pages.
>> -	 * free_pool_huge_page() will balance the freed pages across the
>> +	 * remove_pool_huge_page() will balance the freed pages across the
>>  	 * on-line nodes with memory and will handle the hstate accounting.
>> -	 *
>> -	 * Note that we decrement resv_huge_pages as we free the pages.  If
>> -	 * we drop the lock, resv_huge_pages will still be sufficiently large
>> -	 * to cover subsequent pages we may free.
>>  	 */
>> +	INIT_LIST_HEAD(&page_list);
>>  	while (nr_pages--) {
>> -		h->resv_huge_pages--;
>> -		unused_resv_pages--;
>> -		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
>> +		page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
>> +		if (!page)
>>  			goto out;
>> -		cond_resched_lock(&hugetlb_lock);
>> +
>> +		INIT_LIST_HEAD(&page->lru);
> 
> again unnecessary INIT_LIST_HEAD
> 
>> +		list_add(&page->lru, &page_list);
>>  	}
>>  
>>  out:
>> -	/* Fully uncommit the reservation */
>> -	h->resv_huge_pages -= unused_resv_pages;
>> +	spin_unlock(&hugetlb_lock);
>> +	list_for_each_entry_safe(page, t_page, &page_list, lru) {
>> +		list_del(&page->lru);
>> +		update_and_free_page(h, page);
>> +		cond_resched();
>> +	}
> 
> You have the same construct at 3 different places maybe it deserves a
> little helper update_and_free_page_batch.

Sure.  I will add it.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fae7f034d1eb..a9785e73379f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1204,7 +1204,7 @@  static int hstate_next_node_to_alloc(struct hstate *h,
 }
 
 /*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
  * node ["this node"] from which to free a huge page.  Advance the
  * next node id whether or not we find a free huge page to free so
  * that the next attempt to free addresses the next node.
@@ -1720,16 +1720,18 @@  static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 }
 
 /*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free.  Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page.  The caller must make
+ * an additional call to free the page to low level allocators.
  * Called with hugetlb_lock locked.
  */
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
-							 bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+						nodemask_t *nodes_allowed,
+						 bool acct_surplus)
 {
 	int nr_nodes, node;
-	int ret = 0;
+	struct page *page = NULL;
 
 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
 		/*
@@ -1738,23 +1740,14 @@  static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 		 */
 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
 		    !list_empty(&h->hugepage_freelists[node])) {
-			struct page *page =
-				list_entry(h->hugepage_freelists[node].next,
+			page = list_entry(h->hugepage_freelists[node].next,
 					  struct page, lru);
 			remove_hugetlb_page(h, page, acct_surplus);
-			/*
-			 * unlock/lock around update_and_free_page is temporary
-			 * and will be removed with subsequent patch.
-			 */
-			spin_unlock(&hugetlb_lock);
-			update_and_free_page(h, page);
-			spin_lock(&hugetlb_lock);
-			ret = 1;
 			break;
 		}
 	}
 
-	return ret;
+	return page;
 }
 
 /*
@@ -2074,17 +2067,16 @@  static int gather_surplus_pages(struct hstate *h, long delta)
  *    to the associated reservation map.
  * 2) Free any unused surplus pages that may have been allocated to satisfy
  *    the reservation.  As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing.  Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
  */
 static void return_unused_surplus_pages(struct hstate *h,
 					unsigned long unused_resv_pages)
 {
 	unsigned long nr_pages;
+	struct page *page, *t_page;
+	struct list_head page_list;
+
+	/* Uncommit the reservation */
+	h->resv_huge_pages -= unused_resv_pages;
 
 	/* Cannot return gigantic pages currently */
 	if (hstate_is_gigantic(h))
@@ -2101,24 +2093,27 @@  static void return_unused_surplus_pages(struct hstate *h,
 	 * evenly across all nodes with memory. Iterate across these nodes
 	 * until we can no longer free unreserved surplus pages. This occurs
 	 * when the nodes with surplus pages have no free pages.
-	 * free_pool_huge_page() will balance the freed pages across the
+	 * remove_pool_huge_page() will balance the freed pages across the
 	 * on-line nodes with memory and will handle the hstate accounting.
-	 *
-	 * Note that we decrement resv_huge_pages as we free the pages.  If
-	 * we drop the lock, resv_huge_pages will still be sufficiently large
-	 * to cover subsequent pages we may free.
 	 */
+	INIT_LIST_HEAD(&page_list);
 	while (nr_pages--) {
-		h->resv_huge_pages--;
-		unused_resv_pages--;
-		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+		page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+		if (!page)
 			goto out;
-		cond_resched_lock(&hugetlb_lock);
+
+		INIT_LIST_HEAD(&page->lru);
+		list_add(&page->lru, &page_list);
 	}
 
 out:
-	/* Fully uncommit the reservation */
-	h->resv_huge_pages -= unused_resv_pages;
+	spin_unlock(&hugetlb_lock);
+	list_for_each_entry_safe(page, t_page, &page_list, lru) {
+		list_del(&page->lru);
+		update_and_free_page(h, page);
+		cond_resched();
+	}
+	spin_lock(&hugetlb_lock);
 }
 
 
@@ -2648,6 +2643,8 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 			      nodemask_t *nodes_allowed)
 {
 	unsigned long min_count, ret;
+	struct page *page, *t_page;
+	struct list_head page_list;
 	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
 
 	/*
@@ -2757,11 +2754,28 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
 	min_count = max(count, min_count);
 	try_to_free_low(h, min_count, nodes_allowed);
+
+	/*
+	 * Collect pages to be removed on list without dropping lock
+	 */
+	INIT_LIST_HEAD(&page_list);
 	while (min_count < persistent_huge_pages(h)) {
-		if (!free_pool_huge_page(h, nodes_allowed, 0))
+		page = remove_pool_huge_page(h, nodes_allowed, 0);
+		if (!page)
 			break;
-		cond_resched_lock(&hugetlb_lock);
+
+		INIT_LIST_HEAD(&page->lru);
+		list_add(&page->lru, &page_list);
 	}
+	/* free the pages after dropping lock */
+	spin_unlock(&hugetlb_lock);
+	list_for_each_entry_safe(page, t_page, &page_list, lru) {
+		list_del(&page->lru);
+		update_and_free_page(h, page);
+		cond_resched();
+	}
+	spin_lock(&hugetlb_lock);
+
 	while (count < persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
 			break;