diff mbox series

[v4,4/8] hugetlb: perform vmemmap restoration on a list of pages

Message ID 20230918230202.254631-5-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Batch hugetlb vmemmap modification operations | expand

Commit Message

Mike Kravetz Sept. 18, 2023, 11:01 p.m. UTC
The routine update_and_free_pages_bulk already performs vmemmap
restoration on the list of hugetlb pages in a separate step.  In
preparation for more functionality to be added in this step, create a
new routine hugetlb_vmemmap_restore_folios() that will restore
vmemmap for a list of folios.

This new routine must provide sufficient feedback about errors and
actual restoration performed so that update_and_free_pages_bulk can
perform optimally.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c         | 36 ++++++++++++++++++------------------
 mm/hugetlb_vmemmap.c | 37 +++++++++++++++++++++++++++++++++++++
 mm/hugetlb_vmemmap.h | 11 +++++++++++
 3 files changed, 66 insertions(+), 18 deletions(-)

Comments

Muchun Song Sept. 19, 2023, 9:52 a.m. UTC | #1
On 2023/9/19 07:01, Mike Kravetz wrote:
> The routine update_and_free_pages_bulk already performs vmemmap
> restoration on the list of hugetlb pages in a separate step.  In
> preparation for more functionality to be added in this step, create a
> new routine hugetlb_vmemmap_restore_folios() that will restore
> vmemmap for a list of folios.
>
> This new routine must provide sufficient feedback about errors and
> actual restoration performed so that update_and_free_pages_bulk can
> perform optimally.
>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb.c         | 36 ++++++++++++++++++------------------
>   mm/hugetlb_vmemmap.c | 37 +++++++++++++++++++++++++++++++++++++
>   mm/hugetlb_vmemmap.h | 11 +++++++++++
>   3 files changed, 66 insertions(+), 18 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d6f3db3c1313..814bb1982274 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1836,36 +1836,36 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>   
>   static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
>   {
> +	int ret;
> +	unsigned long restored;
>   	struct folio *folio, *t_folio;
> -	bool clear_dtor = false;
>   
>   	/*
> -	 * First allocate required vmemmmap (if necessary) for all folios on
> -	 * list.  If vmemmap can not be allocated, we can not free folio to
> -	 * lower level allocator, so add back as hugetlb surplus page.
> -	 * add_hugetlb_folio() removes the page from THIS list.
> -	 * Use clear_dtor to note if vmemmap was successfully allocated for
> -	 * ANY page on the list.
> +	 * First allocate required vmemmmap (if necessary) for all folios.
>   	 */
> -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> -		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> -			if (hugetlb_vmemmap_restore(h, &folio->page)) {
> -				spin_lock_irq(&hugetlb_lock);
> +	ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
> +
> +	/*
> +	 * If there was an error restoring vmemmap for ANY folios on the list,
> +	 * add them back as surplus hugetlb pages.  add_hugetlb_folio() removes
> +	 * the folio from THIS list.
> +	 */
> +	if (ret < 0) {
> +		spin_lock_irq(&hugetlb_lock);
> +		list_for_each_entry_safe(folio, t_folio, list, lru)
> +			if (folio_test_hugetlb_vmemmap_optimized(folio))
>   				add_hugetlb_folio(h, folio, true);
> -				spin_unlock_irq(&hugetlb_lock);
> -			} else
> -				clear_dtor = true;
> -		}
> +		spin_unlock_irq(&hugetlb_lock);
>   	}
>   
>   	/*
> -	 * If vmemmmap allocation was performed on any folio above, take lock
> -	 * to clear destructor of all folios on list.  This avoids the need to
> +	 * If vmemmmap allocation was performed on ANY folio , take lock to
> +	 * clear destructor of all folios on list.  This avoids the need to
>   	 * lock/unlock for each individual folio.
>   	 * The assumption is vmemmap allocation was performed on all or none
>   	 * of the folios on the list.  This is true expect in VERY rare cases.
>   	 */
> -	if (clear_dtor) {
> +	if (restored) {
>   		spin_lock_irq(&hugetlb_lock);
>   		list_for_each_entry(folio, list, lru)
>   			__clear_hugetlb_destructor(h, folio);
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 4558b814ffab..463a4037ec6e 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -480,6 +480,43 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>   	return ret;
>   }
>   
> +/**
> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> + * @h:		struct hstate.
> + * @folio_list:	list of folios.
> + * @restored:	Set to number of folios for which vmemmap was restored
> + *		successfully if caller passes a non-NULL pointer.
> + *
> + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
> + *		encountered restoring vmemmap for ANY folio, an error code
> + *		will be returned to the caller.  It is then the responsibility
> + *		of the caller to check the hugetlb vmemmap optimized flag of
> + *		each folio to determine if vmemmap was actually restored.
> + */
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					unsigned long *restored)
> +{
> +	unsigned long num_restored;
> +	struct folio *folio;
> +	int ret = 0, t_ret;
> +
> +	num_restored = 0;
> +	list_for_each_entry(folio, folio_list, lru) {
> +		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> +			t_ret = hugetlb_vmemmap_restore(h, &folio->page);

I still think we should free a non-optimized HugeTLB page if we
encounter an OOM situation instead of continue to restore
vemmmap pages. Restoring vmemmmap pages will only aggravate
the OOM situation. The suitable appraoch is to free a non-optimized
HugeTLB page to satisfy our allocation of vmemmap pages, what's
your opinion, Mike?

Thanks.

> +			if (t_ret)
> +				ret = t_ret;
> +			else
> +				num_restored++;
> +		}
> +	}
> +
> +	if (*restored)
> +		*restored = num_restored;
> +	return ret;
> +}
> +
>   /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
>   static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
>   {
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index c512e388dbb4..bb58453c3cc0 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -19,6 +19,8 @@
>   
>   #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +			struct list_head *folio_list, unsigned long *restored);
>   void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
>   
> @@ -45,6 +47,15 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
>   	return 0;
>   }
>   
> +static inline int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					unsigned long *restored)
> +{
> +	if (restored)
> +		*restored = 0;
> +	return 0;
> +}
> +
>   static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   {
>   }
Mike Kravetz Sept. 19, 2023, 8:57 p.m. UTC | #2
On 09/19/23 17:52, Muchun Song wrote:
> 
> 
> On 2023/9/19 07:01, Mike Kravetz wrote:
> > The routine update_and_free_pages_bulk already performs vmemmap
> > restoration on the list of hugetlb pages in a separate step.  In
> > preparation for more functionality to be added in this step, create a
> > new routine hugetlb_vmemmap_restore_folios() that will restore
> > vmemmap for a list of folios.
> > 
> > This new routine must provide sufficient feedback about errors and
> > actual restoration performed so that update_and_free_pages_bulk can
> > perform optimally.
> > 
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >   mm/hugetlb.c         | 36 ++++++++++++++++++------------------
> >   mm/hugetlb_vmemmap.c | 37 +++++++++++++++++++++++++++++++++++++
> >   mm/hugetlb_vmemmap.h | 11 +++++++++++
> >   3 files changed, 66 insertions(+), 18 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d6f3db3c1313..814bb1982274 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1836,36 +1836,36 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
> >   static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> >   {
> > +	int ret;
> > +	unsigned long restored;
> >   	struct folio *folio, *t_folio;
> > -	bool clear_dtor = false;
> >   	/*
> > -	 * First allocate required vmemmmap (if necessary) for all folios on
> > -	 * list.  If vmemmap can not be allocated, we can not free folio to
> > -	 * lower level allocator, so add back as hugetlb surplus page.
> > -	 * add_hugetlb_folio() removes the page from THIS list.
> > -	 * Use clear_dtor to note if vmemmap was successfully allocated for
> > -	 * ANY page on the list.
> > +	 * First allocate required vmemmmap (if necessary) for all folios.
> >   	 */
> > -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> > -		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> > -			if (hugetlb_vmemmap_restore(h, &folio->page)) {
> > -				spin_lock_irq(&hugetlb_lock);
> > +	ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
> > +
> > +	/*
> > +	 * If there was an error restoring vmemmap for ANY folios on the list,
> > +	 * add them back as surplus hugetlb pages.  add_hugetlb_folio() removes
> > +	 * the folio from THIS list.
> > +	 */
> > +	if (ret < 0) {
> > +		spin_lock_irq(&hugetlb_lock);
> > +		list_for_each_entry_safe(folio, t_folio, list, lru)
> > +			if (folio_test_hugetlb_vmemmap_optimized(folio))
> >   				add_hugetlb_folio(h, folio, true);
> > -				spin_unlock_irq(&hugetlb_lock);
> > -			} else
> > -				clear_dtor = true;
> > -		}
> > +		spin_unlock_irq(&hugetlb_lock);
> >   	}
> >   	/*
> > -	 * If vmemmmap allocation was performed on any folio above, take lock
> > -	 * to clear destructor of all folios on list.  This avoids the need to
> > +	 * If vmemmmap allocation was performed on ANY folio , take lock to
> > +	 * clear destructor of all folios on list.  This avoids the need to
> >   	 * lock/unlock for each individual folio.
> >   	 * The assumption is vmemmap allocation was performed on all or none
> >   	 * of the folios on the list.  This is true expect in VERY rare cases.
> >   	 */
> > -	if (clear_dtor) {
> > +	if (restored) {
> >   		spin_lock_irq(&hugetlb_lock);
> >   		list_for_each_entry(folio, list, lru)
> >   			__clear_hugetlb_destructor(h, folio);
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index 4558b814ffab..463a4037ec6e 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -480,6 +480,43 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
> >   	return ret;
> >   }
> > +/**
> > + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> > + * @h:		struct hstate.
> > + * @folio_list:	list of folios.
> > + * @restored:	Set to number of folios for which vmemmap was restored
> > + *		successfully if caller passes a non-NULL pointer.
> > + *
> > + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
> > + *		encountered restoring vmemmap for ANY folio, an error code
> > + *		will be returned to the caller.  It is then the responsibility
> > + *		of the caller to check the hugetlb vmemmap optimized flag of
> > + *		each folio to determine if vmemmap was actually restored.
> > + */
> > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > +					struct list_head *folio_list,
> > +					unsigned long *restored)
> > +{
> > +	unsigned long num_restored;
> > +	struct folio *folio;
> > +	int ret = 0, t_ret;
> > +
> > +	num_restored = 0;
> > +	list_for_each_entry(folio, folio_list, lru) {
> > +		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> > +			t_ret = hugetlb_vmemmap_restore(h, &folio->page);
> 
> I still think we should free a non-optimized HugeTLB page if we
> encounter an OOM situation instead of continue to restore
> vemmmap pages. Restoring vmemmmap pages will only aggravate
> the OOM situation. The suitable appraoch is to free a non-optimized
> HugeTLB page to satisfy our allocation of vmemmap pages, what's
> your opinion, Mike?

I agree.

As you mentioned previously, this may complicate this code path a bit.
I will rewrite to make this happen.
Muchun Song Sept. 20, 2023, 2:56 a.m. UTC | #3
> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> On 09/19/23 17:52, Muchun Song wrote:
>> 
>> 
>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>> The routine update_and_free_pages_bulk already performs vmemmap
>>> restoration on the list of hugetlb pages in a separate step.  In
>>> preparation for more functionality to be added in this step, create a
>>> new routine hugetlb_vmemmap_restore_folios() that will restore
>>> vmemmap for a list of folios.
>>> 
>>> This new routine must provide sufficient feedback about errors and
>>> actual restoration performed so that update_and_free_pages_bulk can
>>> perform optimally.
>>> 
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>  mm/hugetlb.c         | 36 ++++++++++++++++++------------------
>>>  mm/hugetlb_vmemmap.c | 37 +++++++++++++++++++++++++++++++++++++
>>>  mm/hugetlb_vmemmap.h | 11 +++++++++++
>>>  3 files changed, 66 insertions(+), 18 deletions(-)
>>> 
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index d6f3db3c1313..814bb1982274 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -1836,36 +1836,36 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>>>  static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
>>>  {
>>> + int ret;
>>> + unsigned long restored;
>>>   struct folio *folio, *t_folio;
>>> - bool clear_dtor = false;
>>>   /*
>>> -  * First allocate required vmemmmap (if necessary) for all folios on
>>> -  * list.  If vmemmap can not be allocated, we can not free folio to
>>> -  * lower level allocator, so add back as hugetlb surplus page.
>>> -  * add_hugetlb_folio() removes the page from THIS list.
>>> -  * Use clear_dtor to note if vmemmap was successfully allocated for
>>> -  * ANY page on the list.
>>> +  * First allocate required vmemmmap (if necessary) for all folios.
>>>    */
>>> - list_for_each_entry_safe(folio, t_folio, list, lru) {
>>> - if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>>> - if (hugetlb_vmemmap_restore(h, &folio->page)) {
>>> - spin_lock_irq(&hugetlb_lock);
>>> + ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
>>> +
>>> + /*
>>> +  * If there was an error restoring vmemmap for ANY folios on the list,
>>> +  * add them back as surplus hugetlb pages.  add_hugetlb_folio() removes
>>> +  * the folio from THIS list.
>>> +  */
>>> + if (ret < 0) {
>>> + spin_lock_irq(&hugetlb_lock);
>>> + list_for_each_entry_safe(folio, t_folio, list, lru)
>>> + if (folio_test_hugetlb_vmemmap_optimized(folio))
>>>   add_hugetlb_folio(h, folio, true);
>>> - spin_unlock_irq(&hugetlb_lock);
>>> - } else
>>> - clear_dtor = true;
>>> - }
>>> + spin_unlock_irq(&hugetlb_lock);
>>>   }
>>>   /*
>>> -  * If vmemmmap allocation was performed on any folio above, take lock
>>> -  * to clear destructor of all folios on list.  This avoids the need to
>>> +  * If vmemmmap allocation was performed on ANY folio , take lock to
>>> +  * clear destructor of all folios on list.  This avoids the need to
>>>    * lock/unlock for each individual folio.
>>>    * The assumption is vmemmap allocation was performed on all or none
>>>    * of the folios on the list.  This is true expect in VERY rare cases.
>>>    */
>>> - if (clear_dtor) {
>>> + if (restored) {
>>>   spin_lock_irq(&hugetlb_lock);
>>>   list_for_each_entry(folio, list, lru)
>>>   __clear_hugetlb_destructor(h, folio);
>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>> index 4558b814ffab..463a4037ec6e 100644
>>> --- a/mm/hugetlb_vmemmap.c
>>> +++ b/mm/hugetlb_vmemmap.c
>>> @@ -480,6 +480,43 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>>>   return ret;
>>>  }
>>> +/**
>>> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
>>> + * @h: struct hstate.
>>> + * @folio_list: list of folios.
>>> + * @restored: Set to number of folios for which vmemmap was restored
>>> + * successfully if caller passes a non-NULL pointer.
>>> + *
>>> + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
>>> + * encountered restoring vmemmap for ANY folio, an error code
>>> + * will be returned to the caller.  It is then the responsibility
>>> + * of the caller to check the hugetlb vmemmap optimized flag of
>>> + * each folio to determine if vmemmap was actually restored.
>>> + */
>>> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
>>> + struct list_head *folio_list,
>>> + unsigned long *restored)
>>> +{
>>> + unsigned long num_restored;
>>> + struct folio *folio;
>>> + int ret = 0, t_ret;
>>> +
>>> + num_restored = 0;
>>> + list_for_each_entry(folio, folio_list, lru) {
>>> + if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>>> + t_ret = hugetlb_vmemmap_restore(h, &folio->page);
>> 
>> I still think we should free a non-optimized HugeTLB page if we
>> encounter an OOM situation instead of continue to restore
>> vemmmap pages. Restoring vmemmmap pages will only aggravate
>> the OOM situation. The suitable appraoch is to free a non-optimized
>> HugeTLB page to satisfy our allocation of vmemmap pages, what's
>> your opinion, Mike?
> 
> I agree.
> 
> As you mentioned previously, this may complicate this code path a bit.
> I will rewrite to make this happen.

Maybe we could introduced two list passed to update_and_free_pages_bulk (this
will be easy for the callers of it), one is for non-optimized huge page,
another is optimized one. In update_and_free_pages_bulk, we could first
free those non-optimized huge page, and then restore vemmmap pages for
those optimized ones, in which case, the code could be simple.
hugetlb_vmemmap_restore_folios() dose not need to add complexity, which
still continue to restore vmemmap pages and will stop once we encounter
an OOM situation.

Thanks.

> -- 
> Mike Kravetz
Muchun Song Sept. 20, 2023, 3:03 a.m. UTC | #4
> On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
> 
> 
> 
>> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>> 
>> On 09/19/23 17:52, Muchun Song wrote:
>>> 
>>> 
>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>> The routine update_and_free_pages_bulk already performs vmemmap
>>>> restoration on the list of hugetlb pages in a separate step.  In
>>>> preparation for more functionality to be added in this step, create a
>>>> new routine hugetlb_vmemmap_restore_folios() that will restore
>>>> vmemmap for a list of folios.
>>>> 
>>>> This new routine must provide sufficient feedback about errors and
>>>> actual restoration performed so that update_and_free_pages_bulk can
>>>> perform optimally.
>>>> 
>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>> ---
>>>> mm/hugetlb.c         | 36 ++++++++++++++++++------------------
>>>> mm/hugetlb_vmemmap.c | 37 +++++++++++++++++++++++++++++++++++++
>>>> mm/hugetlb_vmemmap.h | 11 +++++++++++
>>>> 3 files changed, 66 insertions(+), 18 deletions(-)
>>>> 
>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>> index d6f3db3c1313..814bb1982274 100644
>>>> --- a/mm/hugetlb.c
>>>> +++ b/mm/hugetlb.c
>>>> @@ -1836,36 +1836,36 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>>>> static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
>>>> {
>>>> + int ret;
>>>> + unsigned long restored;
>>>>  struct folio *folio, *t_folio;
>>>> - bool clear_dtor = false;
>>>>  /*
>>>> -  * First allocate required vmemmmap (if necessary) for all folios on
>>>> -  * list.  If vmemmap can not be allocated, we can not free folio to
>>>> -  * lower level allocator, so add back as hugetlb surplus page.
>>>> -  * add_hugetlb_folio() removes the page from THIS list.
>>>> -  * Use clear_dtor to note if vmemmap was successfully allocated for
>>>> -  * ANY page on the list.
>>>> +  * First allocate required vmemmmap (if necessary) for all folios.
>>>>   */
>>>> - list_for_each_entry_safe(folio, t_folio, list, lru) {
>>>> - if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>>>> - if (hugetlb_vmemmap_restore(h, &folio->page)) {
>>>> - spin_lock_irq(&hugetlb_lock);
>>>> + ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
>>>> +
>>>> + /*
>>>> +  * If there was an error restoring vmemmap for ANY folios on the list,
>>>> +  * add them back as surplus hugetlb pages.  add_hugetlb_folio() removes
>>>> +  * the folio from THIS list.
>>>> +  */
>>>> + if (ret < 0) {
>>>> + spin_lock_irq(&hugetlb_lock);
>>>> + list_for_each_entry_safe(folio, t_folio, list, lru)
>>>> + if (folio_test_hugetlb_vmemmap_optimized(folio))
>>>>  add_hugetlb_folio(h, folio, true);
>>>> - spin_unlock_irq(&hugetlb_lock);
>>>> - } else
>>>> - clear_dtor = true;
>>>> - }
>>>> + spin_unlock_irq(&hugetlb_lock);
>>>>  }
>>>>  /*
>>>> -  * If vmemmmap allocation was performed on any folio above, take lock
>>>> -  * to clear destructor of all folios on list.  This avoids the need to
>>>> +  * If vmemmmap allocation was performed on ANY folio , take lock to
>>>> +  * clear destructor of all folios on list.  This avoids the need to
>>>>   * lock/unlock for each individual folio.
>>>>   * The assumption is vmemmap allocation was performed on all or none
>>>>   * of the folios on the list.  This is true expect in VERY rare cases.
>>>>   */
>>>> - if (clear_dtor) {
>>>> + if (restored) {
>>>>  spin_lock_irq(&hugetlb_lock);
>>>>  list_for_each_entry(folio, list, lru)
>>>>  __clear_hugetlb_destructor(h, folio);
>>>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>>>> index 4558b814ffab..463a4037ec6e 100644
>>>> --- a/mm/hugetlb_vmemmap.c
>>>> +++ b/mm/hugetlb_vmemmap.c
>>>> @@ -480,6 +480,43 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>>>>  return ret;
>>>> }
>>>> +/**
>>>> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
>>>> + * @h: struct hstate.
>>>> + * @folio_list: list of folios.
>>>> + * @restored: Set to number of folios for which vmemmap was restored
>>>> + * successfully if caller passes a non-NULL pointer.
>>>> + *
>>>> + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
>>>> + * encountered restoring vmemmap for ANY folio, an error code
>>>> + * will be returned to the caller.  It is then the responsibility
>>>> + * of the caller to check the hugetlb vmemmap optimized flag of
>>>> + * each folio to determine if vmemmap was actually restored.
>>>> + */
>>>> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
>>>> + struct list_head *folio_list,
>>>> + unsigned long *restored)
>>>> +{
>>>> + unsigned long num_restored;
>>>> + struct folio *folio;
>>>> + int ret = 0, t_ret;
>>>> +
>>>> + num_restored = 0;
>>>> + list_for_each_entry(folio, folio_list, lru) {
>>>> + if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>>>> + t_ret = hugetlb_vmemmap_restore(h, &folio->page);
>>> 
>>> I still think we should free a non-optimized HugeTLB page if we
>>> encounter an OOM situation instead of continue to restore
>>> vemmmap pages. Restoring vmemmmap pages will only aggravate
>>> the OOM situation. The suitable appraoch is to free a non-optimized
>>> HugeTLB page to satisfy our allocation of vmemmap pages, what's
>>> your opinion, Mike?
>> 
>> I agree.
>> 
>> As you mentioned previously, this may complicate this code path a bit.
>> I will rewrite to make this happen.
> 
> Maybe we could introduced two list passed to update_and_free_pages_bulk (this
> will be easy for the callers of it), one is for non-optimized huge page,
> another is optimized one. In update_and_free_pages_bulk, we could first
> free those non-optimized huge page, and then restore vemmmap pages for
> those optimized ones, in which case, the code could be simple.
> hugetlb_vmemmap_restore_folios() dose not need to add complexity, which
> still continue to restore vmemmap pages and will stop once we encounter
> an OOM situation.

BTW, maybe we should try again iff there are some non-optimized huge page
whose vmemmap pages are restored successfully previously and could be freed
first, then continue to restore the vmemmap pages of the remaining huge pages.
I think the retry code could be done in update_and_free_pages_bulk() as well.

> 
> Thanks.
> 
>> -- 
>> Mike Kravetz
Mike Kravetz Sept. 21, 2023, 1:12 a.m. UTC | #5
On 09/20/23 11:03, Muchun Song wrote:
> > On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
> >> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >> On 09/19/23 17:52, Muchun Song wrote:
> >>> On 2023/9/19 07:01, Mike Kravetz wrote:
> >>> 
> >>> I still think we should free a non-optimized HugeTLB page if we
> >>> encounter an OOM situation instead of continue to restore
> >>> vemmmap pages. Restoring vmemmmap pages will only aggravate
> >>> the OOM situation. The suitable appraoch is to free a non-optimized
> >>> HugeTLB page to satisfy our allocation of vmemmap pages, what's
> >>> your opinion, Mike?
> >> 
> >> I agree.
> >> 
> >> As you mentioned previously, this may complicate this code path a bit.
> >> I will rewrite to make this happen.
> > 
> > Maybe we could introduced two list passed to update_and_free_pages_bulk (this
> > will be easy for the callers of it), one is for non-optimized huge page,
> > another is optimized one. In update_and_free_pages_bulk, we could first
> > free those non-optimized huge page, and then restore vemmmap pages for
> > those optimized ones, in which case, the code could be simple.
> > hugetlb_vmemmap_restore_folios() dose not need to add complexity, which
> > still continue to restore vmemmap pages and will stop once we encounter
> > an OOM situation.

I am not sure if passing in optimized and non-optimized lists to
update_and_free_pages_bulk will help much.  IIUC, it will almost always
be the case where only one list has entries.  Is that mostly accurate?

> BTW, maybe we should try again iff there are some non-optimized huge page
> whose vmemmap pages are restored successfully previously and could be freed
> first, then continue to restore the vmemmap pages of the remaining huge pages.
> I think the retry code could be done in update_and_free_pages_bulk() as well.

I came up with a new routine to handle these ENOMEM returns from
hugetlb_vmemmap_restore_folios.  I 'think' it handles these situations.
Here is an updated version of this patch.  Sorry, diff makes it a bit
hard to read.

From b13bdccb01730f995191944769f87d0725c289ad Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Sun, 10 Sep 2023 16:14:50 -0700
Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages

The routine update_and_free_pages_bulk already performs vmemmap
restoration on the list of hugetlb pages in a separate step.  In
preparation for more functionality to be added in this step, create a
new routine hugetlb_vmemmap_restore_folios() that will restore
vmemmap for a list of folios.

This new routine must provide sufficient feedback about errors and
actual restoration performed so that update_and_free_pages_bulk can
perform optimally.

Special care must be taken when encountering a ENOMEM error from
hugetlb_vmemmap_restore_folios.  We want to continue making as much
forward progress as possible.  A new routine bulk_vmemmap_restore_enomem
handles this specific situation.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c         | 83 ++++++++++++++++++++++++++++++++++----------
 mm/hugetlb_vmemmap.c | 39 +++++++++++++++++++++
 mm/hugetlb_vmemmap.h | 11 ++++++
 3 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 70fedf8682c4..52abe56cf38a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1834,38 +1834,85 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
 		schedule_work(&free_hpage_work);
 }
 
-static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+static void bulk_vmemmap_restore_enomem(struct hstate *h,
+						struct list_head *list,
+						unsigned long restored)
 {
 	struct folio *folio, *t_folio;
-	bool clear_dtor = false;
 
-	/*
-	 * First allocate required vmemmmap (if necessary) for all folios on
-	 * list.  If vmemmap can not be allocated, we can not free folio to
-	 * lower level allocator, so add back as hugetlb surplus page.
-	 * add_hugetlb_folio() removes the page from THIS list.
-	 * Use clear_dtor to note if vmemmap was successfully allocated for
-	 * ANY page on the list.
-	 */
-	list_for_each_entry_safe(folio, t_folio, list, lru) {
-		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+	if (restored) {
+		/*
+		 * On ENOMEM error, free any restored hugetlb pages so that
+		 * restore of the entire list can be retried.
+		 * The idea is that by freeing hugetlb pages with vmemmap
+		 * (those previously restored) we will free up memory so that
+		 * we can allocate vmemmap for more hugetlb pages.
+		 * We must examine and possibly free EVERY hugetlb page on list
+		 * in order to call hugetlb_vmemmap_restore_folios again.
+		 * This is not optimal, but is an error case that should not
+		 * happen frequently.
+		 */
+		list_for_each_entry_safe(folio, t_folio, list, lru)
+			if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
+				list_del(&folio->lru);
+				spin_lock_irq(&hugetlb_lock);
+				__clear_hugetlb_destructor(h, folio);
+				spin_unlock_irq(&hugetlb_lock);
+				update_and_free_hugetlb_folio(h, folio, false);
+				cond_resched();
+			}
+	} else {
+		/*
+		 * In the case where vmemmap was not restored for ANY folios,
+		 * we loop through them trying to restore individually in the
+		 * hope that someone elsewhere may free enough memory.
+		 * If unable to restore a page, the hugetlb page is made a
+		 * surplus page and removed from the list.
+		 * If are able to restore vmemmap for one hugetlb page, we free
+		 * it and quit processing the list to retry the bulk operation.
+		 */
+		list_for_each_entry_safe(folio, t_folio, list, lru)
 			if (hugetlb_vmemmap_restore(h, &folio->page)) {
 				spin_lock_irq(&hugetlb_lock);
 				add_hugetlb_folio(h, folio, true);
 				spin_unlock_irq(&hugetlb_lock);
-			} else
-				clear_dtor = true;
-		}
+			} else {
+				list_del(&folio->lru);
+				spin_lock_irq(&hugetlb_lock);
+				__clear_hugetlb_destructor(h, folio);
+				spin_unlock_irq(&hugetlb_lock);
+				update_and_free_hugetlb_folio(h, folio, false);
+				break;
+			}
 	}
+}
+
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+	int ret;
+	unsigned long restored;
+	struct folio *folio, *t_folio;
 
 	/*
-	 * If vmemmmap allocation was performed on any folio above, take lock
-	 * to clear destructor of all folios on list.  This avoids the need to
+	 * First allocate required vmemmmap (if necessary) for all folios.
+	 * Carefully handle ENOMEM errors and free up any available hugetlb
+	 * pages in order to make forward progress.
+	 */
+retry:
+	ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
+	if (ret == -ENOMEM) {
+		bulk_vmemmap_restore_enomem(h, list, restored);
+		goto retry;
+	}
+
+	/*
+	 * If vmemmmap allocation was performed on ANY folio , take lock to
+	 * clear destructor of all folios on list.  This avoids the need to
 	 * lock/unlock for each individual folio.
 	 * The assumption is vmemmap allocation was performed on all or none
 	 * of the folios on the list.  This is true expect in VERY rare cases.
 	 */
-	if (clear_dtor) {
+	if (restored) {
 		spin_lock_irq(&hugetlb_lock);
 		list_for_each_entry(folio, list, lru)
 			__clear_hugetlb_destructor(h, folio);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4558b814ffab..cc91edbfb68b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -480,6 +480,45 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 	return ret;
 }
 
+/**
+ * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
+ * @h:		struct hstate.
+ * @folio_list:	list of folios.
+ * @restored:	Set to number of folios for which vmemmap was restored
+ *		successfully if caller passes a non-NULL pointer.
+ *
+ * Return: %0 if vmemmap exists for all folios on the list.  If an error is
+ *		encountered restoring vmemmap for ANY folio, an error code
+ *		will be returned to the caller.  It is then the responsibility
+ *		of the caller to check the hugetlb vmemmap optimized flag of
+ *		each folio to determine if vmemmap was actually restored.
+ *		Note that processing is stopped when first error is encountered.
+ */
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					unsigned long *restored)
+{
+	unsigned long num_restored;
+	struct folio *folio;
+	int ret = 0;
+
+	num_restored = 0;
+	list_for_each_entry(folio, folio_list, lru) {
+		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+			ret = hugetlb_vmemmap_restore(h, &folio->page);
+			if (ret)
+				goto out;
+			else
+				num_restored++;
+		}
+	}
+
+out:
+	if (*restored)
+		*restored = num_restored;
+	return ret;
+}
+
 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index c512e388dbb4..bb58453c3cc0 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -19,6 +19,8 @@
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+			struct list_head *folio_list, unsigned long *restored);
 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
 
@@ -45,6 +47,15 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
 	return 0;
 }
 
+static inline int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					unsigned long *restored)
+{
+	if (restored)
+		*restored = 0;
+	return 0;
+}
+
 static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 }
Muchun Song Sept. 21, 2023, 9:31 a.m. UTC | #6
On 2023/9/21 09:12, Mike Kravetz wrote:
> On 09/20/23 11:03, Muchun Song wrote:
>>> On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
>>>> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>> On 09/19/23 17:52, Muchun Song wrote:
>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>
>>>>> I still think we should free a non-optimized HugeTLB page if we
>>>>> encounter an OOM situation instead of continue to restore
>>>>> vemmmap pages. Restoring vmemmmap pages will only aggravate
>>>>> the OOM situation. The suitable appraoch is to free a non-optimized
>>>>> HugeTLB page to satisfy our allocation of vmemmap pages, what's
>>>>> your opinion, Mike?
>>>> I agree.
>>>>
>>>> As you mentioned previously, this may complicate this code path a bit.
>>>> I will rewrite to make this happen.
>>> Maybe we could introduced two list passed to update_and_free_pages_bulk (this
>>> will be easy for the callers of it), one is for non-optimized huge page,
>>> another is optimized one. In update_and_free_pages_bulk, we could first
>>> free those non-optimized huge page, and then restore vemmmap pages for
>>> those optimized ones, in which case, the code could be simple.
>>> hugetlb_vmemmap_restore_folios() dose not need to add complexity, which
>>> still continue to restore vmemmap pages and will stop once we encounter
>>> an OOM situation.
> I am not sure if passing in optimized and non-optimized lists to
> update_and_free_pages_bulk will help much.  IIUC, it will almost always
> be the case where only one list has entries.  Is that mostly accurate?

I think you are right. It will be less helpful since most of
pages will be not optimized when HVO is enabled.

>> BTW, maybe we should try again iff there are some non-optimized huge page
>> whose vmemmap pages are restored successfully previously and could be freed
>> first, then continue to restore the vmemmap pages of the remaining huge pages.
>> I think the retry code could be done in update_and_free_pages_bulk() as well.
> I came up with a new routine to handle these ENOMEM returns from
> hugetlb_vmemmap_restore_folios.  I 'think' it handles these situations.
> Here is an updated version of this patch.  Sorry, diff makes it a bit
> hard to read.
>
>  From b13bdccb01730f995191944769f87d0725c289ad Mon Sep 17 00:00:00 2001
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Date: Sun, 10 Sep 2023 16:14:50 -0700
> Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages
>
> The routine update_and_free_pages_bulk already performs vmemmap
> restoration on the list of hugetlb pages in a separate step.  In
> preparation for more functionality to be added in this step, create a
> new routine hugetlb_vmemmap_restore_folios() that will restore
> vmemmap for a list of folios.
>
> This new routine must provide sufficient feedback about errors and
> actual restoration performed so that update_and_free_pages_bulk can
> perform optimally.
>
> Special care must be taken when encountering a ENOMEM error from
> hugetlb_vmemmap_restore_folios.  We want to continue making as much
> forward progress as possible.  A new routine bulk_vmemmap_restore_enomem
> handles this specific situation.
>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb.c         | 83 ++++++++++++++++++++++++++++++++++----------
>   mm/hugetlb_vmemmap.c | 39 +++++++++++++++++++++
>   mm/hugetlb_vmemmap.h | 11 ++++++
>   3 files changed, 115 insertions(+), 18 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 70fedf8682c4..52abe56cf38a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1834,38 +1834,85 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>   		schedule_work(&free_hpage_work);
>   }
>   
> -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> +static void bulk_vmemmap_restore_enomem(struct hstate *h,
> +						struct list_head *list,
> +						unsigned long restored)
>   {
>   	struct folio *folio, *t_folio;
> -	bool clear_dtor = false;
>   
> -	/*
> -	 * First allocate required vmemmmap (if necessary) for all folios on
> -	 * list.  If vmemmap can not be allocated, we can not free folio to
> -	 * lower level allocator, so add back as hugetlb surplus page.
> -	 * add_hugetlb_folio() removes the page from THIS list.
> -	 * Use clear_dtor to note if vmemmap was successfully allocated for
> -	 * ANY page on the list.
> -	 */
> -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> -		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> +	if (restored) {
> +		/*
> +		 * On ENOMEM error, free any restored hugetlb pages so that
> +		 * restore of the entire list can be retried.
> +		 * The idea is that by freeing hugetlb pages with vmemmap
> +		 * (those previously restored) we will free up memory so that
> +		 * we can allocate vmemmap for more hugetlb pages.
> +		 * We must examine and possibly free EVERY hugetlb page on list
> +		 * in order to call hugetlb_vmemmap_restore_folios again.
> +		 * This is not optimal, but is an error case that should not
> +		 * happen frequently.
> +		 */
> +		list_for_each_entry_safe(folio, t_folio, list, lru)
> +			if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
> +				list_del(&folio->lru);
> +				spin_lock_irq(&hugetlb_lock);
> +				__clear_hugetlb_destructor(h, folio);
> +				spin_unlock_irq(&hugetlb_lock);
> +				update_and_free_hugetlb_folio(h, folio, false);
> +				cond_resched();
> +			}
> +	} else {
> +		/*
> +		 * In the case where vmemmap was not restored for ANY folios,
> +		 * we loop through them trying to restore individually in the
> +		 * hope that someone elsewhere may free enough memory.
> +		 * If unable to restore a page, the hugetlb page is made a
> +		 * surplus page and removed from the list.
> +		 * If are able to restore vmemmap for one hugetlb page, we free
> +		 * it and quit processing the list to retry the bulk operation.
> +		 */
> +		list_for_each_entry_safe(folio, t_folio, list, lru)
>   			if (hugetlb_vmemmap_restore(h, &folio->page)) {
>   				spin_lock_irq(&hugetlb_lock);
>   				add_hugetlb_folio(h, folio, true);
>   				spin_unlock_irq(&hugetlb_lock);
> -			} else
> -				clear_dtor = true;
> -		}
> +			} else {
> +				list_del(&folio->lru);
> +				spin_lock_irq(&hugetlb_lock);
> +				__clear_hugetlb_destructor(h, folio);
> +				spin_unlock_irq(&hugetlb_lock);
> +				update_and_free_hugetlb_folio(h, folio, false);
> +				break;
> +			}
>   	}
> +}
> +
> +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> +{
> +	int ret;
> +	unsigned long restored;
> +	struct folio *folio, *t_folio;
>   
>   	/*
> -	 * If vmemmmap allocation was performed on any folio above, take lock
> -	 * to clear destructor of all folios on list.  This avoids the need to
> +	 * First allocate required vmemmmap (if necessary) for all folios.
> +	 * Carefully handle ENOMEM errors and free up any available hugetlb
> +	 * pages in order to make forward progress.
> +	 */
> +retry:
> +	ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
> +	if (ret == -ENOMEM) {
> +		bulk_vmemmap_restore_enomem(h, list, restored);
> +		goto retry;
> +	}
> +
> +	/*
> +	 * If vmemmmap allocation was performed on ANY folio , take lock to
> +	 * clear destructor of all folios on list.  This avoids the need to
>   	 * lock/unlock for each individual folio.
>   	 * The assumption is vmemmap allocation was performed on all or none
>   	 * of the folios on the list.  This is true expect in VERY rare cases.
>   	 */
> -	if (clear_dtor) {
> +	if (restored) {
>   		spin_lock_irq(&hugetlb_lock);
>   		list_for_each_entry(folio, list, lru)
>   			__clear_hugetlb_destructor(h, folio);
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 4558b814ffab..cc91edbfb68b 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -480,6 +480,45 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>   	return ret;
>   }
>   
> +/**
> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> + * @h:		struct hstate.
> + * @folio_list:	list of folios.
> + * @restored:	Set to number of folios for which vmemmap was restored
> + *		successfully if caller passes a non-NULL pointer.
> + *
> + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
> + *		encountered restoring vmemmap for ANY folio, an error code
> + *		will be returned to the caller.  It is then the responsibility
> + *		of the caller to check the hugetlb vmemmap optimized flag of
> + *		each folio to determine if vmemmap was actually restored.
> + *		Note that processing is stopped when first error is encountered.
> + */
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					unsigned long *restored)

How about changing parameter of @restored to a list_head type which
returns the non-optimized (previously) or vmemmap-restored-sucessful 
huge pages?
In which case, the caller could traverse this returned list to free
them first like you have implemented in bulk_vmemmap_restore_enomem(),
it will be more efficient. The meaning of returned value should also
be changed accordingly since update_and_free_pages_bulk() wants to
whether there is a vmemmap-optimized huge page being restored sucessfully
to determine if it should clear hugetlb flag. So 
hugetlb_vmemmap_restore_folios()
could return how many huge pages being restored successful, if a negative
number is returned meaning there is some error in the process of restoring
of vmemmap.

Thanks.

> +{
> +	unsigned long num_restored;
> +	struct folio *folio;
> +	int ret = 0;
> +
> +	num_restored = 0;
> +	list_for_each_entry(folio, folio_list, lru) {
> +		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> +			ret = hugetlb_vmemmap_restore(h, &folio->page);
> +			if (ret)
> +				goto out;
> +			else
> +				num_restored++;
> +		}
> +	}
> +
> +out:
> +	if (*restored)
> +		*restored = num_restored;
> +	return ret;
> +}
> +
>   /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
>   static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
>   {
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index c512e388dbb4..bb58453c3cc0 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -19,6 +19,8 @@
>   
>   #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +			struct list_head *folio_list, unsigned long *restored);
>   void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
>   
> @@ -45,6 +47,15 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
>   	return 0;
>   }
>   
> +static inline int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					unsigned long *restored)
> +{
> +	if (restored)
> +		*restored = 0;
> +	return 0;
> +}
> +
>   static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   {
>   }
Muchun Song Sept. 21, 2023, 9:47 a.m. UTC | #7
On 2023/9/21 17:31, Muchun Song wrote:
>
>
> On 2023/9/21 09:12, Mike Kravetz wrote:
>> On 09/20/23 11:03, Muchun Song wrote:
>>>> On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
>>>>> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> 
>>>>> wrote:
>>>>> On 09/19/23 17:52, Muchun Song wrote:
>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>>>>>
>>>>>> I still think we should free a non-optimized HugeTLB page if we
>>>>>> encounter an OOM situation instead of continue to restore
>>>>>> vemmmap pages. Restoring vmemmmap pages will only aggravate
>>>>>> the OOM situation. The suitable appraoch is to free a non-optimized
>>>>>> HugeTLB page to satisfy our allocation of vmemmap pages, what's
>>>>>> your opinion, Mike?
>>>>> I agree.
>>>>>
>>>>> As you mentioned previously, this may complicate this code path a 
>>>>> bit.
>>>>> I will rewrite to make this happen.
>>>> Maybe we could introduced two list passed to 
>>>> update_and_free_pages_bulk (this
>>>> will be easy for the callers of it), one is for non-optimized huge 
>>>> page,
>>>> another is optimized one. In update_and_free_pages_bulk, we could 
>>>> first
>>>> free those non-optimized huge page, and then restore vemmmap pages for
>>>> those optimized ones, in which case, the code could be simple.
>>>> hugetlb_vmemmap_restore_folios() dose not need to add complexity, 
>>>> which
>>>> still continue to restore vmemmap pages and will stop once we 
>>>> encounter
>>>> an OOM situation.
>> I am not sure if passing in optimized and non-optimized lists to
>> update_and_free_pages_bulk will help much.  IIUC, it will almost always
>> be the case where only one list has entries.  Is that mostly accurate?
>
> I think you are right. It will be less helpful since most of
> pages will be not optimized when HVO is enabled.

Sorry, correction: **not** should be deleted.

>
>>> BTW, maybe we should try again iff there are some non-optimized huge 
>>> page
>>> whose vmemmap pages are restored successfully previously and could 
>>> be freed
>>> first, then continue to restore the vmemmap pages of the remaining 
>>> huge pages.
>>> I think the retry code could be done in update_and_free_pages_bulk() 
>>> as well.
>> I came up with a new routine to handle these ENOMEM returns from
>> hugetlb_vmemmap_restore_folios.  I 'think' it handles these situations.
>> Here is an updated version of this patch.  Sorry, diff makes it a bit
>> hard to read.
>>
>>  From b13bdccb01730f995191944769f87d0725c289ad Mon Sep 17 00:00:00 2001
>> From: Mike Kravetz <mike.kravetz@oracle.com>
>> Date: Sun, 10 Sep 2023 16:14:50 -0700
>> Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages
>>
>> The routine update_and_free_pages_bulk already performs vmemmap
>> restoration on the list of hugetlb pages in a separate step.  In
>> preparation for more functionality to be added in this step, create a
>> new routine hugetlb_vmemmap_restore_folios() that will restore
>> vmemmap for a list of folios.
>>
>> This new routine must provide sufficient feedback about errors and
>> actual restoration performed so that update_and_free_pages_bulk can
>> perform optimally.
>>
>> Special care must be taken when encountering a ENOMEM error from
>> hugetlb_vmemmap_restore_folios.  We want to continue making as much
>> forward progress as possible.  A new routine bulk_vmemmap_restore_enomem
>> handles this specific situation.
>>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>   mm/hugetlb.c         | 83 ++++++++++++++++++++++++++++++++++----------
>>   mm/hugetlb_vmemmap.c | 39 +++++++++++++++++++++
>>   mm/hugetlb_vmemmap.h | 11 ++++++
>>   3 files changed, 115 insertions(+), 18 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 70fedf8682c4..52abe56cf38a 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1834,38 +1834,85 @@ static void 
>> update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>>           schedule_work(&free_hpage_work);
>>   }
>>   -static void update_and_free_pages_bulk(struct hstate *h, struct 
>> list_head *list)
>> +static void bulk_vmemmap_restore_enomem(struct hstate *h,
>> +                        struct list_head *list,
>> +                        unsigned long restored)
>>   {
>>       struct folio *folio, *t_folio;
>> -    bool clear_dtor = false;
>>   -    /*
>> -     * First allocate required vmemmmap (if necessary) for all 
>> folios on
>> -     * list.  If vmemmap can not be allocated, we can not free folio to
>> -     * lower level allocator, so add back as hugetlb surplus page.
>> -     * add_hugetlb_folio() removes the page from THIS list.
>> -     * Use clear_dtor to note if vmemmap was successfully allocated for
>> -     * ANY page on the list.
>> -     */
>> -    list_for_each_entry_safe(folio, t_folio, list, lru) {
>> -        if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>> +    if (restored) {
>> +        /*
>> +         * On ENOMEM error, free any restored hugetlb pages so that
>> +         * restore of the entire list can be retried.
>> +         * The idea is that by freeing hugetlb pages with vmemmap
>> +         * (those previously restored) we will free up memory so that
>> +         * we can allocate vmemmap for more hugetlb pages.
>> +         * We must examine and possibly free EVERY hugetlb page on list
>> +         * in order to call hugetlb_vmemmap_restore_folios again.
>> +         * This is not optimal, but is an error case that should not
>> +         * happen frequently.
>> +         */
>> +        list_for_each_entry_safe(folio, t_folio, list, lru)
>> +            if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
>> +                list_del(&folio->lru);
>> +                spin_lock_irq(&hugetlb_lock);
>> +                __clear_hugetlb_destructor(h, folio);
>> +                spin_unlock_irq(&hugetlb_lock);
>> +                update_and_free_hugetlb_folio(h, folio, false);
>> +                cond_resched();
>> +            }
>> +    } else {
>> +        /*
>> +         * In the case where vmemmap was not restored for ANY folios,
>> +         * we loop through them trying to restore individually in the
>> +         * hope that someone elsewhere may free enough memory.
>> +         * If unable to restore a page, the hugetlb page is made a
>> +         * surplus page and removed from the list.
>> +         * If are able to restore vmemmap for one hugetlb page, we free
>> +         * it and quit processing the list to retry the bulk operation.
>> +         */
>> +        list_for_each_entry_safe(folio, t_folio, list, lru)
>>               if (hugetlb_vmemmap_restore(h, &folio->page)) {
>>                   spin_lock_irq(&hugetlb_lock);
>>                   add_hugetlb_folio(h, folio, true);
>>                   spin_unlock_irq(&hugetlb_lock);
>> -            } else
>> -                clear_dtor = true;
>> -        }
>> +            } else {
>> +                list_del(&folio->lru);
>> +                spin_lock_irq(&hugetlb_lock);
>> +                __clear_hugetlb_destructor(h, folio);
>> +                spin_unlock_irq(&hugetlb_lock);
>> +                update_and_free_hugetlb_folio(h, folio, false);
>> +                break;
>> +            }
>>       }
>> +}
>> +
>> +static void update_and_free_pages_bulk(struct hstate *h, struct 
>> list_head *list)
>> +{
>> +    int ret;
>> +    unsigned long restored;
>> +    struct folio *folio, *t_folio;
>>         /*
>> -     * If vmemmmap allocation was performed on any folio above, take 
>> lock
>> -     * to clear destructor of all folios on list.  This avoids the 
>> need to
>> +     * First allocate required vmemmmap (if necessary) for all folios.
>> +     * Carefully handle ENOMEM errors and free up any available hugetlb
>> +     * pages in order to make forward progress.
>> +     */
>> +retry:
>> +    ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
>> +    if (ret == -ENOMEM) {
>> +        bulk_vmemmap_restore_enomem(h, list, restored);
>> +        goto retry;
>> +    }
>> +
>> +    /*
>> +     * If vmemmmap allocation was performed on ANY folio , take lock to
>> +     * clear destructor of all folios on list.  This avoids the need to
>>        * lock/unlock for each individual folio.
>>        * The assumption is vmemmap allocation was performed on all or 
>> none
>>        * of the folios on the list.  This is true expect in VERY rare 
>> cases.
>>        */
>> -    if (clear_dtor) {
>> +    if (restored) {
>>           spin_lock_irq(&hugetlb_lock);
>>           list_for_each_entry(folio, list, lru)
>>               __clear_hugetlb_destructor(h, folio);
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index 4558b814ffab..cc91edbfb68b 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -480,6 +480,45 @@ int hugetlb_vmemmap_restore(const struct hstate 
>> *h, struct page *head)
>>       return ret;
>>   }
>>   +/**
>> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio 
>> on the list.
>> + * @h:        struct hstate.
>> + * @folio_list:    list of folios.
>> + * @restored:    Set to number of folios for which vmemmap was restored
>> + *        successfully if caller passes a non-NULL pointer.
>> + *
>> + * Return: %0 if vmemmap exists for all folios on the list.  If an 
>> error is
>> + *        encountered restoring vmemmap for ANY folio, an error code
>> + *        will be returned to the caller.  It is then the 
>> responsibility
>> + *        of the caller to check the hugetlb vmemmap optimized flag of
>> + *        each folio to determine if vmemmap was actually restored.
>> + *        Note that processing is stopped when first error is 
>> encountered.
>> + */
>> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
>> +                    struct list_head *folio_list,
>> +                    unsigned long *restored)
>
> How about changing parameter of @restored to a list_head type which
> returns the non-optimized (previously) or vmemmap-restored-sucessful 
> huge pages?
> In which case, the caller could traverse this returned list to free
> them first like you have implemented in bulk_vmemmap_restore_enomem(),
> it will be more efficient. The meaning of returned value should also
> be changed accordingly since update_and_free_pages_bulk() wants to
> whether there is a vmemmap-optimized huge page being restored sucessfully
> to determine if it should clear hugetlb flag. So 
> hugetlb_vmemmap_restore_folios()
> could return how many huge pages being restored successful, if a negative
> number is returned meaning there is some error in the process of 
> restoring
> of vmemmap.
>
> Thanks.
>
>> +{
>> +    unsigned long num_restored;
>> +    struct folio *folio;
>> +    int ret = 0;
>> +
>> +    num_restored = 0;
>> +    list_for_each_entry(folio, folio_list, lru) {
>> +        if (folio_test_hugetlb_vmemmap_optimized(folio)) {
>> +            ret = hugetlb_vmemmap_restore(h, &folio->page);
>> +            if (ret)
>> +                goto out;
>> +            else
>> +                num_restored++;
>> +        }
>> +    }
>> +
>> +out:
>> +    if (*restored)
>> +        *restored = num_restored;
>> +    return ret;
>> +}
>> +
>>   /* Return true iff a HugeTLB whose vmemmap should and can be 
>> optimized. */
>>   static bool vmemmap_should_optimize(const struct hstate *h, const 
>> struct page *head)
>>   {
>> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
>> index c512e388dbb4..bb58453c3cc0 100644
>> --- a/mm/hugetlb_vmemmap.h
>> +++ b/mm/hugetlb_vmemmap.h
>> @@ -19,6 +19,8 @@
>>     #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>   int hugetlb_vmemmap_restore(const struct hstate *h, struct page 
>> *head);
>> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
>> +            struct list_head *folio_list, unsigned long *restored);
>>   void hugetlb_vmemmap_optimize(const struct hstate *h, struct page 
>> *head);
>>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct 
>> list_head *folio_list);
>>   @@ -45,6 +47,15 @@ static inline int hugetlb_vmemmap_restore(const 
>> struct hstate *h, struct page *h
>>       return 0;
>>   }
>>   +static inline int hugetlb_vmemmap_restore_folios(const struct 
>> hstate *h,
>> +                    struct list_head *folio_list,
>> +                    unsigned long *restored)
>> +{
>> +    if (restored)
>> +        *restored = 0;
>> +    return 0;
>> +}
>> +
>>   static inline void hugetlb_vmemmap_optimize(const struct hstate *h, 
>> struct page *head)
>>   {
>>   }
>
Mike Kravetz Sept. 21, 2023, 9:58 p.m. UTC | #8
On 09/21/23 17:31, Muchun Song wrote:
> 
> 
> On 2023/9/21 09:12, Mike Kravetz wrote:
> > On 09/20/23 11:03, Muchun Song wrote:
> > > > On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
> > > > > On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > > On 09/19/23 17:52, Muchun Song wrote:
> > > > > > On 2023/9/19 07:01, Mike Kravetz wrote:
> > +/**
> > + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> > + * @h:		struct hstate.
> > + * @folio_list:	list of folios.
> > + * @restored:	Set to number of folios for which vmemmap was restored
> > + *		successfully if caller passes a non-NULL pointer.
> > + *
> > + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
> > + *		encountered restoring vmemmap for ANY folio, an error code
> > + *		will be returned to the caller.  It is then the responsibility
> > + *		of the caller to check the hugetlb vmemmap optimized flag of
> > + *		each folio to determine if vmemmap was actually restored.
> > + *		Note that processing is stopped when first error is encountered.
> > + */
> > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > +					struct list_head *folio_list,
> > +					unsigned long *restored)
> 
> How about changing parameter of @restored to a list_head type which
> returns the non-optimized (previously) or vmemmap-restored-sucessful huge
> pages?
> In which case, the caller could traverse this returned list to free
> them first like you have implemented in bulk_vmemmap_restore_enomem(),
> it will be more efficient. The meaning of returned value should also
> be changed accordingly since update_and_free_pages_bulk() wants to
> whether there is a vmemmap-optimized huge page being restored sucessfully
> to determine if it should clear hugetlb flag. So
> hugetlb_vmemmap_restore_folios()
> could return how many huge pages being restored successful, if a negative
> number is returned meaning there is some error in the process of restoring
> of vmemmap.
> 

I had similar thoughts.  An updated patch based on this approach is below.
When creating the patch, I discovered that using the function return code
for both number of vmemmap restored pages as well as error code was
unnecessary.  Just checking !list_empty() of non-optimized pages tells us
if any were restored or could be freed.

From b79f6eeb7a11644830bddfc43d2219c149d26405 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Sun, 10 Sep 2023 16:14:50 -0700
Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages

The routine update_and_free_pages_bulk already performs vmemmap
restoration on the list of hugetlb pages in a separate step.  In
preparation for more functionality to be added in this step, create a
new routine hugetlb_vmemmap_restore_folios() that will restore
vmemmap for a list of folios.

This new routine must provide sufficient feedback about errors and
actual restoration performed so that update_and_free_pages_bulk can
perform optimally.

Special care must be taken when encountering an error from
hugetlb_vmemmap_restore_folios.  We want to continue making as much
forward progress as possible.  A new routine bulk_vmemmap_restore_error
handles this specific situation.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb.c         | 94 +++++++++++++++++++++++++++++++-------------
 mm/hugetlb_vmemmap.c | 36 +++++++++++++++++
 mm/hugetlb_vmemmap.h | 10 +++++
 3 files changed, 112 insertions(+), 28 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 70fedf8682c4..11de3f885065 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1834,50 +1834,88 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
 		schedule_work(&free_hpage_work);
 }
 
-static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+static void bulk_vmemmap_restore_error(struct hstate *h,
+					struct list_head *list,
+					struct list_head *non_op_folios)
 {
 	struct folio *folio, *t_folio;
-	bool clear_dtor = false;
 
-	/*
-	 * First allocate required vmemmmap (if necessary) for all folios on
-	 * list.  If vmemmap can not be allocated, we can not free folio to
-	 * lower level allocator, so add back as hugetlb surplus page.
-	 * add_hugetlb_folio() removes the page from THIS list.
-	 * Use clear_dtor to note if vmemmap was successfully allocated for
-	 * ANY page on the list.
-	 */
-	list_for_each_entry_safe(folio, t_folio, list, lru) {
-		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+	if (!list_empty(non_op_folios)) {
+		/*
+		 * Free any restored hugetlb pages so that restore of the
+		 * entire list can be retried.
+		 * The idea is that in the common case of ENOMEM errors freeing
+		 * hugetlb pages with vmemmap we will free up memory so that we
+		 * can allocate vmemmap for more hugetlb pages.
+		 */
+		list_for_each_entry_safe(folio, t_folio, non_op_folios, lru) {
+			list_del(&folio->lru);
+			spin_lock_irq(&hugetlb_lock);
+			__clear_hugetlb_destructor(h, folio);
+			spin_unlock_irq(&hugetlb_lock);
+			update_and_free_hugetlb_folio(h, folio, false);
+			cond_resched();
+		}
+	} else {
+		/*
+		 * In the case where vmemmap was not restored for ANY folios,
+		 * we loop through them trying to restore individually in the
+		 * hope that someone elsewhere may have done something to cause
+		 * success (such as freeing some memory).
+		 * If unable to restore a hugetlb page, the hugetlb page is
+		 * made a surplus page and removed from the list.
+		 * If are able to restore vmemmap for one hugetlb page, we free
+		 * it and quit processing the list to retry the bulk operation.
+		 */
+		list_for_each_entry_safe(folio, t_folio, list, lru)
 			if (hugetlb_vmemmap_restore(h, &folio->page)) {
 				spin_lock_irq(&hugetlb_lock);
 				add_hugetlb_folio(h, folio, true);
 				spin_unlock_irq(&hugetlb_lock);
-			} else
-				clear_dtor = true;
-		}
+			} else {
+				list_del(&folio->lru);
+				spin_lock_irq(&hugetlb_lock);
+				__clear_hugetlb_destructor(h, folio);
+				spin_unlock_irq(&hugetlb_lock);
+				update_and_free_hugetlb_folio(h, folio, false);
+				cond_resched();
+				break;
+			}
 	}
+}
+
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+	int ret;
+	LIST_HEAD(non_op_folio);
+	struct folio *folio, *t_folio;
 
 	/*
-	 * If vmemmmap allocation was performed on any folio above, take lock
-	 * to clear destructor of all folios on list.  This avoids the need to
-	 * lock/unlock for each individual folio.
-	 * The assumption is vmemmap allocation was performed on all or none
-	 * of the folios on the list.  This is true expect in VERY rare cases.
+	 * First allocate required vmemmmap (if necessary) for all folios.
+	 * Carefully handle errors and free up any available hugetlb pages
+	 * in an effort to make forward progress.
 	 */
-	if (clear_dtor) {
+retry:
+	ret = hugetlb_vmemmap_restore_folios(h, list, &non_op_folio);
+	if (ret < 0) {
+		bulk_vmemmap_restore_error(h, list, &non_op_folio);
+		goto retry;
+	}
+
+	/*
+	 * At this point, list should be empty, and there should only be
+	 * pages on the non_op_folio list.  free those entries.  Do note
+	 * that the non_op_folio list could be empty.
+	 */
+	VM_WARN_ON(!list_empty(list));
+	if (!list_empty(&non_op_folio)) {
 		spin_lock_irq(&hugetlb_lock);
-		list_for_each_entry(folio, list, lru)
+		list_for_each_entry(folio, &non_op_folio, lru)
 			__clear_hugetlb_destructor(h, folio);
 		spin_unlock_irq(&hugetlb_lock);
 	}
 
-	/*
-	 * Free folios back to low level allocators.  vmemmap and destructors
-	 * were taken care of above, so update_and_free_hugetlb_folio will
-	 * not need to take hugetlb lock.
-	 */
-	list_for_each_entry_safe(folio, t_folio, list, lru) {
+	list_for_each_entry_safe(folio, t_folio, &non_op_folio, lru) {
 		update_and_free_hugetlb_folio(h, folio, false);
 		cond_resched();
 	}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4558b814ffab..f827d4efcf8e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -480,6 +480,42 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 	return ret;
 }
 
+/**
+ * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
+ * @h:			hstate.
+ * @folio_list:		list of folios.
+ * @non_op_list:	Output list of folio for which vmemmap exists.
+ *
+ * Return: %0 if vmemmap exists for all folios on the list and all entries have
+ *		been moved to non_op_list.  If an error is encountered restoring
+ *		vmemmap for ANY folio, an error code will be returned to the
+ *		caller.  Processing en entries stops when the first error is
+ *		encountered.  folios processed before the error with vmemmap
+ *		will reside on the non_op_list.  The folio that experienced the
+ *		error and all non-processed folios will remain on folio_list.
+ */
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					struct list_head *non_op_list)
+{
+	struct folio *folio, *t_folio;
+	int ret = 0;
+
+	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
+		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+			ret = hugetlb_vmemmap_restore(h, &folio->page);
+			if (ret)
+				goto out;
+		}
+
+		/* Add non-optimized folios to output list */
+		list_move(&folio->lru, non_op_list);
+	}
+
+out:
+	return ret;
+}
+
 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index c512e388dbb4..e6378ae5c5b6 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -19,6 +19,9 @@
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					struct list_head *non_op_folios);
 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
 
@@ -45,6 +48,13 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
 	return 0;
 }
 
+static int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					struct list_head *non_op_folios)
+{
+	return 0;
+}
+
 static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 }
Muchun Song Sept. 22, 2023, 8:19 a.m. UTC | #9
On 2023/9/22 05:58, Mike Kravetz wrote:
> On 09/21/23 17:31, Muchun Song wrote:
>>
>> On 2023/9/21 09:12, Mike Kravetz wrote:
>>> On 09/20/23 11:03, Muchun Song wrote:
>>>>> On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
>>>>>> On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>>> On 09/19/23 17:52, Muchun Song wrote:
>>>>>>> On 2023/9/19 07:01, Mike Kravetz wrote:
>>> +/**
>>> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
>>> + * @h:		struct hstate.
>>> + * @folio_list:	list of folios.
>>> + * @restored:	Set to number of folios for which vmemmap was restored
>>> + *		successfully if caller passes a non-NULL pointer.
>>> + *
>>> + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
>>> + *		encountered restoring vmemmap for ANY folio, an error code
>>> + *		will be returned to the caller.  It is then the responsibility
>>> + *		of the caller to check the hugetlb vmemmap optimized flag of
>>> + *		each folio to determine if vmemmap was actually restored.
>>> + *		Note that processing is stopped when first error is encountered.
>>> + */
>>> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
>>> +					struct list_head *folio_list,
>>> +					unsigned long *restored)
>> How about changing parameter of @restored to a list_head type which
>> returns the non-optimized (previously) or vmemmap-restored-sucessful huge
>> pages?
>> In which case, the caller could traverse this returned list to free
>> them first like you have implemented in bulk_vmemmap_restore_enomem(),
>> it will be more efficient. The meaning of returned value should also
>> be changed accordingly since update_and_free_pages_bulk() wants to
>> whether there is a vmemmap-optimized huge page being restored sucessfully
>> to determine if it should clear hugetlb flag. So
>> hugetlb_vmemmap_restore_folios()
>> could return how many huge pages being restored successful, if a negative
>> number is returned meaning there is some error in the process of restoring
>> of vmemmap.
>>
> I had similar thoughts.  An updated patch based on this approach is below.
> When creating the patch, I discovered that using the function return code
> for both number of vmemmap restored pages as well as error code was
> unnecessary.  Just checking !list_empty() of non-optimized pages tells us
> if any were restored or could be freed.

I also thought about this. But there is a little different. If HVO
is disabled, we will always clear the hugetlb flag twice since the
list couldn't be empty, I thought it is an optimization for HVO-disabled
case.

>
>  From b79f6eeb7a11644830bddfc43d2219c149d26405 Mon Sep 17 00:00:00 2001
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Date: Sun, 10 Sep 2023 16:14:50 -0700
> Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages
>
> The routine update_and_free_pages_bulk already performs vmemmap
> restoration on the list of hugetlb pages in a separate step.  In
> preparation for more functionality to be added in this step, create a
> new routine hugetlb_vmemmap_restore_folios() that will restore
> vmemmap for a list of folios.
>
> This new routine must provide sufficient feedback about errors and
> actual restoration performed so that update_and_free_pages_bulk can
> perform optimally.
>
> Special care must be taken when encountering an error from
> hugetlb_vmemmap_restore_folios.  We want to continue making as much
> forward progress as possible.  A new routine bulk_vmemmap_restore_error
> handles this specific situation.
>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb.c         | 94 +++++++++++++++++++++++++++++++-------------
>   mm/hugetlb_vmemmap.c | 36 +++++++++++++++++
>   mm/hugetlb_vmemmap.h | 10 +++++
>   3 files changed, 112 insertions(+), 28 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 70fedf8682c4..11de3f885065 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1834,50 +1834,88 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>   		schedule_work(&free_hpage_work);
>   }
>   
> -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> +static void bulk_vmemmap_restore_error(struct hstate *h,
> +					struct list_head *list,
> +					struct list_head *non_op_folios)
>   {
>   	struct folio *folio, *t_folio;
> -	bool clear_dtor = false;
>   
> -	/*
> -	 * First allocate required vmemmmap (if necessary) for all folios on
> -	 * list.  If vmemmap can not be allocated, we can not free folio to
> -	 * lower level allocator, so add back as hugetlb surplus page.
> -	 * add_hugetlb_folio() removes the page from THIS list.
> -	 * Use clear_dtor to note if vmemmap was successfully allocated for
> -	 * ANY page on the list.
> -	 */
> -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> -		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> +	if (!list_empty(non_op_folios)) {
> +		/*
> +		 * Free any restored hugetlb pages so that restore of the
> +		 * entire list can be retried.
> +		 * The idea is that in the common case of ENOMEM errors freeing
> +		 * hugetlb pages with vmemmap we will free up memory so that we
> +		 * can allocate vmemmap for more hugetlb pages.
> +		 */
> +		list_for_each_entry_safe(folio, t_folio, non_op_folios, lru) {
> +			list_del(&folio->lru);
> +			spin_lock_irq(&hugetlb_lock);
> +			__clear_hugetlb_destructor(h, folio);
> +			spin_unlock_irq(&hugetlb_lock);
> +			update_and_free_hugetlb_folio(h, folio, false);
> +			cond_resched();
> +		}
> +	} else {
> +		/*
> +		 * In the case where vmemmap was not restored for ANY folios,
> +		 * we loop through them trying to restore individually in the
> +		 * hope that someone elsewhere may have done something to cause
> +		 * success (such as freeing some memory).
> +		 * If unable to restore a hugetlb page, the hugetlb page is
> +		 * made a surplus page and removed from the list.
> +		 * If are able to restore vmemmap for one hugetlb page, we free
> +		 * it and quit processing the list to retry the bulk operation.
> +		 */
> +		list_for_each_entry_safe(folio, t_folio, list, lru)
>   			if (hugetlb_vmemmap_restore(h, &folio->page)) {
>   				spin_lock_irq(&hugetlb_lock);
>   				add_hugetlb_folio(h, folio, true);
>   				spin_unlock_irq(&hugetlb_lock);
> -			} else
> -				clear_dtor = true;
> -		}
> +			} else {
> +				list_del(&folio->lru);
> +				spin_lock_irq(&hugetlb_lock);
> +				__clear_hugetlb_destructor(h, folio);
> +				spin_unlock_irq(&hugetlb_lock);
> +				update_and_free_hugetlb_folio(h, folio, false);
> +				cond_resched();
> +				break;
> +			}
>   	}
> +}
> +
> +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> +{
> +	int ret;
> +	LIST_HEAD(non_op_folio);
> +	struct folio *folio, *t_folio;
>   
>   	/*
> -	 * If vmemmmap allocation was performed on any folio above, take lock
> -	 * to clear destructor of all folios on list.  This avoids the need to
> -	 * lock/unlock for each individual folio.
> -	 * The assumption is vmemmap allocation was performed on all or none
> -	 * of the folios on the list.  This is true expect in VERY rare cases.
> +	 * First allocate required vmemmmap (if necessary) for all folios.
> +	 * Carefully handle errors and free up any available hugetlb pages
> +	 * in an effort to make forward progress.
>   	 */
> -	if (clear_dtor) {
> +retry:
> +	ret = hugetlb_vmemmap_restore_folios(h, list, &non_op_folio);
> +	if (ret < 0) {
> +		bulk_vmemmap_restore_error(h, list, &non_op_folio);
> +		goto retry;
> +	}
> +
> +	/*
> +	 * At this point, list should be empty, and there should only be
> +	 * pages on the non_op_folio list.  free those entries.  Do note
> +	 * that the non_op_folio list could be empty.
> +	 */
> +	VM_WARN_ON(!list_empty(list));
> +	if (!list_empty(&non_op_folio)) {
>   		spin_lock_irq(&hugetlb_lock);
> -		list_for_each_entry(folio, list, lru)
> +		list_for_each_entry(folio, &non_op_folio, lru)
>   			__clear_hugetlb_destructor(h, folio);
>   		spin_unlock_irq(&hugetlb_lock);
>   	}
>   
> -	/*
> -	 * Free folios back to low level allocators.  vmemmap and destructors
> -	 * were taken care of above, so update_and_free_hugetlb_folio will
> -	 * not need to take hugetlb lock.
> -	 */
> -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> +	list_for_each_entry_safe(folio, t_folio, &non_op_folio, lru) {
>   		update_and_free_hugetlb_folio(h, folio, false);
>   		cond_resched();
>   	}
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 4558b814ffab..f827d4efcf8e 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -480,6 +480,42 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>   	return ret;
>   }
>   
> +/**
> + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> + * @h:			hstate.
> + * @folio_list:		list of folios.
> + * @non_op_list:	Output list of folio for which vmemmap exists.
> + *
> + * Return: %0 if vmemmap exists for all folios on the list and all entries have
> + *		been moved to non_op_list.  If an error is encountered restoring
> + *		vmemmap for ANY folio, an error code will be returned to the
> + *		caller.  Processing en entries stops when the first error is
> + *		encountered.  folios processed before the error with vmemmap
> + *		will reside on the non_op_list.  The folio that experienced the
> + *		error and all non-processed folios will remain on folio_list.
> + */
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					struct list_head *non_op_list)

non_optimized_list or vmemmap_intact_list? The abbreviation is not 
straightforward.

> +{
> +	struct folio *folio, *t_folio;
> +	int ret = 0;
> +
> +	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
> +		if (folio_test_hugetlb_vmemmap_optimized(folio)) {

hugetlb_vmemmap_restore() has this check as well, so it is unnecessary here.

> +			ret = hugetlb_vmemmap_restore(h, &folio->page);
> +			if (ret)
> +				goto out;

Maybe we could drop the label ("out") and just breaking or returning from
here is enough.

> +		}
> +
> +		/* Add non-optimized folios to output list */
> +		list_move(&folio->lru, non_op_list);
> +	}
> +
> +out:
> +	return ret;
> +}
> +
>   /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
>   static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
>   {
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index c512e388dbb4..e6378ae5c5b6 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -19,6 +19,9 @@
>   
>   #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>   int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					struct list_head *non_op_folios);

It is better to keep 3rd name (non_op_folios) consistent with where it is
defined (it is non_op_list).

>   void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
>   
> @@ -45,6 +48,13 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
>   	return 0;
>   }
>   
> +static int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> +					struct list_head *folio_list,
> +					struct list_head *non_op_folios)
> +{
> +	return 0;
> +}
> +
>   static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
>   {
>   }
Mike Kravetz Sept. 22, 2023, 5:01 p.m. UTC | #10
On 09/22/23 16:19, Muchun Song wrote:
> 
> 
> On 2023/9/22 05:58, Mike Kravetz wrote:
> > On 09/21/23 17:31, Muchun Song wrote:
> > > 
> > > On 2023/9/21 09:12, Mike Kravetz wrote:
> > > > On 09/20/23 11:03, Muchun Song wrote:
> > > > > > On Sep 20, 2023, at 10:56, Muchun Song <muchun.song@linux.dev> wrote:
> > > > > > > On Sep 20, 2023, at 04:57, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > > > > On 09/19/23 17:52, Muchun Song wrote:
> > > > > > > > On 2023/9/19 07:01, Mike Kravetz wrote:
> > > > +/**
> > > > + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> > > > + * @h:		struct hstate.
> > > > + * @folio_list:	list of folios.
> > > > + * @restored:	Set to number of folios for which vmemmap was restored
> > > > + *		successfully if caller passes a non-NULL pointer.
> > > > + *
> > > > + * Return: %0 if vmemmap exists for all folios on the list.  If an error is
> > > > + *		encountered restoring vmemmap for ANY folio, an error code
> > > > + *		will be returned to the caller.  It is then the responsibility
> > > > + *		of the caller to check the hugetlb vmemmap optimized flag of
> > > > + *		each folio to determine if vmemmap was actually restored.
> > > > + *		Note that processing is stopped when first error is encountered.
> > > > + */
> > > > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > > > +					struct list_head *folio_list,
> > > > +					unsigned long *restored)
> > > How about changing parameter of @restored to a list_head type which
> > > returns the non-optimized (previously) or vmemmap-restored-sucessful huge
> > > pages?
> > > In which case, the caller could traverse this returned list to free
> > > them first like you have implemented in bulk_vmemmap_restore_enomem(),
> > > it will be more efficient. The meaning of returned value should also
> > > be changed accordingly since update_and_free_pages_bulk() wants to
> > > whether there is a vmemmap-optimized huge page being restored sucessfully
> > > to determine if it should clear hugetlb flag. So
> > > hugetlb_vmemmap_restore_folios()
> > > could return how many huge pages being restored successful, if a negative
> > > number is returned meaning there is some error in the process of restoring
> > > of vmemmap.
> > > 
> > I had similar thoughts.  An updated patch based on this approach is below.
> > When creating the patch, I discovered that using the function return code
> > for both number of vmemmap restored pages as well as error code was
> > unnecessary.  Just checking !list_empty() of non-optimized pages tells us
> > if any were restored or could be freed.
> 
> I also thought about this. But there is a little different. If HVO
> is disabled, we will always clear the hugetlb flag twice since the
> list couldn't be empty, I thought it is an optimization for HVO-disabled
> case.
> 

Ah!  Good point.  We will clear twice with with the patch below if
HVO-disabled.
The reason I did not initially want to have return code be both number
restored and error code is that type int is not sufficient.  Number of
pages is usually of type unsigned long, but we need a signed value for
error codes.  type long should be sufficient for this case.

I will change it and associated logic.

> >  From b79f6eeb7a11644830bddfc43d2219c149d26405 Mon Sep 17 00:00:00 2001
> > From: Mike Kravetz <mike.kravetz@oracle.com>
> > Date: Sun, 10 Sep 2023 16:14:50 -0700
> > Subject: [PATCH] hugetlb: perform vmemmap restoration on a list of pages
> > 
> > The routine update_and_free_pages_bulk already performs vmemmap
> > restoration on the list of hugetlb pages in a separate step.  In
> > preparation for more functionality to be added in this step, create a
> > new routine hugetlb_vmemmap_restore_folios() that will restore
> > vmemmap for a list of folios.
> > 
> > This new routine must provide sufficient feedback about errors and
> > actual restoration performed so that update_and_free_pages_bulk can
> > perform optimally.
> > 
> > Special care must be taken when encountering an error from
> > hugetlb_vmemmap_restore_folios.  We want to continue making as much
> > forward progress as possible.  A new routine bulk_vmemmap_restore_error
> > handles this specific situation.
> > 
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >   mm/hugetlb.c         | 94 +++++++++++++++++++++++++++++++-------------
> >   mm/hugetlb_vmemmap.c | 36 +++++++++++++++++
> >   mm/hugetlb_vmemmap.h | 10 +++++
> >   3 files changed, 112 insertions(+), 28 deletions(-)
> > 
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 70fedf8682c4..11de3f885065 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1834,50 +1834,88 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
> >   		schedule_work(&free_hpage_work);
> >   }
> > -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> > +static void bulk_vmemmap_restore_error(struct hstate *h,
> > +					struct list_head *list,
> > +					struct list_head *non_op_folios)
> >   {
> >   	struct folio *folio, *t_folio;
> > -	bool clear_dtor = false;
> > -	/*
> > -	 * First allocate required vmemmmap (if necessary) for all folios on
> > -	 * list.  If vmemmap can not be allocated, we can not free folio to
> > -	 * lower level allocator, so add back as hugetlb surplus page.
> > -	 * add_hugetlb_folio() removes the page from THIS list.
> > -	 * Use clear_dtor to note if vmemmap was successfully allocated for
> > -	 * ANY page on the list.
> > -	 */
> > -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> > -		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> > +	if (!list_empty(non_op_folios)) {
> > +		/*
> > +		 * Free any restored hugetlb pages so that restore of the
> > +		 * entire list can be retried.
> > +		 * The idea is that in the common case of ENOMEM errors freeing
> > +		 * hugetlb pages with vmemmap we will free up memory so that we
> > +		 * can allocate vmemmap for more hugetlb pages.
> > +		 */
> > +		list_for_each_entry_safe(folio, t_folio, non_op_folios, lru) {
> > +			list_del(&folio->lru);
> > +			spin_lock_irq(&hugetlb_lock);
> > +			__clear_hugetlb_destructor(h, folio);
> > +			spin_unlock_irq(&hugetlb_lock);
> > +			update_and_free_hugetlb_folio(h, folio, false);
> > +			cond_resched();
> > +		}
> > +	} else {
> > +		/*
> > +		 * In the case where vmemmap was not restored for ANY folios,
> > +		 * we loop through them trying to restore individually in the
> > +		 * hope that someone elsewhere may have done something to cause
> > +		 * success (such as freeing some memory).
> > +		 * If unable to restore a hugetlb page, the hugetlb page is
> > +		 * made a surplus page and removed from the list.
> > +		 * If are able to restore vmemmap for one hugetlb page, we free
> > +		 * it and quit processing the list to retry the bulk operation.
> > +		 */
> > +		list_for_each_entry_safe(folio, t_folio, list, lru)
> >   			if (hugetlb_vmemmap_restore(h, &folio->page)) {
> >   				spin_lock_irq(&hugetlb_lock);
> >   				add_hugetlb_folio(h, folio, true);
> >   				spin_unlock_irq(&hugetlb_lock);
> > -			} else
> > -				clear_dtor = true;
> > -		}
> > +			} else {
> > +				list_del(&folio->lru);
> > +				spin_lock_irq(&hugetlb_lock);
> > +				__clear_hugetlb_destructor(h, folio);
> > +				spin_unlock_irq(&hugetlb_lock);
> > +				update_and_free_hugetlb_folio(h, folio, false);
> > +				cond_resched();
> > +				break;
> > +			}
> >   	}
> > +}
> > +
> > +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
> > +{
> > +	int ret;
> > +	LIST_HEAD(non_op_folio);
> > +	struct folio *folio, *t_folio;
> >   	/*
> > -	 * If vmemmmap allocation was performed on any folio above, take lock
> > -	 * to clear destructor of all folios on list.  This avoids the need to
> > -	 * lock/unlock for each individual folio.
> > -	 * The assumption is vmemmap allocation was performed on all or none
> > -	 * of the folios on the list.  This is true expect in VERY rare cases.
> > +	 * First allocate required vmemmmap (if necessary) for all folios.
> > +	 * Carefully handle errors and free up any available hugetlb pages
> > +	 * in an effort to make forward progress.
> >   	 */
> > -	if (clear_dtor) {
> > +retry:
> > +	ret = hugetlb_vmemmap_restore_folios(h, list, &non_op_folio);
> > +	if (ret < 0) {
> > +		bulk_vmemmap_restore_error(h, list, &non_op_folio);
> > +		goto retry;
> > +	}
> > +
> > +	/*
> > +	 * At this point, list should be empty, and there should only be
> > +	 * pages on the non_op_folio list.  free those entries.  Do note
> > +	 * that the non_op_folio list could be empty.
> > +	 */
> > +	VM_WARN_ON(!list_empty(list));
> > +	if (!list_empty(&non_op_folio)) {
> >   		spin_lock_irq(&hugetlb_lock);
> > -		list_for_each_entry(folio, list, lru)
> > +		list_for_each_entry(folio, &non_op_folio, lru)
> >   			__clear_hugetlb_destructor(h, folio);
> >   		spin_unlock_irq(&hugetlb_lock);
> >   	}
> > -	/*
> > -	 * Free folios back to low level allocators.  vmemmap and destructors
> > -	 * were taken care of above, so update_and_free_hugetlb_folio will
> > -	 * not need to take hugetlb lock.
> > -	 */
> > -	list_for_each_entry_safe(folio, t_folio, list, lru) {
> > +	list_for_each_entry_safe(folio, t_folio, &non_op_folio, lru) {
> >   		update_and_free_hugetlb_folio(h, folio, false);
> >   		cond_resched();
> >   	}
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index 4558b814ffab..f827d4efcf8e 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -480,6 +480,42 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
> >   	return ret;
> >   }
> > +/**
> > + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
> > + * @h:			hstate.
> > + * @folio_list:		list of folios.
> > + * @non_op_list:	Output list of folio for which vmemmap exists.
> > + *
> > + * Return: %0 if vmemmap exists for all folios on the list and all entries have
> > + *		been moved to non_op_list.  If an error is encountered restoring
> > + *		vmemmap for ANY folio, an error code will be returned to the
> > + *		caller.  Processing en entries stops when the first error is
> > + *		encountered.  folios processed before the error with vmemmap
> > + *		will reside on the non_op_list.  The folio that experienced the
> > + *		error and all non-processed folios will remain on folio_list.
> > + */
> > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > +					struct list_head *folio_list,
> > +					struct list_head *non_op_list)
> 
> non_optimized_list or vmemmap_intact_list? The abbreviation is not
> straightforward.
> 

Ok, I will be more specific.  non_optimized_list is better.

> > +{
> > +	struct folio *folio, *t_folio;
> > +	int ret = 0;
> > +
> > +	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
> > +		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
> 
> hugetlb_vmemmap_restore() has this check as well, so it is unnecessary here.
> 

Not necessary in this code, but if we want to know if restore operation
was actually performed to return 'number restored' as discussed above,
this test and an additional counter will be required.

> > +			ret = hugetlb_vmemmap_restore(h, &folio->page);
> > +			if (ret)
> > +				goto out;
> 
> Maybe we could drop the label ("out") and just breaking or returning from
> here is enough.

Yes

> > +		}
> > +
> > +		/* Add non-optimized folios to output list */
> > +		list_move(&folio->lru, non_op_list);
> > +	}
> > +
> > +out:
> > +	return ret;
> > +}
> > +
> >   /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
> >   static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> >   {
> > diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> > index c512e388dbb4..e6378ae5c5b6 100644
> > --- a/mm/hugetlb_vmemmap.h
> > +++ b/mm/hugetlb_vmemmap.h
> > @@ -19,6 +19,9 @@
> >   #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> >   int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > +					struct list_head *folio_list,
> > +					struct list_head *non_op_folios);
> 
> It is better to keep 3rd name (non_op_folios) consistent with where it is
> defined (it is non_op_list).
> 

I think using non_optimized_folios everywhere will be consistent and
more descriptive.

Thanks for all your comments and suggestions!
Mike Kravetz Sept. 22, 2023, 5:28 p.m. UTC | #11
On 09/22/23 10:01, Mike Kravetz wrote:
> On 09/22/23 16:19, Muchun Song wrote:
> > On 2023/9/22 05:58, Mike Kravetz wrote:
> > > On 09/21/23 17:31, Muchun Song wrote:
> > > > On 2023/9/21 09:12, Mike Kravetz wrote:
> > > > > On 09/20/23 11:03, Muchun Song wrote:
> > > + */
> > > +int hugetlb_vmemmap_restore_folios(const struct hstate *h,
> > > +					struct list_head *folio_list,
> > > +					struct list_head *non_op_list)
> > 
> > non_optimized_list or vmemmap_intact_list? The abbreviation is not
> > straightforward.
> > 
> 
> Ok, I will be more specific.  non_optimized_list is better.
> 

I changed my mind.
The longer name caused 80 column line wrap that I didn't like. :)

I will use non_hvo_folios.  The abbreviation hvo is pretty specific
in this context.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d6f3db3c1313..814bb1982274 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1836,36 +1836,36 @@  static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
 
 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
 {
+	int ret;
+	unsigned long restored;
 	struct folio *folio, *t_folio;
-	bool clear_dtor = false;
 
 	/*
-	 * First allocate required vmemmmap (if necessary) for all folios on
-	 * list.  If vmemmap can not be allocated, we can not free folio to
-	 * lower level allocator, so add back as hugetlb surplus page.
-	 * add_hugetlb_folio() removes the page from THIS list.
-	 * Use clear_dtor to note if vmemmap was successfully allocated for
-	 * ANY page on the list.
+	 * First allocate required vmemmmap (if necessary) for all folios.
 	 */
-	list_for_each_entry_safe(folio, t_folio, list, lru) {
-		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
-			if (hugetlb_vmemmap_restore(h, &folio->page)) {
-				spin_lock_irq(&hugetlb_lock);
+	ret = hugetlb_vmemmap_restore_folios(h, list, &restored);
+
+	/*
+	 * If there was an error restoring vmemmap for ANY folios on the list,
+	 * add them back as surplus hugetlb pages.  add_hugetlb_folio() removes
+	 * the folio from THIS list.
+	 */
+	if (ret < 0) {
+		spin_lock_irq(&hugetlb_lock);
+		list_for_each_entry_safe(folio, t_folio, list, lru)
+			if (folio_test_hugetlb_vmemmap_optimized(folio))
 				add_hugetlb_folio(h, folio, true);
-				spin_unlock_irq(&hugetlb_lock);
-			} else
-				clear_dtor = true;
-		}
+		spin_unlock_irq(&hugetlb_lock);
 	}
 
 	/*
-	 * If vmemmmap allocation was performed on any folio above, take lock
-	 * to clear destructor of all folios on list.  This avoids the need to
+	 * If vmemmmap allocation was performed on ANY folio , take lock to
+	 * clear destructor of all folios on list.  This avoids the need to
 	 * lock/unlock for each individual folio.
 	 * The assumption is vmemmap allocation was performed on all or none
 	 * of the folios on the list.  This is true expect in VERY rare cases.
 	 */
-	if (clear_dtor) {
+	if (restored) {
 		spin_lock_irq(&hugetlb_lock);
 		list_for_each_entry(folio, list, lru)
 			__clear_hugetlb_destructor(h, folio);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 4558b814ffab..463a4037ec6e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -480,6 +480,43 @@  int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 	return ret;
 }
 
+/**
+ * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
+ * @h:		struct hstate.
+ * @folio_list:	list of folios.
+ * @restored:	Set to number of folios for which vmemmap was restored
+ *		successfully if caller passes a non-NULL pointer.
+ *
+ * Return: %0 if vmemmap exists for all folios on the list.  If an error is
+ *		encountered restoring vmemmap for ANY folio, an error code
+ *		will be returned to the caller.  It is then the responsibility
+ *		of the caller to check the hugetlb vmemmap optimized flag of
+ *		each folio to determine if vmemmap was actually restored.
+ */
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					unsigned long *restored)
+{
+	unsigned long num_restored;
+	struct folio *folio;
+	int ret = 0, t_ret;
+
+	num_restored = 0;
+	list_for_each_entry(folio, folio_list, lru) {
+		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
+			t_ret = hugetlb_vmemmap_restore(h, &folio->page);
+			if (t_ret)
+				ret = t_ret;
+			else
+				num_restored++;
+		}
+	}
+
+	if (*restored)
+		*restored = num_restored;
+	return ret;
+}
+
 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index c512e388dbb4..bb58453c3cc0 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -19,6 +19,8 @@ 
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+			struct list_head *folio_list, unsigned long *restored);
 void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
 
@@ -45,6 +47,15 @@  static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h
 	return 0;
 }
 
+static inline int hugetlb_vmemmap_restore_folios(const struct hstate *h,
+					struct list_head *folio_list,
+					unsigned long *restored)
+{
+	if (restored)
+		*restored = 0;
+	return 0;
+}
+
 static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 }