diff mbox series

[RFC,v1,2/2] mm: khugepaged: recover from poisoned file-backed memory

Message ID 20220323232929.3035443-3-jiaqiyan@google.com (mailing list archive)
State New
Headers show
Series Memory poison recovery in khugepaged | expand

Commit Message

Jiaqi Yan March 23, 2022, 11:29 p.m. UTC
Make collapse_file roll back when copying pages failed.
More concretely:
* extract copy operations into a separate loop
* postpone the updates for nr_none until both scan and copy succeeded
* postpone joining small xarray entries until both scan and copy
  succeeded
* as for update operations to NR_XXX_THPS
    * for SHMEM file, postpone until both scan and copy succeeded
    * for other file, roll back if scan succeeded but copy failed

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 include/linux/highmem.h | 18 ++++++++++
 mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
 2 files changed, 67 insertions(+), 26 deletions(-)

Comments

Yang Shi March 28, 2022, 11:37 p.m. UTC | #1
On Wed, Mar 23, 2022 at 4:29 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
>
> Make collapse_file roll back when copying pages failed.
> More concretely:
> * extract copy operations into a separate loop
> * postpone the updates for nr_none until both scan and copy succeeded
> * postpone joining small xarray entries until both scan and copy
>   succeeded
> * as for update operations to NR_XXX_THPS
>     * for SHMEM file, postpone until both scan and copy succeeded
>     * for other file, roll back if scan succeeded but copy failed
>
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> ---
>  include/linux/highmem.h | 18 ++++++++++
>  mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
>  2 files changed, 67 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index 15d0aa4d349c..fc5aa221bdb5 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -315,6 +315,24 @@ static inline void copy_highpage(struct page *to, struct page *from)
>         kunmap_local(vfrom);
>  }
>
> +/*
> + * Machine check exception handled version of copy_highpage.
> + * Return true if copying page content failed; otherwise false.
> + */
> +static inline bool copy_highpage_mc(struct page *to, struct page *from)
> +{
> +       char *vfrom, *vto;
> +       unsigned long ret;
> +
> +       vfrom = kmap_local_page(from);
> +       vto = kmap_local_page(to);
> +       ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> +       kunmap_local(vto);
> +       kunmap_local(vfrom);
> +
> +       return ret > 0;
> +}
> +
>  #endif
>
>  static inline void memcpy_page(struct page *dst_page, size_t dst_off,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 84ed177f56ff..ed2b1cd4bbc6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1708,12 +1708,13 @@ static void collapse_file(struct mm_struct *mm,
>  {
>         struct address_space *mapping = file->f_mapping;
>         gfp_t gfp;
> -       struct page *new_page;
> +       struct page *new_page, *page, *tmp;

It seems you removed the "struct page *page" from " if (result ==
SCAN_SUCCEED)", but keep the "struct page *page" under "for (index =
start; index < end; index++)". I think the "struct page *page" in the
for loop could be removed too.

>         pgoff_t index, end = start + HPAGE_PMD_NR;
>         LIST_HEAD(pagelist);
>         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
>         int nr_none = 0, result = SCAN_SUCCEED;
>         bool is_shmem = shmem_file(file);
> +       bool copy_failed = false;
>         int nr;
>
>         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> @@ -1936,9 +1937,7 @@ static void collapse_file(struct mm_struct *mm,
>         }
>         nr = thp_nr_pages(new_page);
>
> -       if (is_shmem)
> -               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> -       else {
> +       if (!is_shmem) {
>                 __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
>                 filemap_nr_thps_inc(mapping);
>                 /*
> @@ -1956,34 +1955,39 @@ static void collapse_file(struct mm_struct *mm,
>                 }
>         }
>
> -       if (nr_none) {
> -               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> -               if (is_shmem)
> -                       __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> -       }
> -
> -       /* Join all the small entries into a single multi-index entry */
> -       xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -       xas_store(&xas, new_page);
>  xa_locked:
>         xas_unlock_irq(&xas);
>  xa_unlocked:
>
>         if (result == SCAN_SUCCEED) {
> -               struct page *page, *tmp;
> -
>                 /*
>                  * Replacing old pages with new one has succeeded, now we
> -                * need to copy the content and free the old pages.
> +                * attempt to copy the contents.
>                  */
>                 index = start;
> -               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> +               list_for_each_entry(page, &pagelist, lru) {
>                         while (index < page->index) {
>                                 clear_highpage(new_page + (index % HPAGE_PMD_NR));
>                                 index++;
>                         }
> -                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> -                                       page);
> +                       if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
> +                               copy_failed = true;
> +                               break;
> +                       }
> +                       index++;
> +               }
> +               while (!copy_failed && index < end) {
> +                       clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
> +                       index++;
> +               }
> +       }
> +
> +       if (result == SCAN_SUCCEED && !copy_failed) {

I think you could set "result" to SCAN_COPY_MC (as same as the
anonymous one), then you could drop !copy_failed and use "result"
alone afterwards.

> +               /*
> +                * Copying old pages to huge one has succeeded, now we
> +                * need to free the old pages.
> +                */
> +               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
>                         list_del(&page->lru);
>                         page->mapping = NULL;
>                         page_ref_unfreeze(page, 1);
> @@ -1991,12 +1995,20 @@ static void collapse_file(struct mm_struct *mm,
>                         ClearPageUnevictable(page);
>                         unlock_page(page);
>                         put_page(page);
> -                       index++;
>                 }
> -               while (index < end) {
> -                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
> -                       index++;
> +
> +               xas_lock_irq(&xas);
> +               if (is_shmem)
> +                       __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> +               if (nr_none) {
> +                       __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> +                       if (is_shmem)
> +                               __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
>                 }
> +               /* Join all the small entries into a single multi-index entry. */
> +               xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> +               xas_store(&xas, new_page);
> +               xas_unlock_irq(&xas);
>
>                 SetPageUptodate(new_page);
>                 page_ref_add(new_page, HPAGE_PMD_NR - 1);
> @@ -2012,9 +2024,11 @@ static void collapse_file(struct mm_struct *mm,
>
>                 khugepaged_pages_collapsed++;
>         } else {
> -               struct page *page;
> -
> -               /* Something went wrong: roll back page cache changes */
> +               /*
> +                * Something went wrong:
> +                * either result != SCAN_SUCCEED or copy_failed,
> +                * roll back page cache changes
> +                */
>                 xas_lock_irq(&xas);
>                 mapping->nrpages -= nr_none;
>
> @@ -2047,6 +2061,15 @@ static void collapse_file(struct mm_struct *mm,
>                         xas_lock_irq(&xas);
>                 }
>                 VM_BUG_ON(nr_none);
> +               /*
> +                * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
> +                * which is not updated yet for SHMEM file.
> +                * These undos are not needed if result is not SCAN_SUCCEED.
> +                */
> +               if (!is_shmem && result == SCAN_SUCCEED) {

Handling the error case when "result == SCAN_SUCCEED" looks awkward.
With the above fixed (set result to SCAN_COPY_MC) we could avoid the
awkwardness.

> +                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);

I'm wondering whether we may defer NR_FILE_THPS update just like SHMEM
because it has not to be updated in advance so that we have the user
visible counters update in the single place. Just filemap_nr_thps
needs to be updated in advance since it is used to sync with file open
path to truncate huge pages.

> +                       filemap_nr_thps_dec(mapping);
> +               }
>                 xas_unlock_irq(&xas);
>
>                 new_page->mapping = NULL;
> --
> 2.35.1.894.gb6a874cedc-goog
>
Tong Tiangen March 29, 2022, 4:02 a.m. UTC | #2
在 2022/3/24 7:29, Jiaqi Yan 写道:
> Make collapse_file roll back when copying pages failed.
> More concretely:
> * extract copy operations into a separate loop
> * postpone the updates for nr_none until both scan and copy succeeded
> * postpone joining small xarray entries until both scan and copy
>    succeeded
> * as for update operations to NR_XXX_THPS
>      * for SHMEM file, postpone until both scan and copy succeeded
>      * for other file, roll back if scan succeeded but copy failed
> 
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> ---
>   include/linux/highmem.h | 18 ++++++++++
>   mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
>   2 files changed, 67 insertions(+), 26 deletions(-)
> 
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index 15d0aa4d349c..fc5aa221bdb5 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -315,6 +315,24 @@ static inline void copy_highpage(struct page *to, struct page *from)
>   	kunmap_local(vfrom);
>   }
>   
> +/*
> + * Machine check exception handled version of copy_highpage.
> + * Return true if copying page content failed; otherwise false.
> + */
> +static inline bool copy_highpage_mc(struct page *to, struct page *from)
> +{
> +	char *vfrom, *vto;
> +	unsigned long ret;
> +
> +	vfrom = kmap_local_page(from);
> +	vto = kmap_local_page(to);
> +	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> +	kunmap_local(vto);
> +	kunmap_local(vfrom);
> +
> +	return ret > 0;
> +}
> +
>   #endif
>   
>   static inline void memcpy_page(struct page *dst_page, size_t dst_off,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 84ed177f56ff..ed2b1cd4bbc6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1708,12 +1708,13 @@ static void collapse_file(struct mm_struct *mm,
>   {
>   	struct address_space *mapping = file->f_mapping;
>   	gfp_t gfp;
> -	struct page *new_page;
> +	struct page *new_page, *page, *tmp;
>   	pgoff_t index, end = start + HPAGE_PMD_NR;
>   	LIST_HEAD(pagelist);
>   	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
>   	int nr_none = 0, result = SCAN_SUCCEED;
>   	bool is_shmem = shmem_file(file);
> +	bool copy_failed = false;
>   	int nr;
>   
>   	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> @@ -1936,9 +1937,7 @@ static void collapse_file(struct mm_struct *mm,
>   	}
>   	nr = thp_nr_pages(new_page);
>   
> -	if (is_shmem)
> -		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> -	else {
> +	if (!is_shmem) {
>   		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
>   		filemap_nr_thps_inc(mapping);
>   		/*
> @@ -1956,34 +1955,39 @@ static void collapse_file(struct mm_struct *mm,
>   		}
>   	}
>   
> -	if (nr_none) {
> -		__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> -		if (is_shmem)
> -			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> -	}
> -
> -	/* Join all the small entries into a single multi-index entry */
> -	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -	xas_store(&xas, new_page);
>   xa_locked:
>   	xas_unlock_irq(&xas);
>   xa_unlocked:
>   
>   	if (result == SCAN_SUCCEED) {
> -		struct page *page, *tmp;
> -
>   		/*
>   		 * Replacing old pages with new one has succeeded, now we
> -		 * need to copy the content and free the old pages.
> +		 * attempt to copy the contents.
>   		 */
>   		index = start;
> -		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> +		list_for_each_entry(page, &pagelist, lru) {
>   			while (index < page->index) {
>   				clear_highpage(new_page + (index % HPAGE_PMD_NR));
>   				index++;
>   			}
> -			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> -					page);
> +			if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
> +				copy_failed = true;

The 1st patch here used "copy_succeed = false", It is best that the 
logic of the two positions can be unified.

> +				break;
> +			}
> +			index++;
> +		}
> +		while (!copy_failed && index < end) {
> +			clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
> +			index++;
> +		}
> +	}
> +
> +	if (result == SCAN_SUCCEED && !copy_failed) {
> +		/*
> +		 * Copying old pages to huge one has succeeded, now we
> +		 * need to free the old pages.
> +		 */
> +		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
>   			list_del(&page->lru);
>   			page->mapping = NULL;
>   			page_ref_unfreeze(page, 1);
> @@ -1991,12 +1995,20 @@ static void collapse_file(struct mm_struct *mm,
>   			ClearPageUnevictable(page);
>   			unlock_page(page);
>   			put_page(page);
> -			index++;
>   		}
> -		while (index < end) {
> -			clear_highpage(new_page + (index % HPAGE_PMD_NR));
> -			index++;
> +
> +		xas_lock_irq(&xas);
> +		if (is_shmem)
> +			__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> +		if (nr_none) {
> +			__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> +			if (is_shmem)
> +				__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
>   		}
> +		/* Join all the small entries into a single multi-index entry. */
> +		xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> +		xas_store(&xas, new_page);
> +		xas_unlock_irq(&xas);
>   
>   		SetPageUptodate(new_page);
>   		page_ref_add(new_page, HPAGE_PMD_NR - 1);
> @@ -2012,9 +2024,11 @@ static void collapse_file(struct mm_struct *mm,
>   
>   		khugepaged_pages_collapsed++;
>   	} else {
> -		struct page *page;
> -
> -		/* Something went wrong: roll back page cache changes */
> +		/*
> +		 * Something went wrong:
> +		 * either result != SCAN_SUCCEED or copy_failed,
> +		 * roll back page cache changes
> +		 */
>   		xas_lock_irq(&xas);
>   		mapping->nrpages -= nr_none;
>   
> @@ -2047,6 +2061,15 @@ static void collapse_file(struct mm_struct *mm,
>   			xas_lock_irq(&xas);
>   		}
>   		VM_BUG_ON(nr_none);
> +		/*
> +		 * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
> +		 * which is not updated yet for SHMEM file.
> +		 * These undos are not needed if result is not SCAN_SUCCEED.
> +		 */
> +		if (!is_shmem && result == SCAN_SUCCEED) {
> +			__mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
> +			filemap_nr_thps_dec(mapping);
> +		}
>   		xas_unlock_irq(&xas);
>   
>   		new_page->mapping = NULL;
Jiaqi Yan April 5, 2022, 8:46 p.m. UTC | #3
On Mon, Mar 28, 2022 at 9:02 PM Tong Tiangen <tongtiangen@huawei.com> wrote:
>
>
>
> 在 2022/3/24 7:29, Jiaqi Yan 写道:
> > Make collapse_file roll back when copying pages failed.
> > More concretely:
> > * extract copy operations into a separate loop
> > * postpone the updates for nr_none until both scan and copy succeeded
> > * postpone joining small xarray entries until both scan and copy
> >    succeeded
> > * as for update operations to NR_XXX_THPS
> >      * for SHMEM file, postpone until both scan and copy succeeded
> >      * for other file, roll back if scan succeeded but copy failed
> >
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > ---
> >   include/linux/highmem.h | 18 ++++++++++
> >   mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
> >   2 files changed, 67 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> > index 15d0aa4d349c..fc5aa221bdb5 100644
> > --- a/include/linux/highmem.h
> > +++ b/include/linux/highmem.h
> > @@ -315,6 +315,24 @@ static inline void copy_highpage(struct page *to, struct page *from)
> >       kunmap_local(vfrom);
> >   }
> >
> > +/*
> > + * Machine check exception handled version of copy_highpage.
> > + * Return true if copying page content failed; otherwise false.
> > + */
> > +static inline bool copy_highpage_mc(struct page *to, struct page *from)
> > +{
> > +     char *vfrom, *vto;
> > +     unsigned long ret;
> > +
> > +     vfrom = kmap_local_page(from);
> > +     vto = kmap_local_page(to);
> > +     ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> > +     kunmap_local(vto);
> > +     kunmap_local(vfrom);
> > +
> > +     return ret > 0;
> > +}
> > +
> >   #endif
> >
> >   static inline void memcpy_page(struct page *dst_page, size_t dst_off,
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 84ed177f56ff..ed2b1cd4bbc6 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1708,12 +1708,13 @@ static void collapse_file(struct mm_struct *mm,
> >   {
> >       struct address_space *mapping = file->f_mapping;
> >       gfp_t gfp;
> > -     struct page *new_page;
> > +     struct page *new_page, *page, *tmp;
> >       pgoff_t index, end = start + HPAGE_PMD_NR;
> >       LIST_HEAD(pagelist);
> >       XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> >       int nr_none = 0, result = SCAN_SUCCEED;
> >       bool is_shmem = shmem_file(file);
> > +     bool copy_failed = false;
> >       int nr;
> >
> >       VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> > @@ -1936,9 +1937,7 @@ static void collapse_file(struct mm_struct *mm,
> >       }
> >       nr = thp_nr_pages(new_page);
> >
> > -     if (is_shmem)
> > -             __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> > -     else {
> > +     if (!is_shmem) {
> >               __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
> >               filemap_nr_thps_inc(mapping);
> >               /*
> > @@ -1956,34 +1955,39 @@ static void collapse_file(struct mm_struct *mm,
> >               }
> >       }
> >
> > -     if (nr_none) {
> > -             __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> > -             if (is_shmem)
> > -                     __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> > -     }
> > -
> > -     /* Join all the small entries into a single multi-index entry */
> > -     xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > -     xas_store(&xas, new_page);
> >   xa_locked:
> >       xas_unlock_irq(&xas);
> >   xa_unlocked:
> >
> >       if (result == SCAN_SUCCEED) {
> > -             struct page *page, *tmp;
> > -
> >               /*
> >                * Replacing old pages with new one has succeeded, now we
> > -              * need to copy the content and free the old pages.
> > +              * attempt to copy the contents.
> >                */
> >               index = start;
> > -             list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> > +             list_for_each_entry(page, &pagelist, lru) {
> >                       while (index < page->index) {
> >                               clear_highpage(new_page + (index % HPAGE_PMD_NR));
> >                               index++;
> >                       }
> > -                     copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> > -                                     page);
> > +                     if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
> > +                             copy_failed = true;
>
> The 1st patch here used "copy_succeed = false", It is best that the
> logic of the two positions can be unified.

copy_failed here will be eliminated once we have SCAN_COPY_MC
defined in version 2. Version 2 also renames "copy_succeeded"
in collapse_huge_page() to "copied", mimicking the "isolated" variable
for __collapse_huge_page_isolate().


>
> > +                             break;
> > +                     }
> > +                     index++;
> > +             }
> > +             while (!copy_failed && index < end) {
> > +                     clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
> > +                     index++;
> > +             }
> > +     }
> > +
> > +     if (result == SCAN_SUCCEED && !copy_failed) {
> > +             /*
> > +              * Copying old pages to huge one has succeeded, now we
> > +              * need to free the old pages.
> > +              */
> > +             list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> >                       list_del(&page->lru);
> >                       page->mapping = NULL;
> >                       page_ref_unfreeze(page, 1);
> > @@ -1991,12 +1995,20 @@ static void collapse_file(struct mm_struct *mm,
> >                       ClearPageUnevictable(page);
> >                       unlock_page(page);
> >                       put_page(page);
> > -                     index++;
> >               }
> > -             while (index < end) {
> > -                     clear_highpage(new_page + (index % HPAGE_PMD_NR));
> > -                     index++;
> > +
> > +             xas_lock_irq(&xas);
> > +             if (is_shmem)
> > +                     __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> > +             if (nr_none) {
> > +                     __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> > +                     if (is_shmem)
> > +                             __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> >               }
> > +             /* Join all the small entries into a single multi-index entry. */
> > +             xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > +             xas_store(&xas, new_page);
> > +             xas_unlock_irq(&xas);
> >
> >               SetPageUptodate(new_page);
> >               page_ref_add(new_page, HPAGE_PMD_NR - 1);
> > @@ -2012,9 +2024,11 @@ static void collapse_file(struct mm_struct *mm,
> >
> >               khugepaged_pages_collapsed++;
> >       } else {
> > -             struct page *page;
> > -
> > -             /* Something went wrong: roll back page cache changes */
> > +             /*
> > +              * Something went wrong:
> > +              * either result != SCAN_SUCCEED or copy_failed,
> > +              * roll back page cache changes
> > +              */
> >               xas_lock_irq(&xas);
> >               mapping->nrpages -= nr_none;
> >
> > @@ -2047,6 +2061,15 @@ static void collapse_file(struct mm_struct *mm,
> >                       xas_lock_irq(&xas);
> >               }
> >               VM_BUG_ON(nr_none);
> > +             /*
> > +              * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
> > +              * which is not updated yet for SHMEM file.
> > +              * These undos are not needed if result is not SCAN_SUCCEED.
> > +              */
> > +             if (!is_shmem && result == SCAN_SUCCEED) {
> > +                     __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
> > +                     filemap_nr_thps_dec(mapping);
> > +             }
> >               xas_unlock_irq(&xas);
> >
> >               new_page->mapping = NULL;
Jiaqi Yan April 5, 2022, 8:46 p.m. UTC | #4
On Mon, Mar 28, 2022 at 4:37 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 4:29 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
> >
> > Make collapse_file roll back when copying pages failed.
> > More concretely:
> > * extract copy operations into a separate loop
> > * postpone the updates for nr_none until both scan and copy succeeded
> > * postpone joining small xarray entries until both scan and copy
> >   succeeded
> > * as for update operations to NR_XXX_THPS
> >     * for SHMEM file, postpone until both scan and copy succeeded
> >     * for other file, roll back if scan succeeded but copy failed
> >
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > ---
> >  include/linux/highmem.h | 18 ++++++++++
> >  mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
> >  2 files changed, 67 insertions(+), 26 deletions(-)
> >
> > diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> > index 15d0aa4d349c..fc5aa221bdb5 100644
> > --- a/include/linux/highmem.h
> > +++ b/include/linux/highmem.h
> > @@ -315,6 +315,24 @@ static inline void copy_highpage(struct page *to, struct page *from)
> >         kunmap_local(vfrom);
> >  }
> >
> > +/*
> > + * Machine check exception handled version of copy_highpage.
> > + * Return true if copying page content failed; otherwise false.
> > + */
> > +static inline bool copy_highpage_mc(struct page *to, struct page *from)
> > +{
> > +       char *vfrom, *vto;
> > +       unsigned long ret;
> > +
> > +       vfrom = kmap_local_page(from);
> > +       vto = kmap_local_page(to);
> > +       ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> > +       kunmap_local(vto);
> > +       kunmap_local(vfrom);
> > +
> > +       return ret > 0;
> > +}
> > +
> >  #endif
> >
> >  static inline void memcpy_page(struct page *dst_page, size_t dst_off,
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 84ed177f56ff..ed2b1cd4bbc6 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1708,12 +1708,13 @@ static void collapse_file(struct mm_struct *mm,
> >  {
> >         struct address_space *mapping = file->f_mapping;
> >         gfp_t gfp;
> > -       struct page *new_page;
> > +       struct page *new_page, *page, *tmp;
>
> It seems you removed the "struct page *page" from " if (result ==
> SCAN_SUCCEED)", but keep the "struct page *page" under "for (index =
> start; index < end; index++)". I think the "struct page *page" in the
> for loop could be removed too.

I agree. Removed in the next version.

>
> >         pgoff_t index, end = start + HPAGE_PMD_NR;
> >         LIST_HEAD(pagelist);
> >         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> >         int nr_none = 0, result = SCAN_SUCCEED;
> >         bool is_shmem = shmem_file(file);
> > +       bool copy_failed = false;
> >         int nr;
> >
> >         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> > @@ -1936,9 +1937,7 @@ static void collapse_file(struct mm_struct *mm,
> >         }
> >         nr = thp_nr_pages(new_page);
> >
> > -       if (is_shmem)
> > -               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> > -       else {
> > +       if (!is_shmem) {
> >                 __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
> >                 filemap_nr_thps_inc(mapping);
> >                 /*
> > @@ -1956,34 +1955,39 @@ static void collapse_file(struct mm_struct *mm,
> >                 }
> >         }
> >
> > -       if (nr_none) {
> > -               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> > -               if (is_shmem)
> > -                       __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> > -       }
> > -
> > -       /* Join all the small entries into a single multi-index entry */
> > -       xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > -       xas_store(&xas, new_page);
> >  xa_locked:
> >         xas_unlock_irq(&xas);
> >  xa_unlocked:
> >
> >         if (result == SCAN_SUCCEED) {
> > -               struct page *page, *tmp;
> > -
> >                 /*
> >                  * Replacing old pages with new one has succeeded, now we
> > -                * need to copy the content and free the old pages.
> > +                * attempt to copy the contents.
> >                  */
> >                 index = start;
> > -               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> > +               list_for_each_entry(page, &pagelist, lru) {
> >                         while (index < page->index) {
> >                                 clear_highpage(new_page + (index % HPAGE_PMD_NR));
> >                                 index++;
> >                         }
> > -                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> > -                                       page);
> > +                       if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
> > +                               copy_failed = true;
> > +                               break;
> > +                       }
> > +                       index++;
> > +               }
> > +               while (!copy_failed && index < end) {
> > +                       clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
> > +                       index++;
> > +               }
> > +       }
> > +
> > +       if (result == SCAN_SUCCEED && !copy_failed) {
>
> I think you could set "result" to SCAN_COPY_MC (as same as the
> anonymous one), then you could drop !copy_failed and use "result"
> alone afterwards.

Yes, copy_failed variable will be eliminated in the next version.

>
> > +               /*
> > +                * Copying old pages to huge one has succeeded, now we
> > +                * need to free the old pages.
> > +                */
> > +               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> >                         list_del(&page->lru);
> >                         page->mapping = NULL;
> >                         page_ref_unfreeze(page, 1);
> > @@ -1991,12 +1995,20 @@ static void collapse_file(struct mm_struct *mm,
> >                         ClearPageUnevictable(page);
> >                         unlock_page(page);
> >                         put_page(page);
> > -                       index++;
> >                 }
> > -               while (index < end) {
> > -                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
> > -                       index++;
> > +
> > +               xas_lock_irq(&xas);
> > +               if (is_shmem)
> > +                       __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> > +               if (nr_none) {
> > +                       __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> > +                       if (is_shmem)
> > +                               __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> >                 }
> > +               /* Join all the small entries into a single multi-index entry. */
> > +               xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > +               xas_store(&xas, new_page);
> > +               xas_unlock_irq(&xas);
> >
> >                 SetPageUptodate(new_page);
> >                 page_ref_add(new_page, HPAGE_PMD_NR - 1);
> > @@ -2012,9 +2024,11 @@ static void collapse_file(struct mm_struct *mm,
> >
> >                 khugepaged_pages_collapsed++;
> >         } else {
> > -               struct page *page;
> > -
> > -               /* Something went wrong: roll back page cache changes */
> > +               /*
> > +                * Something went wrong:
> > +                * either result != SCAN_SUCCEED or copy_failed,
> > +                * roll back page cache changes
> > +                */
> >                 xas_lock_irq(&xas);
> >                 mapping->nrpages -= nr_none;
> >
> > @@ -2047,6 +2061,15 @@ static void collapse_file(struct mm_struct *mm,
> >                         xas_lock_irq(&xas);
> >                 }
> >                 VM_BUG_ON(nr_none);
> > +               /*
> > +                * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
> > +                * which is not updated yet for SHMEM file.
> > +                * These undos are not needed if result is not SCAN_SUCCEED.
> > +                */
> > +               if (!is_shmem && result == SCAN_SUCCEED) {
>
> Handling the error case when "result == SCAN_SUCCEED" looks awkward.
> With the above fixed (set result to SCAN_COPY_MC) we could avoid the
> awkwardness.
>

Will look less awkward in the next version :)

> > +                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
>
> I'm wondering whether we may defer NR_FILE_THPS update just like SHMEM
> because it has not to be updated in advance so that we have the user
> visible counters update in the single place. Just filemap_nr_thps
> needs to be updated in advance since it is used to sync with file open
> path to truncate huge pages.

In version 2, I will postpone __mod_lruvec_page_state() until copying succeeded,
and undo filemap_nr_thps_inc if (!is_shmem && result == SCAN_COPY_MC).

>
> > +                       filemap_nr_thps_dec(mapping);
> > +               }
> >                 xas_unlock_irq(&xas);
> >
> >                 new_page->mapping = NULL;
> > --
> > 2.35.1.894.gb6a874cedc-goog
> >

Thanks for the valuable feedback. I will send out the v2 RFC separately,
and please leave/reply there. Thanks!
diff mbox series

Patch

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 15d0aa4d349c..fc5aa221bdb5 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -315,6 +315,24 @@  static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_local(vfrom);
 }
 
+/*
+ * Machine check exception handled version of copy_highpage.
+ * Return true if copying page content failed; otherwise false.
+ */
+static inline bool copy_highpage_mc(struct page *to, struct page *from)
+{
+	char *vfrom, *vto;
+	unsigned long ret;
+
+	vfrom = kmap_local_page(from);
+	vto = kmap_local_page(to);
+	ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+	kunmap_local(vto);
+	kunmap_local(vfrom);
+
+	return ret > 0;
+}
+
 #endif
 
 static inline void memcpy_page(struct page *dst_page, size_t dst_off,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 84ed177f56ff..ed2b1cd4bbc6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1708,12 +1708,13 @@  static void collapse_file(struct mm_struct *mm,
 {
 	struct address_space *mapping = file->f_mapping;
 	gfp_t gfp;
-	struct page *new_page;
+	struct page *new_page, *page, *tmp;
 	pgoff_t index, end = start + HPAGE_PMD_NR;
 	LIST_HEAD(pagelist);
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
 	bool is_shmem = shmem_file(file);
+	bool copy_failed = false;
 	int nr;
 
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
@@ -1936,9 +1937,7 @@  static void collapse_file(struct mm_struct *mm,
 	}
 	nr = thp_nr_pages(new_page);
 
-	if (is_shmem)
-		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
-	else {
+	if (!is_shmem) {
 		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
 		filemap_nr_thps_inc(mapping);
 		/*
@@ -1956,34 +1955,39 @@  static void collapse_file(struct mm_struct *mm,
 		}
 	}
 
-	if (nr_none) {
-		__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
-		if (is_shmem)
-			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
-	}
-
-	/* Join all the small entries into a single multi-index entry */
-	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
-	xas_store(&xas, new_page);
 xa_locked:
 	xas_unlock_irq(&xas);
 xa_unlocked:
 
 	if (result == SCAN_SUCCEED) {
-		struct page *page, *tmp;
-
 		/*
 		 * Replacing old pages with new one has succeeded, now we
-		 * need to copy the content and free the old pages.
+		 * attempt to copy the contents.
 		 */
 		index = start;
-		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+		list_for_each_entry(page, &pagelist, lru) {
 			while (index < page->index) {
 				clear_highpage(new_page + (index % HPAGE_PMD_NR));
 				index++;
 			}
-			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
-					page);
+			if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
+				copy_failed = true;
+				break;
+			}
+			index++;
+		}
+		while (!copy_failed && index < end) {
+			clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
+			index++;
+		}
+	}
+
+	if (result == SCAN_SUCCEED && !copy_failed) {
+		/*
+		 * Copying old pages to huge one has succeeded, now we
+		 * need to free the old pages.
+		 */
+		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
 			list_del(&page->lru);
 			page->mapping = NULL;
 			page_ref_unfreeze(page, 1);
@@ -1991,12 +1995,20 @@  static void collapse_file(struct mm_struct *mm,
 			ClearPageUnevictable(page);
 			unlock_page(page);
 			put_page(page);
-			index++;
 		}
-		while (index < end) {
-			clear_highpage(new_page + (index % HPAGE_PMD_NR));
-			index++;
+
+		xas_lock_irq(&xas);
+		if (is_shmem)
+			__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+		if (nr_none) {
+			__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+			if (is_shmem)
+				__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
 		}
+		/* Join all the small entries into a single multi-index entry. */
+		xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+		xas_store(&xas, new_page);
+		xas_unlock_irq(&xas);
 
 		SetPageUptodate(new_page);
 		page_ref_add(new_page, HPAGE_PMD_NR - 1);
@@ -2012,9 +2024,11 @@  static void collapse_file(struct mm_struct *mm,
 
 		khugepaged_pages_collapsed++;
 	} else {
-		struct page *page;
-
-		/* Something went wrong: roll back page cache changes */
+		/*
+		 * Something went wrong:
+		 * either result != SCAN_SUCCEED or copy_failed,
+		 * roll back page cache changes
+		 */
 		xas_lock_irq(&xas);
 		mapping->nrpages -= nr_none;
 
@@ -2047,6 +2061,15 @@  static void collapse_file(struct mm_struct *mm,
 			xas_lock_irq(&xas);
 		}
 		VM_BUG_ON(nr_none);
+		/*
+		 * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
+		 * which is not updated yet for SHMEM file.
+		 * These undos are not needed if result is not SCAN_SUCCEED.
+		 */
+		if (!is_shmem && result == SCAN_SUCCEED) {
+			__mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+			filemap_nr_thps_dec(mapping);
+		}
 		xas_unlock_irq(&xas);
 
 		new_page->mapping = NULL;