diff mbox series

[v10,3/3] mm/khugepaged: recover from poisoned file-backed memory

Message ID 20230305065112.1932255-4-jiaqiyan@google.com (mailing list archive)
State New
Headers show
Series Memory poison recovery in khugepaged collapsing | expand

Commit Message

Jiaqi Yan March 5, 2023, 6:51 a.m. UTC
Make collapse_file roll back when copying pages failed. More concretely:
- extract copying operations into a separate loop
- postpone the updates for nr_none until both scanning and copying
  succeeded
- postpone joining small xarray entries until both scanning and copying
  succeeded
- postpone the update operations to NR_XXX_THPS until both scanning and
  copying succeeded
- for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but
  copying failed

Tested manually:
0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk.
1. Start a two-thread application. Each thread allocates a chunk of
   non-huge memory buffer from /mnt/ramdisk.
2. Pick 4 random buffer address (2 in each thread) and inject
   uncorrectable memory errors at physical addresses.
3. Signal both threads to make their memory buffer collapsible, i.e.
   calling madvise(MADV_HUGEPAGE).
4. Wait and then check kernel log: khugepaged is able to recover from
   poisoned pages by skipping them.
5. Signal both threads to inspect their buffer contents and make sure no
   data corruption.

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 30 deletions(-)

Comments

Yang Shi March 24, 2023, 9:15 p.m. UTC | #1
On Sat, Mar 4, 2023 at 10:51 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
>
> Make collapse_file roll back when copying pages failed. More concretely:
> - extract copying operations into a separate loop
> - postpone the updates for nr_none until both scanning and copying
>   succeeded
> - postpone joining small xarray entries until both scanning and copying
>   succeeded
> - postpone the update operations to NR_XXX_THPS until both scanning and
>   copying succeeded
> - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but
>   copying failed
>
> Tested manually:
> 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk.
> 1. Start a two-thread application. Each thread allocates a chunk of
>    non-huge memory buffer from /mnt/ramdisk.
> 2. Pick 4 random buffer address (2 in each thread) and inject
>    uncorrectable memory errors at physical addresses.
> 3. Signal both threads to make their memory buffer collapsible, i.e.
>    calling madvise(MADV_HUGEPAGE).
> 4. Wait and then check kernel log: khugepaged is able to recover from
>    poisoned pages by skipping them.
> 5. Signal both threads to inspect their buffer contents and make sure no
>    data corruption.
>
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>

Reviewed-by: Yang Shi <shy828301@gmail.com>

Just a nit below:

> ---
>  mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
>  1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c3c217f6ebc6e..3ea2aa55c2c52 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1890,6 +1890,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  {
>         struct address_space *mapping = file->f_mapping;
>         struct page *hpage;
> +       struct page *page;
> +       struct page *tmp;
> +       struct folio *folio;
>         pgoff_t index = 0, end = start + HPAGE_PMD_NR;
>         LIST_HEAD(pagelist);
>         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> @@ -1934,8 +1937,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>
>         xas_set(&xas, start);
>         for (index = start; index < end; index++) {
> -               struct page *page = xas_next(&xas);
> -               struct folio *folio;
> +               page = xas_next(&xas);
>
>                 VM_BUG_ON(index != xas.xa_index);
>                 if (is_shmem) {
> @@ -2117,10 +2119,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>         }
>         nr = thp_nr_pages(hpage);
>
> -       if (is_shmem)
> -               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> -       else {
> -               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> +       if (!is_shmem) {
>                 filemap_nr_thps_inc(mapping);
>                 /*
>                  * Paired with smp_mb() in do_dentry_open() to ensure
> @@ -2131,21 +2130,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                 smp_mb();
>                 if (inode_is_open_for_write(mapping->host)) {
>                         result = SCAN_FAIL;
> -                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
>                         filemap_nr_thps_dec(mapping);
>                         goto xa_locked;
>                 }
>         }
> -
> -       if (nr_none) {
> -               __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> -               /* nr_none is always 0 for non-shmem. */
> -               __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> -       }
> -
> -       /* Join all the small entries into a single multi-index entry */
> -       xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -       xas_store(&xas, hpage);
>  xa_locked:
>         xas_unlock_irq(&xas);
>  xa_unlocked:
> @@ -2158,21 +2146,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>         try_to_unmap_flush();
>
>         if (result == SCAN_SUCCEED) {
> -               struct page *page, *tmp;
> -               struct folio *folio;
> -
>                 /*
>                  * Replacing old pages with new one has succeeded, now we
> -                * need to copy the content and free the old pages.
> +                * attempt to copy the contents.
>                  */
>                 index = start;
> -               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> +               list_for_each_entry(page, &pagelist, lru) {
>                         while (index < page->index) {
>                                 clear_highpage(hpage + (index % HPAGE_PMD_NR));
>                                 index++;
>                         }
> -                       copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
> -                                     page);
> +                       if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR),
> +                                            page) > 0) {
> +                               result = SCAN_COPY_MC;
> +                               break;
> +                       }
> +                       index++;
> +               }
> +               while (result == SCAN_SUCCEED && index < end) {
> +                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
> +                       index++;
> +               }
> +       }
> +
> +       if (result == SCAN_SUCCEED) {
> +               /*
> +                * Copying old pages to huge one has succeeded, now we
> +                * need to free the old pages.
> +                */
> +               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
>                         list_del(&page->lru);
>                         page->mapping = NULL;
>                         page_ref_unfreeze(page, 1);
> @@ -2180,12 +2182,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                         ClearPageUnevictable(page);
>                         unlock_page(page);
>                         put_page(page);
> -                       index++;
>                 }
> -               while (index < end) {
> -                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
> -                       index++;
> +
> +               xas_lock_irq(&xas);
> +               if (is_shmem)
> +                       __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> +               else
> +                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> +
> +               if (nr_none) {
> +                       __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> +                       /* nr_none is always 0 for non-shmem. */
> +                       __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
>                 }
> +               /* Join all the small entries into a single multi-index entry. */
> +               xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> +               xas_store(&xas, hpage);
> +               xas_unlock_irq(&xas);
>
>                 folio = page_folio(hpage);
>                 folio_mark_uptodate(folio);
> @@ -2203,8 +2216,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                 unlock_page(hpage);
>                 hpage = NULL;
>         } else {
> -               struct page *page;
> -
>                 /* Something went wrong: roll back page cache changes */
>                 xas_lock_irq(&xas);
>                 if (nr_none) {
> @@ -2238,6 +2249,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>                         xas_lock_irq(&xas);
>                 }
>                 VM_BUG_ON(nr_none);
> +               /*
> +                * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only.
> +                * This undo is not needed unless failure is due to SCAN_COPY_MC.
> +                */
> +               if (!is_shmem && result == SCAN_COPY_MC)
> +                       filemap_nr_thps_dec(mapping);

We may need a memory barrier here. But missing the memory barrier is
not a fatal issue either, the worst case is unnecessary truncate from
open path if it sees obsolete nr_thps counter. And it may be better to
handle it in a follow up patch by moving smp_mb() into
filemap_nr_thp_xxx functions.

> +
>                 xas_unlock_irq(&xas);
>
>                 hpage->mapping = NULL;
> --
> 2.40.0.rc0.216.gc4246ad0f0-goog
>
Jiaqi Yan March 24, 2023, 10:54 p.m. UTC | #2
On Fri, Mar 24, 2023 at 2:15 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Sat, Mar 4, 2023 at 10:51 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
> >
> > Make collapse_file roll back when copying pages failed. More concretely:
> > - extract copying operations into a separate loop
> > - postpone the updates for nr_none until both scanning and copying
> >   succeeded
> > - postpone joining small xarray entries until both scanning and copying
> >   succeeded
> > - postpone the update operations to NR_XXX_THPS until both scanning and
> >   copying succeeded
> > - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but
> >   copying failed
> >
> > Tested manually:
> > 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk.
> > 1. Start a two-thread application. Each thread allocates a chunk of
> >    non-huge memory buffer from /mnt/ramdisk.
> > 2. Pick 4 random buffer address (2 in each thread) and inject
> >    uncorrectable memory errors at physical addresses.
> > 3. Signal both threads to make their memory buffer collapsible, i.e.
> >    calling madvise(MADV_HUGEPAGE).
> > 4. Wait and then check kernel log: khugepaged is able to recover from
> >    poisoned pages by skipping them.
> > 5. Signal both threads to inspect their buffer contents and make sure no
> >    data corruption.
> >
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
>
> Reviewed-by: Yang Shi <shy828301@gmail.com>
>
> Just a nit below:
>
> > ---
> >  mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> >  1 file changed, 48 insertions(+), 30 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index c3c217f6ebc6e..3ea2aa55c2c52 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1890,6 +1890,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >  {
> >         struct address_space *mapping = file->f_mapping;
> >         struct page *hpage;
> > +       struct page *page;
> > +       struct page *tmp;
> > +       struct folio *folio;
> >         pgoff_t index = 0, end = start + HPAGE_PMD_NR;
> >         LIST_HEAD(pagelist);
> >         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> > @@ -1934,8 +1937,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >
> >         xas_set(&xas, start);
> >         for (index = start; index < end; index++) {
> > -               struct page *page = xas_next(&xas);
> > -               struct folio *folio;
> > +               page = xas_next(&xas);
> >
> >                 VM_BUG_ON(index != xas.xa_index);
> >                 if (is_shmem) {
> > @@ -2117,10 +2119,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >         }
> >         nr = thp_nr_pages(hpage);
> >
> > -       if (is_shmem)
> > -               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> > -       else {
> > -               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> > +       if (!is_shmem) {
> >                 filemap_nr_thps_inc(mapping);
> >                 /*
> >                  * Paired with smp_mb() in do_dentry_open() to ensure
> > @@ -2131,21 +2130,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >                 smp_mb();
> >                 if (inode_is_open_for_write(mapping->host)) {
> >                         result = SCAN_FAIL;
> > -                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
> >                         filemap_nr_thps_dec(mapping);
> >                         goto xa_locked;

I notice the "goto xa_locked" statement can be removed now that the
code between here and xa_locked are all gone.

> >                 }
> >         }
> > -
> > -       if (nr_none) {
> > -               __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> > -               /* nr_none is always 0 for non-shmem. */
> > -               __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> > -       }
> > -
> > -       /* Join all the small entries into a single multi-index entry */
> > -       xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > -       xas_store(&xas, hpage);
> >  xa_locked:
> >         xas_unlock_irq(&xas);
> >  xa_unlocked:
> > @@ -2158,21 +2146,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >         try_to_unmap_flush();
> >
> >         if (result == SCAN_SUCCEED) {
> > -               struct page *page, *tmp;
> > -               struct folio *folio;
> > -
> >                 /*
> >                  * Replacing old pages with new one has succeeded, now we
> > -                * need to copy the content and free the old pages.
> > +                * attempt to copy the contents.
> >                  */
> >                 index = start;
> > -               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> > +               list_for_each_entry(page, &pagelist, lru) {
> >                         while (index < page->index) {
> >                                 clear_highpage(hpage + (index % HPAGE_PMD_NR));
> >                                 index++;
> >                         }
> > -                       copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
> > -                                     page);
> > +                       if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR),
> > +                                            page) > 0) {
> > +                               result = SCAN_COPY_MC;
> > +                               break;
> > +                       }
> > +                       index++;
> > +               }
> > +               while (result == SCAN_SUCCEED && index < end) {
> > +                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
> > +                       index++;
> > +               }
> > +       }
> > +
> > +       if (result == SCAN_SUCCEED) {
> > +               /*
> > +                * Copying old pages to huge one has succeeded, now we
> > +                * need to free the old pages.
> > +                */
> > +               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> >                         list_del(&page->lru);
> >                         page->mapping = NULL;
> >                         page_ref_unfreeze(page, 1);
> > @@ -2180,12 +2182,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >                         ClearPageUnevictable(page);
> >                         unlock_page(page);
> >                         put_page(page);
> > -                       index++;
> >                 }
> > -               while (index < end) {
> > -                       clear_highpage(hpage + (index % HPAGE_PMD_NR));
> > -                       index++;
> > +
> > +               xas_lock_irq(&xas);
> > +               if (is_shmem)
> > +                       __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> > +               else
> > +                       __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> > +
> > +               if (nr_none) {
> > +                       __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
> > +                       /* nr_none is always 0 for non-shmem. */
> > +                       __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
> >                 }
> > +               /* Join all the small entries into a single multi-index entry. */
> > +               xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> > +               xas_store(&xas, hpage);
> > +               xas_unlock_irq(&xas);
> >
> >                 folio = page_folio(hpage);
> >                 folio_mark_uptodate(folio);
> > @@ -2203,8 +2216,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >                 unlock_page(hpage);
> >                 hpage = NULL;
> >         } else {
> > -               struct page *page;
> > -
> >                 /* Something went wrong: roll back page cache changes */
> >                 xas_lock_irq(&xas);
> >                 if (nr_none) {
> > @@ -2238,6 +2249,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> >                         xas_lock_irq(&xas);
> >                 }
> >                 VM_BUG_ON(nr_none);
> > +               /*
> > +                * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only.
> > +                * This undo is not needed unless failure is due to SCAN_COPY_MC.
> > +                */
> > +               if (!is_shmem && result == SCAN_COPY_MC)
> > +                       filemap_nr_thps_dec(mapping);
>
> We may need a memory barrier here. But missing the memory barrier is
> not a fatal issue either, the worst case is unnecessary truncate from
> open path if it sees obsolete nr_thps counter. And it may be better to
> handle it in a follow up patch by moving smp_mb() into
> filemap_nr_thp_xxx functions.

Ah, for the same reason as the comment above: "Paired with smp_mb() in
do_dentry_open() to ensure i_writecount is up to date and the update
to nr_thps is visible."?
In that case, let me add the smp_mb() like this:
    smp_mb();
    if (!is_shmem && result == SCAN_COPY_MC) {...}

>
> > +
> >                 xas_unlock_irq(&xas);
> >
> >                 hpage->mapping = NULL;
> > --
> > 2.40.0.rc0.216.gc4246ad0f0-goog
> >
Hugh Dickins March 25, 2023, 12:39 a.m. UTC | #3
On Fri, 24 Mar 2023, Jiaqi Yan wrote:
> On Fri, Mar 24, 2023 at 2:15 PM Yang Shi <shy828301@gmail.com> wrote:
> > On Sat, Mar 4, 2023 at 10:51 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
> > >
> > > Make collapse_file roll back when copying pages failed. More concretely:
> > > - extract copying operations into a separate loop
> > > - postpone the updates for nr_none until both scanning and copying
> > >   succeeded
> > > - postpone joining small xarray entries until both scanning and copying
> > >   succeeded
> > > - postpone the update operations to NR_XXX_THPS until both scanning and
> > >   copying succeeded
> > > - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but
> > >   copying failed
> > >
> > > Tested manually:
> > > 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk.
> > > 1. Start a two-thread application. Each thread allocates a chunk of
> > >    non-huge memory buffer from /mnt/ramdisk.
> > > 2. Pick 4 random buffer address (2 in each thread) and inject
> > >    uncorrectable memory errors at physical addresses.
> > > 3. Signal both threads to make their memory buffer collapsible, i.e.
> > >    calling madvise(MADV_HUGEPAGE).
> > > 4. Wait and then check kernel log: khugepaged is able to recover from
> > >    poisoned pages by skipping them.
> > > 5. Signal both threads to inspect their buffer contents and make sure no
> > >    data corruption.
> > >
> > > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> >
> > Reviewed-by: Yang Shi <shy828301@gmail.com>
> >
> > Just a nit below:

Acked-by: Hugh Dickins <hughd@google.com>

with a little nit from me below, if you are respinning:

> >
> > > ---
> > >  mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> > >  1 file changed, 48 insertions(+), 30 deletions(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index c3c217f6ebc6e..3ea2aa55c2c52 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -1890,6 +1890,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > >  {
> > >         struct address_space *mapping = file->f_mapping;
> > >         struct page *hpage;
> > > +       struct page *page;
> > > +       struct page *tmp;
> > > +       struct folio *folio;
> > >         pgoff_t index = 0, end = start + HPAGE_PMD_NR;
> > >         LIST_HEAD(pagelist);
> > >         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> > > @@ -1934,8 +1937,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > >
> > >         xas_set(&xas, start);
> > >         for (index = start; index < end; index++) {
> > > -               struct page *page = xas_next(&xas);
> > > -               struct folio *folio;
> > > +               page = xas_next(&xas);
> > >
> > >                 VM_BUG_ON(index != xas.xa_index);
> > >                 if (is_shmem) {
> > > @@ -2117,10 +2119,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > >         }
> > >         nr = thp_nr_pages(hpage);
> > >
> > > -       if (is_shmem)
> > > -               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> > > -       else {
> > > -               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> > > +       if (!is_shmem) {
> > >                 filemap_nr_thps_inc(mapping);
> > >                 /*
> > >                  * Paired with smp_mb() in do_dentry_open() to ensure

That "nr = thp_nr_pages(hpage);" above becomes stranded a long way away
from where "nr" is actually used for updating those statistics: please
move it down with them.  (I see "nr" is also reported in the tracepoint
at the end, FWIW, so maybe that will show "0" in more failure cases than
it used to, but that's okay - it has been decently initialized.)

Thanks,
Hugh
Jiaqi Yan March 27, 2023, 9:15 p.m. UTC | #4
On Fri, Mar 24, 2023 at 5:39 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Fri, 24 Mar 2023, Jiaqi Yan wrote:
> > On Fri, Mar 24, 2023 at 2:15 PM Yang Shi <shy828301@gmail.com> wrote:
> > > On Sat, Mar 4, 2023 at 10:51 PM Jiaqi Yan <jiaqiyan@google.com> wrote:
> > > >
> > > > Make collapse_file roll back when copying pages failed. More concretely:
> > > > - extract copying operations into a separate loop
> > > > - postpone the updates for nr_none until both scanning and copying
> > > >   succeeded
> > > > - postpone joining small xarray entries until both scanning and copying
> > > >   succeeded
> > > > - postpone the update operations to NR_XXX_THPS until both scanning and
> > > >   copying succeeded
> > > > - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but
> > > >   copying failed
> > > >
> > > > Tested manually:
> > > > 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk.
> > > > 1. Start a two-thread application. Each thread allocates a chunk of
> > > >    non-huge memory buffer from /mnt/ramdisk.
> > > > 2. Pick 4 random buffer address (2 in each thread) and inject
> > > >    uncorrectable memory errors at physical addresses.
> > > > 3. Signal both threads to make their memory buffer collapsible, i.e.
> > > >    calling madvise(MADV_HUGEPAGE).
> > > > 4. Wait and then check kernel log: khugepaged is able to recover from
> > > >    poisoned pages by skipping them.
> > > > 5. Signal both threads to inspect their buffer contents and make sure no
> > > >    data corruption.
> > > >
> > > > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > >
> > > Reviewed-by: Yang Shi <shy828301@gmail.com>
> > >
> > > Just a nit below:
>
> Acked-by: Hugh Dickins <hughd@google.com>
>
> with a little nit from me below, if you are respinning:
>
> > >
> > > > ---
> > > >  mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> > > >  1 file changed, 48 insertions(+), 30 deletions(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index c3c217f6ebc6e..3ea2aa55c2c52 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -1890,6 +1890,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > > >  {
> > > >         struct address_space *mapping = file->f_mapping;
> > > >         struct page *hpage;
> > > > +       struct page *page;
> > > > +       struct page *tmp;
> > > > +       struct folio *folio;
> > > >         pgoff_t index = 0, end = start + HPAGE_PMD_NR;
> > > >         LIST_HEAD(pagelist);
> > > >         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> > > > @@ -1934,8 +1937,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > > >
> > > >         xas_set(&xas, start);
> > > >         for (index = start; index < end; index++) {
> > > > -               struct page *page = xas_next(&xas);
> > > > -               struct folio *folio;
> > > > +               page = xas_next(&xas);
> > > >
> > > >                 VM_BUG_ON(index != xas.xa_index);
> > > >                 if (is_shmem) {
> > > > @@ -2117,10 +2119,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> > > >         }
> > > >         nr = thp_nr_pages(hpage);
> > > >
> > > > -       if (is_shmem)
> > > > -               __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
> > > > -       else {
> > > > -               __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
> > > > +       if (!is_shmem) {
> > > >                 filemap_nr_thps_inc(mapping);
> > > >                 /*
> > > >                  * Paired with smp_mb() in do_dentry_open() to ensure
>
> That "nr = thp_nr_pages(hpage);" above becomes stranded a long way away
> from where "nr" is actually used for updating those statistics: please
> move it down with them.  (I see "nr" is also reported in the tracepoint
> at the end, FWIW, so maybe that will show "0" in more failure cases than
> it used to, but that's okay - it has been decently initialized.)
>

Thanks Hugh! I will make sure V11 moves "nr" closer to the place it is used.

> Thanks,
> Hugh
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c3c217f6ebc6e..3ea2aa55c2c52 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1890,6 +1890,9 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 {
 	struct address_space *mapping = file->f_mapping;
 	struct page *hpage;
+	struct page *page;
+	struct page *tmp;
+	struct folio *folio;
 	pgoff_t index = 0, end = start + HPAGE_PMD_NR;
 	LIST_HEAD(pagelist);
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1934,8 +1937,7 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 
 	xas_set(&xas, start);
 	for (index = start; index < end; index++) {
-		struct page *page = xas_next(&xas);
-		struct folio *folio;
+		page = xas_next(&xas);
 
 		VM_BUG_ON(index != xas.xa_index);
 		if (is_shmem) {
@@ -2117,10 +2119,7 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 	}
 	nr = thp_nr_pages(hpage);
 
-	if (is_shmem)
-		__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
-	else {
-		__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+	if (!is_shmem) {
 		filemap_nr_thps_inc(mapping);
 		/*
 		 * Paired with smp_mb() in do_dentry_open() to ensure
@@ -2131,21 +2130,10 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 		smp_mb();
 		if (inode_is_open_for_write(mapping->host)) {
 			result = SCAN_FAIL;
-			__mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
 			filemap_nr_thps_dec(mapping);
 			goto xa_locked;
 		}
 	}
-
-	if (nr_none) {
-		__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
-		/* nr_none is always 0 for non-shmem. */
-		__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
-	}
-
-	/* Join all the small entries into a single multi-index entry */
-	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
-	xas_store(&xas, hpage);
 xa_locked:
 	xas_unlock_irq(&xas);
 xa_unlocked:
@@ -2158,21 +2146,35 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 	try_to_unmap_flush();
 
 	if (result == SCAN_SUCCEED) {
-		struct page *page, *tmp;
-		struct folio *folio;
-
 		/*
 		 * Replacing old pages with new one has succeeded, now we
-		 * need to copy the content and free the old pages.
+		 * attempt to copy the contents.
 		 */
 		index = start;
-		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+		list_for_each_entry(page, &pagelist, lru) {
 			while (index < page->index) {
 				clear_highpage(hpage + (index % HPAGE_PMD_NR));
 				index++;
 			}
-			copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
-				      page);
+			if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR),
+					     page) > 0) {
+				result = SCAN_COPY_MC;
+				break;
+			}
+			index++;
+		}
+		while (result == SCAN_SUCCEED && index < end) {
+			clear_highpage(hpage + (index % HPAGE_PMD_NR));
+			index++;
+		}
+	}
+
+	if (result == SCAN_SUCCEED) {
+		/*
+		 * Copying old pages to huge one has succeeded, now we
+		 * need to free the old pages.
+		 */
+		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
 			list_del(&page->lru);
 			page->mapping = NULL;
 			page_ref_unfreeze(page, 1);
@@ -2180,12 +2182,23 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 			ClearPageUnevictable(page);
 			unlock_page(page);
 			put_page(page);
-			index++;
 		}
-		while (index < end) {
-			clear_highpage(hpage + (index % HPAGE_PMD_NR));
-			index++;
+
+		xas_lock_irq(&xas);
+		if (is_shmem)
+			__mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
+		else
+			__mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+
+		if (nr_none) {
+			__mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
+			/* nr_none is always 0 for non-shmem. */
+			__mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
 		}
+		/* Join all the small entries into a single multi-index entry. */
+		xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+		xas_store(&xas, hpage);
+		xas_unlock_irq(&xas);
 
 		folio = page_folio(hpage);
 		folio_mark_uptodate(folio);
@@ -2203,8 +2216,6 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 		unlock_page(hpage);
 		hpage = NULL;
 	} else {
-		struct page *page;
-
 		/* Something went wrong: roll back page cache changes */
 		xas_lock_irq(&xas);
 		if (nr_none) {
@@ -2238,6 +2249,13 @@  static int collapse_file(struct mm_struct *mm, unsigned long addr,
 			xas_lock_irq(&xas);
 		}
 		VM_BUG_ON(nr_none);
+		/*
+		 * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only.
+		 * This undo is not needed unless failure is due to SCAN_COPY_MC.
+		 */
+		if (!is_shmem && result == SCAN_COPY_MC)
+			filemap_nr_thps_dec(mapping);
+
 		xas_unlock_irq(&xas);
 
 		hpage->mapping = NULL;