Message ID | 20230327211548.462509-4-jiaqiyan@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Memory poison recovery in khugepaged collapsing | expand |
On Mon, Mar 27, 2023 at 2:16 PM Jiaqi Yan <jiaqiyan@google.com> wrote: > > Make collapse_file roll back when copying pages failed. More concretely: > - extract copying operations into a separate loop > - postpone the updates for nr_none until both scanning and copying > succeeded > - postpone joining small xarray entries until both scanning and copying > succeeded > - postpone the update operations to NR_XXX_THPS until both scanning and > copying succeeded > - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but > copying failed > > Tested manually: > 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk. > 1. Start a two-thread application. Each thread allocates a chunk of > non-huge memory buffer from /mnt/ramdisk. > 2. Pick 4 random buffer address (2 in each thread) and inject > uncorrectable memory errors at physical addresses. > 3. Signal both threads to make their memory buffer collapsible, i.e. > calling madvise(MADV_HUGEPAGE). > 4. Wait and then check kernel log: khugepaged is able to recover from > poisoned pages by skipping them. > 5. Signal both threads to inspect their buffer contents and make sure no > data corruption. > > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> A nit below: > --- > mm/khugepaged.c | 86 +++++++++++++++++++++++++++++++------------------ > 1 file changed, 54 insertions(+), 32 deletions(-) > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index bef68286345c8..38c1655ce0a9e 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1874,6 +1874,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > { > struct address_space *mapping = file->f_mapping; > struct page *hpage; > + struct page *page; > + struct page *tmp; > + struct folio *folio; > pgoff_t index = 0, end = start + HPAGE_PMD_NR; > LIST_HEAD(pagelist); > XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); > @@ -1918,8 +1921,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > xas_set(&xas, start); > for (index = start; index < end; index++) { > - struct page *page = xas_next(&xas); > - struct folio *folio; > + page = xas_next(&xas); > > VM_BUG_ON(index != xas.xa_index); > if (is_shmem) { > @@ -2099,12 +2101,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > put_page(page); > goto xa_unlocked; > } > - nr = thp_nr_pages(hpage); > > - if (is_shmem) > - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > - else { > - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > + if (!is_shmem) { > filemap_nr_thps_inc(mapping); > /* > * Paired with smp_mb() in do_dentry_open() to ensure > @@ -2115,21 +2113,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > smp_mb(); > if (inode_is_open_for_write(mapping->host)) { > result = SCAN_FAIL; > - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); > filemap_nr_thps_dec(mapping); > - goto xa_locked; > } > } > - > - if (nr_none) { > - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > - /* nr_none is always 0 for non-shmem. */ > - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > - } > - > - /* Join all the small entries into a single multi-index entry */ > - xas_set_order(&xas, start, HPAGE_PMD_ORDER); > - xas_store(&xas, hpage); > xa_locked: > xas_unlock_irq(&xas); > xa_unlocked: > @@ -2142,21 +2128,36 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > try_to_unmap_flush(); > > if (result == SCAN_SUCCEED) { > - struct page *page, *tmp; > - struct folio *folio; > - > /* > * Replacing old pages with new one has succeeded, now we > - * need to copy the content and free the old pages. > + * attempt to copy the contents. > */ > index = start; > - list_for_each_entry_safe(page, tmp, &pagelist, lru) { > + list_for_each_entry(page, &pagelist, lru) { > while (index < page->index) { > clear_highpage(hpage + (index % HPAGE_PMD_NR)); > index++; > } > - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), > - page); > + if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), > + page) > 0) { > + result = SCAN_COPY_MC; > + break; > + } > + index++; > + } > + while (result == SCAN_SUCCEED && index < end) { > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > + index++; > + } > + } > + > + nr = thp_nr_pages(hpage); > + if (result == SCAN_SUCCEED) { > + /* > + * Copying old pages to huge one has succeeded, now we > + * need to free the old pages. > + */ > + list_for_each_entry_safe(page, tmp, &pagelist, lru) { > list_del(&page->lru); > page->mapping = NULL; > page_ref_unfreeze(page, 1); > @@ -2164,12 +2165,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > ClearPageUnevictable(page); > unlock_page(page); > put_page(page); > - index++; > } > - while (index < end) { > - clear_highpage(hpage + (index % HPAGE_PMD_NR)); > - index++; > + > + xas_lock_irq(&xas); > + if (is_shmem) > + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > + else > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > + > + if (nr_none) { > + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > + /* nr_none is always 0 for non-shmem. */ > + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > } > + /* Join all the small entries into a single multi-index entry. */ > + xas_set_order(&xas, start, HPAGE_PMD_ORDER); > + xas_store(&xas, hpage); > + xas_unlock_irq(&xas); > > folio = page_folio(hpage); > folio_mark_uptodate(folio); > @@ -2187,8 +2199,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > unlock_page(hpage); > hpage = NULL; > } else { > - struct page *page; > - > /* Something went wrong: roll back page cache changes */ > xas_lock_irq(&xas); > if (nr_none) { > @@ -2222,6 +2232,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > xas_lock_irq(&xas); > } > VM_BUG_ON(nr_none); > + /* > + * Undo the updates of filemap_nr_thps_inc for non-SHMEM > + * file only. This undo is not needed unless failure is > + * due to SCAN_COPY_MC. > + * > + * Paired with smp_mb() in do_dentry_open() to ensure the > + * update to nr_thps is visible. > + */ > + smp_mb(); > + if (!is_shmem && result == SCAN_COPY_MC) > + filemap_nr_thps_dec(mapping); I think the memory barrier should be after the dec. > + > xas_unlock_irq(&xas); > > hpage->mapping = NULL; > -- > 2.40.0.348.gf938b09366-goog >
On Tue, Mar 28, 2023 at 9:02 AM Yang Shi <shy828301@gmail.com> wrote: > > On Mon, Mar 27, 2023 at 2:16 PM Jiaqi Yan <jiaqiyan@google.com> wrote: > > > > Make collapse_file roll back when copying pages failed. More concretely: > > - extract copying operations into a separate loop > > - postpone the updates for nr_none until both scanning and copying > > succeeded > > - postpone joining small xarray entries until both scanning and copying > > succeeded > > - postpone the update operations to NR_XXX_THPS until both scanning and > > copying succeeded > > - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but > > copying failed > > > > Tested manually: > > 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk. > > 1. Start a two-thread application. Each thread allocates a chunk of > > non-huge memory buffer from /mnt/ramdisk. > > 2. Pick 4 random buffer address (2 in each thread) and inject > > uncorrectable memory errors at physical addresses. > > 3. Signal both threads to make their memory buffer collapsible, i.e. > > calling madvise(MADV_HUGEPAGE). > > 4. Wait and then check kernel log: khugepaged is able to recover from > > poisoned pages by skipping them. > > 5. Signal both threads to inspect their buffer contents and make sure no > > data corruption. > > > > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> > > Reviewed-by: Yang Shi <shy828301@gmail.com> > > A nit below: > > > --- > > mm/khugepaged.c | 86 +++++++++++++++++++++++++++++++------------------ > > 1 file changed, 54 insertions(+), 32 deletions(-) > > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index bef68286345c8..38c1655ce0a9e 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -1874,6 +1874,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > { > > struct address_space *mapping = file->f_mapping; > > struct page *hpage; > > + struct page *page; > > + struct page *tmp; > > + struct folio *folio; > > pgoff_t index = 0, end = start + HPAGE_PMD_NR; > > LIST_HEAD(pagelist); > > XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); > > @@ -1918,8 +1921,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > > > xas_set(&xas, start); > > for (index = start; index < end; index++) { > > - struct page *page = xas_next(&xas); > > - struct folio *folio; > > + page = xas_next(&xas); > > > > VM_BUG_ON(index != xas.xa_index); > > if (is_shmem) { > > @@ -2099,12 +2101,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > put_page(page); > > goto xa_unlocked; > > } > > - nr = thp_nr_pages(hpage); > > > > - if (is_shmem) > > - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > > - else { > > - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > > + if (!is_shmem) { > > filemap_nr_thps_inc(mapping); > > /* > > * Paired with smp_mb() in do_dentry_open() to ensure > > @@ -2115,21 +2113,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > smp_mb(); > > if (inode_is_open_for_write(mapping->host)) { > > result = SCAN_FAIL; > > - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); > > filemap_nr_thps_dec(mapping); > > - goto xa_locked; > > } > > } > > - > > - if (nr_none) { > > - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > > - /* nr_none is always 0 for non-shmem. */ > > - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > > - } > > - > > - /* Join all the small entries into a single multi-index entry */ > > - xas_set_order(&xas, start, HPAGE_PMD_ORDER); > > - xas_store(&xas, hpage); > > xa_locked: > > xas_unlock_irq(&xas); > > xa_unlocked: > > @@ -2142,21 +2128,36 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > try_to_unmap_flush(); > > > > if (result == SCAN_SUCCEED) { > > - struct page *page, *tmp; > > - struct folio *folio; > > - > > /* > > * Replacing old pages with new one has succeeded, now we > > - * need to copy the content and free the old pages. > > + * attempt to copy the contents. > > */ > > index = start; > > - list_for_each_entry_safe(page, tmp, &pagelist, lru) { > > + list_for_each_entry(page, &pagelist, lru) { > > while (index < page->index) { > > clear_highpage(hpage + (index % HPAGE_PMD_NR)); > > index++; > > } > > - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), > > - page); > > + if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), > > + page) > 0) { > > + result = SCAN_COPY_MC; > > + break; > > + } > > + index++; > > + } > > + while (result == SCAN_SUCCEED && index < end) { > > + clear_highpage(hpage + (index % HPAGE_PMD_NR)); > > + index++; > > + } > > + } > > + > > + nr = thp_nr_pages(hpage); > > + if (result == SCAN_SUCCEED) { > > + /* > > + * Copying old pages to huge one has succeeded, now we > > + * need to free the old pages. > > + */ > > + list_for_each_entry_safe(page, tmp, &pagelist, lru) { > > list_del(&page->lru); > > page->mapping = NULL; > > page_ref_unfreeze(page, 1); > > @@ -2164,12 +2165,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > ClearPageUnevictable(page); > > unlock_page(page); > > put_page(page); > > - index++; > > } > > - while (index < end) { > > - clear_highpage(hpage + (index % HPAGE_PMD_NR)); > > - index++; > > + > > + xas_lock_irq(&xas); > > + if (is_shmem) > > + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); > > + else > > + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); > > + > > + if (nr_none) { > > + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); > > + /* nr_none is always 0 for non-shmem. */ > > + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); > > } > > + /* Join all the small entries into a single multi-index entry. */ > > + xas_set_order(&xas, start, HPAGE_PMD_ORDER); > > + xas_store(&xas, hpage); > > + xas_unlock_irq(&xas); > > > > folio = page_folio(hpage); > > folio_mark_uptodate(folio); > > @@ -2187,8 +2199,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > unlock_page(hpage); > > hpage = NULL; > > } else { > > - struct page *page; > > - > > /* Something went wrong: roll back page cache changes */ > > xas_lock_irq(&xas); > > if (nr_none) { > > @@ -2222,6 +2232,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, > > xas_lock_irq(&xas); > > } > > VM_BUG_ON(nr_none); > > + /* > > + * Undo the updates of filemap_nr_thps_inc for non-SHMEM > > + * file only. This undo is not needed unless failure is > > + * due to SCAN_COPY_MC. > > + * > > + * Paired with smp_mb() in do_dentry_open() to ensure the > > + * update to nr_thps is visible. > > + */ > > + smp_mb(); > > + if (!is_shmem && result == SCAN_COPY_MC) > > + filemap_nr_thps_dec(mapping); > > I think the memory barrier should be after the dec. Ah, will move into the if block and put after filemap_nr_thps_dec. > > > + > > xas_unlock_irq(&xas); > > > > hpage->mapping = NULL; > > -- > > 2.40.0.348.gf938b09366-goog > >
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bef68286345c8..38c1655ce0a9e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1874,6 +1874,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; + struct page *page; + struct page *tmp; + struct folio *folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1918,8 +1921,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { - struct page *page = xas_next(&xas); - struct folio *folio; + page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -2099,12 +2101,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(hpage); - if (is_shmem) - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); - else { - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -2115,21 +2113,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); - goto xa_locked; } } - - if (nr_none) { - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); - /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); - } - - /* Join all the small entries into a single multi-index entry */ - xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2142,21 +2128,36 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, try_to_unmap_flush(); if (result == SCAN_SUCCEED) { - struct page *page, *tmp; - struct folio *folio; - /* * Replacing old pages with new one has succeeded, now we - * need to copy the content and free the old pages. + * attempt to copy the contents. */ index = start; - list_for_each_entry_safe(page, tmp, &pagelist, lru) { + list_for_each_entry(page, &pagelist, lru) { while (index < page->index) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), - page); + if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), + page) > 0) { + result = SCAN_COPY_MC; + break; + } + index++; + } + while (result == SCAN_SUCCEED && index < end) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } + } + + nr = thp_nr_pages(hpage); + if (result == SCAN_SUCCEED) { + /* + * Copying old pages to huge one has succeeded, now we + * need to free the old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -2164,12 +2165,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, ClearPageUnevictable(page); unlock_page(page); put_page(page); - index++; } - while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); - index++; + + xas_lock_irq(&xas); + if (is_shmem) + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + else + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + + if (nr_none) { + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry. */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, hpage); + xas_unlock_irq(&xas); folio = page_folio(hpage); folio_mark_uptodate(folio); @@ -2187,8 +2199,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, unlock_page(hpage); hpage = NULL; } else { - struct page *page; - /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); if (nr_none) { @@ -2222,6 +2232,18 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); } VM_BUG_ON(nr_none); + /* + * Undo the updates of filemap_nr_thps_inc for non-SHMEM + * file only. This undo is not needed unless failure is + * due to SCAN_COPY_MC. + * + * Paired with smp_mb() in do_dentry_open() to ensure the + * update to nr_thps is visible. + */ + smp_mb(); + if (!is_shmem && result == SCAN_COPY_MC) + filemap_nr_thps_dec(mapping); + xas_unlock_irq(&xas); hpage->mapping = NULL;
Make collapse_file roll back when copying pages failed. More concretely: - extract copying operations into a separate loop - postpone the updates for nr_none until both scanning and copying succeeded - postpone joining small xarray entries until both scanning and copying succeeded - postpone the update operations to NR_XXX_THPS until both scanning and copying succeeded - for non-SHMEM file, roll back filemap_nr_thps_inc if scan succeeded but copying failed Tested manually: 0. Enable khugepaged on system under test. Mount tmpfs at /mnt/ramdisk. 1. Start a two-thread application. Each thread allocates a chunk of non-huge memory buffer from /mnt/ramdisk. 2. Pick 4 random buffer address (2 in each thread) and inject uncorrectable memory errors at physical addresses. 3. Signal both threads to make their memory buffer collapsible, i.e. calling madvise(MADV_HUGEPAGE). 4. Wait and then check kernel log: khugepaged is able to recover from poisoned pages by skipping them. 5. Signal both threads to inspect their buffer contents and make sure no data corruption. Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> --- mm/khugepaged.c | 86 +++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 32 deletions(-)