Message ID | 20240710192250.4114783-15-Liam.Howlett@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Avoid MAP_FIXED gap exposure | expand |
On Wed, Jul 10, 2024 at 03:22:43PM GMT, Liam R. Howlett wrote: > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> > > Instead of zeroing the vma tree and then overwriting the area, let the > area be overwritten and then clean up the gathered vmas using > vms_complete_munmap_vmas(). > > If a driver is mapping over an existing vma, then clear the ptes before > the call_mmap() invocation. This is done using the vms_clear_ptes() > helper. > > Temporarily keep track of the number of pages that will be removed and > reduce the charged amount. > > This also drops the validate_mm() call in the vma_expand() function. > It is necessary to drop the validate as it would fail since the mm > map_count would be incorrect during a vma expansion, prior to the > cleanup from vms_complete_munmap_vmas(). > > Clean up the error handing of the vms_gather_munmap_vmas() by calling > the verification within the function. > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > --- > mm/internal.h | 1 + > mm/mmap.c | 80 +++++++++++++++++++++++++++------------------------ > 2 files changed, 44 insertions(+), 37 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index 11e90c6e5a3e..dd4eede1be0f 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct { > unsigned long stack_vm; > unsigned long data_vm; > bool unlock; /* Unlock after the munmap */ > + bool clear_ptes; /* If there are outstanding PTE to be cleared */ > }; > > void __meminit __init_single_page(struct page *page, unsigned long pfn, > diff --git a/mm/mmap.c b/mm/mmap.c > index 870c2d04ad6b..58cf42e22bfe 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > } > > static unsigned long count_vma_pages_range(struct mm_struct *mm, > - unsigned long addr, unsigned long end) > + unsigned long addr, unsigned long end, > + unsigned long *nr_accounted) > { > VMA_ITERATOR(vmi, mm, addr); > struct vm_area_struct *vma; > unsigned long nr_pages = 0; > > + *nr_accounted = 0; > for_each_vma_range(vmi, vma, end) { > unsigned long vm_start = max(addr, vma->vm_start); > unsigned long vm_end = min(end, vma->vm_end); > > nr_pages += PHYS_PFN(vm_end - vm_start); > + if (vma->vm_flags & VM_ACCOUNT) > + *nr_accounted += PHYS_PFN(vm_end - vm_start); > } > > return nr_pages; > @@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms, > vms->exec_vm = vms->stack_vm = vms->data_vm = 0; > vms->unmap_start = FIRST_USER_ADDRESS; > vms->unmap_end = USER_PGTABLES_CEILING; > + vms->clear_ptes = false; /* No PTEs to clear yet */ > } > > /* > @@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > vma_iter_store(vmi, vma); > > vma_complete(&vp, vmi, vma->vm_mm); > - validate_mm(vma->vm_mm); > return 0; > > nomem: > @@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach) > } > > > -static void vms_complete_pte_clear(struct vma_munmap_struct *vms, > +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, > struct ma_state *mas_detach, bool mm_wr_locked) > { > struct mmu_gather tlb; > > + if (!vms->clear_ptes) /* Nothing to do */ > + return; > + > /* > * We can free page tables without write-locking mmap_lock because VMAs > * were isolated before we downgraded mmap_lock. > @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms, > /* start and end may be different if there is no prev or next vma. */ > free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); > tlb_finish_mmu(&tlb); > + vms->clear_ptes = false; > } > > /* > @@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, > if (vms->unlock) > mmap_write_downgrade(mm); > > - vms_complete_pte_clear(vms, mas_detach, !vms->unlock); > + vms_clear_ptes(vms, mas_detach, !vms->unlock); > /* Update high watermark before we lower total_vm */ > update_hiwater_vm(mm); > /* Stat accounting */ > @@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > while (vma_iter_addr(vms->vmi) > vms->start) > vma_iter_prev_range(vms->vmi); > > + /* There are now PTEs that need to be cleared */ > + vms->clear_ptes = true; > + > return 0; > > userfaultfd_error: > @@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > abort_munmap_vmas(mas_detach); > start_split_failed: > map_count_exceeded: > + validate_mm(vms->mm); I'm guessing here we know it's safe to validate? > return error; > } > > @@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > > clear_tree_failed: > abort_munmap_vmas(&mas_detach); > -gather_failed: > validate_mm(mm); Additionally I imagine the gathering failing results in the tree being unable to be validated? > +gather_failed: > return error; > } > > @@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > unsigned long merge_start = addr, merge_end = end; > bool writable_file_mapping = false; > pgoff_t vm_pgoff; > - int error; > + int error = -ENOMEM; > VMA_ITERATOR(vmi, mm, addr); > + unsigned long nr_pages, nr_accounted; > > - /* Check against address space limit. */ > - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { > - unsigned long nr_pages; > - > - /* > - * MAP_FIXED may remove pages of mappings that intersects with > - * requested mapping. Account for the pages it would unmap. > - */ > - nr_pages = count_vma_pages_range(mm, addr, end); > - > - if (!may_expand_vm(mm, vm_flags, > - (len >> PAGE_SHIFT) - nr_pages)) > - return -ENOMEM; > - } > + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted); > > + /* > + * Check against address space limit. > + * MAP_FIXED may remove pages of mappings that intersects with requested > + * mapping. Account for the pages it would unmap. > + */ > + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) > + return -ENOMEM; > > if (unlikely(!can_modify_mm(mm, addr, end))) > return -EPERM; > @@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > mas_init(&mas_detach, &mt_detach, /* addr = */ 0); > /* Prepare to unmap any existing mapping in the area */ > if (vms_gather_munmap_vmas(&vms, &mas_detach)) > - goto gather_failed; > - > - /* Remove any existing mappings from the vma tree */ > - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL)) > - goto clear_tree_failed; > + return -ENOMEM; > > - /* Unmap any existing mapping in the area */ > - vms_complete_munmap_vmas(&vms, &mas_detach); > next = vms.next; > prev = vms.prev; > vma = NULL; > } else { > + /* Minimal setup of vms */ Nit, but is this valid now we use the init function unconditionally? > next = vma_next(&vmi); > prev = vma_prev(&vmi); > if (prev) > @@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > */ > if (accountable_mapping(file, vm_flags)) { > charged = len >> PAGE_SHIFT; > + charged -= nr_accounted; > if (security_vm_enough_memory_mm(mm, charged)) > - return -ENOMEM; > + goto abort_munmap; > + vms.nr_accounted = 0; > vm_flags |= VM_ACCOUNT; > } > > @@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > * not unmapped, but the maps are removed from the list. > */ > vma = vm_area_alloc(mm); > - if (!vma) { > - error = -ENOMEM; > + if (!vma) > goto unacct_error; > - } > > vma_iter_config(&vmi, addr, end); > vma_set_range(vma, addr, end, pgoff); > @@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > if (file) { > vma->vm_file = get_file(file); > + /* call_mmap() may map PTE, so ensure there are no existing PTEs */ > + vms_clear_ptes(&vms, &mas_detach, true); > error = call_mmap(file, vma); > if (error) > goto unmap_and_free_vma; > @@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > expanded: > perf_event_mmap(vma); > > + /* Unmap any existing mapping in the area */ > + if (vms.nr_pages) > + vms_complete_munmap_vmas(&vms, &mas_detach); > + > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > if (vm_flags & VM_LOCKED) { > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > @@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > unacct_error: > if (charged) > vm_unacct_memory(charged); > - validate_mm(mm); > - return error; > > -clear_tree_failed: > - abort_munmap_vmas(&mas_detach); > -gather_failed: > +abort_munmap: > + if (vms.nr_pages) > + abort_munmap_vmas(&mas_detach); > validate_mm(mm); > - return -ENOMEM; > + return error; > } > > static int __vm_munmap(unsigned long start, size_t len, bool unlock) > -- > 2.43.0 > Other than nits/queries, LGTM: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240711 11:25]: > On Wed, Jul 10, 2024 at 03:22:43PM GMT, Liam R. Howlett wrote: > > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> > > > > Instead of zeroing the vma tree and then overwriting the area, let the > > area be overwritten and then clean up the gathered vmas using > > vms_complete_munmap_vmas(). > > > > If a driver is mapping over an existing vma, then clear the ptes before > > the call_mmap() invocation. This is done using the vms_clear_ptes() > > helper. > > > > Temporarily keep track of the number of pages that will be removed and > > reduce the charged amount. > > > > This also drops the validate_mm() call in the vma_expand() function. > > It is necessary to drop the validate as it would fail since the mm > > map_count would be incorrect during a vma expansion, prior to the > > cleanup from vms_complete_munmap_vmas(). > > > > Clean up the error handing of the vms_gather_munmap_vmas() by calling > > the verification within the function. > > > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > > --- > > mm/internal.h | 1 + > > mm/mmap.c | 80 +++++++++++++++++++++++++++------------------------ > > 2 files changed, 44 insertions(+), 37 deletions(-) > > > > diff --git a/mm/internal.h b/mm/internal.h > > index 11e90c6e5a3e..dd4eede1be0f 100644 > > --- a/mm/internal.h > > +++ b/mm/internal.h > > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct { > > unsigned long stack_vm; > > unsigned long data_vm; > > bool unlock; /* Unlock after the munmap */ > > + bool clear_ptes; /* If there are outstanding PTE to be cleared */ > > }; > > > > void __meminit __init_single_page(struct page *page, unsigned long pfn, > > diff --git a/mm/mmap.c b/mm/mmap.c > > index 870c2d04ad6b..58cf42e22bfe 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > } > > > > static unsigned long count_vma_pages_range(struct mm_struct *mm, > > - unsigned long addr, unsigned long end) > > + unsigned long addr, unsigned long end, > > + unsigned long *nr_accounted) > > { > > VMA_ITERATOR(vmi, mm, addr); > > struct vm_area_struct *vma; > > unsigned long nr_pages = 0; > > > > + *nr_accounted = 0; > > for_each_vma_range(vmi, vma, end) { > > unsigned long vm_start = max(addr, vma->vm_start); > > unsigned long vm_end = min(end, vma->vm_end); > > > > nr_pages += PHYS_PFN(vm_end - vm_start); > > + if (vma->vm_flags & VM_ACCOUNT) > > + *nr_accounted += PHYS_PFN(vm_end - vm_start); > > } > > > > return nr_pages; > > @@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms, > > vms->exec_vm = vms->stack_vm = vms->data_vm = 0; > > vms->unmap_start = FIRST_USER_ADDRESS; > > vms->unmap_end = USER_PGTABLES_CEILING; > > + vms->clear_ptes = false; /* No PTEs to clear yet */ > > } > > > > /* > > @@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, > > vma_iter_store(vmi, vma); > > > > vma_complete(&vp, vmi, vma->vm_mm); > > - validate_mm(vma->vm_mm); > > return 0; > > > > nomem: > > @@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach) > > } > > > > > > -static void vms_complete_pte_clear(struct vma_munmap_struct *vms, > > +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, > > struct ma_state *mas_detach, bool mm_wr_locked) > > { > > struct mmu_gather tlb; > > > > + if (!vms->clear_ptes) /* Nothing to do */ > > + return; > > + > > /* > > * We can free page tables without write-locking mmap_lock because VMAs > > * were isolated before we downgraded mmap_lock. > > @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms, > > /* start and end may be different if there is no prev or next vma. */ > > free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); > > tlb_finish_mmu(&tlb); > > + vms->clear_ptes = false; > > } > > > > /* > > @@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, > > if (vms->unlock) > > mmap_write_downgrade(mm); > > > > - vms_complete_pte_clear(vms, mas_detach, !vms->unlock); > > + vms_clear_ptes(vms, mas_detach, !vms->unlock); > > /* Update high watermark before we lower total_vm */ > > update_hiwater_vm(mm); > > /* Stat accounting */ > > @@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > > while (vma_iter_addr(vms->vmi) > vms->start) > > vma_iter_prev_range(vms->vmi); > > > > + /* There are now PTEs that need to be cleared */ > > + vms->clear_ptes = true; > > + > > return 0; > > > > userfaultfd_error: > > @@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, > > abort_munmap_vmas(mas_detach); > > start_split_failed: > > map_count_exceeded: > > + validate_mm(vms->mm); > > I'm guessing here we know it's safe to validate? verification in the gather state is always safe - we haven't changed the tree or a vma yet. > > > return error; > > } > > > > @@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, > > > > clear_tree_failed: > > abort_munmap_vmas(&mas_detach); > > -gather_failed: > > validate_mm(mm); > > Additionally I imagine the gathering failing results in the tree being unable to > be validated? It is safe, but if it's here then it doesn't need to be above > > > +gather_failed: > > return error; > > } > > > > @@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > unsigned long merge_start = addr, merge_end = end; > > bool writable_file_mapping = false; > > pgoff_t vm_pgoff; > > - int error; > > + int error = -ENOMEM; > > VMA_ITERATOR(vmi, mm, addr); > > + unsigned long nr_pages, nr_accounted; > > > > - /* Check against address space limit. */ > > - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { > > - unsigned long nr_pages; > > - > > - /* > > - * MAP_FIXED may remove pages of mappings that intersects with > > - * requested mapping. Account for the pages it would unmap. > > - */ > > - nr_pages = count_vma_pages_range(mm, addr, end); > > - > > - if (!may_expand_vm(mm, vm_flags, > > - (len >> PAGE_SHIFT) - nr_pages)) > > - return -ENOMEM; > > - } > > + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted); > > > > + /* > > + * Check against address space limit. > > + * MAP_FIXED may remove pages of mappings that intersects with requested > > + * mapping. Account for the pages it would unmap. > > + */ > > + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) > > + return -ENOMEM; > > > > if (unlikely(!can_modify_mm(mm, addr, end))) > > return -EPERM; > > @@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > mas_init(&mas_detach, &mt_detach, /* addr = */ 0); > > /* Prepare to unmap any existing mapping in the area */ > > if (vms_gather_munmap_vmas(&vms, &mas_detach)) > > - goto gather_failed; > > - > > - /* Remove any existing mappings from the vma tree */ > > - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL)) > > - goto clear_tree_failed; > > + return -ENOMEM; > > > > - /* Unmap any existing mapping in the area */ > > - vms_complete_munmap_vmas(&vms, &mas_detach); > > next = vms.next; > > prev = vms.prev; > > vma = NULL; > > } else { > > + /* Minimal setup of vms */ > > Nit, but is this valid now we use the init function unconditionally? Yes, that needs to be dropped, thanks. > > > next = vma_next(&vmi); > > prev = vma_prev(&vmi); > > if (prev) > > @@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > */ > > if (accountable_mapping(file, vm_flags)) { > > charged = len >> PAGE_SHIFT; > > + charged -= nr_accounted; > > if (security_vm_enough_memory_mm(mm, charged)) > > - return -ENOMEM; > > + goto abort_munmap; > > + vms.nr_accounted = 0; > > vm_flags |= VM_ACCOUNT; > > } > > > > @@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > * not unmapped, but the maps are removed from the list. > > */ > > vma = vm_area_alloc(mm); > > - if (!vma) { > > - error = -ENOMEM; > > + if (!vma) > > goto unacct_error; > > - } > > > > vma_iter_config(&vmi, addr, end); > > vma_set_range(vma, addr, end, pgoff); > > @@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > > if (file) { > > vma->vm_file = get_file(file); > > + /* call_mmap() may map PTE, so ensure there are no existing PTEs */ > > + vms_clear_ptes(&vms, &mas_detach, true); > > error = call_mmap(file, vma); > > if (error) > > goto unmap_and_free_vma; > > @@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > expanded: > > perf_event_mmap(vma); > > > > + /* Unmap any existing mapping in the area */ > > + if (vms.nr_pages) > > + vms_complete_munmap_vmas(&vms, &mas_detach); > > + > > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); > > if (vm_flags & VM_LOCKED) { > > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > > @@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > unacct_error: > > if (charged) > > vm_unacct_memory(charged); > > - validate_mm(mm); > > - return error; > > > > -clear_tree_failed: > > - abort_munmap_vmas(&mas_detach); > > -gather_failed: > > +abort_munmap: > > + if (vms.nr_pages) > > + abort_munmap_vmas(&mas_detach); > > validate_mm(mm); > > - return -ENOMEM; > > + return error; > > } > > > > static int __vm_munmap(unsigned long start, size_t len, bool unlock) > > -- > > 2.43.0 > > > > Other than nits/queries, LGTM: > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Hello, kernel test robot noticed "ltp.hugemmap06.fail" on: commit: d793398401db9fb81084bd4fe2f782342201df18 ("[PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()") url: https://github.com/intel-lab-lkp/linux/commits/Liam-R-Howlett/mm-mmap-Correctly-position-vma_iterator-in-__split_vma/20240711-075019 base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/all/20240710192250.4114783-15-Liam.Howlett@oracle.com/ patch subject: [PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region() in testcase: ltp version: ltp-x86_64-14c1f76-1_20240706 with following parameters: test: hugetlb/hugemmap06 compiler: gcc-13 test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz (Ivy Bridge) with 16G memory (please refer to attached dmesg/kmsg for entire log/backtrace) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202407162022.5a730c37-oliver.sang@intel.com Running tests....... <<<test_start>>> tag=hugemmap06 stime=1721029963 cmdline="hugemmap06" contacts="" analysis=exit <<<test_output>>> tst_hugepage.c:84: TINFO: 255 hugepage(s) reserved tst_test.c:1803: TINFO: LTP version: 20240524-71-g361f6ad13 tst_test.c:1647: TINFO: Timeout per run is 0h 00m 30s hugemmap06.c:114: TPASS: No regression found hugemmap06.c:114: TPASS: No regression found hugemmap06.c:114: TPASS: No regression found hugemmap06.c:114: TPASS: No regression found hugemmap06.c:100: TFAIL: mmap failed: ENOMEM (12) HINT: You _MAY_ be missing kernel fixes: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f522c3ac00a4 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9119a41e9091 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7b24d8616be3 https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1406ec9ba6c6 Summary: passed 4 failed 1 broken 0 skipped 0 warnings 0 incrementing stop <<<execution_status>>> initiation_status="ok" duration=10 termination_type=exited termination_id=1 corefile=no cutime=2 cstime=629 <<<test_end>>> INFO: ltp-pan reported some tests FAIL LTP Version: 20240524-71-g361f6ad13 ############################################################### Done executing testcases. LTP Version: 20240524-71-g361f6ad13 ############################################################### The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20240716/202407162022.5a730c37-oliver.sang@intel.com
* kernel test robot <oliver.sang@intel.com> [240716 08:47]: > > > Hello, > > kernel test robot noticed "ltp.hugemmap06.fail" on: Hello Robot! Thank you for finding this, it will certainly help me improve my next revision of my series! > > commit: d793398401db9fb81084bd4fe2f782342201df18 ("[PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()") > url: https://github.com/intel-lab-lkp/linux/commits/Liam-R-Howlett/mm-mmap-Correctly-position-vma_iterator-in-__split_vma/20240711-075019 > base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything > patch link: https://lore.kernel.org/all/20240710192250.4114783-15-Liam.Howlett@oracle.com/ > patch subject: [PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region() > > in testcase: ltp > version: ltp-x86_64-14c1f76-1_20240706 > with following parameters: > > test: hugetlb/hugemmap06 > > > This is because I am trying to set up a MAP_FIXED huge page before hugetlb_vm_op_close() is called, which removes the reserved huge pages. I will address this in v5.
diff --git a/mm/internal.h b/mm/internal.h index 11e90c6e5a3e..dd4eede1be0f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1503,6 +1503,7 @@ struct vma_munmap_struct { unsigned long stack_vm; unsigned long data_vm; bool unlock; /* Unlock after the munmap */ + bool clear_ptes; /* If there are outstanding PTE to be cleared */ }; void __meminit __init_single_page(struct page *page, unsigned long pfn, diff --git a/mm/mmap.c b/mm/mmap.c index 870c2d04ad6b..58cf42e22bfe 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) } static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) + unsigned long addr, unsigned long end, + unsigned long *nr_accounted) { VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; unsigned long nr_pages = 0; + *nr_accounted = 0; for_each_vma_range(vmi, vma, end) { unsigned long vm_start = max(addr, vma->vm_start); unsigned long vm_end = min(end, vma->vm_end); nr_pages += PHYS_PFN(vm_end - vm_start); + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += PHYS_PFN(vm_end - vm_start); } return nr_pages; @@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms, vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; + vms->clear_ptes = false; /* No PTEs to clear yet */ } /* @@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; nomem: @@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach) } -static void vms_complete_pte_clear(struct vma_munmap_struct *vms, +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; + if (!vms->clear_ptes) /* Nothing to do */ + return; + /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms, /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); + vms->clear_ptes = false; } /* @@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, if (vms->unlock) mmap_write_downgrade(mm); - vms_complete_pte_clear(vms, mas_detach, !vms->unlock); + vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ @@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); + /* There are now PTEs that need to be cleared */ + vms->clear_ptes = true; + return 0; userfaultfd_error: @@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, abort_munmap_vmas(mas_detach); start_split_failed: map_count_exceeded: + validate_mm(vms->mm); return error; } @@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, clear_tree_failed: abort_munmap_vmas(&mas_detach); -gather_failed: validate_mm(mm); +gather_failed: return error; } @@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; pgoff_t vm_pgoff; - int error; + int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); + unsigned long nr_pages, nr_accounted; - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted); + /* + * Check against address space limit. + * MAP_FIXED may remove pages of mappings that intersects with requested + * mapping. Account for the pages it would unmap. + */ + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; if (unlikely(!can_modify_mm(mm, addr, end))) return -EPERM; @@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, mas_init(&mas_detach, &mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ if (vms_gather_munmap_vmas(&vms, &mas_detach)) - goto gather_failed; - - /* Remove any existing mappings from the vma tree */ - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL)) - goto clear_tree_failed; + return -ENOMEM; - /* Unmap any existing mapping in the area */ - vms_complete_munmap_vmas(&vms, &mas_detach); next = vms.next; prev = vms.prev; vma = NULL; } else { + /* Minimal setup of vms */ next = vma_next(&vmi); prev = vma_prev(&vmi); if (prev) @@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; + charged -= nr_accounted; if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; + goto abort_munmap; + vms.nr_accounted = 0; vm_flags |= VM_ACCOUNT; } @@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; + if (!vma) goto unacct_error; - } vma_iter_config(&vmi, addr, end); vma_set_range(vma, addr, end, pgoff); @@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (file) { vma->vm_file = get_file(file); + /* call_mmap() may map PTE, so ensure there are no existing PTEs */ + vms_clear_ptes(&vms, &mas_detach, true); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, expanded: perf_event_mmap(vma); + /* Unmap any existing mapping in the area */ + if (vms.nr_pages) + vms_complete_munmap_vmas(&vms, &mas_detach); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || @@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm(mm); - return error; -clear_tree_failed: - abort_munmap_vmas(&mas_detach); -gather_failed: +abort_munmap: + if (vms.nr_pages) + abort_munmap_vmas(&mas_detach); validate_mm(mm); - return -ENOMEM; + return error; } static int __vm_munmap(unsigned long start, size_t len, bool unlock)