Message ID | 20211201142918.921493-22-Liam.Howlett@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Introducing the Maple Tree | expand |
On 12/1/21 15:29, Liam Howlett wrote: > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> > > Avoid allocating a new VMA when it a vma modification can occur. When a > brk() can expand or contract a VMA, then the single store operation will > only modify one index of the maple tree instead of causing a node to > split or coalesce. This avoids unnecessary allocations/frees of maple > tree nodes and VMAs. > > Use the advanced API for the maple tree to avoid unnecessary walks of > the tree. > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > --- > mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 207 insertions(+), 51 deletions(-) > > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, > + unsigned long addr, unsigned long len, > + unsigned long flags) > { > struct mm_struct *mm = current->mm; > - struct vm_area_struct *vma, *prev; > - pgoff_t pgoff = addr >> PAGE_SHIFT; > + struct vm_area_struct *prev = NULL; > int error; > unsigned long mapped_addr; > validate_mm_mt(mm); > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > if (error) > return error; > > - /* Clear old maps, set up prev and uf */ > - if (munmap_vma_range(mm, addr, len, &prev, uf)) > - return -ENOMEM; > - > - /* Check against address space limits *after* clearing old maps... */ > + /* Check against address space limits by the changed size */ Can that cause spurious ENOMEM because now the check assumes 'len' worth of purely new pages and no reuse? > if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) > return -ENOMEM; > > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) > return -ENOMEM; > > - /* Can we just expand an old private anonymous mapping? */ > - vma = vma_merge(mm, prev, addr, addr + len, flags, > - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); > - if (vma) > - goto out; > + mas->last = addr + len - 1; > + if (vma) { > + /* Expand the existing vma if possible; almost never a singular > + * list, so this will almost always fail. */ > > - /* > - * create a vma struct for an anonymous mapping > - */ > - vma = vm_area_alloc(mm); > - if (!vma) { > - vm_unacct_memory(len >> PAGE_SHIFT); > - return -ENOMEM; > + if ((!vma->anon_vma || > + list_is_singular(&vma->anon_vma_chain)) && Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks like something e.g. we can easily forget to adjust when changing vma_merge() itself. Is this optimization worth the trouble given the comment above "so this will almost always fail"? > + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){ > + mas->index = vma->vm_start; > + > + vma_adjust_trans_huge(vma, addr, addr + len, 0); > + if (vma->anon_vma) { > + anon_vma_lock_write(vma->anon_vma); > + anon_vma_interval_tree_pre_update_vma(vma); > + } > + vma->vm_end = addr + len; > + vma->vm_flags |= VM_SOFTDIRTY; > + if (mas_store_gfp(mas, vma, GFP_KERNEL)) > + goto mas_mod_fail; > + > + if (vma->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vma); > + anon_vma_unlock_write(vma->anon_vma); > + } > + khugepaged_enter_vma_merge(vma, flags); > + goto out; > + } > + prev = vma; > } > + mas->index = addr; > + mas_walk(mas); > + > + /* create a vma struct for an anonymous mapping */ > + vma = vm_area_alloc(mm); > + if (!vma) > + goto vma_alloc_fail; > > vma_set_anonymous(vma); > vma->vm_start = addr; > vma->vm_end = addr + len; > - vma->vm_pgoff = pgoff; > + vma->vm_pgoff = addr >> PAGE_SHIFT; > vma->vm_flags = flags; > vma->vm_page_prot = vm_get_page_prot(flags); > - vma_link(mm, vma, prev); > + if (vma_mas_store(vma, mas)) > + goto mas_store_fail; > + > + if (!prev) > + prev = mas_prev(mas, 0); > + > + __vma_link_list(mm, vma, prev); > + mm->map_count++; > out: > perf_event_mmap(vma); > mm->total_vm += len >> PAGE_SHIFT; > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > vma->vm_flags |= VM_SOFTDIRTY; > validate_mm_mt(mm); > return 0; > + > +mas_store_fail: > + vm_area_free(vma); > +vma_alloc_fail: > + vm_unacct_memory(len >> PAGE_SHIFT); > + return -ENOMEM; > + > +mas_mod_fail: > + vma->vm_end = addr; > + if (vma->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vma); > + anon_vma_unlock_write(vma->anon_vma); > + } > + return -ENOMEM; > + > } > > int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > { > struct mm_struct *mm = current->mm; > + struct vm_area_struct *vma = NULL; > unsigned long len; > int ret; > bool populate; > - LIST_HEAD(uf); > + MA_STATE(mas, &mm->mm_mt, addr, addr); > > len = PAGE_ALIGN(request); > if (len < request) > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > if (mmap_write_lock_killable(mm)) > return -EINTR; > > - ret = do_brk_flags(addr, len, flags, &uf); > + // This vma left intentionally blank. This comment using unintentionally bad syntax (// vs /* */) Also if we leave it blank it means this path won't ever expand an existing vma, while previously it could succeed the vma_merge, no? Or all callers of vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have a more verbose comment... > + mas_walk(&mas); > + ret = do_brk_flags(&mas, vma, addr, len, flags); > populate = ((mm->def_flags & VM_LOCKED) != 0); > mmap_write_unlock(mm); > - userfaultfd_unmap_complete(mm, &uf); Looks like this part is removed completely from vm_brk_flags() paths? OK it seems the whole patch makes some asumption that vm_brk_flags() never has to unmap a pre-existing area, and in the brk() syscall this is now delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it might be safe, it should be discussed in the patch that vm_brk_flags() didn't actually need to support the unmap part, because x y z. And best if there are some DEBUG_VM based assertions supporting that. But then again, is the optimized scenario happening often enough to warrant it? > if (populate && !ret) > mm_populate(addr, len); > return ret;
On 12/1/21 15:29, Liam Howlett wrote: > @@ -1989,6 +2013,7 @@ EXPORT_SYMBOL(get_unmapped_area); > struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > { > struct vm_area_struct *vma; > + MA_STATE(mas, &mm->mm_mt, addr, addr); > > mmap_assert_locked(mm); > /* Check the cache first. */ > @@ -1996,7 +2021,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > if (likely(vma)) > return vma; > > - vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX); > + vma = mas_find(&mas, -1); > if (vma) > vmacache_update(addr, vma); > return vma; Oh and this change to find_vma() was supposed to go to the next patch, no?
* Vlastimil Babka <vbabka@suse.cz> [220113 07:59]: > On 12/1/21 15:29, Liam Howlett wrote: > > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> > > > > Avoid allocating a new VMA when it a vma modification can occur. When a > > brk() can expand or contract a VMA, then the single store operation will > > only modify one index of the maple tree instead of causing a node to > > split or coalesce. This avoids unnecessary allocations/frees of maple > > tree nodes and VMAs. > > > > Use the advanced API for the maple tree to avoid unnecessary walks of > > the tree. > > > > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > > --- > > mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++----------- > > 1 file changed, 207 insertions(+), 51 deletions(-) > > > > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, > > + unsigned long addr, unsigned long len, > > + unsigned long flags) > > { > > struct mm_struct *mm = current->mm; > > - struct vm_area_struct *vma, *prev; > > - pgoff_t pgoff = addr >> PAGE_SHIFT; > > + struct vm_area_struct *prev = NULL; > > int error; > > unsigned long mapped_addr; > > validate_mm_mt(mm); > > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > > if (error) > > return error; > > > > - /* Clear old maps, set up prev and uf */ > > - if (munmap_vma_range(mm, addr, len, &prev, uf)) > > - return -ENOMEM; > > - > > - /* Check against address space limits *after* clearing old maps... */ > > + /* Check against address space limits by the changed size */ > > Can that cause spurious ENOMEM because now the check assumes 'len' worth of > purely new pages and no reuse? I don't think so? I must be missing how anything could exist in this range to begin with? The brk syscall checks to ensure there is enough room and the other two users are the elf and a.out loaders - could either of those two map over parts of themselves on load? This seemed to be there primarily to set up for an rb insert (set prev, rb_link, rb_parent) in the current code. Sort of like how get_unmapped_area() with MAP_FIXED appears to be used to sanitize the addr and len. > > > if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) > > return -ENOMEM; > > > > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) > > return -ENOMEM; > > > > - /* Can we just expand an old private anonymous mapping? */ > > - vma = vma_merge(mm, prev, addr, addr + len, flags, > > - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); > > - if (vma) > > - goto out; > > + mas->last = addr + len - 1; > > + if (vma) { > > + /* Expand the existing vma if possible; almost never a singular > > + * list, so this will almost always fail. */ > > > > - /* > > - * create a vma struct for an anonymous mapping > > - */ > > - vma = vm_area_alloc(mm); > > - if (!vma) { > > - vm_unacct_memory(len >> PAGE_SHIFT); > > - return -ENOMEM; > > + if ((!vma->anon_vma || > > + list_is_singular(&vma->anon_vma_chain)) && > > Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks > like something e.g. we can easily forget to adjust when changing vma_merge() > itself. vma_merge() is overly heavy for what brk() is doing. I get what you are saying about it potentially being missed and I think brk is already in the 'potentially overlooked' category as it stands today. Honestly, one less user of vma_merge() (and thus less __vma_adjust() users) seems like a win to me. > Is this optimization worth the trouble given the comment above "so this will > almost always fail"? vma_merge() walks the tree for next and next->next and does a lot of extra checks before arriving at the conclusion that this will fail. Maybe 'almost always fail' is too strong wording; on boot of my VM, 63 expands happen out of 517, so 87.8% fail this test. > > > + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){ > > + mas->index = vma->vm_start; > > + > > + vma_adjust_trans_huge(vma, addr, addr + len, 0); > > + if (vma->anon_vma) { > > + anon_vma_lock_write(vma->anon_vma); > > + anon_vma_interval_tree_pre_update_vma(vma); > > + } > > + vma->vm_end = addr + len; > > + vma->vm_flags |= VM_SOFTDIRTY; > > + if (mas_store_gfp(mas, vma, GFP_KERNEL)) > > + goto mas_mod_fail; > > + > > + if (vma->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vma); > > + anon_vma_unlock_write(vma->anon_vma); > > + } > > + khugepaged_enter_vma_merge(vma, flags); > > + goto out; > > + } > > + prev = vma; > > } > > + mas->index = addr; > > + mas_walk(mas); > > + > > + /* create a vma struct for an anonymous mapping */ > > + vma = vm_area_alloc(mm); > > + if (!vma) > > + goto vma_alloc_fail; > > > > vma_set_anonymous(vma); > > vma->vm_start = addr; > > vma->vm_end = addr + len; > > - vma->vm_pgoff = pgoff; > > + vma->vm_pgoff = addr >> PAGE_SHIFT; > > vma->vm_flags = flags; > > vma->vm_page_prot = vm_get_page_prot(flags); > > - vma_link(mm, vma, prev); > > + if (vma_mas_store(vma, mas)) > > + goto mas_store_fail; > > + > > + if (!prev) > > + prev = mas_prev(mas, 0); > > + > > + __vma_link_list(mm, vma, prev); > > + mm->map_count++; > > out: > > perf_event_mmap(vma); > > mm->total_vm += len >> PAGE_SHIFT; > > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len, > > vma->vm_flags |= VM_SOFTDIRTY; > > validate_mm_mt(mm); > > return 0; > > + > > +mas_store_fail: > > + vm_area_free(vma); > > +vma_alloc_fail: > > + vm_unacct_memory(len >> PAGE_SHIFT); > > + return -ENOMEM; > > + > > +mas_mod_fail: > > + vma->vm_end = addr; > > + if (vma->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vma); > > + anon_vma_unlock_write(vma->anon_vma); > > + } > > + return -ENOMEM; > > + > > } > > > > int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > > { > > struct mm_struct *mm = current->mm; > > + struct vm_area_struct *vma = NULL; > > unsigned long len; > > int ret; > > bool populate; > > - LIST_HEAD(uf); > > + MA_STATE(mas, &mm->mm_mt, addr, addr); > > > > len = PAGE_ALIGN(request); > > if (len < request) > > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) > > if (mmap_write_lock_killable(mm)) > > return -EINTR; > > > > - ret = do_brk_flags(addr, len, flags, &uf); > > + // This vma left intentionally blank. > > This comment using unintentionally bad syntax (// vs /* */) Ha! Thanks! > > Also if we leave it blank it means this path won't ever expand an existing > vma, while previously it could succeed the vma_merge, no? Or all callers of > vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have > a more verbose comment... Is it possible that other code paths lead to a vma merge? From what I can tell the other entry points are for the BSS or after loading a binary anyways. I guess the next vma could be anon and have matching flags, but I think they will all have a vma->vm_file. In fact, if I change the do_brk_flags() to check !vma->vm_file and pass through the vma in the case of vma->vm_end == addr, then it works - but there are no merging from this code path that I can see on boot. If you think this is necessary, I can add it in, but I don't think it's needed. > > > + mas_walk(&mas); > > + ret = do_brk_flags(&mas, vma, addr, len, flags); > > populate = ((mm->def_flags & VM_LOCKED) != 0); > > mmap_write_unlock(mm); > > - userfaultfd_unmap_complete(mm, &uf); > > Looks like this part is removed completely from vm_brk_flags() paths? > When I removed the call to munmap_vma_range(), I dropped the userfaultfd here as there was no need. > OK it seems the whole patch makes some asumption that vm_brk_flags() never > has to unmap a pre-existing area, and in the brk() syscall this is now > delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it > might be safe, it should be discussed in the patch that vm_brk_flags() > didn't actually need to support the unmap part, because x y z. And best if > there are some DEBUG_VM based assertions supporting that. Yes, I do believe this to be true. I don't think do_brk_flags() needs to unmap. If you look at brk() syscall, it actually ensure there is enough room for the expansion + vm_start_gap(next), so that one is probably safe. brk() already depends on do_brk_munmap() doing the unmap. The other callers are the elf and a.out loaders, which also don't appear to use the functionality. I thought the call was only for setting up for insertion into the rbtree. > > But then again, is the optimized scenario happening often enough to warrant it? well, 12.2% use the optimization to the fullest, the rest fail faster. I am really after the faster failure optimization here. I especially do not like the fact that vma_merge() gets the next vma and the next->next vma prior to seeing if it can be merged. I get why, but is there really going to be an anon vma with the right flags, no file, etc, etc, etc often enough to try this? In fact, it's not possible at all to need next->next when we unmap the area first. Out of the 8 cases in vma_merge, only 1, 2, and 3 are possible. Of the 3 possibilities, I am suggesting that 2 is really the only one we should check as 1 and 3 are so unlikely. > > > if (populate && !ret) > > mm_populate(addr, len); > > return ret; >
* Vlastimil Babka <vbabka@suse.cz> [220113 10:28]: > On 12/1/21 15:29, Liam Howlett wrote: > > @@ -1989,6 +2013,7 @@ EXPORT_SYMBOL(get_unmapped_area); > > struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > > { > > struct vm_area_struct *vma; > > + MA_STATE(mas, &mm->mm_mt, addr, addr); > > > > mmap_assert_locked(mm); > > /* Check the cache first. */ > > @@ -1996,7 +2021,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > > if (likely(vma)) > > return vma; > > > > - vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX); > > + vma = mas_find(&mas, -1); > > if (vma) > > vmacache_update(addr, vma); > > return vma; > > Oh and this change to find_vma() was supposed to go to the next patch, no? Yes, thanks. I will relocate this change to the next patch.
On 1/19/22 04:03, Liam Howlett wrote: > * Vlastimil Babka <vbabka@suse.cz> [220113 07:59]: >> On 12/1/21 15:29, Liam Howlett wrote: >> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com> >> > >> > Avoid allocating a new VMA when it a vma modification can occur. When a >> > brk() can expand or contract a VMA, then the single store operation will >> > only modify one index of the maple tree instead of causing a node to >> > split or coalesce. This avoids unnecessary allocations/frees of maple >> > tree nodes and VMAs. >> > >> > Use the advanced API for the maple tree to avoid unnecessary walks of >> > the tree. >> > >> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> >> > --- >> > mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++----------- >> > 1 file changed, 207 insertions(+), 51 deletions(-) >> > >> > +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, >> > + unsigned long addr, unsigned long len, >> > + unsigned long flags) >> > { >> > struct mm_struct *mm = current->mm; >> > - struct vm_area_struct *vma, *prev; >> > - pgoff_t pgoff = addr >> PAGE_SHIFT; >> > + struct vm_area_struct *prev = NULL; >> > int error; >> > unsigned long mapped_addr; >> > validate_mm_mt(mm); >> > @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, >> > if (error) >> > return error; >> > >> > - /* Clear old maps, set up prev and uf */ >> > - if (munmap_vma_range(mm, addr, len, &prev, uf)) >> > - return -ENOMEM; >> > - >> > - /* Check against address space limits *after* clearing old maps... */ >> > + /* Check against address space limits by the changed size */ >> >> Can that cause spurious ENOMEM because now the check assumes 'len' worth of >> purely new pages and no reuse? > > > I don't think so? I must be missing how anything could exist in this > range to begin with? Well the comment including "*after*" made it look like somebody was careful for a good reason. But it's possible that it's outdated, of course. So it's generally good to explain in such changes how it was evaluated that it's now ok. > The brk syscall checks to ensure there is enough > room and the other two users are the elf and a.out loaders - could > either of those two map over parts of themselves on load? IIRC some past changes between MAP_FIXED/MAP_FIXED_NOREPLACE made us realize that some loaders do, see e.g. 5f501d555653 ("binfmt_elf: reintroduce using MAP_FIXED_NOREPLACE") But I guess we can also assume that during the initial elf loading, we are not even close to the limits and it's unlikely to manifest as a problem somewhere... > This seemed > to be there primarily to set up for an rb insert (set prev, rb_link, > rb_parent) in the current code. Sort of like how get_unmapped_area() > with MAP_FIXED appears to be used to sanitize the addr and len. > > >> >> > if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) >> > return -ENOMEM; >> > >> > @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len, >> > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) >> > return -ENOMEM; >> > >> > - /* Can we just expand an old private anonymous mapping? */ >> > - vma = vma_merge(mm, prev, addr, addr + len, flags, >> > - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); >> > - if (vma) >> > - goto out; >> > + mas->last = addr + len - 1; >> > + if (vma) { >> > + /* Expand the existing vma if possible; almost never a singular >> > + * list, so this will almost always fail. */ >> > >> > - /* >> > - * create a vma struct for an anonymous mapping >> > - */ >> > - vma = vm_area_alloc(mm); >> > - if (!vma) { >> > - vm_unacct_memory(len >> PAGE_SHIFT); >> > - return -ENOMEM; >> > + if ((!vma->anon_vma || >> > + list_is_singular(&vma->anon_vma_chain)) && >> >> Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks >> like something e.g. we can easily forget to adjust when changing vma_merge() >> itself. > > vma_merge() is overly heavy for what brk() is doing. I get what you are > saying about it potentially being missed and I think brk is already in > the 'potentially overlooked' category as it stands today. Honestly, one > less user of vma_merge() (and thus less __vma_adjust() users) seems like > a win to me. > >> Is this optimization worth the trouble given the comment above "so this will >> almost always fail"? > > vma_merge() walks the tree for next and next->next and does a lot of > extra checks before arriving at the conclusion that this will fail. > Maybe 'almost always fail' is too strong wording; on boot of my VM, 63 > expands happen out of 517, so 87.8% fail this test. OK guess we can live with it, I think you already mentioned there's plan for future cleanups in this area :) >> >> > + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){ >> > + mas->index = vma->vm_start; >> > + >> > + vma_adjust_trans_huge(vma, addr, addr + len, 0); >> > + if (vma->anon_vma) { >> > + anon_vma_lock_write(vma->anon_vma); >> > + anon_vma_interval_tree_pre_update_vma(vma); >> > + } >> > + vma->vm_end = addr + len; >> > + vma->vm_flags |= VM_SOFTDIRTY; >> > + if (mas_store_gfp(mas, vma, GFP_KERNEL)) >> > + goto mas_mod_fail; >> > + >> > + if (vma->anon_vma) { >> > + anon_vma_interval_tree_post_update_vma(vma); >> > + anon_vma_unlock_write(vma->anon_vma); >> > + } >> > + khugepaged_enter_vma_merge(vma, flags); >> > + goto out; >> > + } >> > + prev = vma; >> > } >> > + mas->index = addr; >> > + mas_walk(mas); >> > + >> > + /* create a vma struct for an anonymous mapping */ >> > + vma = vm_area_alloc(mm); >> > + if (!vma) >> > + goto vma_alloc_fail; >> > >> > vma_set_anonymous(vma); >> > vma->vm_start = addr; >> > vma->vm_end = addr + len; >> > - vma->vm_pgoff = pgoff; >> > + vma->vm_pgoff = addr >> PAGE_SHIFT; >> > vma->vm_flags = flags; >> > vma->vm_page_prot = vm_get_page_prot(flags); >> > - vma_link(mm, vma, prev); >> > + if (vma_mas_store(vma, mas)) >> > + goto mas_store_fail; >> > + >> > + if (!prev) >> > + prev = mas_prev(mas, 0); >> > + >> > + __vma_link_list(mm, vma, prev); >> > + mm->map_count++; >> > out: >> > perf_event_mmap(vma); >> > mm->total_vm += len >> PAGE_SHIFT; >> > @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len, >> > vma->vm_flags |= VM_SOFTDIRTY; >> > validate_mm_mt(mm); >> > return 0; >> > + >> > +mas_store_fail: >> > + vm_area_free(vma); >> > +vma_alloc_fail: >> > + vm_unacct_memory(len >> PAGE_SHIFT); >> > + return -ENOMEM; >> > + >> > +mas_mod_fail: >> > + vma->vm_end = addr; >> > + if (vma->anon_vma) { >> > + anon_vma_interval_tree_post_update_vma(vma); >> > + anon_vma_unlock_write(vma->anon_vma); >> > + } >> > + return -ENOMEM; >> > + >> > } >> > >> > int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) >> > { >> > struct mm_struct *mm = current->mm; >> > + struct vm_area_struct *vma = NULL; >> > unsigned long len; >> > int ret; >> > bool populate; >> > - LIST_HEAD(uf); >> > + MA_STATE(mas, &mm->mm_mt, addr, addr); >> > >> > len = PAGE_ALIGN(request); >> > if (len < request) >> > @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) >> > if (mmap_write_lock_killable(mm)) >> > return -EINTR; >> > >> > - ret = do_brk_flags(addr, len, flags, &uf); >> > + // This vma left intentionally blank. >> >> This comment using unintentionally bad syntax (// vs /* */) > > Ha! Thanks! > >> >> Also if we leave it blank it means this path won't ever expand an existing >> vma, while previously it could succeed the vma_merge, no? Or all callers of >> vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have >> a more verbose comment... > > Is it possible that other code paths lead to a vma merge? From what I > can tell the other entry points are for the BSS or after loading a > binary anyways. I guess the next vma could be anon and have matching > flags, but I think they will all have a vma->vm_file. In fact, if I > change the do_brk_flags() to check !vma->vm_file and pass through the > vma in the case of vma->vm_end == addr, then it works - but there are no > merging from this code path that I can see on boot. If you think this > is necessary, I can add it in, but I don't think it's needed. Looks like this is all changed again towards the end of the series anyway... >> >> > + mas_walk(&mas); >> > + ret = do_brk_flags(&mas, vma, addr, len, flags); >> > populate = ((mm->def_flags & VM_LOCKED) != 0); >> > mmap_write_unlock(mm); >> > - userfaultfd_unmap_complete(mm, &uf); >> >> Looks like this part is removed completely from vm_brk_flags() paths? >> > > When I removed the call to munmap_vma_range(), I dropped the userfaultfd > here as there was no need. > > >> OK it seems the whole patch makes some asumption that vm_brk_flags() never >> has to unmap a pre-existing area, and in the brk() syscall this is now >> delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it >> might be safe, it should be discussed in the patch that vm_brk_flags() >> didn't actually need to support the unmap part, because x y z. And best if >> there are some DEBUG_VM based assertions supporting that. > > Yes, I do believe this to be true. I don't think do_brk_flags() needs > to unmap. If you look at brk() syscall, it actually ensure there is > enough room for the expansion + vm_start_gap(next), so that one is > probably safe. brk() already depends on do_brk_munmap() doing the > unmap. The other callers are the elf and a.out loaders, which also > don't appear to use the functionality. Hopefully that doesn't rely on assuming no elf segments overlap, as discussed above. > I thought the call was only for > setting up for insertion into the rbtree. > >> >> But then again, is the optimized scenario happening often enough to warrant it? > > well, 12.2% use the optimization to the fullest, the rest fail faster. > I am really after the faster failure optimization here. I especially do > not like the fact that vma_merge() gets the next vma and the next->next > vma prior to seeing if it can be merged. I get why, but is there really > going to be an anon vma with the right flags, no file, etc, etc, etc > often enough to try this? In fact, it's not possible at all to need > next->next when we unmap the area first. Out of the 8 cases in > vma_merge, only 1, 2, and 3 are possible. Of the 3 possibilities, I am > suggesting that 2 is really the only one we should check as 1 and 3 are > so unlikely. > > >> >> > if (populate && !ret) >> > mm_populate(addr, len); >> > return ret; >>
diff --git a/mm/mmap.c b/mm/mmap.c index c5f92666d145..e4c8ce377f2c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,17 +188,22 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, + unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -238,37 +243,56 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto success; } - /* - * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. - */ - if (brk <= mm->brk) { + mas_set(&mas, newbrk); + brkvma = mas_walk(&mas); + if (brkvma) { // munmap necessary, there is something at newbrk. + /* + * Always allow shrinking brk. + * do_brk_munmap() may downgrade mmap_lock to read. + */ int ret; + if (brkvma->vm_start >= oldbrk) + goto out; // mapping intersects with an existing non-brk vma. /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + mas.last = oldbrk - 1; + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; - } + goto success; + } else if (!ret) + goto success; + mm->brk = origbrk; + goto out; + } + /* Only check if the next VMA is within the stack_guard_gap of the + * expansion area */ + next = mas_next(&mas, newbrk + PAGE_SIZE + stack_guard_gap); /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); + if (brkvma) { + if (brkvma->vm_start >= oldbrk) + goto out; // Trying to map over another vma. + + if (brkvma->vm_end <= min_brk) { + brkvma = NULL; + mas_reset(&mas); + } + } + /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -1989,6 +2013,7 @@ EXPORT_SYMBOL(get_unmapped_area); struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); mmap_assert_locked(mm); /* Check the cache first. */ @@ -1996,7 +2021,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (likely(vma)) return vma; - vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX); + vma = mas_find(&mas, -1); if (vma) vmacache_update(addr, vma); return vma; @@ -2713,16 +2738,105 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. + */ +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct unmap; + unsigned long unmap_pages; + int ret = 1; + + arch_unmap(mm, newbrk, oldbrk); + + if (likely(vma->vm_start >= newbrk)) { // remove entire mapping(s) + mas_set(mas, newbrk); + if (vma->vm_start != newbrk) + mas_reset(mas); // cause a re-walk for the first overlap. + ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + goto munmap_full_vma; + } + + vma_init(&unmap, mm); + unmap.vm_start = newbrk; + unmap.vm_end = oldbrk; + ret = userfaultfd_unmap_prep(&unmap, newbrk, oldbrk, uf); + if (ret) + return ret; + ret = 1; + + // Change the oldbrk of vma to the newbrk of the munmap area + vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + vma->vm_end = newbrk; + if (vma_mas_remove(&unmap, mas)) + goto mas_store_fail; + + vmacache_invalidate(vma->vm_mm); + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + + unmap_pages = vma_pages(&unmap); + if (unmap.vm_flags & VM_LOCKED) { + mm->locked_vm -= unmap_pages; + munlock_vma_pages_range(&unmap, newbrk, oldbrk); + } + + mmap_write_downgrade(mm); + unmap_region(mm, &unmap, vma, newbrk, oldbrk); + /* Statistics */ + vm_stat_account(mm, unmap.vm_flags, -unmap_pages); + if (unmap.vm_flags & VM_ACCOUNT) + vm_unacct_memory(unmap_pages); + +munmap_full_vma: + validate_mm_mt(mm); + return ret; + +mas_store_fail: + vma->vm_end = oldbrk; + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + return -ENOMEM; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long len, - unsigned long flags, struct list_head *uf) +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, + unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - pgoff_t pgoff = addr >> PAGE_SHIFT; + struct vm_area_struct *prev = NULL; int error; unsigned long mapped_addr; validate_mm_mt(mm); @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, if (error) return error; - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) - return -ENOMEM; - - /* Check against address space limits *after* clearing old maps... */ + /* Check against address space limits by the changed size */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len, if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); - if (vma) - goto out; + mas->last = addr + len - 1; + if (vma) { + /* Expand the existing vma if possible; almost never a singular + * list, so this will almost always fail. */ - /* - * create a vma struct for an anonymous mapping - */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if ((!vma->anon_vma || + list_is_singular(&vma->anon_vma_chain)) && + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){ + mas->index = vma->vm_start; + + vma_adjust_trans_huge(vma, addr, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_mod_fail; + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma_merge(vma, flags); + goto out; + } + prev = vma; } + mas->index = addr; + mas_walk(mas); + + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev); + if (vma_mas_store(vma, mas)) + goto mas_store_fail; + + if (!prev) + prev = mas_prev(mas, 0); + + __vma_link_list(mm, vma, prev); + mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len, vma->vm_flags |= VM_SOFTDIRTY; validate_mm_mt(mm); return 0; + +mas_store_fail: + vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + +mas_mod_fail: + vma->vm_end = addr; + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + return -ENOMEM; + } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; - LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + // This vma left intentionally blank. + mas_walk(&mas); + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); - userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret;