Message ID | cf40652a2c3f6b987623f8f11a514618718546f7.1722849860.git.lorenzo.stoakes@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: remove vma_merge() | expand |
On Mon, 5 Aug 2024 13:13:54 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > to use vma_expand() rather than invoke a full vma_merge() operation. > > Abstract this logic and eliminate all of the open-coding, and also use the > same logic for all cases where we add new VMAs to, rather than ultimately > use vma_merge(), rather use vma_expand(). > > We implement this by replacing vma_merge_new_vma() with this newly > abstracted logic. > > Doing so removes duplication and simplifies VMA merging in all such cases, > laying the ground for us to eliminate the merging of new VMAs in > vma_merge() altogether. > > This makes it far easier to understand what is happening in these cases > avoiding confusion, bugs and allowing for future optimisation. > > As a result of this change we are also able to make vma_prepare(), > init_vma_prep(), vma_complete(), can_vma_merge_before() and > can_vma_merge_after() static and internal to vma.c. This patch truly rocks. Let me just say: Wow! Petr T > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/mmap.c | 79 ++--- > mm/vma.c | 482 +++++++++++++++++++------------ > mm/vma.h | 51 +--- > tools/testing/vma/vma_internal.h | 6 + > 4 files changed, 324 insertions(+), 294 deletions(-) > > diff --git a/mm/mmap.c b/mm/mmap.c > index f6593a81f73d..c03f50f46396 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > { > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma = NULL; > - struct vm_area_struct *next, *prev, *merge; > - pgoff_t pglen = len >> PAGE_SHIFT; > + struct vm_area_struct *merge; > unsigned long charged = 0; > unsigned long end = addr + len; > bool writable_file_mapping = false; > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vm_flags |= VM_ACCOUNT; > } > > - next = vmg.next = vma_next(&vmi); > - prev = vmg.prev = vma_prev(&vmi); > - if (vm_flags & VM_SPECIAL) { > - if (prev) > - vma_iter_next_range(&vmi); > - goto cannot_expand; > - } > - > - /* Attempt to expand an old mapping */ > - /* Check next */ > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > - vmg.end = next->vm_end; > - vma = vmg.vma = next; > - vmg.pgoff = next->vm_pgoff - pglen; > - > - /* We may merge our NULL anon_vma with non-NULL in next. */ > - vmg.anon_vma = vma->anon_vma; > - } > - > - /* Check prev */ > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > - vmg.start = prev->vm_start; > - vma = vmg.vma = prev; > - vmg.pgoff = prev->vm_pgoff; > - } else if (prev) { > - vma_iter_next_range(&vmi); > - } > - > - /* Actually expand, if possible */ > - if (vma && !vma_expand(&vmg)) { > - khugepaged_enter_vma(vma, vm_flags); > + vma = vma_merge_new_vma(&vmg); > + if (vma) > goto expanded; > - } > - > - if (vma == prev) > - vma_iter_set(&vmi, addr); > -cannot_expand: > > /* > * Determine the object being mapped and call the appropriate > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > * If vm_flags changed after call_mmap(), we should try merge > * vma again as we may succeed this time. > */ > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > - vma->vm_start, vma->vm_end, > - vma->vm_pgoff); > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > + merge = vma_merge_new_vma(&vmg); > + > if (merge) { > /* > * ->mmap() can change vma->vm_file and fput > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vma_iter_set(&vmi, vma->vm_end); > /* Undo any partial mapping done by a device driver. */ > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, > vma->vm_end, vma->vm_end, true); > } > if (writable_file_mapping) > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > unsigned long addr, unsigned long len, unsigned long flags) > { > struct mm_struct *mm = current->mm; > - struct vma_prepare vp; > > /* > * Check against address space limits by the changed size > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > */ > if (vma && vma->vm_end == addr) { > struct vma_merge_struct vmg = { > + .vmi = vmi, > .prev = vma, > + .next = NULL, > + .start = addr, > + .end = addr + len, > .flags = flags, > .pgoff = addr >> PAGE_SHIFT, > + .file = vma->vm_file, > + .anon_vma = vma->anon_vma, > + .policy = vma_policy(vma), > + .uffd_ctx = vma->vm_userfaultfd_ctx, > + .anon_name = anon_vma_name(vma), > }; > > - if (can_vma_merge_after(&vmg)) { > - vma_iter_config(vmi, vma->vm_start, addr + len); > - if (vma_iter_prealloc(vmi, vma)) > - goto unacct_fail; > - > - vma_start_write(vma); > - > - init_vma_prep(&vp, vma); > - vma_prepare(&vp); > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); > - vma->vm_end = addr + len; > - vm_flags_set(vma, VM_SOFTDIRTY); > - vma_iter_store(vmi, vma); > - > - vma_complete(&vp, vmi, mm); > - khugepaged_enter_vma(vma, flags); > + if (vma_merge_new_vma(&vmg)) > goto out; > - } > } > > if (vma) > diff --git a/mm/vma.c b/mm/vma.c > index 55615392e8d2..a404cf718f9e 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, > * > * We assume the vma may be removed as part of the merge. > */ > -bool > -can_vma_merge_before(struct vma_merge_struct *vmg) > +static bool can_vma_merge_before(struct vma_merge_struct *vmg) > { > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); > > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) > * > * We assume that vma is not removed as part of the merge. > */ > -bool can_vma_merge_after(struct vma_merge_struct *vmg) > +static bool can_vma_merge_after(struct vma_merge_struct *vmg) > { > if (is_mergeable_vma(vmg, false) && > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) > return false; > } > > +static void __vma_link_file(struct vm_area_struct *vma, > + struct address_space *mapping) > +{ > + if (vma_is_shared_maywrite(vma)) > + mapping_allow_writable(mapping); > + > + flush_dcache_mmap_lock(mapping); > + vma_interval_tree_insert(vma, &mapping->i_mmap); > + flush_dcache_mmap_unlock(mapping); > +} > + > +/* > + * Requires inode->i_mapping->i_mmap_rwsem > + */ > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > + struct address_space *mapping) > +{ > + if (vma_is_shared_maywrite(vma)) > + mapping_unmap_writable(mapping); > + > + flush_dcache_mmap_lock(mapping); > + vma_interval_tree_remove(vma, &mapping->i_mmap); > + flush_dcache_mmap_unlock(mapping); > +} > + > +/* > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > + * @vp: The initialized vma_prepare struct > + */ > +static void vma_prepare(struct vma_prepare *vp) > +{ > + if (vp->file) { > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > + > + if (vp->adj_next) > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > + vp->adj_next->vm_end); > + > + i_mmap_lock_write(vp->mapping); > + if (vp->insert && vp->insert->vm_file) { > + /* > + * Put into interval tree now, so instantiated pages > + * are visible to arm/parisc __flush_dcache_page > + * throughout; but we cannot insert into address > + * space until vma start or end is updated. > + */ > + __vma_link_file(vp->insert, > + vp->insert->vm_file->f_mapping); > + } > + } > + > + if (vp->anon_vma) { > + anon_vma_lock_write(vp->anon_vma); > + anon_vma_interval_tree_pre_update_vma(vp->vma); > + if (vp->adj_next) > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > + } > + > + if (vp->file) { > + flush_dcache_mmap_lock(vp->mapping); > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > + if (vp->adj_next) > + vma_interval_tree_remove(vp->adj_next, > + &vp->mapping->i_mmap); > + } > + > +} > + > +/* > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > + * or for inserting a VMA. > + * > + * @vp: The vma_prepare struct > + * @vmi: The vma iterator > + * @mm: The mm_struct > + */ > +static void vma_complete(struct vma_prepare *vp, > + struct vma_iterator *vmi, struct mm_struct *mm) > +{ > + if (vp->file) { > + if (vp->adj_next) > + vma_interval_tree_insert(vp->adj_next, > + &vp->mapping->i_mmap); > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > + flush_dcache_mmap_unlock(vp->mapping); > + } > + > + if (vp->remove && vp->file) { > + __remove_shared_vm_struct(vp->remove, vp->mapping); > + if (vp->remove2) > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > + } else if (vp->insert) { > + /* > + * split_vma has split insert from vma, and needs > + * us to insert it before dropping the locks > + * (it may either follow vma or precede it). > + */ > + vma_iter_store(vmi, vp->insert); > + mm->map_count++; > + } > + > + if (vp->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vp->vma); > + if (vp->adj_next) > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > + anon_vma_unlock_write(vp->anon_vma); > + } > + > + if (vp->file) { > + i_mmap_unlock_write(vp->mapping); > + uprobe_mmap(vp->vma); > + > + if (vp->adj_next) > + uprobe_mmap(vp->adj_next); > + } > + > + if (vp->remove) { > +again: > + vma_mark_detached(vp->remove, true); > + if (vp->file) { > + uprobe_munmap(vp->remove, vp->remove->vm_start, > + vp->remove->vm_end); > + fput(vp->file); > + } > + if (vp->remove->anon_vma) > + anon_vma_merge(vp->vma, vp->remove); > + mm->map_count--; > + mpol_put(vma_policy(vp->remove)); > + if (!vp->remove2) > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > + vm_area_free(vp->remove); > + > + /* > + * In mprotect's case 6 (see comments on vma_merge), > + * we are removing both mid and next vmas > + */ > + if (vp->remove2) { > + vp->remove = vp->remove2; > + vp->remove2 = NULL; > + goto again; > + } > + } > + if (vp->insert && vp->file) > + uprobe_mmap(vp->insert); > + validate_mm(mm); > +} > + > +/* > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > + * @vp: The vma_prepare struct > + * @vma: The vma that will be altered once locked > + */ > +static void init_vma_prep(struct vma_prepare *vp, > + struct vm_area_struct *vma) > +{ > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > +} > + > /* > * Close a vm structure and free it. > */ > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > vm_unacct_memory(nr_accounted); > } > > -/* > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > - * @vp: The vma_prepare struct > - * @vma: The vma that will be altered once locked > - */ > -void init_vma_prep(struct vma_prepare *vp, > - struct vm_area_struct *vma) > -{ > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > -} > - > -/* > - * Requires inode->i_mapping->i_mmap_rwsem > - */ > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > - struct address_space *mapping) > -{ > - if (vma_is_shared_maywrite(vma)) > - mapping_unmap_writable(mapping); > - > - flush_dcache_mmap_lock(mapping); > - vma_interval_tree_remove(vma, &mapping->i_mmap); > - flush_dcache_mmap_unlock(mapping); > -} > - > /* > * vma has some anon_vma assigned, and is already inserted on that > * anon_vma's interval trees. > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > } > > -static void __vma_link_file(struct vm_area_struct *vma, > - struct address_space *mapping) > -{ > - if (vma_is_shared_maywrite(vma)) > - mapping_allow_writable(mapping); > - > - flush_dcache_mmap_lock(mapping); > - vma_interval_tree_insert(vma, &mapping->i_mmap); > - flush_dcache_mmap_unlock(mapping); > -} > - > -/* > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > - * @vp: The initialized vma_prepare struct > - */ > -void vma_prepare(struct vma_prepare *vp) > -{ > - if (vp->file) { > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > - > - if (vp->adj_next) > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > - vp->adj_next->vm_end); > - > - i_mmap_lock_write(vp->mapping); > - if (vp->insert && vp->insert->vm_file) { > - /* > - * Put into interval tree now, so instantiated pages > - * are visible to arm/parisc __flush_dcache_page > - * throughout; but we cannot insert into address > - * space until vma start or end is updated. > - */ > - __vma_link_file(vp->insert, > - vp->insert->vm_file->f_mapping); > - } > - } > - > - if (vp->anon_vma) { > - anon_vma_lock_write(vp->anon_vma); > - anon_vma_interval_tree_pre_update_vma(vp->vma); > - if (vp->adj_next) > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > - } > - > - if (vp->file) { > - flush_dcache_mmap_lock(vp->mapping); > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > - if (vp->adj_next) > - vma_interval_tree_remove(vp->adj_next, > - &vp->mapping->i_mmap); > - } > - > -} > - > /* > * dup_anon_vma() - Helper function to duplicate anon_vma > * @dst: The destination VMA > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) > } > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > +/* > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > + * > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > + * > + * We are about to add a VMA to the address space starting at @vmg->start and > + * ending at @vmg->end. There are three different possible scenarios: > + * > + * 1. There is a VMA with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > + * EXPAND that VMA: > + * > + * Proposed: |-----| or |-----| > + * Existing: |----| |----| > + * > + * 2. There are VMAs with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > + * EXPAND the former and REMOVE the latter: > + * > + * Proposed: |-----| > + * Existing: |----| |----| > + * > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > + * > + * In instances where we can merge, this function returns the expanded VMA which > + * will have its range adjusted accordingly and the underlying maple tree also > + * adjusted. > + * > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > + * to the VMA we expanded. > + * > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > + * expanded range. > + * > + * ASSUMPTIONS: > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > + */ > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > +{ > + bool is_special = vmg->flags & VM_SPECIAL; > + struct vm_area_struct *prev = vmg->prev; > + struct vm_area_struct *next = vmg->next; > + unsigned long start = vmg->start; > + unsigned long end = vmg->end; > + pgoff_t pgoff = vmg->pgoff; > + pgoff_t pglen = PHYS_PFN(end - start); > + > + VM_WARN_ON(vmg->vma); > + > + if (!prev && !next) { > + /* > + * Since the caller must have determined that the requested > + * range is empty, vmg->vmi will be left pointing at the VMA > + * immediately prior. > + */ > + next = vmg->next = vma_next(vmg->vmi); > + prev = vmg->prev = vma_prev(vmg->vmi); > + > + /* Avoid maple tree re-walk. */ > + if (is_special && prev) > + vma_iter_next_range(vmg->vmi); > + } > + > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > + if (is_special || (!prev && !next)) > + return NULL; > + > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > + /* > + * We can adjust this here as can_vma_merge_after() doesn't > + * touch vmg->end. > + */ > + vmg->end = next->vm_end; > + vmg->vma = next; > + vmg->pgoff = next->vm_pgoff - pglen; > + > + vmg->anon_vma = next->anon_vma; > + } > + > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > + vmg->start = prev->vm_start; > + vmg->vma = prev; > + vmg->pgoff = prev->vm_pgoff; > + } else if (prev) { > + vma_iter_next_range(vmg->vmi); > + } > + > + /* > + * Now try to expand adjacent VMA(s). This takes care of removing the > + * following VMA if we have VMAs on both sides. > + */ > + if (vmg->vma && !vma_expand(vmg)) { > + khugepaged_enter_vma(vmg->vma, vmg->flags); > + return vmg->vma; > + } > + > + /* If expansion failed, reset state. Allows us to retry merge later. */ > + vmg->vma = NULL; > + vmg->anon_vma = NULL; > + vmg->start = start; > + vmg->end = end; > + vmg->pgoff = pgoff; > + if (vmg->vma == prev) > + vma_iter_set(vmg->vmi, start); > + > + return NULL; > +} > + > /* > * vma_expand - Expand an existing VMA > * > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with > * vmg->next needs to be handled by the caller. > * > - * Returns: 0 on success > + * Returns: 0 on success. > + * > + * ASSUMPTIONS: > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. > + * - The caller must have set @vmg->prev and @vmg->next. > */ > int vma_expand(struct vma_merge_struct *vmg) > { > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) > return 0; > } > > -/* > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > - * or for inserting a VMA. > - * > - * @vp: The vma_prepare struct > - * @vmi: The vma iterator > - * @mm: The mm_struct > - */ > -void vma_complete(struct vma_prepare *vp, > - struct vma_iterator *vmi, struct mm_struct *mm) > -{ > - if (vp->file) { > - if (vp->adj_next) > - vma_interval_tree_insert(vp->adj_next, > - &vp->mapping->i_mmap); > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > - flush_dcache_mmap_unlock(vp->mapping); > - } > - > - if (vp->remove && vp->file) { > - __remove_shared_vm_struct(vp->remove, vp->mapping); > - if (vp->remove2) > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > - } else if (vp->insert) { > - /* > - * split_vma has split insert from vma, and needs > - * us to insert it before dropping the locks > - * (it may either follow vma or precede it). > - */ > - vma_iter_store(vmi, vp->insert); > - mm->map_count++; > - } > - > - if (vp->anon_vma) { > - anon_vma_interval_tree_post_update_vma(vp->vma); > - if (vp->adj_next) > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > - anon_vma_unlock_write(vp->anon_vma); > - } > - > - if (vp->file) { > - i_mmap_unlock_write(vp->mapping); > - uprobe_mmap(vp->vma); > - > - if (vp->adj_next) > - uprobe_mmap(vp->adj_next); > - } > - > - if (vp->remove) { > -again: > - vma_mark_detached(vp->remove, true); > - if (vp->file) { > - uprobe_munmap(vp->remove, vp->remove->vm_start, > - vp->remove->vm_end); > - fput(vp->file); > - } > - if (vp->remove->anon_vma) > - anon_vma_merge(vp->vma, vp->remove); > - mm->map_count--; > - mpol_put(vma_policy(vp->remove)); > - if (!vp->remove2) > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > - vm_area_free(vp->remove); > - > - /* > - * In mprotect's case 6 (see comments on vma_merge), > - * we are removing both mid and next vmas > - */ > - if (vp->remove2) { > - vp->remove = vp->remove2; > - vp->remove2 = NULL; > - goto again; > - } > - } > - if (vp->insert && vp->file) > - uprobe_mmap(vp->insert); > - validate_mm(mm); > -} > - > /* > * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > * @vmi: The vma iterator > @@ -1261,20 +1378,6 @@ struct vm_area_struct > return vma_modify(&vmg); > } > > -/* > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > - * must ensure that [start, end) does not overlap any existing VMA. > - */ > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > -{ > - if (!vmg->prev) { > - vmg->prev = vma_prev(vmg->vmi); > - vma_iter_set(vmg->vmi, vmg->start); > - } > - > - return vma_merge(vmg); > -} > - > /* > * Expand vma by delta bytes, potentially merging with an immediately adjacent > * VMA with identical properties. > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > .anon_name = anon_vma_name(vma), > }; > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > - return vma_merge(&vmg); > + return vma_merge_new_vma(&vmg); > } > > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > struct vm_area_struct *vma = *vmap; > unsigned long vma_start = vma->vm_start; > struct mm_struct *mm = vma->vm_mm; > - struct vm_area_struct *new_vma, *prev; > + struct vm_area_struct *new_vma; > bool faulted_in_anon_vma = true; > VMA_ITERATOR(vmi, mm, addr); > + struct vma_merge_struct vmg = { > + .vmi = &vmi, > + .start = addr, > + .end = addr + len, > + .flags = vma->vm_flags, > + .pgoff = pgoff, > + .file = vma->vm_file, > + .anon_vma = vma->anon_vma, > + .policy = vma_policy(vma), > + .uffd_ctx = vma->vm_userfaultfd_ctx, > + .anon_name = anon_vma_name(vma), > + }; > > /* > * If anonymous vma has not yet been faulted, update new pgoff > * to match new location, to increase its chance of merging. > */ > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > - pgoff = addr >> PAGE_SHIFT; > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; > faulted_in_anon_vma = false; > } > > - new_vma = find_vma_prev(mm, addr, &prev); > + new_vma = find_vma_prev(mm, addr, &vmg.prev); > if (new_vma && new_vma->vm_start < addr + len) > return NULL; /* should never get here */ > > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); > + vmg.next = vma_next(&vmi); > + vma_prev(&vmi); > + > + new_vma = vma_merge_new_vma(&vmg); > + > if (new_vma) { > /* > * Source vma may have been merged into new_vma > diff --git a/mm/vma.h b/mm/vma.h > index 50459f9e4c7f..bbb173053f34 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > /* Required for expand_downwards(). */ > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > -/* Required for do_brk_flags(). */ > -void vma_prepare(struct vma_prepare *vp); > - > -/* Required for do_brk_flags(). */ > -void init_vma_prep(struct vma_prepare *vp, > - struct vm_area_struct *vma); > - > -/* Required for do_brk_flags(). */ > -void vma_complete(struct vma_prepare *vp, > - struct vma_iterator *vmi, struct mm_struct *mm); > - > int vma_expand(struct vma_merge_struct *vmg); > int vma_shrink(struct vma_merge_struct *vmg); > > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, > struct vm_area_struct *next, unsigned long start, > unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > -/* > - * Can we merge the VMA described by vmg into the following VMA vmg->next? > - * > - * Required by mmap_region(). > - */ > -bool can_vma_merge_before(struct vma_merge_struct *vmg); > - > -/* > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? > - * > - * Required by mmap_region() and do_brk_flags(). > - */ > -bool can_vma_merge_after(struct vma_merge_struct *vmg); > - > /* We are about to modify the VMA's flags. */ > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, > struct vm_area_struct *prev, > @@ -133,31 +108,7 @@ struct vm_area_struct > unsigned long new_flags, > struct vm_userfaultfd_ctx new_ctx); > > -struct vm_area_struct > -*vma_merge_new_vma(struct vma_merge_struct *vmg); > - > -/* Temporary convenience wrapper. */ > -static inline struct vm_area_struct > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, > - struct vm_area_struct *vma, unsigned long start, > - unsigned long end, pgoff_t pgoff) > -{ > - struct vma_merge_struct vmg = { > - .vmi = vmi, > - .prev = prev, > - .start = start, > - .end = end, > - .flags = vma->vm_flags, > - .file = vma->vm_file, > - .anon_vma = vma->anon_vma, > - .pgoff = pgoff, > - .policy = vma_policy(vma), > - .uffd_ctx = vma->vm_userfaultfd_ctx, > - .anon_name = anon_vma_name(vma), > - }; > - > - return vma_merge_new_vma(&vmg); > -} > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); > > /* > * Temporary wrapper around vma_merge() so we can have a common interface for > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 40797a819d3d..a39a734282d0 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) > mas_destroy(&vmi->mas); > } > > +static inline > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) > +{ > + return mas_next_range(&vmi->mas, ULONG_MAX); > +} > + > static inline void vm_acct_memory(long pages) > { > }
On Tue, Aug 06, 2024 at 03:04:22PM GMT, Petr Tesařík wrote: > On Mon, 5 Aug 2024 13:13:54 +0100 > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > same logic for all cases where we add new VMAs to, rather than ultimately > > use vma_merge(), rather use vma_expand(). > > > > We implement this by replacing vma_merge_new_vma() with this newly > > abstracted logic. > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > laying the ground for us to eliminate the merging of new VMAs in > > vma_merge() altogether. > > > > This makes it far easier to understand what is happening in these cases > > avoiding confusion, bugs and allowing for future optimisation. > > > > As a result of this change we are also able to make vma_prepare(), > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > can_vma_merge_after() static and internal to vma.c. > > This patch truly rocks. Let me just say: Wow! Thanks! > > Petr T > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > > mm/mmap.c | 79 ++--- > > mm/vma.c | 482 +++++++++++++++++++------------ > > mm/vma.h | 51 +--- > > tools/testing/vma/vma_internal.h | 6 + > > 4 files changed, 324 insertions(+), 294 deletions(-) > > > > diff --git a/mm/mmap.c b/mm/mmap.c > > index f6593a81f73d..c03f50f46396 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > { > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma = NULL; > > - struct vm_area_struct *next, *prev, *merge; > > - pgoff_t pglen = len >> PAGE_SHIFT; > > + struct vm_area_struct *merge; > > unsigned long charged = 0; > > unsigned long end = addr + len; > > bool writable_file_mapping = false; > > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vm_flags |= VM_ACCOUNT; > > } > > > > - next = vmg.next = vma_next(&vmi); > > - prev = vmg.prev = vma_prev(&vmi); > > - if (vm_flags & VM_SPECIAL) { > > - if (prev) > > - vma_iter_next_range(&vmi); > > - goto cannot_expand; > > - } > > - > > - /* Attempt to expand an old mapping */ > > - /* Check next */ > > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > > - vmg.end = next->vm_end; > > - vma = vmg.vma = next; > > - vmg.pgoff = next->vm_pgoff - pglen; > > - > > - /* We may merge our NULL anon_vma with non-NULL in next. */ > > - vmg.anon_vma = vma->anon_vma; > > - } > > - > > - /* Check prev */ > > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > > - vmg.start = prev->vm_start; > > - vma = vmg.vma = prev; > > - vmg.pgoff = prev->vm_pgoff; > > - } else if (prev) { > > - vma_iter_next_range(&vmi); > > - } > > - > > - /* Actually expand, if possible */ > > - if (vma && !vma_expand(&vmg)) { > > - khugepaged_enter_vma(vma, vm_flags); > > + vma = vma_merge_new_vma(&vmg); > > + if (vma) > > goto expanded; > > - } > > - > > - if (vma == prev) > > - vma_iter_set(&vmi, addr); > > -cannot_expand: > > > > /* > > * Determine the object being mapped and call the appropriate > > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > * If vm_flags changed after call_mmap(), we should try merge > > * vma again as we may succeed this time. > > */ > > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > > - vma->vm_start, vma->vm_end, > > - vma->vm_pgoff); > > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > > + merge = vma_merge_new_vma(&vmg); > > + > > if (merge) { > > /* > > * ->mmap() can change vma->vm_file and fput > > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > > vma_iter_set(&vmi, vma->vm_end); > > /* Undo any partial mapping done by a device driver. */ > > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, > > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, > > vma->vm_end, vma->vm_end, true); > > } > > if (writable_file_mapping) > > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > unsigned long addr, unsigned long len, unsigned long flags) > > { > > struct mm_struct *mm = current->mm; > > - struct vma_prepare vp; > > > > /* > > * Check against address space limits by the changed size > > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > */ > > if (vma && vma->vm_end == addr) { > > struct vma_merge_struct vmg = { > > + .vmi = vmi, > > .prev = vma, > > + .next = NULL, > > + .start = addr, > > + .end = addr + len, > > .flags = flags, > > .pgoff = addr >> PAGE_SHIFT, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > }; > > > > - if (can_vma_merge_after(&vmg)) { > > - vma_iter_config(vmi, vma->vm_start, addr + len); > > - if (vma_iter_prealloc(vmi, vma)) > > - goto unacct_fail; > > - > > - vma_start_write(vma); > > - > > - init_vma_prep(&vp, vma); > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); > > - vma->vm_end = addr + len; > > - vm_flags_set(vma, VM_SOFTDIRTY); > > - vma_iter_store(vmi, vma); > > - > > - vma_complete(&vp, vmi, mm); > > - khugepaged_enter_vma(vma, flags); > > + if (vma_merge_new_vma(&vmg)) > > goto out; > > - } > > } > > > > if (vma) > > diff --git a/mm/vma.c b/mm/vma.c > > index 55615392e8d2..a404cf718f9e 100644 > > --- a/mm/vma.c > > +++ b/mm/vma.c > > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, > > * > > * We assume the vma may be removed as part of the merge. > > */ > > -bool > > -can_vma_merge_before(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_before(struct vma_merge_struct *vmg) > > { > > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); > > > > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) > > * > > * We assume that vma is not removed as part of the merge. > > */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_after(struct vma_merge_struct *vmg) > > { > > if (is_mergeable_vma(vmg, false) && > > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { > > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) > > return false; > > } > > > > +static void __vma_link_file(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_allow_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_insert(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * Requires inode->i_mapping->i_mmap_rwsem > > + */ > > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_unmap_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_remove(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > > + * @vp: The initialized vma_prepare struct > > + */ > > +static void vma_prepare(struct vma_prepare *vp) > > +{ > > + if (vp->file) { > > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > + > > + if (vp->adj_next) > > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > + vp->adj_next->vm_end); > > + > > + i_mmap_lock_write(vp->mapping); > > + if (vp->insert && vp->insert->vm_file) { > > + /* > > + * Put into interval tree now, so instantiated pages > > + * are visible to arm/parisc __flush_dcache_page > > + * throughout; but we cannot insert into address > > + * space until vma start or end is updated. > > + */ > > + __vma_link_file(vp->insert, > > + vp->insert->vm_file->f_mapping); > > + } > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_lock_write(vp->anon_vma); > > + anon_vma_interval_tree_pre_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > + } > > + > > + if (vp->file) { > > + flush_dcache_mmap_lock(vp->mapping); > > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > + if (vp->adj_next) > > + vma_interval_tree_remove(vp->adj_next, > > + &vp->mapping->i_mmap); > > + } > > + > > +} > > + > > +/* > > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > > + * or for inserting a VMA. > > + * > > + * @vp: The vma_prepare struct > > + * @vmi: The vma iterator > > + * @mm: The mm_struct > > + */ > > +static void vma_complete(struct vma_prepare *vp, > > + struct vma_iterator *vmi, struct mm_struct *mm) > > +{ > > + if (vp->file) { > > + if (vp->adj_next) > > + vma_interval_tree_insert(vp->adj_next, > > + &vp->mapping->i_mmap); > > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > + flush_dcache_mmap_unlock(vp->mapping); > > + } > > + > > + if (vp->remove && vp->file) { > > + __remove_shared_vm_struct(vp->remove, vp->mapping); > > + if (vp->remove2) > > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > > + } else if (vp->insert) { > > + /* > > + * split_vma has split insert from vma, and needs > > + * us to insert it before dropping the locks > > + * (it may either follow vma or precede it). > > + */ > > + vma_iter_store(vmi, vp->insert); > > + mm->map_count++; > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > > + anon_vma_unlock_write(vp->anon_vma); > > + } > > + > > + if (vp->file) { > > + i_mmap_unlock_write(vp->mapping); > > + uprobe_mmap(vp->vma); > > + > > + if (vp->adj_next) > > + uprobe_mmap(vp->adj_next); > > + } > > + > > + if (vp->remove) { > > +again: > > + vma_mark_detached(vp->remove, true); > > + if (vp->file) { > > + uprobe_munmap(vp->remove, vp->remove->vm_start, > > + vp->remove->vm_end); > > + fput(vp->file); > > + } > > + if (vp->remove->anon_vma) > > + anon_vma_merge(vp->vma, vp->remove); > > + mm->map_count--; > > + mpol_put(vma_policy(vp->remove)); > > + if (!vp->remove2) > > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > + vm_area_free(vp->remove); > > + > > + /* > > + * In mprotect's case 6 (see comments on vma_merge), > > + * we are removing both mid and next vmas > > + */ > > + if (vp->remove2) { > > + vp->remove = vp->remove2; > > + vp->remove2 = NULL; > > + goto again; > > + } > > + } > > + if (vp->insert && vp->file) > > + uprobe_mmap(vp->insert); > > + validate_mm(mm); > > +} > > + > > +/* > > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > > + * @vp: The vma_prepare struct > > + * @vma: The vma that will be altered once locked > > + */ > > +static void init_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma) > > +{ > > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > +} > > + > > /* > > * Close a vm structure and free it. > > */ > > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > > vm_unacct_memory(nr_accounted); > > } > > > > -/* > > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > > - * @vp: The vma_prepare struct > > - * @vma: The vma that will be altered once locked > > - */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma) > > -{ > > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > -} > > - > > -/* > > - * Requires inode->i_mapping->i_mmap_rwsem > > - */ > > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_unmap_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_remove(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > /* > > * vma has some anon_vma assigned, and is already inserted on that > > * anon_vma's interval trees. > > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > > } > > > > -static void __vma_link_file(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_allow_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_insert(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > -/* > > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > > - * @vp: The initialized vma_prepare struct > > - */ > > -void vma_prepare(struct vma_prepare *vp) > > -{ > > - if (vp->file) { > > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > - > > - if (vp->adj_next) > > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > - vp->adj_next->vm_end); > > - > > - i_mmap_lock_write(vp->mapping); > > - if (vp->insert && vp->insert->vm_file) { > > - /* > > - * Put into interval tree now, so instantiated pages > > - * are visible to arm/parisc __flush_dcache_page > > - * throughout; but we cannot insert into address > > - * space until vma start or end is updated. > > - */ > > - __vma_link_file(vp->insert, > > - vp->insert->vm_file->f_mapping); > > - } > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_lock_write(vp->anon_vma); > > - anon_vma_interval_tree_pre_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > - } > > - > > - if (vp->file) { > > - flush_dcache_mmap_lock(vp->mapping); > > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > - if (vp->adj_next) > > - vma_interval_tree_remove(vp->adj_next, > > - &vp->mapping->i_mmap); > > - } > > - > > -} > > - > > /* > > * dup_anon_vma() - Helper function to duplicate anon_vma > > * @dst: The destination VMA > > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) > > } > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > > > +/* > > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > > + * > > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > > + * > > + * We are about to add a VMA to the address space starting at @vmg->start and > > + * ending at @vmg->end. There are three different possible scenarios: > > + * > > + * 1. There is a VMA with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > > + * EXPAND that VMA: > > + * > > + * Proposed: |-----| or |-----| > > + * Existing: |----| |----| > > + * > > + * 2. There are VMAs with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > > + * EXPAND the former and REMOVE the latter: > > + * > > + * Proposed: |-----| > > + * Existing: |----| |----| > > + * > > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > > + * > > + * In instances where we can merge, this function returns the expanded VMA which > > + * will have its range adjusted accordingly and the underlying maple tree also > > + * adjusted. > > + * > > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > > + * to the VMA we expanded. > > + * > > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > > + * expanded range. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > > + */ > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > +{ > > + bool is_special = vmg->flags & VM_SPECIAL; > > + struct vm_area_struct *prev = vmg->prev; > > + struct vm_area_struct *next = vmg->next; > > + unsigned long start = vmg->start; > > + unsigned long end = vmg->end; > > + pgoff_t pgoff = vmg->pgoff; > > + pgoff_t pglen = PHYS_PFN(end - start); > > + > > + VM_WARN_ON(vmg->vma); > > + > > + if (!prev && !next) { > > + /* > > + * Since the caller must have determined that the requested > > + * range is empty, vmg->vmi will be left pointing at the VMA > > + * immediately prior. > > + */ > > + next = vmg->next = vma_next(vmg->vmi); > > + prev = vmg->prev = vma_prev(vmg->vmi); > > + > > + /* Avoid maple tree re-walk. */ > > + if (is_special && prev) > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > > + if (is_special || (!prev && !next)) > > + return NULL; > > + > > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > > + /* > > + * We can adjust this here as can_vma_merge_after() doesn't > > + * touch vmg->end. > > + */ > > + vmg->end = next->vm_end; > > + vmg->vma = next; > > + vmg->pgoff = next->vm_pgoff - pglen; > > + > > + vmg->anon_vma = next->anon_vma; > > + } > > + > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > > + vmg->start = prev->vm_start; > > + vmg->vma = prev; > > + vmg->pgoff = prev->vm_pgoff; > > + } else if (prev) { > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* > > + * Now try to expand adjacent VMA(s). This takes care of removing the > > + * following VMA if we have VMAs on both sides. > > + */ > > + if (vmg->vma && !vma_expand(vmg)) { > > + khugepaged_enter_vma(vmg->vma, vmg->flags); > > + return vmg->vma; > > + } > > + > > + /* If expansion failed, reset state. Allows us to retry merge later. */ > > + vmg->vma = NULL; > > + vmg->anon_vma = NULL; > > + vmg->start = start; > > + vmg->end = end; > > + vmg->pgoff = pgoff; > > + if (vmg->vma == prev) > > + vma_iter_set(vmg->vmi, start); > > + > > + return NULL; > > +} > > + > > /* > > * vma_expand - Expand an existing VMA > > * > > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) > > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with > > * vmg->next needs to be handled by the caller. > > * > > - * Returns: 0 on success > > + * Returns: 0 on success. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. > > + * - The caller must have set @vmg->prev and @vmg->next. > > */ > > int vma_expand(struct vma_merge_struct *vmg) > > { > > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) > > return 0; > > } > > > > -/* > > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > > - * or for inserting a VMA. > > - * > > - * @vp: The vma_prepare struct > > - * @vmi: The vma iterator > > - * @mm: The mm_struct > > - */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm) > > -{ > > - if (vp->file) { > > - if (vp->adj_next) > > - vma_interval_tree_insert(vp->adj_next, > > - &vp->mapping->i_mmap); > > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > - flush_dcache_mmap_unlock(vp->mapping); > > - } > > - > > - if (vp->remove && vp->file) { > > - __remove_shared_vm_struct(vp->remove, vp->mapping); > > - if (vp->remove2) > > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > > - } else if (vp->insert) { > > - /* > > - * split_vma has split insert from vma, and needs > > - * us to insert it before dropping the locks > > - * (it may either follow vma or precede it). > > - */ > > - vma_iter_store(vmi, vp->insert); > > - mm->map_count++; > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_interval_tree_post_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > > - anon_vma_unlock_write(vp->anon_vma); > > - } > > - > > - if (vp->file) { > > - i_mmap_unlock_write(vp->mapping); > > - uprobe_mmap(vp->vma); > > - > > - if (vp->adj_next) > > - uprobe_mmap(vp->adj_next); > > - } > > - > > - if (vp->remove) { > > -again: > > - vma_mark_detached(vp->remove, true); > > - if (vp->file) { > > - uprobe_munmap(vp->remove, vp->remove->vm_start, > > - vp->remove->vm_end); > > - fput(vp->file); > > - } > > - if (vp->remove->anon_vma) > > - anon_vma_merge(vp->vma, vp->remove); > > - mm->map_count--; > > - mpol_put(vma_policy(vp->remove)); > > - if (!vp->remove2) > > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > - vm_area_free(vp->remove); > > - > > - /* > > - * In mprotect's case 6 (see comments on vma_merge), > > - * we are removing both mid and next vmas > > - */ > > - if (vp->remove2) { > > - vp->remove = vp->remove2; > > - vp->remove2 = NULL; > > - goto again; > > - } > > - } > > - if (vp->insert && vp->file) > > - uprobe_mmap(vp->insert); > > - validate_mm(mm); > > -} > > - > > /* > > * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > > * @vmi: The vma iterator > > @@ -1261,20 +1378,6 @@ struct vm_area_struct > > return vma_modify(&vmg); > > } > > > > -/* > > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > > - * must ensure that [start, end) does not overlap any existing VMA. > > - */ > > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > -{ > > - if (!vmg->prev) { > > - vmg->prev = vma_prev(vmg->vmi); > > - vma_iter_set(vmg->vmi, vmg->start); > > - } > > - > > - return vma_merge(vmg); > > -} > > - > > /* > > * Expand vma by delta bytes, potentially merging with an immediately adjacent > > * VMA with identical properties. > > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > .anon_name = anon_vma_name(vma), > > }; > > > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > > - return vma_merge(&vmg); > > + return vma_merge_new_vma(&vmg); > > } > > > > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > struct vm_area_struct *vma = *vmap; > > unsigned long vma_start = vma->vm_start; > > struct mm_struct *mm = vma->vm_mm; > > - struct vm_area_struct *new_vma, *prev; > > + struct vm_area_struct *new_vma; > > bool faulted_in_anon_vma = true; > > VMA_ITERATOR(vmi, mm, addr); > > + struct vma_merge_struct vmg = { > > + .vmi = &vmi, > > + .start = addr, > > + .end = addr + len, > > + .flags = vma->vm_flags, > > + .pgoff = pgoff, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > + }; > > > > /* > > * If anonymous vma has not yet been faulted, update new pgoff > > * to match new location, to increase its chance of merging. > > */ > > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > > - pgoff = addr >> PAGE_SHIFT; > > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; > > faulted_in_anon_vma = false; > > } > > > > - new_vma = find_vma_prev(mm, addr, &prev); > > + new_vma = find_vma_prev(mm, addr, &vmg.prev); > > if (new_vma && new_vma->vm_start < addr + len) > > return NULL; /* should never get here */ > > > > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); > > + vmg.next = vma_next(&vmi); > > + vma_prev(&vmi); > > + > > + new_vma = vma_merge_new_vma(&vmg); > > + > > if (new_vma) { > > /* > > * Source vma may have been merged into new_vma > > diff --git a/mm/vma.h b/mm/vma.h > > index 50459f9e4c7f..bbb173053f34 100644 > > --- a/mm/vma.h > > +++ b/mm/vma.h > > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > > /* Required for expand_downwards(). */ > > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > > > -/* Required for do_brk_flags(). */ > > -void vma_prepare(struct vma_prepare *vp); > > - > > -/* Required for do_brk_flags(). */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma); > > - > > -/* Required for do_brk_flags(). */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm); > > - > > int vma_expand(struct vma_merge_struct *vmg); > > int vma_shrink(struct vma_merge_struct *vmg); > > > > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > struct vm_area_struct *next, unsigned long start, > > unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > > > -/* > > - * Can we merge the VMA described by vmg into the following VMA vmg->next? > > - * > > - * Required by mmap_region(). > > - */ > > -bool can_vma_merge_before(struct vma_merge_struct *vmg); > > - > > -/* > > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? > > - * > > - * Required by mmap_region() and do_brk_flags(). > > - */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg); > > - > > /* We are about to modify the VMA's flags. */ > > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, > > struct vm_area_struct *prev, > > @@ -133,31 +108,7 @@ struct vm_area_struct > > unsigned long new_flags, > > struct vm_userfaultfd_ctx new_ctx); > > > > -struct vm_area_struct > > -*vma_merge_new_vma(struct vma_merge_struct *vmg); > > - > > -/* Temporary convenience wrapper. */ > > -static inline struct vm_area_struct > > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, > > - struct vm_area_struct *vma, unsigned long start, > > - unsigned long end, pgoff_t pgoff) > > -{ > > - struct vma_merge_struct vmg = { > > - .vmi = vmi, > > - .prev = prev, > > - .start = start, > > - .end = end, > > - .flags = vma->vm_flags, > > - .file = vma->vm_file, > > - .anon_vma = vma->anon_vma, > > - .pgoff = pgoff, > > - .policy = vma_policy(vma), > > - .uffd_ctx = vma->vm_userfaultfd_ctx, > > - .anon_name = anon_vma_name(vma), > > - }; > > - > > - return vma_merge_new_vma(&vmg); > > -} > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); > > > > /* > > * Temporary wrapper around vma_merge() so we can have a common interface for > > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > > index 40797a819d3d..a39a734282d0 100644 > > --- a/tools/testing/vma/vma_internal.h > > +++ b/tools/testing/vma/vma_internal.h > > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) > > mas_destroy(&vmi->mas); > > } > > > > +static inline > > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) > > +{ > > + return mas_next_range(&vmi->mas, ULONG_MAX); > > +} > > + > > static inline void vm_acct_memory(long pages) > > { > > } >
On 8/5/24 14:13, Lorenzo Stoakes wrote: > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > to use vma_expand() rather than invoke a full vma_merge() operation. > > Abstract this logic and eliminate all of the open-coding, and also use the > same logic for all cases where we add new VMAs to, rather than ultimately > use vma_merge(), rather use vma_expand(). > > We implement this by replacing vma_merge_new_vma() with this newly > abstracted logic. > > Doing so removes duplication and simplifies VMA merging in all such cases, > laying the ground for us to eliminate the merging of new VMAs in > vma_merge() altogether. > > This makes it far easier to understand what is happening in these cases > avoiding confusion, bugs and allowing for future optimisation. > > As a result of this change we are also able to make vma_prepare(), > init_vma_prep(), vma_complete(), can_vma_merge_before() and > can_vma_merge_after() static and internal to vma.c. That's really great, but it would be even better if these code moves could be a separate patch as it would make reviewing so much easier. But with git diff's --color-moved to the rescue, let me try... > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/mmap.c | 79 ++--- > mm/vma.c | 482 +++++++++++++++++++------------ > mm/vma.h | 51 +--- > tools/testing/vma/vma_internal.h | 6 + > 4 files changed, 324 insertions(+), 294 deletions(-) > > diff --git a/mm/mmap.c b/mm/mmap.c > index f6593a81f73d..c03f50f46396 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > { > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma = NULL; > - struct vm_area_struct *next, *prev, *merge; > - pgoff_t pglen = len >> PAGE_SHIFT; > + struct vm_area_struct *merge; > unsigned long charged = 0; > unsigned long end = addr + len; > bool writable_file_mapping = false; > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vm_flags |= VM_ACCOUNT; > } > > - next = vmg.next = vma_next(&vmi); > - prev = vmg.prev = vma_prev(&vmi); > - if (vm_flags & VM_SPECIAL) { > - if (prev) > - vma_iter_next_range(&vmi); > - goto cannot_expand; > - } > - > - /* Attempt to expand an old mapping */ > - /* Check next */ > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > - vmg.end = next->vm_end; > - vma = vmg.vma = next; > - vmg.pgoff = next->vm_pgoff - pglen; > - > - /* We may merge our NULL anon_vma with non-NULL in next. */ > - vmg.anon_vma = vma->anon_vma; > - } > - > - /* Check prev */ > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > - vmg.start = prev->vm_start; > - vma = vmg.vma = prev; > - vmg.pgoff = prev->vm_pgoff; > - } else if (prev) { > - vma_iter_next_range(&vmi); > - } > - > - /* Actually expand, if possible */ > - if (vma && !vma_expand(&vmg)) { > - khugepaged_enter_vma(vma, vm_flags); > + vma = vma_merge_new_vma(&vmg); > + if (vma) > goto expanded; > - } > - > - if (vma == prev) > - vma_iter_set(&vmi, addr); > -cannot_expand: > > /* > * Determine the object being mapped and call the appropriate > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > * If vm_flags changed after call_mmap(), we should try merge > * vma again as we may succeed this time. > */ > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > - vma->vm_start, vma->vm_end, > - vma->vm_pgoff); > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > + merge = vma_merge_new_vma(&vmg); Can this even succeed if we don't update vmg->vm_flags? Previously the wrapper would take them from vma. > + > if (merge) { > /* > * ->mmap() can change vma->vm_file and fput <snip> > +/* > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > + * > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > + * > + * We are about to add a VMA to the address space starting at @vmg->start and > + * ending at @vmg->end. There are three different possible scenarios: > + * > + * 1. There is a VMA with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > + * EXPAND that VMA: > + * > + * Proposed: |-----| or |-----| > + * Existing: |----| |----| > + * > + * 2. There are VMAs with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > + * EXPAND the former and REMOVE the latter: > + * > + * Proposed: |-----| > + * Existing: |----| |----| > + * > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > + * > + * In instances where we can merge, this function returns the expanded VMA which > + * will have its range adjusted accordingly and the underlying maple tree also > + * adjusted. > + * > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > + * to the VMA we expanded. > + * > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > + * expanded range. > + * > + * ASSUMPTIONS: > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. Should we be paranoid and assert something? > + */ > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > +{ > + bool is_special = vmg->flags & VM_SPECIAL; > + struct vm_area_struct *prev = vmg->prev; > + struct vm_area_struct *next = vmg->next; > + unsigned long start = vmg->start; > + unsigned long end = vmg->end; > + pgoff_t pgoff = vmg->pgoff; > + pgoff_t pglen = PHYS_PFN(end - start); > + > + VM_WARN_ON(vmg->vma); > + > + if (!prev && !next) { > + /* > + * Since the caller must have determined that the requested > + * range is empty, vmg->vmi will be left pointing at the VMA > + * immediately prior. > + */ OK that's perhaps not that obvious, as it seems copy_vma() is doing some special dance to ensure this. Should we add it to the ASSUMPTIONS and assert it, or is there a maple tree operation we can do to ensure it, ideally if it's very cheap if the iterator is already set the way we want it to be? > + next = vmg->next = vma_next(vmg->vmi); > + prev = vmg->prev = vma_prev(vmg->vmi); > + > + /* Avoid maple tree re-walk. */ > + if (is_special && prev) > + vma_iter_next_range(vmg->vmi); I wish I knew what this did but seems it's the same as the old code did so hopefully that's fine. > + } > + > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > + if (is_special || (!prev && !next)) > + return NULL; > + > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > + /* > + * We can adjust this here as can_vma_merge_after() doesn't > + * touch vmg->end. > + */ > + vmg->end = next->vm_end; > + vmg->vma = next; > + vmg->pgoff = next->vm_pgoff - pglen; > + > + vmg->anon_vma = next->anon_vma; > + } > + > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > + vmg->start = prev->vm_start; > + vmg->vma = prev; > + vmg->pgoff = prev->vm_pgoff; > + } else if (prev) { > + vma_iter_next_range(vmg->vmi); > + } Sigh... ditto.
On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote: > On 8/5/24 14:13, Lorenzo Stoakes wrote: > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > same logic for all cases where we add new VMAs to, rather than ultimately > > use vma_merge(), rather use vma_expand(). > > > > We implement this by replacing vma_merge_new_vma() with this newly > > abstracted logic. > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > laying the ground for us to eliminate the merging of new VMAs in > > vma_merge() altogether. > > > > This makes it far easier to understand what is happening in these cases > > avoiding confusion, bugs and allowing for future optimisation. > > > > As a result of this change we are also able to make vma_prepare(), > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > can_vma_merge_after() static and internal to vma.c. > > That's really great, but it would be even better if these code moves could > be a separate patch as it would make reviewing so much easier. But with git > diff's --color-moved to the rescue, let me try... Will separate out on respin. > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > > mm/mmap.c | 79 ++--- > > mm/vma.c | 482 +++++++++++++++++++------------ > > mm/vma.h | 51 +--- > > tools/testing/vma/vma_internal.h | 6 + > > 4 files changed, 324 insertions(+), 294 deletions(-) > > > > diff --git a/mm/mmap.c b/mm/mmap.c > > index f6593a81f73d..c03f50f46396 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > { > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma = NULL; > > - struct vm_area_struct *next, *prev, *merge; > > - pgoff_t pglen = len >> PAGE_SHIFT; > > + struct vm_area_struct *merge; > > unsigned long charged = 0; > > unsigned long end = addr + len; > > bool writable_file_mapping = false; > > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vm_flags |= VM_ACCOUNT; > > } > > > > - next = vmg.next = vma_next(&vmi); > > - prev = vmg.prev = vma_prev(&vmi); > > - if (vm_flags & VM_SPECIAL) { > > - if (prev) > > - vma_iter_next_range(&vmi); > > - goto cannot_expand; > > - } > > - > > - /* Attempt to expand an old mapping */ > > - /* Check next */ > > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > > - vmg.end = next->vm_end; > > - vma = vmg.vma = next; > > - vmg.pgoff = next->vm_pgoff - pglen; > > - > > - /* We may merge our NULL anon_vma with non-NULL in next. */ > > - vmg.anon_vma = vma->anon_vma; > > - } > > - > > - /* Check prev */ > > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > > - vmg.start = prev->vm_start; > > - vma = vmg.vma = prev; > > - vmg.pgoff = prev->vm_pgoff; > > - } else if (prev) { > > - vma_iter_next_range(&vmi); > > - } > > - > > - /* Actually expand, if possible */ > > - if (vma && !vma_expand(&vmg)) { > > - khugepaged_enter_vma(vma, vm_flags); > > + vma = vma_merge_new_vma(&vmg); > > + if (vma) > > goto expanded; > > - } > > - > > - if (vma == prev) > > - vma_iter_set(&vmi, addr); > > -cannot_expand: > > > > /* > > * Determine the object being mapped and call the appropriate > > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > * If vm_flags changed after call_mmap(), we should try merge > > * vma again as we may succeed this time. > > */ > > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > > - vma->vm_start, vma->vm_end, > > - vma->vm_pgoff); > > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > > + merge = vma_merge_new_vma(&vmg); > > Can this even succeed if we don't update vmg->vm_flags? Previously the > wrapper would take them from vma. You're right... ugh. Will fix. This is yet another example of how having this _not_ be under test is problematic, as that'd have picked this up. I will try to move at least VMA merge invocation logic over in a later series. > > > + > > if (merge) { > > /* > > * ->mmap() can change vma->vm_file and fput > > <snip> > > > +/* > > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > > + * > > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > > + * > > + * We are about to add a VMA to the address space starting at @vmg->start and > > + * ending at @vmg->end. There are three different possible scenarios: > > + * > > + * 1. There is a VMA with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > > + * EXPAND that VMA: > > + * > > + * Proposed: |-----| or |-----| > > + * Existing: |----| |----| > > + * > > + * 2. There are VMAs with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > > + * EXPAND the former and REMOVE the latter: > > + * > > + * Proposed: |-----| > > + * Existing: |----| |----| > > + * > > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > > + * > > + * In instances where we can merge, this function returns the expanded VMA which > > + * will have its range adjusted accordingly and the underlying maple tree also > > + * adjusted. > > + * > > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > > + * to the VMA we expanded. > > + * > > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > > + * expanded range. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > > Should we be paranoid and assert something? This will have a performance impact, if we do that we'll want something like an #ifdef CONFIG_DEBUG_VM around that. > > > + */ > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > +{ > > + bool is_special = vmg->flags & VM_SPECIAL; > > + struct vm_area_struct *prev = vmg->prev; > > + struct vm_area_struct *next = vmg->next; > > + unsigned long start = vmg->start; > > + unsigned long end = vmg->end; > > + pgoff_t pgoff = vmg->pgoff; > > + pgoff_t pglen = PHYS_PFN(end - start); > > + > > + VM_WARN_ON(vmg->vma); > > + > > + if (!prev && !next) { > > + /* > > + * Since the caller must have determined that the requested > > + * range is empty, vmg->vmi will be left pointing at the VMA > > + * immediately prior. > > + */ > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert > it, or is there a maple tree operation we can do to ensure it, ideally if > it's very cheap if the iterator is already set the way we want it to be? > To be fair this is something that was previously assumed, and I just added a comment. Will add to assumptions, and again I think any assert should be done in such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe VM_WARN_ON()? Will try to come up with something. > > + next = vmg->next = vma_next(vmg->vmi); > > + prev = vmg->prev = vma_prev(vmg->vmi); > > + > > + /* Avoid maple tree re-walk. */ > > + if (is_special && prev) > > + vma_iter_next_range(vmg->vmi); > > I wish I knew what this did but seems it's the same as the old code did so > hopefully that's fine. I think point is that we are about to exit, so we'd be left pointing at prev. But since we're exiting in just a second, we want to be pointing at the next vma which will become the prev of the next merge attempt. Liam can maybe elucidate further. > > > + } > > + > > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > > + if (is_special || (!prev && !next)) > > + return NULL; > > + > > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > > + /* > > + * We can adjust this here as can_vma_merge_after() doesn't > > + * touch vmg->end. > > + */ > > + vmg->end = next->vm_end; > > + vmg->vma = next; > > + vmg->pgoff = next->vm_pgoff - pglen; > > + > > + vmg->anon_vma = next->anon_vma; > > + } > > + > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > > + vmg->start = prev->vm_start; > > + vmg->vma = prev; > > + vmg->pgoff = prev->vm_pgoff; > > + } else if (prev) { > > + vma_iter_next_range(vmg->vmi); > > + } > > Sigh... ditto. > (Liam can correct me) I think this is just setting up the vmi similar to the other case such that if expansion fails we can positioned correctly for the next merge attempt. Yes it's fiddly, maybe needs a comment...
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]: > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote: > > On 8/5/24 14:13, Lorenzo Stoakes wrote: > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > > same logic for all cases where we add new VMAs to, rather than ultimately > > > use vma_merge(), rather use vma_expand(). > > > > > > We implement this by replacing vma_merge_new_vma() with this newly > > > abstracted logic. > > > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > > laying the ground for us to eliminate the merging of new VMAs in > > > vma_merge() altogether. > > > > > > This makes it far easier to understand what is happening in these cases > > > avoiding confusion, bugs and allowing for future optimisation. > > > > > > As a result of this change we are also able to make vma_prepare(), > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > > can_vma_merge_after() static and internal to vma.c. > > > > That's really great, but it would be even better if these code moves could > > be a separate patch as it would make reviewing so much easier. But with git > > diff's --color-moved to the rescue, let me try... > > Will separate out on respin. > > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > --- > > > mm/mmap.c | 79 ++--- > > > mm/vma.c | 482 +++++++++++++++++++------------ > > > mm/vma.h | 51 +--- > > > tools/testing/vma/vma_internal.h | 6 + > > > 4 files changed, 324 insertions(+), 294 deletions(-) ... > > > + */ > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > > +{ > > > + bool is_special = vmg->flags & VM_SPECIAL; > > > + struct vm_area_struct *prev = vmg->prev; > > > + struct vm_area_struct *next = vmg->next; > > > + unsigned long start = vmg->start; > > > + unsigned long end = vmg->end; > > > + pgoff_t pgoff = vmg->pgoff; > > > + pgoff_t pglen = PHYS_PFN(end - start); > > > + > > > + VM_WARN_ON(vmg->vma); > > > + > > > + if (!prev && !next) { > > > + /* > > > + * Since the caller must have determined that the requested > > > + * range is empty, vmg->vmi will be left pointing at the VMA > > > + * immediately prior. > > > + */ > > > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert > > it, or is there a maple tree operation we can do to ensure it, ideally if > > it's very cheap if the iterator is already set the way we want it to be? > > > > To be fair this is something that was previously assumed, and I just added > a comment. > > Will add to assumptions, and again I think any assert should be done in > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe > VM_WARN_ON()? > > Will try to come up with something. > > > > + next = vmg->next = vma_next(vmg->vmi); > > > + prev = vmg->prev = vma_prev(vmg->vmi); > > > + > > > + /* Avoid maple tree re-walk. */ > > > + if (is_special && prev) > > > + vma_iter_next_range(vmg->vmi); > > > > I wish I knew what this did but seems it's the same as the old code did so > > hopefully that's fine. > > I think point is that we are about to exit, so we'd be left pointing at > prev. But since we're exiting in just a second, we want to be pointing at > the next vma which will become the prev of the next merge attempt. > > Liam can maybe elucidate further. What you have to remember is that the vma iterator (vmg->vmi above), contains (or, basically is) a maple state (usually written as mas). We keep state of the maple tree walker so that we don't have to keep re-walking to find the same thing. We move around the tree with this maple state because going prev/next is faster from leaves (almost always just the next thing in the nodes array of pointers). We use the maple state to write as well, so the maple state needs to point to the correct location in the tree for a write. The maple tree is a range-based tree, so each entry exists for a span of values. A write happens at the lowest index and can overwrite subsequent values. This means that the maple state needs to point to the range containing the lowest index for the write (if it's pointing to a node - it could walk from the top). A side effect of writing to the lowest index is that we need to point to the previous vma if we are going to 'expand' the vma. The range is essentially going to be from prev->start to "whatever we are expanding over". In the old code, the vm_flags & VM_SPECIAL code meant there was no way an expansion was going to happen, but we've moved the maple state to the wrong location for a write of a new vma - so this vma_iter_next_range() just moves it back. Then we "goto cannot_expand". > > > > > > + } > > > + > > > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > > > + if (is_special || (!prev && !next)) > > > + return NULL; > > > + > > > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > > > + /* > > > + * We can adjust this here as can_vma_merge_after() doesn't > > > + * touch vmg->end. > > > + */ > > > + vmg->end = next->vm_end; > > > + vmg->vma = next; > > > + vmg->pgoff = next->vm_pgoff - pglen; > > > + > > > + vmg->anon_vma = next->anon_vma; > > > + } > > > + > > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > > > + vmg->start = prev->vm_start; > > > + vmg->vma = prev; > > > + vmg->pgoff = prev->vm_pgoff; > > > + } else if (prev) { > > > + vma_iter_next_range(vmg->vmi); > > > + } > > > > Sigh... ditto. > > > > (Liam can correct me) I think this is just setting up the vmi similar to > the other case such that if expansion fails we can positioned correctly for > the next merge attempt. > > Yes it's fiddly, maybe needs a comment... Yes, ditto.
* Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]: > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]: > > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote: > > > On 8/5/24 14:13, Lorenzo Stoakes wrote: > > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > > > same logic for all cases where we add new VMAs to, rather than ultimately > > > > use vma_merge(), rather use vma_expand(). > > > > > > > > We implement this by replacing vma_merge_new_vma() with this newly > > > > abstracted logic. > > > > > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > > > laying the ground for us to eliminate the merging of new VMAs in > > > > vma_merge() altogether. > > > > > > > > This makes it far easier to understand what is happening in these cases > > > > avoiding confusion, bugs and allowing for future optimisation. > > > > > > > > As a result of this change we are also able to make vma_prepare(), > > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > > > can_vma_merge_after() static and internal to vma.c. > > > > > > That's really great, but it would be even better if these code moves could > > > be a separate patch as it would make reviewing so much easier. But with git > > > diff's --color-moved to the rescue, let me try... > > > > Will separate out on respin. > > > > > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > > --- > > > > mm/mmap.c | 79 ++--- > > > > mm/vma.c | 482 +++++++++++++++++++------------ > > > > mm/vma.h | 51 +--- > > > > tools/testing/vma/vma_internal.h | 6 + > > > > 4 files changed, 324 insertions(+), 294 deletions(-) > > ... > > > > + */ > > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > > > +{ > > > > + bool is_special = vmg->flags & VM_SPECIAL; > > > > + struct vm_area_struct *prev = vmg->prev; > > > > + struct vm_area_struct *next = vmg->next; > > > > + unsigned long start = vmg->start; > > > > + unsigned long end = vmg->end; > > > > + pgoff_t pgoff = vmg->pgoff; > > > > + pgoff_t pglen = PHYS_PFN(end - start); > > > > + > > > > + VM_WARN_ON(vmg->vma); > > > > + > > > > + if (!prev && !next) { > > > > + /* > > > > + * Since the caller must have determined that the requested > > > > + * range is empty, vmg->vmi will be left pointing at the VMA > > > > + * immediately prior. > > > > + */ > > > > > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some > > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert > > > it, or is there a maple tree operation we can do to ensure it, ideally if > > > it's very cheap if the iterator is already set the way we want it to be? > > > > > > > To be fair this is something that was previously assumed, and I just added > > a comment. > > > > Will add to assumptions, and again I think any assert should be done in > > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe > > VM_WARN_ON()? > > > > Will try to come up with something. Something like: VM_BUG_ON(vma_iter_end(vmg->vmi) > start); > > > > > > + next = vmg->next = vma_next(vmg->vmi); and: VM_BUG_ON(vma_iter_addr(vmg->vmi) < end); > > > > + prev = vmg->prev = vma_prev(vmg->vmi); > > > > + > > > > + /* Avoid maple tree re-walk. */ > > > > + if (is_special && prev) > > > > + vma_iter_next_range(vmg->vmi); > > > > > > I wish I knew what this did but seems it's the same as the old code did so > > > hopefully that's fine. > > > > I think point is that we are about to exit, so we'd be left pointing at > > prev. But since we're exiting in just a second, we want to be pointing at > > the next vma which will become the prev of the next merge attempt. > > > > Liam can maybe elucidate further. > > What you have to remember is that the vma iterator (vmg->vmi above), > contains (or, basically is) a maple state (usually written as mas). We > keep state of the maple tree walker so that we don't have to keep > re-walking to find the same thing. We move around the tree with this > maple state because going prev/next is faster from leaves (almost always > just the next thing in the nodes array of pointers). > > We use the maple state to write as well, so the maple state needs to > point to the correct location in the tree for a write. > > The maple tree is a range-based tree, so each entry exists for a span of > values. A write happens at the lowest index and can overwrite > subsequent values. This means that the maple state needs to point to > the range containing the lowest index for the write (if it's pointing to > a node - it could walk from the top). > > A side effect of writing to the lowest index is that we need to point to > the previous vma if we are going to 'expand' the vma. The range is > essentially going to be from prev->start to "whatever we are expanding > over". > > In the old code, the vm_flags & VM_SPECIAL code meant there was no way > an expansion was going to happen, but we've moved the maple state to the > wrong location for a write of a new vma - so this vma_iter_next_range() > just moves it back. Then we "goto cannot_expand". >
On Thu, Aug 08, 2024 at 03:06:14PM GMT, Liam R. Howlett wrote: > * Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]: > > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]: > > > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote: > > > > On 8/5/24 14:13, Lorenzo Stoakes wrote: > > > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > > > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > > > > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > > > > same logic for all cases where we add new VMAs to, rather than ultimately > > > > > use vma_merge(), rather use vma_expand(). > > > > > > > > > > We implement this by replacing vma_merge_new_vma() with this newly > > > > > abstracted logic. > > > > > > > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > > > > laying the ground for us to eliminate the merging of new VMAs in > > > > > vma_merge() altogether. > > > > > > > > > > This makes it far easier to understand what is happening in these cases > > > > > avoiding confusion, bugs and allowing for future optimisation. > > > > > > > > > > As a result of this change we are also able to make vma_prepare(), > > > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > > > > can_vma_merge_after() static and internal to vma.c. > > > > > > > > That's really great, but it would be even better if these code moves could > > > > be a separate patch as it would make reviewing so much easier. But with git > > > > diff's --color-moved to the rescue, let me try... > > > > > > Will separate out on respin. > > > > > > > > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > > > --- > > > > > mm/mmap.c | 79 ++--- > > > > > mm/vma.c | 482 +++++++++++++++++++------------ > > > > > mm/vma.h | 51 +--- > > > > > tools/testing/vma/vma_internal.h | 6 + > > > > > 4 files changed, 324 insertions(+), 294 deletions(-) > > > > ... > > > > > + */ > > > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > > > > +{ > > > > > + bool is_special = vmg->flags & VM_SPECIAL; > > > > > + struct vm_area_struct *prev = vmg->prev; > > > > > + struct vm_area_struct *next = vmg->next; > > > > > + unsigned long start = vmg->start; > > > > > + unsigned long end = vmg->end; > > > > > + pgoff_t pgoff = vmg->pgoff; > > > > > + pgoff_t pglen = PHYS_PFN(end - start); > > > > > + > > > > > + VM_WARN_ON(vmg->vma); > > > > > + > > > > > + if (!prev && !next) { > > > > > + /* > > > > > + * Since the caller must have determined that the requested > > > > > + * range is empty, vmg->vmi will be left pointing at the VMA > > > > > + * immediately prior. > > > > > + */ > > > > > > > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some > > > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert > > > > it, or is there a maple tree operation we can do to ensure it, ideally if > > > > it's very cheap if the iterator is already set the way we want it to be? > > > > > > > > > > To be fair this is something that was previously assumed, and I just added > > > a comment. > > > > > > Will add to assumptions, and again I think any assert should be done in > > > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe > > > VM_WARN_ON()? > > > > > > Will try to come up with something. > > Something like: > > VM_BUG_ON(vma_iter_end(vmg->vmi) > start); > > > > > > > > > + next = vmg->next = vma_next(vmg->vmi); > > and: > > VM_BUG_ON(vma_iter_addr(vmg->vmi) < end); > Ack x2. Thought VM_BUG_ON() was 'not done' these days though... but checkpatch.pl has become rather hit or miss as to what should be given attention to or not. > > > > > + prev = vmg->prev = vma_prev(vmg->vmi); > > > > > + > > > > > + /* Avoid maple tree re-walk. */ > > > > > + if (is_special && prev) > > > > > + vma_iter_next_range(vmg->vmi); > > > > > > > > I wish I knew what this did but seems it's the same as the old code did so > > > > hopefully that's fine. > > > > > > I think point is that we are about to exit, so we'd be left pointing at > > > prev. But since we're exiting in just a second, we want to be pointing at > > > the next vma which will become the prev of the next merge attempt. > > > > > > Liam can maybe elucidate further. > > > > What you have to remember is that the vma iterator (vmg->vmi above), > > contains (or, basically is) a maple state (usually written as mas). We > > keep state of the maple tree walker so that we don't have to keep > > re-walking to find the same thing. We move around the tree with this > > maple state because going prev/next is faster from leaves (almost always > > just the next thing in the nodes array of pointers). > > > > We use the maple state to write as well, so the maple state needs to > > point to the correct location in the tree for a write. > > > > The maple tree is a range-based tree, so each entry exists for a span of > > values. A write happens at the lowest index and can overwrite > > subsequent values. This means that the maple state needs to point to > > the range containing the lowest index for the write (if it's pointing to > > a node - it could walk from the top). > > > > A side effect of writing to the lowest index is that we need to point to > > the previous vma if we are going to 'expand' the vma. The range is > > essentially going to be from prev->start to "whatever we are expanding > > over". > > > > In the old code, the vm_flags & VM_SPECIAL code meant there was no way > > an expansion was going to happen, but we've moved the maple state to the > > wrong location for a write of a new vma - so this vma_iter_next_range() > > just moves it back. Then we "goto cannot_expand". > >
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]: > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > to use vma_expand() rather than invoke a full vma_merge() operation. > > Abstract this logic and eliminate all of the open-coding, and also use the > same logic for all cases where we add new VMAs to, rather than ultimately > use vma_merge(), rather use vma_expand(). > > We implement this by replacing vma_merge_new_vma() with this newly > abstracted logic. > > Doing so removes duplication and simplifies VMA merging in all such cases, > laying the ground for us to eliminate the merging of new VMAs in > vma_merge() altogether. > > This makes it far easier to understand what is happening in these cases > avoiding confusion, bugs and allowing for future optimisation. > > As a result of this change we are also able to make vma_prepare(), > init_vma_prep(), vma_complete(), can_vma_merge_before() and > can_vma_merge_after() static and internal to vma.c. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/mmap.c | 79 ++--- > mm/vma.c | 482 +++++++++++++++++++------------ > mm/vma.h | 51 +--- > tools/testing/vma/vma_internal.h | 6 + > 4 files changed, 324 insertions(+), 294 deletions(-) > > diff --git a/mm/mmap.c b/mm/mmap.c > index f6593a81f73d..c03f50f46396 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > { > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma = NULL; > - struct vm_area_struct *next, *prev, *merge; > - pgoff_t pglen = len >> PAGE_SHIFT; > + struct vm_area_struct *merge; > unsigned long charged = 0; > unsigned long end = addr + len; > bool writable_file_mapping = false; > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > vm_flags |= VM_ACCOUNT; > } > > - next = vmg.next = vma_next(&vmi); > - prev = vmg.prev = vma_prev(&vmi); > - if (vm_flags & VM_SPECIAL) { > - if (prev) > - vma_iter_next_range(&vmi); > - goto cannot_expand; > - } > - > - /* Attempt to expand an old mapping */ > - /* Check next */ > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > - vmg.end = next->vm_end; > - vma = vmg.vma = next; > - vmg.pgoff = next->vm_pgoff - pglen; > - > - /* We may merge our NULL anon_vma with non-NULL in next. */ > - vmg.anon_vma = vma->anon_vma; > - } > - > - /* Check prev */ > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > - vmg.start = prev->vm_start; > - vma = vmg.vma = prev; > - vmg.pgoff = prev->vm_pgoff; > - } else if (prev) { > - vma_iter_next_range(&vmi); > - } > - > - /* Actually expand, if possible */ > - if (vma && !vma_expand(&vmg)) { > - khugepaged_enter_vma(vma, vm_flags); > + vma = vma_merge_new_vma(&vmg); > + if (vma) > goto expanded; > - } > - > - if (vma == prev) > - vma_iter_set(&vmi, addr); > -cannot_expand: > > /* > * Determine the object being mapped and call the appropriate > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > * If vm_flags changed after call_mmap(), we should try merge > * vma again as we may succeed this time. > */ > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > - vma->vm_start, vma->vm_end, > - vma->vm_pgoff); > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > + merge = vma_merge_new_vma(&vmg); > + > if (merge) { > /* > * ->mmap() can change vma->vm_file and fput > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vma_iter_set(&vmi, vma->vm_end); > /* Undo any partial mapping done by a device driver. */ > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, > vma->vm_end, vma->vm_end, true); > } > if (writable_file_mapping) > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > unsigned long addr, unsigned long len, unsigned long flags) > { > struct mm_struct *mm = current->mm; > - struct vma_prepare vp; > > /* > * Check against address space limits by the changed size > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > */ > if (vma && vma->vm_end == addr) { > struct vma_merge_struct vmg = { > + .vmi = vmi, > .prev = vma, > + .next = NULL, > + .start = addr, > + .end = addr + len, > .flags = flags, > .pgoff = addr >> PAGE_SHIFT, > + .file = vma->vm_file, > + .anon_vma = vma->anon_vma, > + .policy = vma_policy(vma), > + .uffd_ctx = vma->vm_userfaultfd_ctx, > + .anon_name = anon_vma_name(vma), > }; > > - if (can_vma_merge_after(&vmg)) { > - vma_iter_config(vmi, vma->vm_start, addr + len); > - if (vma_iter_prealloc(vmi, vma)) > - goto unacct_fail; > - > - vma_start_write(vma); > - > - init_vma_prep(&vp, vma); > - vma_prepare(&vp); > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); > - vma->vm_end = addr + len; > - vm_flags_set(vma, VM_SOFTDIRTY); > - vma_iter_store(vmi, vma); > - > - vma_complete(&vp, vmi, mm); > - khugepaged_enter_vma(vma, flags); > + if (vma_merge_new_vma(&vmg)) > goto out; This is very convoluted to follow. It seems vma_merge_new_vma() will do what is necessary by finding out that it can merge after, then call vma_expand() which calls commit merge(), which sets the iterator to vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr here.. it's actually set to prev->vm_start in vma_merge_new_vma(). This is getting really hard to trace what happens. I'm also concerned that the overhead of following all these checks will cost performance on the brk system call? Maybe we can have a way to set up the vmg and call the right function to just make the above happen? We know with the can_vma_merge_after() that it is going to work, so could we just call vma_start_write() and commit_merge()? Also, vma_merge_new_vma() could fail because it's out of memory so it should goto unacct_fail.. but we now don't know if it's because the merge wasn't allowed or if we are out of memory.. > - } > } > > if (vma) > diff --git a/mm/vma.c b/mm/vma.c > index 55615392e8d2..a404cf718f9e 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, > * > * We assume the vma may be removed as part of the merge. > */ > -bool > -can_vma_merge_before(struct vma_merge_struct *vmg) > +static bool can_vma_merge_before(struct vma_merge_struct *vmg) > { > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); > > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) > * > * We assume that vma is not removed as part of the merge. > */ > -bool can_vma_merge_after(struct vma_merge_struct *vmg) > +static bool can_vma_merge_after(struct vma_merge_struct *vmg) > { > if (is_mergeable_vma(vmg, false) && > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) > return false; > } > > +static void __vma_link_file(struct vm_area_struct *vma, > + struct address_space *mapping) > +{ > + if (vma_is_shared_maywrite(vma)) > + mapping_allow_writable(mapping); > + > + flush_dcache_mmap_lock(mapping); > + vma_interval_tree_insert(vma, &mapping->i_mmap); > + flush_dcache_mmap_unlock(mapping); > +} > + > +/* > + * Requires inode->i_mapping->i_mmap_rwsem > + */ > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > + struct address_space *mapping) > +{ > + if (vma_is_shared_maywrite(vma)) > + mapping_unmap_writable(mapping); > + > + flush_dcache_mmap_lock(mapping); > + vma_interval_tree_remove(vma, &mapping->i_mmap); > + flush_dcache_mmap_unlock(mapping); > +} > + > +/* > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > + * @vp: The initialized vma_prepare struct > + */ > +static void vma_prepare(struct vma_prepare *vp) > +{ > + if (vp->file) { > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > + > + if (vp->adj_next) > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > + vp->adj_next->vm_end); > + > + i_mmap_lock_write(vp->mapping); > + if (vp->insert && vp->insert->vm_file) { > + /* > + * Put into interval tree now, so instantiated pages > + * are visible to arm/parisc __flush_dcache_page > + * throughout; but we cannot insert into address > + * space until vma start or end is updated. > + */ > + __vma_link_file(vp->insert, > + vp->insert->vm_file->f_mapping); > + } > + } > + > + if (vp->anon_vma) { > + anon_vma_lock_write(vp->anon_vma); > + anon_vma_interval_tree_pre_update_vma(vp->vma); > + if (vp->adj_next) > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > + } > + > + if (vp->file) { > + flush_dcache_mmap_lock(vp->mapping); > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > + if (vp->adj_next) > + vma_interval_tree_remove(vp->adj_next, > + &vp->mapping->i_mmap); > + } > + > +} > + > +/* > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > + * or for inserting a VMA. > + * > + * @vp: The vma_prepare struct > + * @vmi: The vma iterator > + * @mm: The mm_struct > + */ > +static void vma_complete(struct vma_prepare *vp, > + struct vma_iterator *vmi, struct mm_struct *mm) > +{ > + if (vp->file) { > + if (vp->adj_next) > + vma_interval_tree_insert(vp->adj_next, > + &vp->mapping->i_mmap); > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > + flush_dcache_mmap_unlock(vp->mapping); > + } > + > + if (vp->remove && vp->file) { > + __remove_shared_vm_struct(vp->remove, vp->mapping); > + if (vp->remove2) > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > + } else if (vp->insert) { > + /* > + * split_vma has split insert from vma, and needs > + * us to insert it before dropping the locks > + * (it may either follow vma or precede it). > + */ > + vma_iter_store(vmi, vp->insert); > + mm->map_count++; > + } > + > + if (vp->anon_vma) { > + anon_vma_interval_tree_post_update_vma(vp->vma); > + if (vp->adj_next) > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > + anon_vma_unlock_write(vp->anon_vma); > + } > + > + if (vp->file) { > + i_mmap_unlock_write(vp->mapping); > + uprobe_mmap(vp->vma); > + > + if (vp->adj_next) > + uprobe_mmap(vp->adj_next); > + } > + > + if (vp->remove) { > +again: > + vma_mark_detached(vp->remove, true); > + if (vp->file) { > + uprobe_munmap(vp->remove, vp->remove->vm_start, > + vp->remove->vm_end); > + fput(vp->file); > + } > + if (vp->remove->anon_vma) > + anon_vma_merge(vp->vma, vp->remove); > + mm->map_count--; > + mpol_put(vma_policy(vp->remove)); > + if (!vp->remove2) > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > + vm_area_free(vp->remove); > + > + /* > + * In mprotect's case 6 (see comments on vma_merge), > + * we are removing both mid and next vmas > + */ > + if (vp->remove2) { > + vp->remove = vp->remove2; > + vp->remove2 = NULL; > + goto again; > + } > + } > + if (vp->insert && vp->file) > + uprobe_mmap(vp->insert); > + validate_mm(mm); > +} > + > +/* > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > + * @vp: The vma_prepare struct > + * @vma: The vma that will be altered once locked > + */ > +static void init_vma_prep(struct vma_prepare *vp, > + struct vm_area_struct *vma) > +{ > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > +} > + > /* > * Close a vm structure and free it. > */ > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > vm_unacct_memory(nr_accounted); > } > > -/* > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > - * @vp: The vma_prepare struct > - * @vma: The vma that will be altered once locked > - */ > -void init_vma_prep(struct vma_prepare *vp, > - struct vm_area_struct *vma) > -{ > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > -} > - > -/* > - * Requires inode->i_mapping->i_mmap_rwsem > - */ > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > - struct address_space *mapping) > -{ > - if (vma_is_shared_maywrite(vma)) > - mapping_unmap_writable(mapping); > - > - flush_dcache_mmap_lock(mapping); > - vma_interval_tree_remove(vma, &mapping->i_mmap); > - flush_dcache_mmap_unlock(mapping); > -} > - > /* > * vma has some anon_vma assigned, and is already inserted on that > * anon_vma's interval trees. > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > } > > -static void __vma_link_file(struct vm_area_struct *vma, > - struct address_space *mapping) > -{ > - if (vma_is_shared_maywrite(vma)) > - mapping_allow_writable(mapping); > - > - flush_dcache_mmap_lock(mapping); > - vma_interval_tree_insert(vma, &mapping->i_mmap); > - flush_dcache_mmap_unlock(mapping); > -} > - > -/* > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > - * @vp: The initialized vma_prepare struct > - */ > -void vma_prepare(struct vma_prepare *vp) > -{ > - if (vp->file) { > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > - > - if (vp->adj_next) > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > - vp->adj_next->vm_end); > - > - i_mmap_lock_write(vp->mapping); > - if (vp->insert && vp->insert->vm_file) { > - /* > - * Put into interval tree now, so instantiated pages > - * are visible to arm/parisc __flush_dcache_page > - * throughout; but we cannot insert into address > - * space until vma start or end is updated. > - */ > - __vma_link_file(vp->insert, > - vp->insert->vm_file->f_mapping); > - } > - } > - > - if (vp->anon_vma) { > - anon_vma_lock_write(vp->anon_vma); > - anon_vma_interval_tree_pre_update_vma(vp->vma); > - if (vp->adj_next) > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > - } > - > - if (vp->file) { > - flush_dcache_mmap_lock(vp->mapping); > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > - if (vp->adj_next) > - vma_interval_tree_remove(vp->adj_next, > - &vp->mapping->i_mmap); > - } > - > -} > - > /* > * dup_anon_vma() - Helper function to duplicate anon_vma > * @dst: The destination VMA > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) > } > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > +/* > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > + * > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > + * > + * We are about to add a VMA to the address space starting at @vmg->start and > + * ending at @vmg->end. There are three different possible scenarios: > + * > + * 1. There is a VMA with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > + * EXPAND that VMA: > + * > + * Proposed: |-----| or |-----| > + * Existing: |----| |----| > + * > + * 2. There are VMAs with identical properties immediately adjacent to the > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > + * EXPAND the former and REMOVE the latter: > + * > + * Proposed: |-----| > + * Existing: |----| |----| > + * > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. We still have diagrams, that's too bad. > + * > + * In instances where we can merge, this function returns the expanded VMA which > + * will have its range adjusted accordingly and the underlying maple tree also > + * adjusted. > + * > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > + * to the VMA we expanded. > + * > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > + * expanded range. > + * > + * ASSUMPTIONS: > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > + */ > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > +{ > + bool is_special = vmg->flags & VM_SPECIAL; > + struct vm_area_struct *prev = vmg->prev; > + struct vm_area_struct *next = vmg->next; > + unsigned long start = vmg->start; > + unsigned long end = vmg->end; > + pgoff_t pgoff = vmg->pgoff; > + pgoff_t pglen = PHYS_PFN(end - start); > + > + VM_WARN_ON(vmg->vma); > + > + if (!prev && !next) { > + /* > + * Since the caller must have determined that the requested > + * range is empty, vmg->vmi will be left pointing at the VMA > + * immediately prior. > + */ > + next = vmg->next = vma_next(vmg->vmi); > + prev = vmg->prev = vma_prev(vmg->vmi); > + > + /* Avoid maple tree re-walk. */ > + if (is_special && prev) > + vma_iter_next_range(vmg->vmi); > + } > + > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > + if (is_special || (!prev && !next)) > + return NULL; > + > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > + /* > + * We can adjust this here as can_vma_merge_after() doesn't > + * touch vmg->end. > + */ > + vmg->end = next->vm_end; > + vmg->vma = next; > + vmg->pgoff = next->vm_pgoff - pglen; > + > + vmg->anon_vma = next->anon_vma; > + } > + > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > + vmg->start = prev->vm_start; > + vmg->vma = prev; > + vmg->pgoff = prev->vm_pgoff; > + } else if (prev) { > + vma_iter_next_range(vmg->vmi); > + } > + > + /* > + * Now try to expand adjacent VMA(s). This takes care of removing the > + * following VMA if we have VMAs on both sides. > + */ > + if (vmg->vma && !vma_expand(vmg)) { > + khugepaged_enter_vma(vmg->vma, vmg->flags); > + return vmg->vma; > + } > + > + /* If expansion failed, reset state. Allows us to retry merge later. */ > + vmg->vma = NULL; > + vmg->anon_vma = NULL; > + vmg->start = start; > + vmg->end = end; > + vmg->pgoff = pgoff; > + if (vmg->vma == prev) > + vma_iter_set(vmg->vmi, start); > + > + return NULL; > +} Can we split this up a bit? I was thinking that, for the brk() case, we need to know if we can merge prev and if that merge fails. I was thinking something that you create a vmg with whatever, then call can_merge_prev, and that'd do the block above and change the vmg as required. We could have a can_merge_next that does the same, then we need to prepare the change (dup anon vma, preallocate for maple tree, locking, whatever), then commit. There could still be the function above, but with smaller widgets to do what we need so we gain flexibility in what we decide to check - prev only in brk(). I'm not sure if we'd need one for expanding vs existing or if we could check !vmg->vma to figure that out.. This would also have the effect of self-documenting what is going on. For brk, it would look like this: if (vmg_expand_prev()) { if (vmg_prepare()) goto no_mem; vmg_commit(); } I think this would change your exposed interface, at least for brk() - or a wrapper for this, but small widgets may gain us some self-documented code? If you really don't like the exposure of the interface, the vmg could have a return so we can see if we ran out of memory? > + > /* > * vma_expand - Expand an existing VMA > * > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with > * vmg->next needs to be handled by the caller. > * > - * Returns: 0 on success > + * Returns: 0 on success. > + * > + * ASSUMPTIONS: > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. > + * - The caller must have set @vmg->prev and @vmg->next. > */ > int vma_expand(struct vma_merge_struct *vmg) > { > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) > return 0; > } > > -/* > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > - * or for inserting a VMA. > - * > - * @vp: The vma_prepare struct > - * @vmi: The vma iterator > - * @mm: The mm_struct > - */ > -void vma_complete(struct vma_prepare *vp, > - struct vma_iterator *vmi, struct mm_struct *mm) > -{ > - if (vp->file) { > - if (vp->adj_next) > - vma_interval_tree_insert(vp->adj_next, > - &vp->mapping->i_mmap); > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > - flush_dcache_mmap_unlock(vp->mapping); > - } > - > - if (vp->remove && vp->file) { > - __remove_shared_vm_struct(vp->remove, vp->mapping); > - if (vp->remove2) > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > - } else if (vp->insert) { > - /* > - * split_vma has split insert from vma, and needs > - * us to insert it before dropping the locks > - * (it may either follow vma or precede it). > - */ > - vma_iter_store(vmi, vp->insert); > - mm->map_count++; > - } > - > - if (vp->anon_vma) { > - anon_vma_interval_tree_post_update_vma(vp->vma); > - if (vp->adj_next) > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > - anon_vma_unlock_write(vp->anon_vma); > - } > - > - if (vp->file) { > - i_mmap_unlock_write(vp->mapping); > - uprobe_mmap(vp->vma); > - > - if (vp->adj_next) > - uprobe_mmap(vp->adj_next); > - } > - > - if (vp->remove) { > -again: > - vma_mark_detached(vp->remove, true); > - if (vp->file) { > - uprobe_munmap(vp->remove, vp->remove->vm_start, > - vp->remove->vm_end); > - fput(vp->file); > - } > - if (vp->remove->anon_vma) > - anon_vma_merge(vp->vma, vp->remove); > - mm->map_count--; > - mpol_put(vma_policy(vp->remove)); > - if (!vp->remove2) > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > - vm_area_free(vp->remove); > - > - /* > - * In mprotect's case 6 (see comments on vma_merge), > - * we are removing both mid and next vmas > - */ > - if (vp->remove2) { > - vp->remove = vp->remove2; > - vp->remove2 = NULL; > - goto again; > - } > - } > - if (vp->insert && vp->file) > - uprobe_mmap(vp->insert); > - validate_mm(mm); > -} > - > /* > * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > * @vmi: The vma iterator > @@ -1261,20 +1378,6 @@ struct vm_area_struct > return vma_modify(&vmg); > } > > -/* > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > - * must ensure that [start, end) does not overlap any existing VMA. > - */ > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > -{ > - if (!vmg->prev) { > - vmg->prev = vma_prev(vmg->vmi); > - vma_iter_set(vmg->vmi, vmg->start); > - } > - > - return vma_merge(vmg); > -} > - > /* > * Expand vma by delta bytes, potentially merging with an immediately adjacent > * VMA with identical properties. > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > .anon_name = anon_vma_name(vma), > }; > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > - return vma_merge(&vmg); > + return vma_merge_new_vma(&vmg); > } > > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > struct vm_area_struct *vma = *vmap; > unsigned long vma_start = vma->vm_start; > struct mm_struct *mm = vma->vm_mm; > - struct vm_area_struct *new_vma, *prev; > + struct vm_area_struct *new_vma; > bool faulted_in_anon_vma = true; > VMA_ITERATOR(vmi, mm, addr); > + struct vma_merge_struct vmg = { > + .vmi = &vmi, > + .start = addr, > + .end = addr + len, > + .flags = vma->vm_flags, > + .pgoff = pgoff, > + .file = vma->vm_file, > + .anon_vma = vma->anon_vma, > + .policy = vma_policy(vma), > + .uffd_ctx = vma->vm_userfaultfd_ctx, > + .anon_name = anon_vma_name(vma), > + }; > > /* > * If anonymous vma has not yet been faulted, update new pgoff > * to match new location, to increase its chance of merging. > */ > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > - pgoff = addr >> PAGE_SHIFT; > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; > faulted_in_anon_vma = false; > } > > - new_vma = find_vma_prev(mm, addr, &prev); > + new_vma = find_vma_prev(mm, addr, &vmg.prev); > if (new_vma && new_vma->vm_start < addr + len) > return NULL; /* should never get here */ > > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); > + vmg.next = vma_next(&vmi); > + vma_prev(&vmi); > + > + new_vma = vma_merge_new_vma(&vmg); > + > if (new_vma) { > /* > * Source vma may have been merged into new_vma > diff --git a/mm/vma.h b/mm/vma.h > index 50459f9e4c7f..bbb173053f34 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > /* Required for expand_downwards(). */ > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > -/* Required for do_brk_flags(). */ > -void vma_prepare(struct vma_prepare *vp); > - > -/* Required for do_brk_flags(). */ > -void init_vma_prep(struct vma_prepare *vp, > - struct vm_area_struct *vma); > - > -/* Required for do_brk_flags(). */ > -void vma_complete(struct vma_prepare *vp, > - struct vma_iterator *vmi, struct mm_struct *mm); > - > int vma_expand(struct vma_merge_struct *vmg); > int vma_shrink(struct vma_merge_struct *vmg); > > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, > struct vm_area_struct *next, unsigned long start, > unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > -/* > - * Can we merge the VMA described by vmg into the following VMA vmg->next? > - * > - * Required by mmap_region(). > - */ > -bool can_vma_merge_before(struct vma_merge_struct *vmg); > - > -/* > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? > - * > - * Required by mmap_region() and do_brk_flags(). > - */ > -bool can_vma_merge_after(struct vma_merge_struct *vmg); > - > /* We are about to modify the VMA's flags. */ > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, > struct vm_area_struct *prev, > @@ -133,31 +108,7 @@ struct vm_area_struct > unsigned long new_flags, > struct vm_userfaultfd_ctx new_ctx); > > -struct vm_area_struct > -*vma_merge_new_vma(struct vma_merge_struct *vmg); > - > -/* Temporary convenience wrapper. */ > -static inline struct vm_area_struct > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, > - struct vm_area_struct *vma, unsigned long start, > - unsigned long end, pgoff_t pgoff) > -{ > - struct vma_merge_struct vmg = { > - .vmi = vmi, > - .prev = prev, > - .start = start, > - .end = end, > - .flags = vma->vm_flags, > - .file = vma->vm_file, > - .anon_vma = vma->anon_vma, > - .pgoff = pgoff, > - .policy = vma_policy(vma), > - .uffd_ctx = vma->vm_userfaultfd_ctx, > - .anon_name = anon_vma_name(vma), > - }; > - > - return vma_merge_new_vma(&vmg); > -} > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); > > /* > * Temporary wrapper around vma_merge() so we can have a common interface for > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 40797a819d3d..a39a734282d0 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) > mas_destroy(&vmi->mas); > } > > +static inline > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) > +{ > + return mas_next_range(&vmi->mas, ULONG_MAX); > +} > + > static inline void vm_acct_memory(long pages) > { > } > -- > 2.45.2 >
On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote: > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]: > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer > > to use vma_expand() rather than invoke a full vma_merge() operation. > > > > Abstract this logic and eliminate all of the open-coding, and also use the > > same logic for all cases where we add new VMAs to, rather than ultimately > > use vma_merge(), rather use vma_expand(). > > > > We implement this by replacing vma_merge_new_vma() with this newly > > abstracted logic. > > > > Doing so removes duplication and simplifies VMA merging in all such cases, > > laying the ground for us to eliminate the merging of new VMAs in > > vma_merge() altogether. > > > > This makes it far easier to understand what is happening in these cases > > avoiding confusion, bugs and allowing for future optimisation. > > > > As a result of this change we are also able to make vma_prepare(), > > init_vma_prep(), vma_complete(), can_vma_merge_before() and > > can_vma_merge_after() static and internal to vma.c. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > > mm/mmap.c | 79 ++--- > > mm/vma.c | 482 +++++++++++++++++++------------ > > mm/vma.h | 51 +--- > > tools/testing/vma/vma_internal.h | 6 + > > 4 files changed, 324 insertions(+), 294 deletions(-) > > > > diff --git a/mm/mmap.c b/mm/mmap.c > > index f6593a81f73d..c03f50f46396 100644 > > --- a/mm/mmap.c > > +++ b/mm/mmap.c > > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > { > > struct mm_struct *mm = current->mm; > > struct vm_area_struct *vma = NULL; > > - struct vm_area_struct *next, *prev, *merge; > > - pgoff_t pglen = len >> PAGE_SHIFT; > > + struct vm_area_struct *merge; > > unsigned long charged = 0; > > unsigned long end = addr + len; > > bool writable_file_mapping = false; > > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > vm_flags |= VM_ACCOUNT; > > } > > > > - next = vmg.next = vma_next(&vmi); > > - prev = vmg.prev = vma_prev(&vmi); > > - if (vm_flags & VM_SPECIAL) { > > - if (prev) > > - vma_iter_next_range(&vmi); > > - goto cannot_expand; > > - } > > - > > - /* Attempt to expand an old mapping */ > > - /* Check next */ > > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { > > - /* We can adjust this as can_vma_merge_after() doesn't touch */ > > - vmg.end = next->vm_end; > > - vma = vmg.vma = next; > > - vmg.pgoff = next->vm_pgoff - pglen; > > - > > - /* We may merge our NULL anon_vma with non-NULL in next. */ > > - vmg.anon_vma = vma->anon_vma; > > - } > > - > > - /* Check prev */ > > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { > > - vmg.start = prev->vm_start; > > - vma = vmg.vma = prev; > > - vmg.pgoff = prev->vm_pgoff; > > - } else if (prev) { > > - vma_iter_next_range(&vmi); > > - } > > - > > - /* Actually expand, if possible */ > > - if (vma && !vma_expand(&vmg)) { > > - khugepaged_enter_vma(vma, vm_flags); > > + vma = vma_merge_new_vma(&vmg); > > + if (vma) > > goto expanded; > > - } > > - > > - if (vma == prev) > > - vma_iter_set(&vmi, addr); > > -cannot_expand: > > > > /* > > * Determine the object being mapped and call the appropriate > > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > * If vm_flags changed after call_mmap(), we should try merge > > * vma again as we may succeed this time. > > */ > > - if (unlikely(vm_flags != vma->vm_flags && prev)) { > > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, > > - vma->vm_start, vma->vm_end, > > - vma->vm_pgoff); > > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { > > + merge = vma_merge_new_vma(&vmg); > > + > > if (merge) { > > /* > > * ->mmap() can change vma->vm_file and fput > > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, > > > > vma_iter_set(&vmi, vma->vm_end); > > /* Undo any partial mapping done by a device driver. */ > > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, > > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, > > vma->vm_end, vma->vm_end, true); > > } > > if (writable_file_mapping) > > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > unsigned long addr, unsigned long len, unsigned long flags) > > { > > struct mm_struct *mm = current->mm; > > - struct vma_prepare vp; > > > > /* > > * Check against address space limits by the changed size > > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, > > */ > > if (vma && vma->vm_end == addr) { > > struct vma_merge_struct vmg = { > > + .vmi = vmi, > > .prev = vma, > > + .next = NULL, > > + .start = addr, > > + .end = addr + len, > > .flags = flags, > > .pgoff = addr >> PAGE_SHIFT, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > }; > > > > - if (can_vma_merge_after(&vmg)) { > > - vma_iter_config(vmi, vma->vm_start, addr + len); > > - if (vma_iter_prealloc(vmi, vma)) > > - goto unacct_fail; > > - > > - vma_start_write(vma); > > - > > - init_vma_prep(&vp, vma); > > - vma_prepare(&vp); > > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); > > - vma->vm_end = addr + len; > > - vm_flags_set(vma, VM_SOFTDIRTY); > > - vma_iter_store(vmi, vma); > > - > > - vma_complete(&vp, vmi, mm); > > - khugepaged_enter_vma(vma, flags); > > + if (vma_merge_new_vma(&vmg)) > > goto out; > > This is very convoluted to follow. It seems vma_merge_new_vma() will do > what is necessary by finding out that it can merge after, then call > vma_expand() which calls commit merge(), which sets the iterator to > vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr > here.. it's actually set to prev->vm_start in vma_merge_new_vma(). Sorry, it's kind of hard to make a change like this all that lovely to follow. The only extra checks before it checks mergeability are prev - which we set to vma, so is not NULL (except in the case of first vma, which is wasteful, but a one-off) and an is_special and next check. So this isn't _hugely_ terrible. As to the vmi positioning... I thought there might be some things that we could improve on this :) However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end = addr + len which is the same as before right? I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY) though... will add that back in. Ugh. Again, so frustrating to not have these functions testable. I'd like to find a way to move things around if possible at some point. But if we're worried about call stack maybe not feasible. > > This is getting really hard to trace what happens. I'm also concerned > that the overhead of following all these checks will cost performance on > the brk system call? I'll take a look at mm-tests results. > > Maybe we can have a way to set up the vmg and call the right function to > just make the above happen? We know with the can_vma_merge_after() that > it is going to work, so could we just call vma_start_write() and > commit_merge()? I'm happy to add an enum or something to set a specific mode if we want, but maybe worth looking at scalability results first to see if there's really a regression? I mean from our discussions on irc, it sounds like this is very possible so we could figure something out. > > Also, vma_merge_new_vma() could fail because it's out of memory so it > should goto unacct_fail.. but we now don't know if it's because the > merge wasn't allowed or if we are out of memory.. Yes this is a mistake, damn it. Will fix. Grumble about untestability of these functions x2. As per your comment below I think simplest way may be to have an error or outcome field or some such that we can check to see _why_ things failed. > > > - } > > } > > > > if (vma) > > diff --git a/mm/vma.c b/mm/vma.c > > index 55615392e8d2..a404cf718f9e 100644 > > --- a/mm/vma.c > > +++ b/mm/vma.c > > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, > > * > > * We assume the vma may be removed as part of the merge. > > */ > > -bool > > -can_vma_merge_before(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_before(struct vma_merge_struct *vmg) > > { > > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); > > > > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) > > * > > * We assume that vma is not removed as part of the merge. > > */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg) > > +static bool can_vma_merge_after(struct vma_merge_struct *vmg) > > { > > if (is_mergeable_vma(vmg, false) && > > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { > > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) > > return false; > > } > > > > +static void __vma_link_file(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_allow_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_insert(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * Requires inode->i_mapping->i_mmap_rwsem > > + */ > > +static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > + struct address_space *mapping) > > +{ > > + if (vma_is_shared_maywrite(vma)) > > + mapping_unmap_writable(mapping); > > + > > + flush_dcache_mmap_lock(mapping); > > + vma_interval_tree_remove(vma, &mapping->i_mmap); > > + flush_dcache_mmap_unlock(mapping); > > +} > > + > > +/* > > + * vma_prepare() - Helper function for handling locking VMAs prior to altering > > + * @vp: The initialized vma_prepare struct > > + */ > > +static void vma_prepare(struct vma_prepare *vp) > > +{ > > + if (vp->file) { > > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > + > > + if (vp->adj_next) > > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > + vp->adj_next->vm_end); > > + > > + i_mmap_lock_write(vp->mapping); > > + if (vp->insert && vp->insert->vm_file) { > > + /* > > + * Put into interval tree now, so instantiated pages > > + * are visible to arm/parisc __flush_dcache_page > > + * throughout; but we cannot insert into address > > + * space until vma start or end is updated. > > + */ > > + __vma_link_file(vp->insert, > > + vp->insert->vm_file->f_mapping); > > + } > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_lock_write(vp->anon_vma); > > + anon_vma_interval_tree_pre_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > + } > > + > > + if (vp->file) { > > + flush_dcache_mmap_lock(vp->mapping); > > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > + if (vp->adj_next) > > + vma_interval_tree_remove(vp->adj_next, > > + &vp->mapping->i_mmap); > > + } > > + > > +} > > + > > +/* > > + * vma_complete- Helper function for handling the unlocking after altering VMAs, > > + * or for inserting a VMA. > > + * > > + * @vp: The vma_prepare struct > > + * @vmi: The vma iterator > > + * @mm: The mm_struct > > + */ > > +static void vma_complete(struct vma_prepare *vp, > > + struct vma_iterator *vmi, struct mm_struct *mm) > > +{ > > + if (vp->file) { > > + if (vp->adj_next) > > + vma_interval_tree_insert(vp->adj_next, > > + &vp->mapping->i_mmap); > > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > + flush_dcache_mmap_unlock(vp->mapping); > > + } > > + > > + if (vp->remove && vp->file) { > > + __remove_shared_vm_struct(vp->remove, vp->mapping); > > + if (vp->remove2) > > + __remove_shared_vm_struct(vp->remove2, vp->mapping); > > + } else if (vp->insert) { > > + /* > > + * split_vma has split insert from vma, and needs > > + * us to insert it before dropping the locks > > + * (it may either follow vma or precede it). > > + */ > > + vma_iter_store(vmi, vp->insert); > > + mm->map_count++; > > + } > > + > > + if (vp->anon_vma) { > > + anon_vma_interval_tree_post_update_vma(vp->vma); > > + if (vp->adj_next) > > + anon_vma_interval_tree_post_update_vma(vp->adj_next); > > + anon_vma_unlock_write(vp->anon_vma); > > + } > > + > > + if (vp->file) { > > + i_mmap_unlock_write(vp->mapping); > > + uprobe_mmap(vp->vma); > > + > > + if (vp->adj_next) > > + uprobe_mmap(vp->adj_next); > > + } > > + > > + if (vp->remove) { > > +again: > > + vma_mark_detached(vp->remove, true); > > + if (vp->file) { > > + uprobe_munmap(vp->remove, vp->remove->vm_start, > > + vp->remove->vm_end); > > + fput(vp->file); > > + } > > + if (vp->remove->anon_vma) > > + anon_vma_merge(vp->vma, vp->remove); > > + mm->map_count--; > > + mpol_put(vma_policy(vp->remove)); > > + if (!vp->remove2) > > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > + vm_area_free(vp->remove); > > + > > + /* > > + * In mprotect's case 6 (see comments on vma_merge), > > + * we are removing both mid and next vmas > > + */ > > + if (vp->remove2) { > > + vp->remove = vp->remove2; > > + vp->remove2 = NULL; > > + goto again; > > + } > > + } > > + if (vp->insert && vp->file) > > + uprobe_mmap(vp->insert); > > + validate_mm(mm); > > +} > > + > > +/* > > + * init_vma_prep() - Initializer wrapper for vma_prepare struct > > + * @vp: The vma_prepare struct > > + * @vma: The vma that will be altered once locked > > + */ > > +static void init_vma_prep(struct vma_prepare *vp, > > + struct vm_area_struct *vma) > > +{ > > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > +} > > + > > /* > > * Close a vm structure and free it. > > */ > > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) > > vm_unacct_memory(nr_accounted); > > } > > > > -/* > > - * init_vma_prep() - Initializer wrapper for vma_prepare struct > > - * @vp: The vma_prepare struct > > - * @vma: The vma that will be altered once locked > > - */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma) > > -{ > > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); > > -} > > - > > -/* > > - * Requires inode->i_mapping->i_mmap_rwsem > > - */ > > -static void __remove_shared_vm_struct(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_unmap_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_remove(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > /* > > * vma has some anon_vma assigned, and is already inserted on that > > * anon_vma's interval trees. > > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) > > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); > > } > > > > -static void __vma_link_file(struct vm_area_struct *vma, > > - struct address_space *mapping) > > -{ > > - if (vma_is_shared_maywrite(vma)) > > - mapping_allow_writable(mapping); > > - > > - flush_dcache_mmap_lock(mapping); > > - vma_interval_tree_insert(vma, &mapping->i_mmap); > > - flush_dcache_mmap_unlock(mapping); > > -} > > - > > -/* > > - * vma_prepare() - Helper function for handling locking VMAs prior to altering > > - * @vp: The initialized vma_prepare struct > > - */ > > -void vma_prepare(struct vma_prepare *vp) > > -{ > > - if (vp->file) { > > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); > > - > > - if (vp->adj_next) > > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, > > - vp->adj_next->vm_end); > > - > > - i_mmap_lock_write(vp->mapping); > > - if (vp->insert && vp->insert->vm_file) { > > - /* > > - * Put into interval tree now, so instantiated pages > > - * are visible to arm/parisc __flush_dcache_page > > - * throughout; but we cannot insert into address > > - * space until vma start or end is updated. > > - */ > > - __vma_link_file(vp->insert, > > - vp->insert->vm_file->f_mapping); > > - } > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_lock_write(vp->anon_vma); > > - anon_vma_interval_tree_pre_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_pre_update_vma(vp->adj_next); > > - } > > - > > - if (vp->file) { > > - flush_dcache_mmap_lock(vp->mapping); > > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); > > - if (vp->adj_next) > > - vma_interval_tree_remove(vp->adj_next, > > - &vp->mapping->i_mmap); > > - } > > - > > -} > > - > > /* > > * dup_anon_vma() - Helper function to duplicate anon_vma > > * @dst: The destination VMA > > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) > > } > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ > > > > +/* > > + * vma_merge_new_vma - Attempt to merge a new VMA into address space > > + * > > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end > > + * (exclusive), which we try to merge with any adjacent VMAs if possible. > > + * > > + * We are about to add a VMA to the address space starting at @vmg->start and > > + * ending at @vmg->end. There are three different possible scenarios: > > + * > > + * 1. There is a VMA with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - > > + * EXPAND that VMA: > > + * > > + * Proposed: |-----| or |-----| > > + * Existing: |----| |----| > > + * > > + * 2. There are VMAs with identical properties immediately adjacent to the > > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - > > + * EXPAND the former and REMOVE the latter: > > + * > > + * Proposed: |-----| > > + * Existing: |----| |----| > > + * > > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those > > + * VMAs do not have identical attributes - NO MERGE POSSIBLE. > > We still have diagrams, that's too bad. But they're cute ones! Upgrade right? > > > + * > > + * In instances where we can merge, this function returns the expanded VMA which > > + * will have its range adjusted accordingly and the underlying maple tree also > > + * adjusted. > > + * > > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer > > + * to the VMA we expanded. > > + * > > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if > > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the > > + * expanded range. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. > > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. > > + */ > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > +{ > > + bool is_special = vmg->flags & VM_SPECIAL; > > + struct vm_area_struct *prev = vmg->prev; > > + struct vm_area_struct *next = vmg->next; > > + unsigned long start = vmg->start; > > + unsigned long end = vmg->end; > > + pgoff_t pgoff = vmg->pgoff; > > + pgoff_t pglen = PHYS_PFN(end - start); > > + > > + VM_WARN_ON(vmg->vma); > > + > > + if (!prev && !next) { > > + /* > > + * Since the caller must have determined that the requested > > + * range is empty, vmg->vmi will be left pointing at the VMA > > + * immediately prior. > > + */ > > + next = vmg->next = vma_next(vmg->vmi); > > + prev = vmg->prev = vma_prev(vmg->vmi); > > + > > + /* Avoid maple tree re-walk. */ > > + if (is_special && prev) > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* If special mapping or no adjacent VMAs, nothing to merge. */ > > + if (is_special || (!prev && !next)) > > + return NULL; > > + > > + /* If we can merge with the following VMA, adjust vmg accordingly. */ > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { > > + /* > > + * We can adjust this here as can_vma_merge_after() doesn't > > + * touch vmg->end. > > + */ > > + vmg->end = next->vm_end; > > + vmg->vma = next; > > + vmg->pgoff = next->vm_pgoff - pglen; > > + > > + vmg->anon_vma = next->anon_vma; > > + } > > + > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */ > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { > > + vmg->start = prev->vm_start; > > + vmg->vma = prev; > > + vmg->pgoff = prev->vm_pgoff; > > + } else if (prev) { > > + vma_iter_next_range(vmg->vmi); > > + } > > + > > + /* > > + * Now try to expand adjacent VMA(s). This takes care of removing the > > + * following VMA if we have VMAs on both sides. > > + */ > > + if (vmg->vma && !vma_expand(vmg)) { > > + khugepaged_enter_vma(vmg->vma, vmg->flags); > > + return vmg->vma; > > + } > > + > > + /* If expansion failed, reset state. Allows us to retry merge later. */ > > + vmg->vma = NULL; > > + vmg->anon_vma = NULL; > > + vmg->start = start; > > + vmg->end = end; > > + vmg->pgoff = pgoff; > > + if (vmg->vma == prev) > > + vma_iter_set(vmg->vmi, start); > > + > > + return NULL; > > +} > > Can we split this up a bit? I was thinking that, for the brk() case, we > need to know if we can merge prev and if that merge fails. I was > thinking something that you create a vmg with whatever, then call > can_merge_prev, and that'd do the block above and change the vmg as > required. We could have a can_merge_next that does the same, then we > need to prepare the change (dup anon vma, preallocate for maple tree, > locking, whatever), then commit. Yeah that's not a bad idea, that could actually help really help clarify what's going on. Then could have a sort of state machine that indicates that we've already adjusted vmg parameters for the merge. I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma() that invokes different code to figure out how to expand. I will have a fiddle around and see what I can figure out that makes sense. > > There could still be the function above, but with smaller widgets to do > what we need so we gain flexibility in what we decide to check - prev > only in brk(). > > I'm not sure if we'd need one for expanding vs existing or if we could > check !vmg->vma to figure that out.. > > This would also have the effect of self-documenting what is going on. > For brk, it would look like this: > > if (vmg_expand_prev()) { > if (vmg_prepare()) > goto no_mem; > vmg_commit(); > } > > I think this would change your exposed interface, at least for brk() - > or a wrapper for this, but small widgets may gain us some > self-documented code? > > If you really don't like the exposure of the interface, the vmg could > have a return so we can see if we ran out of memory? I really don't like can_vma_merge_xxx() being exposed, it's very clearly an internal interface. As mentioned above we can have some kind of way of passing back an error code. Obviously if testing indicates stack size/perf is a problem we can begrudgingly accept the interface leak :'(. Will check that. > > > + > > /* > > * vma_expand - Expand an existing VMA > > * > > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) > > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with > > * vmg->next needs to be handled by the caller. > > * > > - * Returns: 0 on success > > + * Returns: 0 on success. > > + * > > + * ASSUMPTIONS: > > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. > > + * - The caller must have set @vmg->prev and @vmg->next. > > */ > > int vma_expand(struct vma_merge_struct *vmg) > > { > > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) > > return 0; > > } > > > > -/* > > - * vma_complete- Helper function for handling the unlocking after altering VMAs, > > - * or for inserting a VMA. > > - * > > - * @vp: The vma_prepare struct > > - * @vmi: The vma iterator > > - * @mm: The mm_struct > > - */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm) > > -{ > > - if (vp->file) { > > - if (vp->adj_next) > > - vma_interval_tree_insert(vp->adj_next, > > - &vp->mapping->i_mmap); > > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); > > - flush_dcache_mmap_unlock(vp->mapping); > > - } > > - > > - if (vp->remove && vp->file) { > > - __remove_shared_vm_struct(vp->remove, vp->mapping); > > - if (vp->remove2) > > - __remove_shared_vm_struct(vp->remove2, vp->mapping); > > - } else if (vp->insert) { > > - /* > > - * split_vma has split insert from vma, and needs > > - * us to insert it before dropping the locks > > - * (it may either follow vma or precede it). > > - */ > > - vma_iter_store(vmi, vp->insert); > > - mm->map_count++; > > - } > > - > > - if (vp->anon_vma) { > > - anon_vma_interval_tree_post_update_vma(vp->vma); > > - if (vp->adj_next) > > - anon_vma_interval_tree_post_update_vma(vp->adj_next); > > - anon_vma_unlock_write(vp->anon_vma); > > - } > > - > > - if (vp->file) { > > - i_mmap_unlock_write(vp->mapping); > > - uprobe_mmap(vp->vma); > > - > > - if (vp->adj_next) > > - uprobe_mmap(vp->adj_next); > > - } > > - > > - if (vp->remove) { > > -again: > > - vma_mark_detached(vp->remove, true); > > - if (vp->file) { > > - uprobe_munmap(vp->remove, vp->remove->vm_start, > > - vp->remove->vm_end); > > - fput(vp->file); > > - } > > - if (vp->remove->anon_vma) > > - anon_vma_merge(vp->vma, vp->remove); > > - mm->map_count--; > > - mpol_put(vma_policy(vp->remove)); > > - if (!vp->remove2) > > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); > > - vm_area_free(vp->remove); > > - > > - /* > > - * In mprotect's case 6 (see comments on vma_merge), > > - * we are removing both mid and next vmas > > - */ > > - if (vp->remove2) { > > - vp->remove = vp->remove2; > > - vp->remove2 = NULL; > > - goto again; > > - } > > - } > > - if (vp->insert && vp->file) > > - uprobe_mmap(vp->insert); > > - validate_mm(mm); > > -} > > - > > /* > > * do_vmi_align_munmap() - munmap the aligned region from @start to @end. > > * @vmi: The vma iterator > > @@ -1261,20 +1378,6 @@ struct vm_area_struct > > return vma_modify(&vmg); > > } > > > > -/* > > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller > > - * must ensure that [start, end) does not overlap any existing VMA. > > - */ > > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > > -{ > > - if (!vmg->prev) { > > - vmg->prev = vma_prev(vmg->vmi); > > - vma_iter_set(vmg->vmi, vmg->start); > > - } > > - > > - return vma_merge(vmg); > > -} > > - > > /* > > * Expand vma by delta bytes, potentially merging with an immediately adjacent > > * VMA with identical properties. > > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, > > .anon_name = anon_vma_name(vma), > > }; > > > > - /* vma is specified as prev, so case 1 or 2 will apply. */ > > - return vma_merge(&vmg); > > + return vma_merge_new_vma(&vmg); > > } > > > > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) > > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > struct vm_area_struct *vma = *vmap; > > unsigned long vma_start = vma->vm_start; > > struct mm_struct *mm = vma->vm_mm; > > - struct vm_area_struct *new_vma, *prev; > > + struct vm_area_struct *new_vma; > > bool faulted_in_anon_vma = true; > > VMA_ITERATOR(vmi, mm, addr); > > + struct vma_merge_struct vmg = { > > + .vmi = &vmi, > > + .start = addr, > > + .end = addr + len, > > + .flags = vma->vm_flags, > > + .pgoff = pgoff, > > + .file = vma->vm_file, > > + .anon_vma = vma->anon_vma, > > + .policy = vma_policy(vma), > > + .uffd_ctx = vma->vm_userfaultfd_ctx, > > + .anon_name = anon_vma_name(vma), > > + }; > > > > /* > > * If anonymous vma has not yet been faulted, update new pgoff > > * to match new location, to increase its chance of merging. > > */ > > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { > > - pgoff = addr >> PAGE_SHIFT; > > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; > > faulted_in_anon_vma = false; > > } > > > > - new_vma = find_vma_prev(mm, addr, &prev); > > + new_vma = find_vma_prev(mm, addr, &vmg.prev); > > if (new_vma && new_vma->vm_start < addr + len) > > return NULL; /* should never get here */ > > > > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); > > + vmg.next = vma_next(&vmi); > > + vma_prev(&vmi); > > + > > + new_vma = vma_merge_new_vma(&vmg); > > + > > if (new_vma) { > > /* > > * Source vma may have been merged into new_vma > > diff --git a/mm/vma.h b/mm/vma.h > > index 50459f9e4c7f..bbb173053f34 100644 > > --- a/mm/vma.h > > +++ b/mm/vma.h > > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); > > /* Required for expand_downwards(). */ > > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); > > > > -/* Required for do_brk_flags(). */ > > -void vma_prepare(struct vma_prepare *vp); > > - > > -/* Required for do_brk_flags(). */ > > -void init_vma_prep(struct vma_prepare *vp, > > - struct vm_area_struct *vma); > > - > > -/* Required for do_brk_flags(). */ > > -void vma_complete(struct vma_prepare *vp, > > - struct vma_iterator *vmi, struct mm_struct *mm); > > - > > int vma_expand(struct vma_merge_struct *vmg); > > int vma_shrink(struct vma_merge_struct *vmg); > > > > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, > > struct vm_area_struct *next, unsigned long start, > > unsigned long end, unsigned long tree_end, bool mm_wr_locked); > > > > -/* > > - * Can we merge the VMA described by vmg into the following VMA vmg->next? > > - * > > - * Required by mmap_region(). > > - */ > > -bool can_vma_merge_before(struct vma_merge_struct *vmg); > > - > > -/* > > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? > > - * > > - * Required by mmap_region() and do_brk_flags(). > > - */ > > -bool can_vma_merge_after(struct vma_merge_struct *vmg); > > - > > /* We are about to modify the VMA's flags. */ > > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, > > struct vm_area_struct *prev, > > @@ -133,31 +108,7 @@ struct vm_area_struct > > unsigned long new_flags, > > struct vm_userfaultfd_ctx new_ctx); > > > > -struct vm_area_struct > > -*vma_merge_new_vma(struct vma_merge_struct *vmg); > > - > > -/* Temporary convenience wrapper. */ > > -static inline struct vm_area_struct > > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, > > - struct vm_area_struct *vma, unsigned long start, > > - unsigned long end, pgoff_t pgoff) > > -{ > > - struct vma_merge_struct vmg = { > > - .vmi = vmi, > > - .prev = prev, > > - .start = start, > > - .end = end, > > - .flags = vma->vm_flags, > > - .file = vma->vm_file, > > - .anon_vma = vma->anon_vma, > > - .pgoff = pgoff, > > - .policy = vma_policy(vma), > > - .uffd_ctx = vma->vm_userfaultfd_ctx, > > - .anon_name = anon_vma_name(vma), > > - }; > > - > > - return vma_merge_new_vma(&vmg); > > -} > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); > > > > /* > > * Temporary wrapper around vma_merge() so we can have a common interface for > > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > > index 40797a819d3d..a39a734282d0 100644 > > --- a/tools/testing/vma/vma_internal.h > > +++ b/tools/testing/vma/vma_internal.h > > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) > > mas_destroy(&vmi->mas); > > } > > > > +static inline > > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) > > +{ > > + return mas_next_range(&vmi->mas, ULONG_MAX); > > +} > > + > > static inline void vm_acct_memory(long pages) > > { > > } > > -- > > 2.45.2 > >
diff --git a/mm/mmap.c b/mm/mmap.c index f6593a81f73d..c03f50f46396 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; + struct vm_area_struct *merge; unsigned long charged = 0; unsigned long end = addr + len; bool writable_file_mapping = false; @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - next = vmg.next = vma_next(&vmi); - prev = vmg.prev = vma_prev(&vmi); - if (vm_flags & VM_SPECIAL) { - if (prev) - vma_iter_next_range(&vmi); - goto cannot_expand; - } - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) { - /* We can adjust this as can_vma_merge_after() doesn't touch */ - vmg.end = next->vm_end; - vma = vmg.vma = next; - vmg.pgoff = next->vm_pgoff - pglen; - - /* We may merge our NULL anon_vma with non-NULL in next. */ - vmg.anon_vma = vma->anon_vma; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) { - vmg.start = prev->vm_start; - vma = vmg.vma = prev; - vmg.pgoff = prev->vm_pgoff; - } else if (prev) { - vma_iter_next_range(&vmi); - } - - /* Actually expand, if possible */ - if (vma && !vma_expand(&vmg)) { - khugepaged_enter_vma(vma, vm_flags); + vma = vma_merge_new_vma(&vmg); + if (vma) goto expanded; - } - - if (vma == prev) - vma_iter_set(&vmi, addr); -cannot_expand: /* * Determine the object being mapped and call the appropriate @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma, - vma->vm_start, vma->vm_end, - vma->vm_pgoff); + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + merge = vma_merge_new_vma(&vmg); + if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start, vma->vm_end, vma->vm_end, true); } if (writable_file_mapping) @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vma_prepare vp; /* * Check against address space limits by the changed size @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, */ if (vma && vma->vm_end == addr) { struct vma_merge_struct vmg = { + .vmi = vmi, .prev = vma, + .next = NULL, + .start = addr, + .end = addr + len, .flags = flags, .pgoff = addr >> PAGE_SHIFT, + .file = vma->vm_file, + .anon_vma = vma->anon_vma, + .policy = vma_policy(vma), + .uffd_ctx = vma->vm_userfaultfd_ctx, + .anon_name = anon_vma_name(vma), }; - if (can_vma_merge_after(&vmg)) { - vma_iter_config(vmi, vma->vm_start, addr + len); - if (vma_iter_prealloc(vmi, vma)) - goto unacct_fail; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - vma->vm_end = addr + len; - vm_flags_set(vma, VM_SOFTDIRTY); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(vma, flags); + if (vma_merge_new_vma(&vmg)) goto out; - } } if (vma) diff --git a/mm/vma.c b/mm/vma.c index 55615392e8d2..a404cf718f9e 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp, * * We assume the vma may be removed as part of the merge. */ -bool -can_vma_merge_before(struct vma_merge_struct *vmg) +static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg) * * We assume that vma is not removed as part of the merge. */ -bool can_vma_merge_after(struct vma_merge_struct *vmg) +static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, false) && is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg) return false; } +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_allow_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Requires inode->i_mapping->i_mmap_rwsem + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static void vma_complete(struct vma_prepare *vp, + struct vma_iterator *vmi, struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + vma_mark_detached(vp->remove, true); + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we are removing both mid and next vmas + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); + validate_mm(mm); +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static void init_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + /* * Close a vm structure and free it. */ @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) vm_unacct_memory(nr_accounted); } -/* - * init_vma_prep() - Initializer wrapper for vma_prepare struct - * @vp: The vma_prepare struct - * @vma: The vma that will be altered once locked - */ -void init_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma) -{ - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); -} - -/* - * Requires inode->i_mapping->i_mmap_rwsem - */ -static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_unmap_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static void __vma_link_file(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -/* - * vma_prepare() - Helper function for handling locking VMAs prior to altering - * @vp: The initialized vma_prepare struct - */ -void vma_prepare(struct vma_prepare *vp) -{ - if (vp->file) { - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); - - if (vp->adj_next) - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, - vp->adj_next->vm_end); - - i_mmap_lock_write(vp->mapping); - if (vp->insert && vp->insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(vp->insert, - vp->insert->vm_file->f_mapping); - } - } - - if (vp->anon_vma) { - anon_vma_lock_write(vp->anon_vma); - anon_vma_interval_tree_pre_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_pre_update_vma(vp->adj_next); - } - - if (vp->file) { - flush_dcache_mmap_lock(vp->mapping); - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); - if (vp->adj_next) - vma_interval_tree_remove(vp->adj_next, - &vp->mapping->i_mmap); - } - -} - /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm) } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +/* + * vma_merge_new_vma - Attempt to merge a new VMA into address space + * + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end + * (exclusive), which we try to merge with any adjacent VMAs if possible. + * + * We are about to add a VMA to the address space starting at @vmg->start and + * ending at @vmg->end. There are three different possible scenarios: + * + * 1. There is a VMA with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - + * EXPAND that VMA: + * + * Proposed: |-----| or |-----| + * Existing: |----| |----| + * + * 2. There are VMAs with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - + * EXPAND the former and REMOVE the latter: + * + * Proposed: |-----| + * Existing: |----| |----| + * + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those + * VMAs do not have identical attributes - NO MERGE POSSIBLE. + * + * In instances where we can merge, this function returns the expanded VMA which + * will have its range adjusted accordingly and the underlying maple tree also + * adjusted. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the + * expanded range. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have determined that [@vmg->start, @vmg->end) is empty. + */ +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) +{ + bool is_special = vmg->flags & VM_SPECIAL; + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next = vmg->next; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + pgoff_t pgoff = vmg->pgoff; + pgoff_t pglen = PHYS_PFN(end - start); + + VM_WARN_ON(vmg->vma); + + if (!prev && !next) { + /* + * Since the caller must have determined that the requested + * range is empty, vmg->vmi will be left pointing at the VMA + * immediately prior. + */ + next = vmg->next = vma_next(vmg->vmi); + prev = vmg->prev = vma_prev(vmg->vmi); + + /* Avoid maple tree re-walk. */ + if (is_special && prev) + vma_iter_next_range(vmg->vmi); + } + + /* If special mapping or no adjacent VMAs, nothing to merge. */ + if (is_special || (!prev && !next)) + return NULL; + + /* If we can merge with the following VMA, adjust vmg accordingly. */ + if (next && next->vm_start == end && can_vma_merge_before(vmg)) { + /* + * We can adjust this here as can_vma_merge_after() doesn't + * touch vmg->end. + */ + vmg->end = next->vm_end; + vmg->vma = next; + vmg->pgoff = next->vm_pgoff - pglen; + + vmg->anon_vma = next->anon_vma; + } + + /* If we can merge with the previous VMA, adjust vmg accordingly. */ + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) { + vmg->start = prev->vm_start; + vmg->vma = prev; + vmg->pgoff = prev->vm_pgoff; + } else if (prev) { + vma_iter_next_range(vmg->vmi); + } + + /* + * Now try to expand adjacent VMA(s). This takes care of removing the + * following VMA if we have VMAs on both sides. + */ + if (vmg->vma && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->vma, vmg->flags); + return vmg->vma; + } + + /* If expansion failed, reset state. Allows us to retry merge later. */ + vmg->vma = NULL; + vmg->anon_vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + + return NULL; +} + /* * vma_expand - Expand an existing VMA * @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm) * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with * vmg->next needs to be handled by the caller. * - * Returns: 0 on success + * Returns: 0 on success. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. + * - The caller must have set @vmg->prev and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg) return 0; } -/* - * vma_complete- Helper function for handling the unlocking after altering VMAs, - * or for inserting a VMA. - * - * @vp: The vma_prepare struct - * @vmi: The vma iterator - * @mm: The mm_struct - */ -void vma_complete(struct vma_prepare *vp, - struct vma_iterator *vmi, struct mm_struct *mm) -{ - if (vp->file) { - if (vp->adj_next) - vma_interval_tree_insert(vp->adj_next, - &vp->mapping->i_mmap); - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); - flush_dcache_mmap_unlock(vp->mapping); - } - - if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->mapping); - if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->mapping); - } else if (vp->insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, vp->insert); - mm->map_count++; - } - - if (vp->anon_vma) { - anon_vma_interval_tree_post_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_post_update_vma(vp->adj_next); - anon_vma_unlock_write(vp->anon_vma); - } - - if (vp->file) { - i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - - if (vp->adj_next) - uprobe_mmap(vp->adj_next); - } - - if (vp->remove) { -again: - vma_mark_detached(vp->remove, true); - if (vp->file) { - uprobe_munmap(vp->remove, vp->remove->vm_start, - vp->remove->vm_end); - fput(vp->file); - } - if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); - mm->map_count--; - mpol_put(vma_policy(vp->remove)); - if (!vp->remove2) - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); - vm_area_free(vp->remove); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we are removing both mid and next vmas - */ - if (vp->remove2) { - vp->remove = vp->remove2; - vp->remove2 = NULL; - goto again; - } - } - if (vp->insert && vp->file) - uprobe_mmap(vp->insert); - validate_mm(mm); -} - /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -1261,20 +1378,6 @@ struct vm_area_struct return vma_modify(&vmg); } -/* - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller - * must ensure that [start, end) does not overlap any existing VMA. - */ -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) -{ - if (!vmg->prev) { - vmg->prev = vma_prev(vmg->vmi); - vma_iter_set(vmg->vmi, vmg->start); - } - - return vma_merge(vmg); -} - /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, .anon_name = anon_vma_name(vma), }; - /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(&vmg); + return vma_merge_new_vma(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; + struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); + struct vma_merge_struct vmg = { + .vmi = &vmi, + .start = addr, + .end = addr + len, + .flags = vma->vm_flags, + .pgoff = pgoff, + .file = vma->vm_file, + .anon_vma = vma->anon_vma, + .policy = vma_policy(vma), + .uffd_ctx = vma->vm_userfaultfd_ctx, + .anon_name = anon_vma_name(vma), + }; /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { - pgoff = addr >> PAGE_SHIFT; + pgoff = vmg.pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } - new_vma = find_vma_prev(mm, addr, &prev); + new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff); + vmg.next = vma_next(&vmi); + vma_prev(&vmi); + + new_vma = vma_merge_new_vma(&vmg); + if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/vma.h b/mm/vma.h index 50459f9e4c7f..bbb173053f34 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); /* Required for expand_downwards(). */ void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); -/* Required for do_brk_flags(). */ -void vma_prepare(struct vma_prepare *vp); - -/* Required for do_brk_flags(). */ -void init_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma); - -/* Required for do_brk_flags(). */ -void vma_complete(struct vma_prepare *vp, - struct vma_iterator *vmi, struct mm_struct *mm); - int vma_expand(struct vma_merge_struct *vmg); int vma_shrink(struct vma_merge_struct *vmg); @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *next, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); -/* - * Can we merge the VMA described by vmg into the following VMA vmg->next? - * - * Required by mmap_region(). - */ -bool can_vma_merge_before(struct vma_merge_struct *vmg); - -/* - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev? - * - * Required by mmap_region() and do_brk_flags(). - */ -bool can_vma_merge_after(struct vma_merge_struct *vmg); - /* We are about to modify the VMA's flags. */ struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, @@ -133,31 +108,7 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct -*vma_merge_new_vma(struct vma_merge_struct *vmg); - -/* Temporary convenience wrapper. */ -static inline struct vm_area_struct -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff) -{ - struct vma_merge_struct vmg = { - .vmi = vmi, - .prev = prev, - .start = start, - .end = end, - .flags = vma->vm_flags, - .file = vma->vm_file, - .anon_vma = vma->anon_vma, - .pgoff = pgoff, - .policy = vma_policy(vma), - .uffd_ctx = vma->vm_userfaultfd_ctx, - .anon_name = anon_vma_name(vma), - }; - - return vma_merge_new_vma(&vmg); -} +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg); /* * Temporary wrapper around vma_merge() so we can have a common interface for diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 40797a819d3d..a39a734282d0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi) mas_destroy(&vmi->mas); } +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + static inline void vm_acct_memory(long pages) { }
In mmap_region() and do_brk_flags() we open code scenarios where we prefer to use vma_expand() rather than invoke a full vma_merge() operation. Abstract this logic and eliminate all of the open-coding, and also use the same logic for all cases where we add new VMAs to, rather than ultimately use vma_merge(), rather use vma_expand(). We implement this by replacing vma_merge_new_vma() with this newly abstracted logic. Doing so removes duplication and simplifies VMA merging in all such cases, laying the ground for us to eliminate the merging of new VMAs in vma_merge() altogether. This makes it far easier to understand what is happening in these cases avoiding confusion, bugs and allowing for future optimisation. As a result of this change we are also able to make vma_prepare(), init_vma_prep(), vma_complete(), can_vma_merge_before() and can_vma_merge_after() static and internal to vma.c. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- mm/mmap.c | 79 ++--- mm/vma.c | 482 +++++++++++++++++++------------ mm/vma.h | 51 +--- tools/testing/vma/vma_internal.h | 6 + 4 files changed, 324 insertions(+), 294 deletions(-)