@@ -542,29 +542,297 @@ void validate_mm(struct mm_struct *mm)
/* Actually perform the VMA merge operation. */
static int commit_merge(struct vma_merge_struct *vmg,
- struct vm_area_struct *remove)
+ struct vm_area_struct *adjust,
+ struct vm_area_struct *remove,
+ struct vm_area_struct *remove2,
+ long adj_start,
+ bool expanded)
{
struct vma_prepare vp;
- init_multi_vma_prep(&vp, vmg->vma, NULL, remove, NULL);
+ init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
- /* Note: vma iterator must be pointing to 'start'. */
- vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ vp.anon_vma != adjust->anon_vma);
+
+ if (expanded) {
+ /* Note: vma iterator must be pointing to 'start'. */
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ } else {
+ vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
+ adjust->vm_end);
+ }
if (vma_iter_prealloc(vmg->vmi, vmg->vma))
return -ENOMEM;
vma_prepare(&vp);
- vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, 0);
+ vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
- vma_iter_store(vmg->vmi, vmg->vma);
+ if (expanded)
+ vma_iter_store(vmg->vmi, vmg->vma);
+
+ if (adj_start) {
+ adjust->vm_start += adj_start;
+ adjust->vm_pgoff += PHYS_PFN(adj_start);
+ if (adj_start < 0) {
+ WARN_ON(expanded);
+ vma_iter_store(vmg->vmi, adjust);
+ }
+ }
vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
return 0;
}
+/*
+ * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
+ * attributes modified.
+ *
+ * @vmg: Describes the modifications being made to a VMA and associated
+ * metadata.
+ *
+ * When the attributes of a range within a VMA change, then it might be possible
+ * for immediately adjacent VMAs to be merged into that VMA due to having
+ * identical properties.
+ *
+ * This function checks for the existence of any such mergeable VMAs and updates
+ * the maple tree describing the @vmg->vma->vm_mm address space to account for
+ * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ *
+ * As part of this operation, if a merge occurs, the @vmg object will have its
+ * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
+ * calls to this function should reset these fields.
+ *
+ * Returns: The merged VMA if merge succeeds, or NULL otherwise.
+ *
+ * ASSUMPTIONS:
+ * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must have set @vmg->prev to the previous VMA, if there is one.
+ * - The caller must not set @vmg->next, as we determine this.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ */
+static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *prev = vmg->prev;
+ struct vm_area_struct *next, *res;
+ struct vm_area_struct *anon_dup = NULL;
+ struct vm_area_struct *adjust = NULL;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
+ bool left_side = vma && start == vma->vm_start;
+ bool right_side = vma && end == vma->vm_end;
+ bool merge_both = false;
+ int err = 0;
+ long adj_start = 0;
+ bool merge_will_delete_vma, merge_will_delete_next;
+ bool merge_left, merge_right;
+ bool expanded;
+
+ mmap_assert_write_locked(vmg->mm);
+ VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON(vmg->next); /* We set this. */
+ VM_WARN_ON(prev && start <= prev->vm_start);
+ VM_WARN_ON(start >= end);
+ /*
+ * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * not, we must span a portion of the VMA.
+ */
+ VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
+ vmg->end > vma->vm_end));
+ /* The vmi must be positioned within vmg->vma. */
+ VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
+ vma_iter_addr(vmg->vmi) < vma->vm_end));
+
+ vmg->state = VMA_MERGE_NOMERGE;
+
+ /*
+ * If a special mapping or neither at the furthermost left or right side
+ * of the VMA, then we have no chance of merging and should abort.
+ *
+ * We later require that vma->vm_flags == vm_flags, so this tests
+ * vma->vm_flags & VM_SPECIAL, too.
+ */
+ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ return NULL;
+
+ if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
+ merge_left = true;
+ vma_prev(vmg->vmi);
+ } else {
+ merge_left = false;
+ }
+
+ if (right_side) {
+ next = vmg->next = vma_lookup(vma->vm_mm, end);
+
+ /*
+ * We can merge right if there is a subsequent VMA, if it is
+ * immediately adjacent, and if it is compatible with vma.
+ */
+ merge_right = next && end == next->vm_start &&
+ can_vma_merge_before(vmg);
+
+ /*
+ * We can only merge both if the anonymous VMA of the previous
+ * VMA is compatible with the anonymous VMA of the subsequent
+ * VMA.
+ *
+ * Otherwise, we default to merging only the left.
+ */
+ if (merge_left && merge_right)
+ merge_right = merge_both =
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL);
+ } else {
+ merge_right = false;
+ next = NULL;
+ }
+
+ /* If we have nothing to merge, abort. */
+ if (!merge_left && !merge_right)
+ return NULL;
+
+ /* If we span the entire VMA, a merge implies it will be deleted. */
+ merge_will_delete_vma = left_side && right_side;
+ /*
+ * If we merge both VMAs, then next is also deleted. This implies
+ * merge_will_delete_vma also.
+ */
+ merge_will_delete_next = merge_both;
+
+ /* No matter what happens, we will be adjusting vma. */
+ vma_start_write(vma);
+
+ if (merge_left)
+ vma_start_write(prev);
+
+ if (merge_right)
+ vma_start_write(next);
+
+ if (merge_both) {
+ /*
+ * |<----->|
+ * |-------*********-------|
+ * prev vma next
+ * extend delete delete
+ */
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->end = next->vm_end;
+ vmg->pgoff = prev->vm_pgoff;
+
+ /*
+ * We already ensured anon_vma compatibility above, so now it's
+ * simply a case of, if prev has no anon_vma object, which of
+ * next or vma contains the anon_vma we must duplicate.
+ */
+ err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ } else if (merge_left) {
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * |-------*************
+ * prev vma
+ * extend shrink/delete
+ */
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->pgoff = prev->vm_pgoff;
+
+ if (merge_will_delete_vma) {
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing vma, so it skipped the check for
+ * vm_ops->close, but we are removing vma.
+ */
+ if (vma->vm_ops && vma->vm_ops->close)
+ err = -EINVAL;
+ } else {
+ adjust = vma;
+ adj_start = vmg->end - vma->vm_start;
+ }
+
+ if (!err)
+ err = dup_anon_vma(prev, vma, &anon_dup);
+ } else { /* merge_right */
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * *************-------|
+ * vma next
+ * shrink/delete extend
+ */
+
+ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+ VM_WARN_ON(!merge_right);
+ /* If we are offset into a VMA, then prev must be vma. */
+ VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+
+ if (merge_will_delete_vma) {
+ vmg->vma = next;
+ vmg->end = next->vm_end;
+ vmg->pgoff = next->vm_pgoff - pglen;
+ } else {
+ /*
+ * We shrink vma and expand next.
+ *
+ * IMPORTANT: This is the ONLY case where the final
+ * merged VMA is NOT vmg->vma, but rather vmg->next.
+ */
+
+ vmg->start = vma->vm_start;
+ vmg->end = start;
+ vmg->pgoff = vma->vm_pgoff;
+
+ adjust = next;
+ adj_start = -(vma->vm_end - start);
+ }
+
+ err = dup_anon_vma(next, vma, &anon_dup);
+ }
+
+ if (err)
+ goto abort;
+
+ /*
+ * In nearly all cases, we expand vmg->vma. There is one exception -
+ * merge_right where we partially span the VMA. In this case we shrink
+ * the end of vmg->vma and adjust the start of vmg->next accordingly.
+ */
+ expanded = !merge_right || merge_will_delete_vma;
+
+ if (commit_merge(vmg, adjust,
+ merge_will_delete_vma ? vma : NULL,
+ merge_will_delete_next ? next : NULL,
+ adj_start, expanded)) {
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
+ return NULL;
+ }
+
+ res = merge_left ? prev : next;
+ khugepaged_enter_vma(res, vmg->flags);
+
+ vmg->state = VMA_MERGE_SUCCESS;
+ return res;
+
+abort:
+ vma_iter_set(vmg->vmi, start);
+ vma_iter_load(vmg->vmi);
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
+ return NULL;
+}
+
/*
* vma_merge_new_range - Attempt to merge a new VMA into address space
*
@@ -717,7 +985,7 @@ int vma_expand(struct vma_merge_struct *vmg)
/* Only handles expanding */
VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
- if (commit_merge(vmg, remove_next ? next : NULL))
+ if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
goto nomem;
return 0;
@@ -1092,249 +1360,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
-/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
- * figure out whether that can be merged with its predecessor or its
- * successor. Or both (it neatly fills a hole).
- *
- * In most cases - when called for mmap, brk or mremap - [addr,end) is
- * certain not to be mapped by the time vma_merge is called; but when
- * called for mprotect, it is certain to be already mapped (either at
- * an offset within prev, or at the start of next), and the flags of
- * this area are about to be changed to vm_flags - and the no-change
- * case has already been eliminated.
- *
- * The following mprotect cases have to be considered, where **** is
- * the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
- * at the same address as **** and is of the same or larger span, and
- * NNNN the next vma after ****:
- *
- * **** **** ****
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
- * cannot merge might become might become
- * PPNNNNNNNNNN PPPPPPPPPPCC
- * mmap, brk or case 4 below case 5 below
- * mremap move:
- * **** ****
- * PPPP NNNN PPPPCCCCNNNN
- * might become might become
- * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
- * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
- * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
- *
- * It is important for case 8 that the vma CCCC overlapping the
- * region **** is never going to extended over NNNN. Instead NNNN must
- * be extended in region **** and CCCC must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_merge drops the
- * rmap_locks, the properties of the merged vma will be already
- * correct for the whole merged range. Some of those properties like
- * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
- * be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if NNNN would be removed and
- * CCCC would be extended over the NNNN range, remove_migration_ptes
- * or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of CCCC
- * instead of the right permissions of NNNN.
- *
- * In the code below:
- * PPPP is represented by *prev
- * CCCC is represented by *curr or not represented at all (NULL)
- * NNNN is represented by *next or not represented at all (NULL)
- * **** is not represented - it will be merged and the vma containing the
- * area is returned, or the function will return NULL
- */
-static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
-{
- struct mm_struct *mm = vmg->mm;
- struct vm_area_struct *prev = vmg->prev;
- struct vm_area_struct *curr, *next, *res;
- struct vm_area_struct *vma, *adjust, *remove, *remove2;
- struct vm_area_struct *anon_dup = NULL;
- struct vma_prepare vp;
- pgoff_t vma_pgoff;
- int err = 0;
- bool merge_prev = false;
- bool merge_next = false;
- bool vma_expanded = false;
- unsigned long addr = vmg->start;
- unsigned long end = vmg->end;
- unsigned long vma_start = addr;
- unsigned long vma_end = end;
- pgoff_t pglen = PHYS_PFN(end - addr);
- long adj_start = 0;
-
- vmg->state = VMA_MERGE_NOMERGE;
-
- /*
- * We later require that vma->vm_flags == vm_flags,
- * so this tests vma->vm_flags & VM_SPECIAL, too.
- */
- if (vmg->flags & VM_SPECIAL)
- return NULL;
-
- /* Does the input range span an existing VMA? (cases 5 - 8) */
- curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
-
- if (!curr || /* cases 1 - 4 */
- end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
- next = vmg->next = vma_lookup(mm, end);
- else
- next = vmg->next = NULL; /* case 5 */
-
- if (prev) {
- vma_start = prev->vm_start;
- vma_pgoff = prev->vm_pgoff;
-
- /* Can we merge the predecessor? */
- if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
- merge_prev = true;
- vma_prev(vmg->vmi);
- }
- }
-
- /* Can we merge the successor? */
- if (next && can_vma_merge_before(vmg)) {
- merge_next = true;
- }
-
- /* Verify some invariant that must be enforced by the caller. */
- VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
- VM_WARN_ON(addr >= end);
-
- if (!merge_prev && !merge_next)
- return NULL; /* Not mergeable. */
-
- if (merge_prev)
- vma_start_write(prev);
-
- res = vma = prev;
- remove = remove2 = adjust = NULL;
-
- /* Can we merge both the predecessor and the successor? */
- if (merge_prev && merge_next &&
- is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
- vma_start_write(next);
- remove = next; /* case 1 */
- vma_end = next->vm_end;
- err = dup_anon_vma(prev, next, &anon_dup);
- if (curr) { /* case 6 */
- vma_start_write(curr);
- remove = curr;
- remove2 = next;
- /*
- * Note that the dup_anon_vma below cannot overwrite err
- * since the first caller would do nothing unless next
- * has an anon_vma.
- */
- if (!next->anon_vma)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else if (merge_prev) { /* case 2 */
- if (curr) {
- vma_start_write(curr);
- if (end == curr->vm_end) { /* case 7 */
- /*
- * can_vma_merge_after() assumed we would not be
- * removing prev vma, so it skipped the check
- * for vm_ops->close, but we are removing curr
- */
- if (curr->vm_ops && curr->vm_ops->close)
- err = -EINVAL;
- remove = curr;
- } else { /* case 5 */
- adjust = curr;
- adj_start = end - curr->vm_start;
- }
- if (!err)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else { /* merge_next */
- vma_start_write(next);
- res = next;
- if (prev && addr < prev->vm_end) { /* case 4 */
- vma_start_write(prev);
- vma_end = addr;
- adjust = next;
- adj_start = -(prev->vm_end - addr);
- err = dup_anon_vma(next, prev, &anon_dup);
- } else {
- /*
- * Note that cases 3 and 8 are the ONLY ones where prev
- * is permitted to be (but is not necessarily) NULL.
- */
- vma = next; /* case 3 */
- vma_start = addr;
- vma_end = next->vm_end;
- vma_pgoff = next->vm_pgoff - pglen;
- if (curr) { /* case 8 */
- vma_pgoff = curr->vm_pgoff;
- vma_start_write(curr);
- remove = curr;
- err = dup_anon_vma(next, curr, &anon_dup);
- }
- }
- }
-
- /* Error in anon_vma clone. */
- if (err)
- goto anon_vma_fail;
-
- if (vma_start < vma->vm_start || vma_end > vma->vm_end)
- vma_expanded = true;
-
- if (vma_expanded) {
- vma_iter_config(vmg->vmi, vma_start, vma_end);
- } else {
- vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
- }
-
- if (vma_iter_prealloc(vmg->vmi, vma))
- goto prealloc_fail;
-
- init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
-
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
- vma_set_range(vma, vma_start, vma_end, vma_pgoff);
-
- if (vma_expanded)
- vma_iter_store(vmg->vmi, vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
- if (adj_start < 0) {
- WARN_ON(vma_expanded);
- vma_iter_store(vmg->vmi, next);
- }
- }
-
- vma_complete(&vp, vmg->vmi, mm);
- validate_mm(mm);
- khugepaged_enter_vma(res, vmg->flags);
-
- vmg->state = VMA_MERGE_SUCCESS;
- return res;
-
-prealloc_fail:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
-anon_vma_fail:
- if (err == -ENOMEM)
- vmg->state = VMA_MERGE_ERROR_NOMEM;
-
- vma_iter_set(vmg->vmi, addr);
- vma_iter_load(vmg->vmi);
- return NULL;
-}
-
/*
* We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
* context and anonymous VMA name within the range [start, end).
@@ -1354,7 +1379,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *merged;
/* First, try to merge. */
- merged = vma_merge(vmg);
+ merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
@@ -112,7 +112,7 @@ static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
*/
static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
{
- return vma_merge(vmg);
+ return vma_merge_existing_range(vmg);
}
/*
@@ -752,7 +752,12 @@ static bool test_vma_merge_with_close(void)
vmg.vma = vma;
/* Make sure merge does not occur. */
ASSERT_EQ(merge_existing(&vmg), NULL);
- ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+ /*
+ * Initially this is misapprehended as an out of memory report, as the
+ * close() check is handled in the same way as anon_vma duplication
+ * failures, however a subsequent patch resolves this.
+ */
+ ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
cleanup_mm(&mm, &vmi);
return true;
The existing vma_merge() function is no longer required to handle what were previously referred to as cases 1-3 (i.e. the merging of a new VMA), as this is now handled by vma_merge_new_vma(). Additionally, simplify the convoluted control flow of the original, maintaining identical logic only expressed more clearly and doing away with a complicated set of cases, rather logically examining each possible outcome - merging of both the previous and subsequent VMA, merging of the previous VMA and merging of the subsequent VMA alone. We now utilise the previously implemented commit_merge() function to share logic with vma_expand() de-duplicating code and providing less surface area for bugs and confusion. In order to do so, we adjust this function to accept parameters specific to merging existing ranges. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- mm/vma.c | 527 +++++++++++++++++++++------------------- tools/testing/vma/vma.c | 9 +- 2 files changed, 283 insertions(+), 253 deletions(-)