@@ -24,6 +24,44 @@
struct folio_batch;
+/*
+ * Maintains state across a page table move. The operation assumes both source
+ * and destination VMAs already exist and are specified by the user.
+ *
+ * Partial moves are permitted, but the old and new ranges must both reside
+ * within a VMA.
+ *
+ * mmap lock must be held in write and VMA write locks must be held on any VMA
+ * that is visible.
+ *
+ * Use the PAGETABLE_MOVE() macro to initialise this struct.
+ *
+ * NOTE: The page table move is affected by reading from [old_addr, old_end),
+ * and old_addr may be updated for better page table alignment, so len_in
+ * represents the length of the range being copied as specified by the user.
+ */
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
@@ -1527,10 +1565,7 @@ extern struct list_lru shadow_nodes;
} while (0)
/* mremap.c */
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
+unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
@@ -1694,6 +1694,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
struct vm_area_struct *next;
struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
BUG_ON(new_start > new_end);
@@ -1716,8 +1717,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
* move the page tables downwards, on failure we rely on
* process cleanup to remove whatever mess we made.
*/
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
return -ENOMEM;
tlb_gather_mmu(&tlb, mm);
@@ -580,8 +580,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
* the VMA that is created to span the source and destination of the move,
* so we make an exception for it.
*/
-static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
- unsigned long mask, bool for_stack)
+static bool can_align_down(struct pagetable_move_control *pmc,
+ struct vm_area_struct *vma, unsigned long addr_to_align,
+ unsigned long mask)
{
unsigned long addr_masked = addr_to_align & mask;
@@ -590,11 +591,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
* of the corresponding VMA, we can't align down or we will destroy part
* of the current mapping.
*/
- if (!for_stack && vma->vm_start != addr_to_align)
+ if (!pmc->for_stack && vma->vm_start != addr_to_align)
return false;
/* In the stack case we explicitly permit in-VMA alignment. */
- if (for_stack && addr_masked >= vma->vm_start)
+ if (pmc->for_stack && addr_masked >= vma->vm_start)
return true;
/*
@@ -604,54 +605,131 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
}
-/* Opportunistically realign to specified boundary for faster copy. */
-static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
- unsigned long *new_addr, struct vm_area_struct *new_vma,
- unsigned long mask, bool for_stack)
+/*
+ * Determine if are in fact able to realign for efficiency to a higher page
+ * table boundary.
+ */
+static bool can_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
{
+ unsigned long align_mask = ~pagetable_mask;
+ unsigned long old_align = pmc->old_addr & align_mask;
+ unsigned long new_align = pmc->new_addr & align_mask;
+ unsigned long pagetable_size = align_mask + 1;
+ unsigned long old_align_next = pagetable_size - old_align;
+
+ /*
+ * We don't want to have to go hunting for VMAs from the end of the old
+ * VMA to the next page table boundary, also we want to make sure the
+ * operation is wortwhile.
+ *
+ * So ensure that we only perform this realignment if the end of the
+ * range being copied reaches or crosses the page table boundary.
+ *
+ * boundary boundary
+ * .<- old_align -> .
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . <----------------.----------->
+ * . len_in
+ * <------------------------------->
+ * . pagetable_size .
+ * . <---------------->
+ * . old_align_next .
+ */
+ if (pmc->len_in < old_align_next)
+ return false;
+
/* Skip if the addresses are already aligned. */
- if ((*old_addr & ~mask) == 0)
- return;
+ if (old_align == 0)
+ return false;
/* Only realign if the new and old addresses are mutually aligned. */
- if ((*old_addr & ~mask) != (*new_addr & ~mask))
- return;
+ if (old_align != new_align)
+ return false;
/* Ensure realignment doesn't cause overlap with existing mappings. */
- if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
- !can_align_down(new_vma, *new_addr, mask, for_stack))
+ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
+ !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
+ return false;
+
+ return true;
+}
+
+/*
+ * Opportunistically realign to specified boundary for faster copy.
+ *
+ * Consider an mremap() of a VMA with page table boundaries as below, and no
+ * preceding VMAs from the lower page table boundary to the start of the VMA,
+ * with the end of the range reaching or crossing the page table boundary.
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . pmc->old_addr . pmc->old_end
+ * . <---------------------------->
+ * . move these page tables
+ *
+ * If we proceed with moving page tables in this scenario, we will have a lot of
+ * work to do traversing old page tables and establishing new ones in the
+ * destination across multiple lower level page tables.
+ *
+ * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
+ * page table boundary, so we can simply copy a single page table entry for the
+ * aligned portion of the VMA instead:
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * pmc->old_addr . pmc->old_end
+ * <------------------------------------------->
+ * . move these page tables
+ */
+static void try_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
+{
+
+ if (!can_realign_addr(pmc, pagetable_mask))
return;
- *old_addr = *old_addr & mask;
- *new_addr = *new_addr & mask;
+ /*
+ * Simply align to page table boundaries. Note that we do NOT update the
+ * pmc->old_end value, and since the move_page_tables() operation spans
+ * from [old_addr, old_end) (offsetting new_addr as it is performed),
+ * this simply changes the start of the copy, not the end.
+ */
+ pmc->old_addr &= pagetable_mask;
+ pmc->new_addr &= pagetable_mask;
}
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack)
+unsigned long move_page_tables(struct pagetable_move_control *pmc)
{
unsigned long extent, old_end;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
+ unsigned long old_addr, new_addr;
+ struct vm_area_struct *vma = pmc->old;
- if (!len)
+ if (!pmc->len_in)
return 0;
- old_end = old_addr + len;
-
if (is_vm_hugetlb_page(vma))
- return move_hugetlb_page_tables(vma, new_vma, old_addr,
- new_addr, len);
+ return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
+ pmc->new_addr, pmc->len_in);
+ old_end = pmc->old_end;
/*
* If possible, realign addresses to PMD boundary for faster copy.
* Only realign if the mremap copying hits a PMD boundary.
*/
- if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
- try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
- for_stack);
+ try_realign_addr(pmc, PMD_MASK);
+ /* These may have been changed. */
+ old_addr = pmc->old_addr;
+ new_addr = pmc->new_addr;
flush_cache_range(vma, old_addr, old_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
@@ -675,12 +753,11 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, need_rmap_locks);
+ old_pud, new_pud, pmc->need_rmap_locks);
/* We ignore and continue on error? */
continue;
}
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-
if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
old_pud, new_pud, true))
continue;
@@ -698,7 +775,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, need_rmap_locks))
+ old_pmd, new_pmd, pmc->need_rmap_locks))
continue;
split_huge_pmd(vma, old_pmd, old_addr);
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
@@ -713,10 +790,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
}
if (pmd_none(*old_pmd))
continue;
- if (pte_alloc(new_vma->vm_mm, new_pmd))
+ if (pte_alloc(pmc->new->vm_mm, new_pmd))
break;
if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ pmc->new, new_pmd, new_addr, pmc->need_rmap_locks) < 0)
goto again;
}
@@ -726,10 +803,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
* Prevent negative return values when {old,new}_addr was realigned
* but we broke out of the above loop for the first PMD itself.
*/
- if (old_addr < old_end - len)
+ if (old_addr < old_end - pmc->len_in)
return 0;
- return len + old_addr - old_end; /* how much done */
+ return pmc->len_in + old_addr - old_end; /* how much done */
}
/* Set vrm->delta to the difference in VMA size specified by user. */
@@ -1040,37 +1117,40 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
unsigned long moved_len;
- bool need_rmap_locks;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = vrm->vma;
struct vm_area_struct *new_vma;
int err = 0;
+ PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
- new_vma = copy_vma(&vrm->vma, vrm->new_addr, vrm->new_len, new_pgoff,
- &need_rmap_locks);
+ new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
+ &pmc.need_rmap_locks);
if (!new_vma) {
vrm_uncharge(vrm);
*new_vma_ptr = NULL;
return -ENOMEM;
}
- vma = vrm->vma;
+ vrm->vma = vma;
+ pmc.old = vma;
+ pmc.new = new_vma;
- moved_len = move_page_tables(vma, vrm->addr, new_vma,
- vrm->new_addr, vrm->old_len,
- need_rmap_locks, /* for_stack= */false);
+ moved_len = move_page_tables(&pmc);
if (moved_len < vrm->old_len)
err = -ENOMEM;
else if (vma->vm_ops && vma->vm_ops->mremap)
err = vma->vm_ops->mremap(new_vma);
if (unlikely(err)) {
+ PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
+ vrm->addr, moved_len);
+
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, vrm->new_addr, vma, vrm->addr,
- moved_len, /* need_rmap_locks = */true,
- /* for_stack= */false);
+ pmc_revert.need_rmap_locks = true;
+ move_page_tables(&pmc_revert);
+
vrm->vma = new_vma;
vrm->old_len = vrm->new_len;
vrm->addr = vrm->new_addr;