@@ -1119,24 +1119,14 @@ static long madvise_guard_install(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
- long err;
+ long err = 0;
+ unsigned long nr_pages;
int i;
*prev = vma;
if (!is_valid_guard_vma(vma, /* allow_locked = */false))
return -EINVAL;
- /*
- * If we install guard markers, then the range is no longer
- * empty from a page table perspective and therefore it's
- * appropriate to have an anon_vma.
- *
- * This ensures that on fork, we copy page tables correctly.
- */
- err = anon_vma_prepare(vma);
- if (err)
- return err;
-
/*
* Optimistically try to install the guard marker pages first. If any
* non-guard pages are encountered, give up and zap the range before
@@ -1150,19 +1140,20 @@ static long madvise_guard_install(struct vm_area_struct *vma,
* with no zap or looping.
*/
for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
- unsigned long nr_pages = 0;
+ /* We count existing guard region pages each retry also. */
+ nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
err = walk_page_range_mm(vma->vm_mm, start, end,
&guard_install_walk_ops, &nr_pages);
if (err < 0)
- return err;
+ break;
if (err == 0) {
unsigned long nr_expected_pages = PHYS_PFN(end - start);
VM_WARN_ON(nr_pages != nr_expected_pages);
- return 0;
+ break;
}
/*
@@ -1172,12 +1163,19 @@ static long madvise_guard_install(struct vm_area_struct *vma,
zap_page_range_single(vma, start, end - start, NULL);
}
+ /* Ensure that page tables are propagated on fork. */
+ if (nr_pages > 0)
+ vma_set_anon_vma_unfaulted(vma);
+
/*
* We were unable to install the guard pages due to being raced by page
* faults. This should not happen ordinarily. We return to userspace and
* immediately retry, relieving lock contention.
*/
- return restart_syscall();
+ if (err > 0)
+ return restart_syscall();
+
+ return err;
}
static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
@@ -1229,6 +1227,8 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ long err;
+
*prev = vma;
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
@@ -1237,8 +1237,21 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
if (!is_valid_guard_vma(vma, /* allow_locked = */true))
return -EINVAL;
- return walk_page_range(vma->vm_mm, start, end,
- &guard_remove_walk_ops, NULL);
+ err = walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+
+ /*
+ * If we have successfully cleared the guard flags, and we span the
+ * whole VMA, clear the unfaulted state so this VMA doesn't
+ * unnecessarily propagate page tables.
+ *
+ * The operation is protected via mm->page_table_lock avoiding races
+ * with a guard install operation.
+ */
+ if (!err && start == vma->vm_start && end == vma->vm_end)
+ vma_clear_anon_vma_unfaulted(vma);
+
+ return err;
}
/*
We have introduced the ability to indicate that a VMA's anon_vma field is 'unfaulted', that is that we have a desire for there to be propagation of page tables on fork, but no anon_vma is yet initialised. Utilise that on guard region installation (via MADV_GUARD_INSTALL) to ensure that page table propagation on fork occurs, but without occupying one byte of memory more than is required. Note that this is a no-op if a 'real' anon_vma is already in place. This also avoids any issue with THP inferring that it should not immediately attempt huge page collapse. More importantly, for file-backed mappings, this avoids otherwise unnecessary kernel memory allocation purely for the purposes of indicating on-fork page table propagation requirements. We adjust when we do this, so we do it only after a successful guard region installation and one which installs a guard region of at least one page in size. This means we only set this once guard regions are definitely installed. We are safe from a fork racing here, because we hold the mmap read lock, and the fork requires a write lock. We also adjust MADV_GUARD_REMOVE to remove this flag if the range specified spans the entire VMA (and no 'real' anon_vma has been installed yet), meaning we do not cause unnecessary page table propagation. This is protected from racing with guard region installation through use of the mm->page_table_lock, which is being used to prevent races between mmap read-locked modifiers of vma->anon_vma. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- mm/madvise.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-)