diff mbox series

[1/3] mm: add apply_to_existing_pages helper

Message ID 20191205140407.1874-1-dja@axtens.net (mailing list archive)
State New, archived
Headers show
Series [1/3] mm: add apply_to_existing_pages helper | expand

Commit Message

Daniel Axtens Dec. 5, 2019, 2:04 p.m. UTC
apply_to_page_range takes an address range, and if any parts of it
are not covered by the existing page table hierarchy, it allocates
memory to fill them in.

In some use cases, this is not what we want - we want to be able to
operate exclusively on PTEs that are already in the tables.

Add apply_to_existing_pages for this. Adjust the walker functions
for apply_to_page_range to take 'create', which switches them between
the old and new modes.

This will be used in KASAN vmalloc.

Signed-off-by: Daniel Axtens <dja@axtens.net>
---
 include/linux/mm.h |   3 ++
 mm/memory.c        | 131 +++++++++++++++++++++++++++++++++------------
 2 files changed, 99 insertions(+), 35 deletions(-)

Comments

Andrey Ryabinin Dec. 6, 2019, 4:24 p.m. UTC | #1
On 12/5/19 5:04 PM, Daniel Axtens wrote:
> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
> 
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
> 
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.
> 
> This will be used in KASAN vmalloc.
> 
> Signed-off-by: Daniel Axtens <dja@axtens.net>

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Andrew Morton Dec. 7, 2019, 12:37 a.m. UTC | #2
On Fri,  6 Dec 2019 01:04:05 +1100 Daniel Axtens <dja@axtens.net> wrote:

> +/*
> + * Scan a region of virtual memory, calling a provided function on
> + * each leaf page table where it exists.
> + *
> + * Unlike apply_to_page_range, this does _not_ fill in page tables
> + * where they are absent.
> + */
> +int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> +			    unsigned long size, pte_fn_t fn, void *data)
> +{
> +	pgd_t *pgd;
> +	unsigned long next;
> +	unsigned long end = addr + size;
> +	int err = 0;
> +
> +	if (WARN_ON(addr >= end))
> +		return -EINVAL;
> +
> +	pgd = pgd_offset(mm, addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (pgd_none_or_clear_bad(pgd))
> +			continue;
> +		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
> +		if (err)
> +			break;
> +	} while (pgd++, addr = next, addr != end);
> +
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(apply_to_existing_pages);

This is almost identical to apply_to_page_range() and cries out for
some deduplication.  This?

--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix
+++ a/mm/memory.c
@@ -2141,12 +2141,9 @@ static int apply_to_p4d_range(struct mm_
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn,
+				 void *data, bool create)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2159,13 +2156,25 @@ int apply_to_page_range(struct mm_struct
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
+		if (!create && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	return __apply_to_page_range(mm, addr, size, fn, data, true);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
@@ -2178,25 +2187,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
 int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
 			    unsigned long size, pte_fn_t fn, void *data)
 {
-	pgd_t *pgd;
-	unsigned long next;
-	unsigned long end = addr + size;
-	int err = 0;
-
-	if (WARN_ON(addr >= end))
-		return -EINVAL;
-
-	pgd = pgd_offset(mm, addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
-		if (err)
-			break;
-	} while (pgd++, addr = next, addr != end);
-
-	return err;
+	return __apply_to_page_range(mm, addr, size, fn, data, false);
 }
 EXPORT_SYMBOL_GPL(apply_to_existing_pages);
Andrew Morton Dec. 7, 2019, 12:38 a.m. UTC | #3
On Fri,  6 Dec 2019 01:04:05 +1100 Daniel Axtens <dja@axtens.net> wrote:

> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
> 
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
> 
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.

Wouldn't apply_to_existing_page_range() be a better name?

--- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/include/linux/mm.h
@@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
-extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
-				   unsigned long size, pte_fn_t fn,
-				   void *data);
+extern int apply_to_existing_page_range(struct mm_struct *mm,
+				   unsigned long address, unsigned long size,
+				   pte_fn_t fn, void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/mm/memory.c
@@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
  * Unlike apply_to_page_range, this does _not_ fill in page tables
  * where they are absent.
  */
-int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
-			    unsigned long size, pte_fn_t fn, void *data)
+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
+				 unsigned long size, pte_fn_t fn, void *data)
 {
 	return __apply_to_page_range(mm, addr, size, fn, data, false);
 }
-EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
 
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
Daniel Axtens Dec. 7, 2019, 2:16 a.m. UTC | #4
>
> Wouldn't apply_to_existing_page_range() be a better name?

I agree with both of those fixups, thanks!

Regards,
Daniel

>
> --- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/include/linux/mm.h
> @@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
>  typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
>  extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
>  			       unsigned long size, pte_fn_t fn, void *data);
> -extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
> -				   unsigned long size, pte_fn_t fn,
> -				   void *data);
> +extern int apply_to_existing_page_range(struct mm_struct *mm,
> +				   unsigned long address, unsigned long size,
> +				   pte_fn_t fn, void *data);
>  
>  #ifdef CONFIG_PAGE_POISONING
>  extern bool page_poisoning_enabled(void);
> --- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/mm/memory.c
> @@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
>   * Unlike apply_to_page_range, this does _not_ fill in page tables
>   * where they are absent.
>   */
> -int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> -			    unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
> +				 unsigned long size, pte_fn_t fn, void *data)
>  {
>  	return __apply_to_page_range(mm, addr, size, fn, data, false);
>  }
> -EXPORT_SYMBOL_GPL(apply_to_existing_pages);
> +EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
>  
>  /*
>   * handle_pte_fault chooses page fault handler according to an entry which was
> _
Christoph Hellwig Dec. 9, 2019, 7:34 a.m. UTC | #5
Or a flags argument with descriptive flags to the existing function?
These magic bool arguments don't scale..
Andrew Morton Dec. 11, 2019, 1:31 a.m. UTC | #6
On Sun, 8 Dec 2019 23:34:58 -0800 Christoph Hellwig <hch@infradead.org> wrote:

> Or a flags argument with descriptive flags to the existing function?
> These magic bool arguments don't scale..

True.  But it's easy enough to do s/bool create/enum mode/ in the
future should the need arise.  For now, the code is clearer.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c97ea3b694e6..f4dba827d76e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2621,6 +2621,9 @@  static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
+extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
+				   unsigned long size, pte_fn_t fn,
+				   void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 606da187d1de..e508ba7e0a19 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2021,26 +2021,34 @@  EXPORT_SYMBOL(vm_iomap_memory);
 
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pte_t *pte;
-	int err;
+	int err = 0;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
-		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
-	if (!pte)
-		return -ENOMEM;
+	if (create) {
+		pte = (mm == &init_mm) ?
+			pte_alloc_kernel(pmd, addr) :
+			pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		if (!pte)
+			return -ENOMEM;
+	} else {
+		pte = (mm == &init_mm) ?
+			pte_offset_kernel(pmd, addr) :
+			pte_offset_map_lock(mm, pmd, addr, &ptl);
+	}
 
 	BUG_ON(pmd_huge(*pmd));
 
 	arch_enter_lazy_mmu_mode();
 
 	do {
-		err = fn(pte++, addr, data);
-		if (err)
-			break;
+		if (create || !pte_none(*pte)) {
+			err = fn(pte++, addr, data);
+			if (err)
+				break;
+		}
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
@@ -2052,62 +2060,83 @@  static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return -ENOMEM;
+	if (create) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (!pmd)
+			return -ENOMEM;
+	} else {
+		pmd = pmd_offset(pud, addr);
+	}
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pmd_none_or_clear_bad(pmd)) {
+			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
-	if (!pud)
-		return -ENOMEM;
+	if (create) {
+		pud = pud_alloc(mm, p4d, addr);
+		if (!pud)
+			return -ENOMEM;
+	} else {
+		pud = pud_offset(p4d, addr);
+	}
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !pud_none_or_clear_bad(pud)) {
+			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
 static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+				     pte_fn_t fn, void *data, bool create)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
-	if (!p4d)
-		return -ENOMEM;
+	if (create) {
+		p4d = p4d_alloc(mm, pgd, addr);
+		if (!p4d)
+			return -ENOMEM;
+	} else {
+		p4d = p4d_offset(pgd, addr);
+	}
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
-		if (err)
-			break;
+		if (create || !p4d_none_or_clear_bad(p4d)) {
+			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
+						 create);
+			if (err)
+				break;
+		}
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
@@ -2130,7 +2159,7 @@  int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
@@ -2139,6 +2168,38 @@  int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+/*
+ * Scan a region of virtual memory, calling a provided function on
+ * each leaf page table where it exists.
+ *
+ * Unlike apply_to_page_range, this does _not_ fill in page tables
+ * where they are absent.
+ */
+int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
+			    unsigned long size, pte_fn_t fn, void *data)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	int err = 0;
+
+	if (WARN_ON(addr >= end))
+		return -EINVAL;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures