Message ID | 20190612064243.55340-3-thellstrom@vmwopensource.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Emulated coherent graphics memory | expand |
On Wed, Jun 12, 2019 at 08:42:36AM +0200, Thomas Hellström (VMware) wrote: > From: Thomas Hellstrom <thellstrom@vmware.com> > > This is basically apply_to_page_range with added functionality: > Allocating missing parts of the page table becomes optional, which > means that the function can be guaranteed not to error if allocation > is disabled. Also passing of the closure struct and callback function > becomes different and more in line with how things are done elsewhere. > > Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range > > The reason for not using the page-walk code is that we want to perform > the page-walk on vmas pointing to an address space without requiring the > mmap_sem to be held rather than on vmas belonging to a process with the > mmap_sem held. > > Notable changes since RFC: > Don't export apply_to_pfn range. > > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Will Deacon <will.deacon@arm.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Rik van Riel <riel@surriel.com> > Cc: Minchan Kim <minchan@kernel.org> > Cc: Michal Hocko <mhocko@suse.com> > Cc: Huang Ying <ying.huang@intel.com> > Cc: Souptick Joarder <jrdr.linux@gmail.com> > Cc: "Jérôme Glisse" <jglisse@redhat.com> > Cc: linux-mm@kvack.org > Cc: linux-kernel@vger.kernel.org > > Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com> > Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1 > --- > include/linux/mm.h | 10 ++++ > mm/memory.c | 135 ++++++++++++++++++++++++++++++++++----------- > 2 files changed, 113 insertions(+), 32 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0e8834ac32b7..3d06ce2a64af 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, > extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, > unsigned long size, pte_fn_t fn, void *data); > > +struct pfn_range_apply; > +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, > + struct pfn_range_apply *closure); > +struct pfn_range_apply { > + struct mm_struct *mm; > + pter_fn_t ptefn; > + unsigned int alloc; > +}; > +extern int apply_to_pfn_range(struct pfn_range_apply *closure, > + unsigned long address, unsigned long size); > > #ifdef CONFIG_PAGE_POISONING > extern bool page_poisoning_enabled(void); > diff --git a/mm/memory.c b/mm/memory.c > index 168f546af1ad..462aa47f8878 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long > } > EXPORT_SYMBOL(vm_iomap_memory); > > -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > - unsigned long addr, unsigned long end, > - pte_fn_t fn, void *data) > +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, > + unsigned long addr, unsigned long end) > { > pte_t *pte; > int err; > pgtable_t token; > spinlock_t *uninitialized_var(ptl); > > - pte = (mm == &init_mm) ? > + pte = (closure->mm == &init_mm) ? > pte_alloc_kernel(pmd, addr) : > - pte_alloc_map_lock(mm, pmd, addr, &ptl); > + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl); > if (!pte) > return -ENOMEM; > > @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, > token = pmd_pgtable(*pmd); > > do { > - err = fn(pte++, token, addr, data); > + err = closure->ptefn(pte++, token, addr, closure); > if (err) > break; > } while (addr += PAGE_SIZE, addr != end); > > arch_leave_lazy_mmu_mode(); > > - if (mm != &init_mm) > + if (closure->mm != &init_mm) > pte_unmap_unlock(pte-1, ptl); > return err; > } > > -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, > - unsigned long addr, unsigned long end, > - pte_fn_t fn, void *data) > +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, > + unsigned long addr, unsigned long end) > { > pmd_t *pmd; > unsigned long next; > - int err; > + int err = 0; > > BUG_ON(pud_huge(*pud)); > > - pmd = pmd_alloc(mm, pud, addr); > + pmd = pmd_alloc(closure->mm, pud, addr); > if (!pmd) > return -ENOMEM; > + > do { > next = pmd_addr_end(addr, end); > - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); > + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) > + continue; > + err = apply_to_pte_range(closure, pmd, addr, next); > if (err) > break; > } while (pmd++, addr = next, addr != end); > return err; > } > > -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, > - unsigned long addr, unsigned long end, > - pte_fn_t fn, void *data) > +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d, > + unsigned long addr, unsigned long end) > { > pud_t *pud; > unsigned long next; > - int err; > + int err = 0; > > - pud = pud_alloc(mm, p4d, addr); > + pud = pud_alloc(closure->mm, p4d, addr); > if (!pud) > return -ENOMEM; > + > do { > next = pud_addr_end(addr, end); > - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); > + if (!closure->alloc && pud_none_or_clear_bad(pud)) > + continue; > + err = apply_to_pmd_range(closure, pud, addr, next); > if (err) > break; > } while (pud++, addr = next, addr != end); > return err; > } > > -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, > - unsigned long addr, unsigned long end, > - pte_fn_t fn, void *data) > +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd, > + unsigned long addr, unsigned long end) > { > p4d_t *p4d; > unsigned long next; > - int err; > + int err = 0; > > - p4d = p4d_alloc(mm, pgd, addr); > + p4d = p4d_alloc(closure->mm, pgd, addr); > if (!p4d) > return -ENOMEM; > + > do { > next = p4d_addr_end(addr, end); > - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); > + if (!closure->alloc && p4d_none_or_clear_bad(p4d)) > + continue; > + err = apply_to_pud_range(closure, p4d, addr, next); > if (err) > break; > } while (p4d++, addr = next, addr != end); > return err; > } > > -/* > - * Scan a region of virtual memory, filling in page tables as necessary > - * and calling a provided function on each leaf page table. > +/** > + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided > + * function on each leaf page table entry > + * @closure: Details about how to scan and what function to apply > + * @addr: Start virtual address > + * @size: Size of the region > + * > + * If @closure->alloc is set to 1, the function will fill in the page table > + * as necessary. Otherwise it will skip non-present parts. > + * Note: The caller must ensure that the range does not contain huge pages. > + * The caller must also assure that the proper mmu_notifier functions are > + * called before and after the call to apply_to_pfn_range. > + * > + * WARNING: Do not use this function unless you know exactly what you are > + * doing. It is lacking support for huge pages and transparent huge pages. > + * > + * Return: Zero on success. If the provided function returns a non-zero status, > + * the page table walk will terminate and that status will be returned. > + * If @closure->alloc is set to 1, then this function may also return memory > + * allocation errors arising from allocating page table memory. > */ > -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, > - unsigned long size, pte_fn_t fn, void *data) > +int apply_to_pfn_range(struct pfn_range_apply *closure, > + unsigned long addr, unsigned long size) > { > pgd_t *pgd; > unsigned long next; > @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, > if (WARN_ON(addr >= end)) > return -EINVAL; > > - pgd = pgd_offset(mm, addr); > + pgd = pgd_offset(closure->mm, addr); > do { > next = pgd_addr_end(addr, end); > - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); > + if (!closure->alloc && pgd_none_or_clear_bad(pgd)) > + continue; > + err = apply_to_p4d_range(closure, pgd, addr, next); > if (err) > break; > } while (pgd++, addr = next, addr != end); > > return err; > } > + > +/** > + * struct page_range_apply - Closure structure for apply_to_page_range() > + * @pter: The base closure structure we derive from > + * @fn: The leaf pte function to call > + * @data: The leaf pte function closure > + */ > +struct page_range_apply { > + struct pfn_range_apply pter; > + pte_fn_t fn; > + void *data; > +}; > + > +/* > + * Callback wrapper to enable use of apply_to_pfn_range for > + * the apply_to_page_range interface > + */ > +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token, > + unsigned long addr, > + struct pfn_range_apply *pter) > +{ > + struct page_range_apply *pra = > + container_of(pter, typeof(*pra), pter); > + > + return pra->fn(pte, token, addr, pra->data); > +} > + > +/* > + * Scan a region of virtual memory, filling in page tables as necessary > + * and calling a provided function on each leaf page table. > + * > + * WARNING: Do not use this function unless you know exactly what you are > + * doing. It is lacking support for huge pages and transparent huge pages. > + */ > +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, > + unsigned long size, pte_fn_t fn, void *data) > +{ > + struct page_range_apply pra = { > + .pter = {.mm = mm, > + .alloc = 1, > + .ptefn = apply_to_page_range_wrapper }, > + .fn = fn, > + .data = data > + }; > + > + return apply_to_pfn_range(&pra.pter, addr, size); > +} > > EXPORT_SYMBOL_GPL(apply_to_page_range); Actually - did you look into converting our two hand full of apply_to_page_range callers to your new scheme? It seems like that might actually not be to bad and avoid various layers of wrappers.
On 6/12/19 2:16 PM, Christoph Hellwig wrote: > On Wed, Jun 12, 2019 at 08:42:36AM +0200, Thomas Hellström (VMware) wrote: >> From: Thomas Hellstrom <thellstrom@vmware.com> >> >> This is basically apply_to_page_range with added functionality: >> Allocating missing parts of the page table becomes optional, which >> means that the function can be guaranteed not to error if allocation >> is disabled. Also passing of the closure struct and callback function >> becomes different and more in line with how things are done elsewhere. >> >> Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range >> >> The reason for not using the page-walk code is that we want to perform >> the page-walk on vmas pointing to an address space without requiring the >> mmap_sem to be held rather than on vmas belonging to a process with the >> mmap_sem held. >> >> Notable changes since RFC: >> Don't export apply_to_pfn range. >> >> Cc: Andrew Morton <akpm@linux-foundation.org> >> Cc: Matthew Wilcox <willy@infradead.org> >> Cc: Will Deacon <will.deacon@arm.com> >> Cc: Peter Zijlstra <peterz@infradead.org> >> Cc: Rik van Riel <riel@surriel.com> >> Cc: Minchan Kim <minchan@kernel.org> >> Cc: Michal Hocko <mhocko@suse.com> >> Cc: Huang Ying <ying.huang@intel.com> >> Cc: Souptick Joarder <jrdr.linux@gmail.com> >> Cc: "Jérôme Glisse" <jglisse@redhat.com> >> Cc: linux-mm@kvack.org >> Cc: linux-kernel@vger.kernel.org >> >> Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com> >> Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1 >> --- >> include/linux/mm.h | 10 ++++ >> mm/memory.c | 135 ++++++++++++++++++++++++++++++++++----------- >> 2 files changed, 113 insertions(+), 32 deletions(-) >> >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 0e8834ac32b7..3d06ce2a64af 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, >> extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, >> unsigned long size, pte_fn_t fn, void *data); >> >> +struct pfn_range_apply; >> +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, >> + struct pfn_range_apply *closure); >> +struct pfn_range_apply { >> + struct mm_struct *mm; >> + pter_fn_t ptefn; >> + unsigned int alloc; >> +}; >> +extern int apply_to_pfn_range(struct pfn_range_apply *closure, >> + unsigned long address, unsigned long size); >> >> #ifdef CONFIG_PAGE_POISONING >> extern bool page_poisoning_enabled(void); >> diff --git a/mm/memory.c b/mm/memory.c >> index 168f546af1ad..462aa47f8878 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long >> } >> EXPORT_SYMBOL(vm_iomap_memory); >> >> -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, >> - unsigned long addr, unsigned long end, >> - pte_fn_t fn, void *data) >> +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, >> + unsigned long addr, unsigned long end) >> { >> pte_t *pte; >> int err; >> pgtable_t token; >> spinlock_t *uninitialized_var(ptl); >> >> - pte = (mm == &init_mm) ? >> + pte = (closure->mm == &init_mm) ? >> pte_alloc_kernel(pmd, addr) : >> - pte_alloc_map_lock(mm, pmd, addr, &ptl); >> + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl); >> if (!pte) >> return -ENOMEM; >> >> @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, >> token = pmd_pgtable(*pmd); >> >> do { >> - err = fn(pte++, token, addr, data); >> + err = closure->ptefn(pte++, token, addr, closure); >> if (err) >> break; >> } while (addr += PAGE_SIZE, addr != end); >> >> arch_leave_lazy_mmu_mode(); >> >> - if (mm != &init_mm) >> + if (closure->mm != &init_mm) >> pte_unmap_unlock(pte-1, ptl); >> return err; >> } >> >> -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, >> - unsigned long addr, unsigned long end, >> - pte_fn_t fn, void *data) >> +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, >> + unsigned long addr, unsigned long end) >> { >> pmd_t *pmd; >> unsigned long next; >> - int err; >> + int err = 0; >> >> BUG_ON(pud_huge(*pud)); >> >> - pmd = pmd_alloc(mm, pud, addr); >> + pmd = pmd_alloc(closure->mm, pud, addr); >> if (!pmd) >> return -ENOMEM; >> + >> do { >> next = pmd_addr_end(addr, end); >> - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); >> + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) >> + continue; >> + err = apply_to_pte_range(closure, pmd, addr, next); >> if (err) >> break; >> } while (pmd++, addr = next, addr != end); >> return err; >> } >> >> -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, >> - unsigned long addr, unsigned long end, >> - pte_fn_t fn, void *data) >> +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d, >> + unsigned long addr, unsigned long end) >> { >> pud_t *pud; >> unsigned long next; >> - int err; >> + int err = 0; >> >> - pud = pud_alloc(mm, p4d, addr); >> + pud = pud_alloc(closure->mm, p4d, addr); >> if (!pud) >> return -ENOMEM; >> + >> do { >> next = pud_addr_end(addr, end); >> - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); >> + if (!closure->alloc && pud_none_or_clear_bad(pud)) >> + continue; >> + err = apply_to_pmd_range(closure, pud, addr, next); >> if (err) >> break; >> } while (pud++, addr = next, addr != end); >> return err; >> } >> >> -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, >> - unsigned long addr, unsigned long end, >> - pte_fn_t fn, void *data) >> +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd, >> + unsigned long addr, unsigned long end) >> { >> p4d_t *p4d; >> unsigned long next; >> - int err; >> + int err = 0; >> >> - p4d = p4d_alloc(mm, pgd, addr); >> + p4d = p4d_alloc(closure->mm, pgd, addr); >> if (!p4d) >> return -ENOMEM; >> + >> do { >> next = p4d_addr_end(addr, end); >> - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); >> + if (!closure->alloc && p4d_none_or_clear_bad(p4d)) >> + continue; >> + err = apply_to_pud_range(closure, p4d, addr, next); >> if (err) >> break; >> } while (p4d++, addr = next, addr != end); >> return err; >> } >> >> -/* >> - * Scan a region of virtual memory, filling in page tables as necessary >> - * and calling a provided function on each leaf page table. >> +/** >> + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided >> + * function on each leaf page table entry >> + * @closure: Details about how to scan and what function to apply >> + * @addr: Start virtual address >> + * @size: Size of the region >> + * >> + * If @closure->alloc is set to 1, the function will fill in the page table >> + * as necessary. Otherwise it will skip non-present parts. >> + * Note: The caller must ensure that the range does not contain huge pages. >> + * The caller must also assure that the proper mmu_notifier functions are >> + * called before and after the call to apply_to_pfn_range. >> + * >> + * WARNING: Do not use this function unless you know exactly what you are >> + * doing. It is lacking support for huge pages and transparent huge pages. >> + * >> + * Return: Zero on success. If the provided function returns a non-zero status, >> + * the page table walk will terminate and that status will be returned. >> + * If @closure->alloc is set to 1, then this function may also return memory >> + * allocation errors arising from allocating page table memory. >> */ >> -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, >> - unsigned long size, pte_fn_t fn, void *data) >> +int apply_to_pfn_range(struct pfn_range_apply *closure, >> + unsigned long addr, unsigned long size) >> { >> pgd_t *pgd; >> unsigned long next; >> @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, >> if (WARN_ON(addr >= end)) >> return -EINVAL; >> >> - pgd = pgd_offset(mm, addr); >> + pgd = pgd_offset(closure->mm, addr); >> do { >> next = pgd_addr_end(addr, end); >> - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); >> + if (!closure->alloc && pgd_none_or_clear_bad(pgd)) >> + continue; >> + err = apply_to_p4d_range(closure, pgd, addr, next); >> if (err) >> break; >> } while (pgd++, addr = next, addr != end); >> >> return err; >> } >> + >> +/** >> + * struct page_range_apply - Closure structure for apply_to_page_range() >> + * @pter: The base closure structure we derive from >> + * @fn: The leaf pte function to call >> + * @data: The leaf pte function closure >> + */ >> +struct page_range_apply { >> + struct pfn_range_apply pter; >> + pte_fn_t fn; >> + void *data; >> +}; >> + >> +/* >> + * Callback wrapper to enable use of apply_to_pfn_range for >> + * the apply_to_page_range interface >> + */ >> +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token, >> + unsigned long addr, >> + struct pfn_range_apply *pter) >> +{ >> + struct page_range_apply *pra = >> + container_of(pter, typeof(*pra), pter); >> + >> + return pra->fn(pte, token, addr, pra->data); >> +} >> + >> +/* >> + * Scan a region of virtual memory, filling in page tables as necessary >> + * and calling a provided function on each leaf page table. >> + * >> + * WARNING: Do not use this function unless you know exactly what you are >> + * doing. It is lacking support for huge pages and transparent huge pages. >> + */ >> +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, >> + unsigned long size, pte_fn_t fn, void *data) >> +{ >> + struct page_range_apply pra = { >> + .pter = {.mm = mm, >> + .alloc = 1, >> + .ptefn = apply_to_page_range_wrapper }, >> + .fn = fn, >> + .data = data >> + }; >> + >> + return apply_to_pfn_range(&pra.pter, addr, size); >> +} >> >> EXPORT_SYMBOL_GPL(apply_to_page_range); > Actually - did you look into converting our two hand full of > apply_to_page_range callers to your new scheme? It seems like that > might actually not be to bad and avoid various layers of wrappers. Yes, I had that in mind once this landed and got some serious testing. /Thomas
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e8834ac32b7..3d06ce2a64af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +struct pfn_range_apply; +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, + struct pfn_range_apply *closure); +struct pfn_range_apply { + struct mm_struct *mm; + pter_fn_t ptefn; + unsigned int alloc; +}; +extern int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long address, unsigned long size); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); diff --git a/mm/memory.c b/mm/memory.c index 168f546af1ad..462aa47f8878 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, + unsigned long addr, unsigned long end) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); - pte = (mm == &init_mm) ? + pte = (closure->mm == &init_mm) ? pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte++, token, addr, data); + err = closure->ptefn(pte++, token, addr, closure); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - if (mm != &init_mm) + if (closure->mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, + unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; - int err; + int err = 0; BUG_ON(pud_huge(*pud)); - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc(closure->mm, pud, addr); if (!pmd) return -ENOMEM; + do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) + continue; + err = apply_to_pte_range(closure, pmd, addr, next); if (err) break; } while (pmd++, addr = next, addr != end); return err; } -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d, + unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; - int err; + int err = 0; - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc(closure->mm, p4d, addr); if (!pud) return -ENOMEM; + do { next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (!closure->alloc && pud_none_or_clear_bad(pud)) + continue; + err = apply_to_pmd_range(closure, pud, addr, next); if (err) break; } while (pud++, addr = next, addr != end); return err; } -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd, + unsigned long addr, unsigned long end) { p4d_t *p4d; unsigned long next; - int err; + int err = 0; - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc(closure->mm, pgd, addr); if (!p4d) return -ENOMEM; + do { next = p4d_addr_end(addr, end); - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); + if (!closure->alloc && p4d_none_or_clear_bad(p4d)) + continue; + err = apply_to_pud_range(closure, p4d, addr, next); if (err) break; } while (p4d++, addr = next, addr != end); return err; } -/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. +/** + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided + * function on each leaf page table entry + * @closure: Details about how to scan and what function to apply + * @addr: Start virtual address + * @size: Size of the region + * + * If @closure->alloc is set to 1, the function will fill in the page table + * as necessary. Otherwise it will skip non-present parts. + * Note: The caller must ensure that the range does not contain huge pages. + * The caller must also assure that the proper mmu_notifier functions are + * called before and after the call to apply_to_pfn_range. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + * + * Return: Zero on success. If the provided function returns a non-zero status, + * the page table walk will terminate and that status will be returned. + * If @closure->alloc is set to 1, then this function may also return memory + * allocation errors arising from allocating page table memory. */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) +int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long addr, unsigned long size) { pgd_t *pgd; unsigned long next; @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (WARN_ON(addr >= end)) return -EINVAL; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(closure->mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); + if (!closure->alloc && pgd_none_or_clear_bad(pgd)) + continue; + err = apply_to_p4d_range(closure, pgd, addr, next); if (err) break; } while (pgd++, addr = next, addr != end); return err; } + +/** + * struct page_range_apply - Closure structure for apply_to_page_range() + * @pter: The base closure structure we derive from + * @fn: The leaf pte function to call + * @data: The leaf pte function closure + */ +struct page_range_apply { + struct pfn_range_apply pter; + pte_fn_t fn; + void *data; +}; + +/* + * Callback wrapper to enable use of apply_to_pfn_range for + * the apply_to_page_range interface + */ +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *pter) +{ + struct page_range_apply *pra = + container_of(pter, typeof(*pra), pter); + + return pra->fn(pte, token, addr, pra->data); +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + struct page_range_apply pra = { + .pter = {.mm = mm, + .alloc = 1, + .ptefn = apply_to_page_range_wrapper }, + .fn = fn, + .data = data + }; + + return apply_to_pfn_range(&pra.pter, addr, size); +} EXPORT_SYMBOL_GPL(apply_to_page_range); /*