diff mbox series

[06/13] mm: pagewalk: Add 'depth' parameter to pte_hole

Message ID 20190215170235.23360-7-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series Convert x86 & arm64 to use generic page walk | expand

Commit Message

Steven Price Feb. 15, 2019, 5:02 p.m. UTC
The pte_hole() callback is called at multiple levels of the page tables.
Code dumping the kernel page tables needs to know what at what depth
the missing entry is. Add this is an extra parameter to pte_hole().
When the depth isn't know (e.g. processing a vma) then -1 is passed.

Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their
natural numbers as levels 2/3/4.

Signed-off-by: Steven Price <steven.price@arm.com>
---
 fs/proc/task_mmu.c |  4 ++--
 include/linux/mm.h |  5 +++--
 mm/hmm.c           |  2 +-
 mm/migrate.c       |  1 +
 mm/mincore.c       |  1 +
 mm/pagewalk.c      | 16 ++++++++++------
 6 files changed, 18 insertions(+), 11 deletions(-)

Comments

Mark Rutland Feb. 18, 2019, 11:23 a.m. UTC | #1
On Fri, Feb 15, 2019 at 05:02:27PM +0000, Steven Price wrote:
> The pte_hole() callback is called at multiple levels of the page tables.
> Code dumping the kernel page tables needs to know what at what depth
> the missing entry is. Add this is an extra parameter to pte_hole().
> When the depth isn't know (e.g. processing a vma) then -1 is passed.
>
> Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their
> natural numbers as levels 2/3/4.
>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>  fs/proc/task_mmu.c |  4 ++--
>  include/linux/mm.h |  5 +++--
>  mm/hmm.c           |  2 +-
>  mm/migrate.c       |  1 +
>  mm/mincore.c       |  1 +
>  mm/pagewalk.c      | 16 ++++++++++------
>  6 files changed, 18 insertions(+), 11 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index f0ec9edab2f3..91131cd4e9e0 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -474,7 +474,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>
>  #ifdef CONFIG_SHMEM
>  static int smaps_pte_hole(unsigned long addr, unsigned long end,
> -struct mm_walk *walk)
> +  __always_unused int depth, struct mm_walk *walk)
>  {
>  struct mem_size_stats *mss = walk->private;
>
> @@ -1203,7 +1203,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
>  }
>
>  static int pagemap_pte_hole(unsigned long start, unsigned long end,
> -struct mm_walk *walk)
> +    __always_unused int depth, struct mm_walk *walk)
>  {
>  struct pagemapread *pm = walk->private;
>  unsigned long addr = start;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1a4b1615d012..0418a018d7b3 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1420,7 +1420,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>   *       pmd_trans_huge() pmds.  They may simply choose to
>   *       split_huge_page() instead of handling it explicitly.
>   * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
> - * @pte_hole: if set, called for each hole at all levels
> + * @pte_hole: if set, called for each hole at all levels,
> + *            depth is -1 if not known
>   * @hugetlb_entry: if set, called for each hugetlb entry
>   * @test_walk: caller specific callback function to determine whether
>   *             we walk over the current vma or not. Returning 0
> @@ -1445,7 +1446,7 @@ struct mm_walk {
>  int (*pte_entry)(pte_t *pte, unsigned long addr,
>   unsigned long next, struct mm_walk *walk);
>  int (*pte_hole)(unsigned long addr, unsigned long next,
> -struct mm_walk *walk);
> +int depth, struct mm_walk *walk);
>  int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
>       unsigned long addr, unsigned long next,
>       struct mm_walk *walk);
> diff --git a/mm/hmm.c b/mm/hmm.c
> index a04e4b810610..e3e6b8fda437 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -440,7 +440,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
>  }
>
>  static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> -     struct mm_walk *walk)
> +     __always_unused int depth, struct mm_walk *walk)
>  {
>  struct hmm_vma_walk *hmm_vma_walk = walk->private;
>  struct hmm_range *range = hmm_vma_walk->range;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index d4fd680be3b0..8b62a9fecb5c 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2121,6 +2121,7 @@ struct migrate_vma {
>
>  static int migrate_vma_collect_hole(unsigned long start,
>      unsigned long end,
> +    __always_unused int depth,
>      struct mm_walk *walk)
>  {
>  struct migrate_vma *migrate = walk->private;
> diff --git a/mm/mincore.c b/mm/mincore.c
> index 218099b5ed31..c4edbc688241 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -104,6 +104,7 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
>  }
>
>  static int mincore_unmapped_range(unsigned long addr, unsigned long end,
> +   __always_unused int depth,
>     struct mm_walk *walk)
>  {
>  walk->private += __mincore_unmapped_range(addr, end,
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index dac0c848b458..b8038f852f06 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -38,7 +38,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  next = pmd_addr_end(addr, end);
>  if (pmd_none(*pmd)) {
>  if (walk->pte_hole)
> -err = walk->pte_hole(addr, next, walk);
> +err = walk->pte_hole(addr, next, 3, walk);
>  if (err)
>  break;
>  continue;
> @@ -88,7 +88,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>  next = pud_addr_end(addr, end);
>  if (pud_none(*pud)) {
>  if (walk->pte_hole)
> -err = walk->pte_hole(addr, next, walk);
> +err = walk->pte_hole(addr, next, 2, walk);
>  if (err)
>  break;
>  continue;
> @@ -123,13 +123,17 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
>  p4d_t *p4d;
>  unsigned long next;
>  int err = 0;
> +/* If the p4ds are actually just pgds then we should report a depth
> + * of 0 not 1 (as a missing entry is really a missing pgd
> + */

Nit: comment style violation. This should look like:
should be:

/*
 * If the p4ds are actually just pgds then we should report a depth
 * of 0 not 1 (as a missing entry is really a missing pgd
 */

> +int depth = (PTRS_PER_P4D == 1)?0:1;

Nit: the ternary should have spacing.

We don't seem to do this at any other level that could be folded, so why
does p4d need special care?

For example, what happens on arm64 when using 64K pages and 3 level
paging, where puds are folded into pgds?

Thanks,
Mark.

>
>  p4d = p4d_offset(pgd, addr);
>  do {
>  next = p4d_addr_end(addr, end);
>  if (p4d_none_or_clear_bad(p4d)) {
>  if (walk->pte_hole)
> -err = walk->pte_hole(addr, next, walk);
> +err = walk->pte_hole(addr, next, depth, walk);
>  if (err)
>  break;
>  continue;
> @@ -160,7 +164,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
>  next = pgd_addr_end(addr, end);
>  if (pgd_none_or_clear_bad(pgd)) {
>  if (walk->pte_hole)
> -err = walk->pte_hole(addr, next, walk);
> +err = walk->pte_hole(addr, next, 0, walk);
>  if (err)
>  break;
>  continue;
> @@ -206,7 +210,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
>  if (pte)
>  err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
>  else if (walk->pte_hole)
> -err = walk->pte_hole(addr, next, walk);
> +err = walk->pte_hole(addr, next, -1, walk);
>
>  if (err)
>  break;
> @@ -249,7 +253,7 @@ static int walk_page_test(unsigned long start, unsigned long end,
>  if (vma->vm_flags & VM_PFNMAP) {
>  int err = 1;
>  if (walk->pte_hole)
> -err = walk->pte_hole(start, end, walk);
> +err = walk->pte_hole(start, end, -1, walk);
>  return err ? err : 1;
>  }
>  return 0;
> --
> 2.20.1
>
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Steven Price Feb. 18, 2019, 3:23 p.m. UTC | #2
On 18/02/2019 11:23, Mark Rutland wrote:
> On Fri, Feb 15, 2019 at 05:02:27PM +0000, Steven Price wrote:
>> +/* If the p4ds are actually just pgds then we should report a depth
>> + * of 0 not 1 (as a missing entry is really a missing pgd
>> + */
> 
> Nit: comment style violation. This should look like:
> should be:
> 
> /*
>  * If the p4ds are actually just pgds then we should report a depth
>  * of 0 not 1 (as a missing entry is really a missing pgd
>  */
> 
>> +int depth = (PTRS_PER_P4D == 1)?0:1;
> 
> Nit: the ternary should have spacing.
> 
> We don't seem to do this at any other level that could be folded, so why
> does p4d need special care?
> 
> For example, what happens on arm64 when using 64K pages and 3 level
> paging, where puds are folded into pgds?
> 
> Thanks,
> Mark.

Yes, you are entirely correct I've missed the other potential foldings.
I somehow imagined that p4d was special and was folded the opposite
direction (I'm not sure why!).

The best solution I can come up with is a function which will convert
from the level the entry is found at, back to the 'real' level the entry
was missing at. This is needed to produce the correct output in the
debugfs file. Something like:

static int real_depth(int depth)
{
	if (depth == 3 && PTRS_PER_PMD == 1)
		depth = 2;
	if (depth == 2 && PTRS_PER_PUD == 1)
		depth = 1;
	if (depth == 1 && PTRS_PER_P4D == 1)
		depth = 0;
	return depth;
}

This should of course get folded by the compiler and not actually
generate any code.

Thanks,

Steve
William Kucharski Feb. 20, 2019, 11:35 a.m. UTC | #3
> On Feb 15, 2019, at 10:02 AM, Steven Price <Steven.Price@arm.com> wrote:
> 
> The pte_hole() callback is called at multiple levels of the page tables.
> Code dumping the kernel page tables needs to know what at what depth
> the missing entry is. Add this is an extra parameter to pte_hole().
> When the depth isn't know (e.g. processing a vma) then -1 is passed.
> 
> Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their
> natural numbers as levels 2/3/4.

Nit: Could you add a comment noting this for anyone wondering how to
calculate the level numbers in the future?

Thanks!
Steven Price Feb. 20, 2019, 2:10 p.m. UTC | #4
On 20/02/2019 11:35, William Kucharski wrote:
> 
> 
>> On Feb 15, 2019, at 10:02 AM, Steven Price <Steven.Price@arm.com> wrote:
>>
>> The pte_hole() callback is called at multiple levels of the page tables.
>> Code dumping the kernel page tables needs to know what at what depth
>> the missing entry is. Add this is an extra parameter to pte_hole().
>> When the depth isn't know (e.g. processing a vma) then -1 is passed.
>>
>> Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their
>> natural numbers as levels 2/3/4.
> 
> Nit: Could you add a comment noting this for anyone wondering how to
> calculate the level numbers in the future?

Good point! I'll expand the comment in the header file.

Thanks,

Steve
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..91131cd4e9e0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -474,7 +474,7 @@  static void smaps_account(struct mem_size_stats *mss, struct page *page,
 
 #ifdef CONFIG_SHMEM
 static int smaps_pte_hole(unsigned long addr, unsigned long end,
-		struct mm_walk *walk)
+			  __always_unused int depth, struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 
@@ -1203,7 +1203,7 @@  static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
 }
 
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
-				struct mm_walk *walk)
+			    __always_unused int depth, struct mm_walk *walk)
 {
 	struct pagemapread *pm = walk->private;
 	unsigned long addr = start;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a4b1615d012..0418a018d7b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1420,7 +1420,8 @@  void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  *	       pmd_trans_huge() pmds.  They may simply choose to
  *	       split_huge_page() instead of handling it explicitly.
  * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
- * @pte_hole: if set, called for each hole at all levels
+ * @pte_hole: if set, called for each hole at all levels,
+ *            depth is -1 if not known
  * @hugetlb_entry: if set, called for each hugetlb entry
  * @test_walk: caller specific callback function to determine whether
  *             we walk over the current vma or not. Returning 0
@@ -1445,7 +1446,7 @@  struct mm_walk {
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_hole)(unsigned long addr, unsigned long next,
-			struct mm_walk *walk);
+			int depth, struct mm_walk *walk);
 	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
 			     unsigned long addr, unsigned long next,
 			     struct mm_walk *walk);
diff --git a/mm/hmm.c b/mm/hmm.c
index a04e4b810610..e3e6b8fda437 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -440,7 +440,7 @@  static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 }
 
 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
-			     struct mm_walk *walk)
+			     __always_unused int depth, struct mm_walk *walk)
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
diff --git a/mm/migrate.c b/mm/migrate.c
index d4fd680be3b0..8b62a9fecb5c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2121,6 +2121,7 @@  struct migrate_vma {
 
 static int migrate_vma_collect_hole(unsigned long start,
 				    unsigned long end,
+				    __always_unused int depth,
 				    struct mm_walk *walk)
 {
 	struct migrate_vma *migrate = walk->private;
diff --git a/mm/mincore.c b/mm/mincore.c
index 218099b5ed31..c4edbc688241 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -104,6 +104,7 @@  static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
 }
 
 static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+				   __always_unused int depth,
 				   struct mm_walk *walk)
 {
 	walk->private += __mincore_unmapped_range(addr, end,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index dac0c848b458..b8038f852f06 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -38,7 +38,7 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none(*pmd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
+				err = walk->pte_hole(addr, next, 3, walk);
 			if (err)
 				break;
 			continue;
@@ -88,7 +88,7 @@  static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 		next = pud_addr_end(addr, end);
 		if (pud_none(*pud)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
+				err = walk->pte_hole(addr, next, 2, walk);
 			if (err)
 				break;
 			continue;
@@ -123,13 +123,17 @@  static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	p4d_t *p4d;
 	unsigned long next;
 	int err = 0;
+	/* If the p4ds are actually just pgds then we should report a depth
+	 * of 0 not 1 (as a missing entry is really a missing pgd
+	 */
+	int depth = (PTRS_PER_P4D == 1)?0:1;
 
 	p4d = p4d_offset(pgd, addr);
 	do {
 		next = p4d_addr_end(addr, end);
 		if (p4d_none_or_clear_bad(p4d)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
+				err = walk->pte_hole(addr, next, depth, walk);
 			if (err)
 				break;
 			continue;
@@ -160,7 +164,7 @@  static int walk_pgd_range(unsigned long addr, unsigned long end,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
+				err = walk->pte_hole(addr, next, 0, walk);
 			if (err)
 				break;
 			continue;
@@ -206,7 +210,7 @@  static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 		if (pte)
 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
 		else if (walk->pte_hole)
-			err = walk->pte_hole(addr, next, walk);
+			err = walk->pte_hole(addr, next, -1, walk);
 
 		if (err)
 			break;
@@ -249,7 +253,7 @@  static int walk_page_test(unsigned long start, unsigned long end,
 	if (vma->vm_flags & VM_PFNMAP) {
 		int err = 1;
 		if (walk->pte_hole)
-			err = walk->pte_hole(start, end, walk);
+			err = walk->pte_hole(start, end, -1, walk);
 		return err ? err : 1;
 	}
 	return 0;