diff mbox series

[05/45] mm/pagewalk: Enable walk_pmd_range to handle cont-pmds

Message ID 20240704043132.28501-6-osalvador@suse.de (mailing list archive)
State New
Headers show
Series hugetlb pagewalk unification | expand

Commit Message

Oscar Salvador July 4, 2024, 4:30 a.m. UTC
HugeTLB pages can be cont-pmd mapped, so teach walk_pmd_range to
handle those.
This will save us some cycles as we do it in one-shot instead of
calling in multiple times.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 include/linux/pgtable.h | 12 ++++++++++++
 mm/pagewalk.c           | 12 +++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

Comments

David Hildenbrand July 4, 2024, 3:41 p.m. UTC | #1
On 04.07.24 06:30, Oscar Salvador wrote:
> HugeTLB pages can be cont-pmd mapped, so teach walk_pmd_range to
> handle those.
> This will save us some cycles as we do it in one-shot instead of
> calling in multiple times.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>   include/linux/pgtable.h | 12 ++++++++++++
>   mm/pagewalk.c           | 12 +++++++++---
>   2 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 2a6a3cccfc36..3a7b8751747e 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1914,6 +1914,18 @@ typedef unsigned int pgtbl_mod_mask;
>   #define __pte_leaf_size(x,y) pte_leaf_size(y)
>   #endif
>   
> +#ifndef pmd_cont
> +#define pmd_cont(x) false
> +#endif
> +
> +#ifndef CONT_PMD_SIZE
> +#define CONT_PMD_SIZE 0
> +#endif
> +
> +#ifndef CONT_PMDS
> +#define CONT_PMDS 0
> +#endif
> +
>   /*
>    * We always define pmd_pfn for all archs as it's used in lots of generic
>    * code.  Now it happens too for pud_pfn (and can happen for larger
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index d93e77411482..a9c36f9e9820 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -81,11 +81,18 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>   	const struct mm_walk_ops *ops = walk->ops;
>   	int err = 0;
>   	int depth = real_depth(3);
> +	int cont_pmds;
>   
>   	pmd = pmd_offset(pud, addr);
>   	do {
>   again:
> -		next = pmd_addr_end(addr, end);
> +		if (pmd_cont(*pmd)) {
> +			cont_pmds = CONT_PMDS;
> +			next = pmd_cont_addr_end(addr, end);
> +		} else {
> +			cont_pmds = 1;
> +			next = pmd_addr_end(addr, end);
> +		}
>   		if (pmd_none(*pmd)) {
>   			if (ops->pte_hole)
>   				err = ops->pte_hole(addr, next, depth, walk);
> @@ -126,8 +133,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>   
>   		if (walk->action == ACTION_AGAIN)
>   			goto again;
> -
> -	} while (pmd++, addr = next, addr != end);
> +	} while (pmd += cont_pmds, addr = next, addr != end);

Similar to my other comment regarding PTE batching, this is very 
specific to architectures that support cont-pmds.

Yes, right now we only have that on architectures that support 
cont-pmd-sized hugetlb, but Willy is interested in us supporting+mapping 
folios > PMD_SIZE, whereby we'd want to batch even without arch-specific 
cont-pmd bits.

Similar to the other (pte) case, having a way to generically patch 
folios will me more beneficial. Note that cont-pmd/cont-pte is only 
relevant for present entries (-> mapping folios).
kernel test robot July 5, 2024, 4:56 p.m. UTC | #2
Hi Oscar,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on sj/damon/next powerpc/next powerpc/fixes linus/master v6.10-rc6 next-20240703]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Oscar-Salvador/arch-x86-Drop-own-definition-of-pgd-p4d_leaf/20240705-042640
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20240704043132.28501-6-osalvador%40suse.de
patch subject: [PATCH 05/45] mm/pagewalk: Enable walk_pmd_range to handle cont-pmds
config: um-allnoconfig (https://download.01.org/0day-ci/archive/20240706/202407060025.WIFWw7WY-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240706/202407060025.WIFWw7WY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407060025.WIFWw7WY-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from mm/pagewalk.c:3:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:14:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:548:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     548 |         val = __raw_readb(PCI_IOBASE + addr);
         |                           ~~~~~~~~~~ ^
   include/asm-generic/io.h:561:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     561 |         val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
      37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
         |                                                   ^
   In file included from mm/pagewalk.c:3:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:14:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:574:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     574 |         val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
      35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
         |                                                   ^
   In file included from mm/pagewalk.c:3:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:14:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:585:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     585 |         __raw_writeb(value, PCI_IOBASE + addr);
         |                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:595:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     595 |         __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:605:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     605 |         __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:693:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     693 |         readsb(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:701:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     701 |         readsw(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:709:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     709 |         readsl(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:718:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     718 |         writesb(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:727:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     727 |         writesw(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:736:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     736 |         writesl(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
>> mm/pagewalk.c:91:11: error: call to undeclared function 'pmd_cont_addr_end'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
      91 |                         next = pmd_cont_addr_end(addr, end);
         |                                ^
   12 warnings and 1 error generated.


vim +/pmd_cont_addr_end +91 mm/pagewalk.c

    75	
    76	static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
    77				  struct mm_walk *walk)
    78	{
    79		pmd_t *pmd;
    80		unsigned long next;
    81		const struct mm_walk_ops *ops = walk->ops;
    82		int err = 0;
    83		int depth = real_depth(3);
    84		int cont_pmds;
    85	
    86		pmd = pmd_offset(pud, addr);
    87		do {
    88	again:
    89			if (pmd_cont(*pmd)) {
    90				cont_pmds = CONT_PMDS;
  > 91				next = pmd_cont_addr_end(addr, end);
    92			} else {
    93				cont_pmds = 1;
    94				next = pmd_addr_end(addr, end);
    95			}
    96			if (pmd_none(*pmd)) {
    97				if (ops->pte_hole)
    98					err = ops->pte_hole(addr, next, depth, walk);
    99				if (err)
   100					break;
   101				continue;
   102			}
   103	
   104			walk->action = ACTION_SUBTREE;
   105	
   106			/*
   107			 * This implies that each ->pmd_entry() handler
   108			 * needs to know about pmd_trans_huge() pmds
   109			 */
   110			if (ops->pmd_entry)
   111				err = ops->pmd_entry(pmd, addr, next, walk);
   112			if (err)
   113				break;
   114	
   115			if (walk->action == ACTION_AGAIN)
   116				goto again;
   117	
   118			/*
   119			 * Check this here so we only break down trans_huge
   120			 * pages when we _need_ to
   121			 */
   122			if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
   123			    walk->action == ACTION_CONTINUE ||
   124			    !(ops->pte_entry))
   125				continue;
   126	
   127			if (walk->vma)
   128				split_huge_pmd(walk->vma, pmd, addr);
   129	
   130			err = walk_pte_range(pmd, addr, next, walk);
   131			if (err)
   132				break;
   133	
   134			if (walk->action == ACTION_AGAIN)
   135				goto again;
   136		} while (pmd += cont_pmds, addr = next, addr != end);
   137	
   138		return err;
   139	}
   140
diff mbox series

Patch

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a6a3cccfc36..3a7b8751747e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1914,6 +1914,18 @@  typedef unsigned int pgtbl_mod_mask;
 #define __pte_leaf_size(x,y) pte_leaf_size(y)
 #endif
 
+#ifndef pmd_cont
+#define pmd_cont(x) false
+#endif
+
+#ifndef CONT_PMD_SIZE
+#define CONT_PMD_SIZE 0
+#endif
+
+#ifndef CONT_PMDS
+#define CONT_PMDS 0
+#endif
+
 /*
  * We always define pmd_pfn for all archs as it's used in lots of generic
  * code.  Now it happens too for pud_pfn (and can happen for larger
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d93e77411482..a9c36f9e9820 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -81,11 +81,18 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	const struct mm_walk_ops *ops = walk->ops;
 	int err = 0;
 	int depth = real_depth(3);
+	int cont_pmds;
 
 	pmd = pmd_offset(pud, addr);
 	do {
 again:
-		next = pmd_addr_end(addr, end);
+		if (pmd_cont(*pmd)) {
+			cont_pmds = CONT_PMDS;
+			next = pmd_cont_addr_end(addr, end);
+		} else {
+			cont_pmds = 1;
+			next = pmd_addr_end(addr, end);
+		}
 		if (pmd_none(*pmd)) {
 			if (ops->pte_hole)
 				err = ops->pte_hole(addr, next, depth, walk);
@@ -126,8 +133,7 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 
 		if (walk->action == ACTION_AGAIN)
 			goto again;
-
-	} while (pmd++, addr = next, addr != end);
+	} while (pmd += cont_pmds, addr = next, addr != end);
 
 	return err;
 }