diff mbox series

[10/12] hugetlb: batch PMD split for bulk vmemmap dedup

Message ID 20230825190436.55045-11-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Batch hugetlb vmemmap modification operations | expand

Commit Message

Mike Kravetz Aug. 25, 2023, 7:04 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

In an effort to minimize amount of TLB flushes, batch all PMD splits
belonging to a range of pages in order to perform only 1 (global) TLB
flush. This brings down from 14.2secs into 7.9secs a 1T hugetlb
allocation.

Rebased by Mike Kravetz

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/hugetlb_vmemmap.c | 94 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 4 deletions(-)

Comments

kernel test robot Aug. 26, 2023, 5:56 a.m. UTC | #1
Hi Mike,

kernel test robot noticed the following build errors:

[auto build test ERROR on next-20230825]
[cannot apply to akpm-mm/mm-everything v6.5-rc7 v6.5-rc6 v6.5-rc5 linus/master v6.5-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-clear-flags-in-tail-pages-that-will-be-freed-individually/20230826-030805
base:   next-20230825
patch link:    https://lore.kernel.org/r/20230825190436.55045-11-mike.kravetz%40oracle.com
patch subject: [PATCH 10/12] hugetlb: batch PMD split for bulk vmemmap dedup
config: s390-randconfig-001-20230826 (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308261325.ipTttZHZ-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   mm/hugetlb_vmemmap.c:661:6: warning: no previous prototype for function 'hugetlb_vmemmap_optimize_bulk' [-Wmissing-prototypes]
     661 | void hugetlb_vmemmap_optimize_bulk(const struct hstate *h, struct page *head,
         |      ^
   mm/hugetlb_vmemmap.c:661:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     661 | void hugetlb_vmemmap_optimize_bulk(const struct hstate *h, struct page *head,
         | ^
         | static 
>> mm/hugetlb_vmemmap.c:667:6: warning: no previous prototype for function 'hugetlb_vmemmap_split' [-Wmissing-prototypes]
     667 | void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
         |      ^
   mm/hugetlb_vmemmap.c:667:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     667 | void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
         | ^
         | static 
>> mm/hugetlb_vmemmap.c:698:28: error: use of undeclared identifier 'TLB_FLUSH_ALL'
     698 |         flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
         |                                   ^
   2 warnings and 1 error generated.


vim +/TLB_FLUSH_ALL +698 mm/hugetlb_vmemmap.c

   666	
 > 667	void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
   668	{
   669		unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
   670		unsigned long vmemmap_reuse;
   671	
   672		if (!vmemmap_should_optimize(h, head))
   673			return;
   674	
   675		static_branch_inc(&hugetlb_optimize_vmemmap_key);
   676	
   677		vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
   678		vmemmap_reuse   = vmemmap_start;
   679		vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
   680	
   681		/*
   682		 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
   683		 * to the page which @vmemmap_reuse is mapped to, then free the pages
   684		 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
   685		 */
   686		if (vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse))
   687			static_branch_dec(&hugetlb_optimize_vmemmap_key);
   688	}
   689	
   690	void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
   691	{
   692		struct folio *folio;
   693		LIST_HEAD(vmemmap_pages);
   694	
   695		list_for_each_entry(folio, folio_list, lru)
   696			hugetlb_vmemmap_split(h, &folio->page);
   697	
 > 698		flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
   699	
   700		list_for_each_entry(folio, folio_list, lru)
   701			hugetlb_vmemmap_optimize_bulk(h, &folio->page, &vmemmap_pages);
   702	
   703		free_vmemmap_page_list(&vmemmap_pages);
   704	}
   705
kernel test robot Aug. 26, 2023, 6:14 p.m. UTC | #2
Hi Mike,

kernel test robot noticed the following build warnings:

[auto build test WARNING on next-20230825]
[cannot apply to akpm-mm/mm-everything v6.5-rc7 v6.5-rc6 v6.5-rc5 linus/master v6.5-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-clear-flags-in-tail-pages-that-will-be-freed-individually/20230826-030805
base:   next-20230825
patch link:    https://lore.kernel.org/r/20230825190436.55045-11-mike.kravetz%40oracle.com
patch subject: [PATCH 10/12] hugetlb: batch PMD split for bulk vmemmap dedup
config: x86_64-randconfig-r031-20230826 (https://download.01.org/0day-ci/archive/20230827/202308270248.eJT8UDG3-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230827/202308270248.eJT8UDG3-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308270248.eJT8UDG3-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/hugetlb_vmemmap.c:40: warning: Function parameter or member 'flags' not described in 'vmemmap_remap_walk'


vim +40 mm/hugetlb_vmemmap.c

f41f2ed43ca525 Muchun Song  2021-06-30  19  
998a2997885f73 Muchun Song  2022-06-28  20  /**
998a2997885f73 Muchun Song  2022-06-28  21   * struct vmemmap_remap_walk - walk vmemmap page table
998a2997885f73 Muchun Song  2022-06-28  22   *
998a2997885f73 Muchun Song  2022-06-28  23   * @remap_pte:		called for each lowest-level entry (PTE).
998a2997885f73 Muchun Song  2022-06-28  24   * @nr_walked:		the number of walked pte.
998a2997885f73 Muchun Song  2022-06-28  25   * @reuse_page:		the page which is reused for the tail vmemmap pages.
998a2997885f73 Muchun Song  2022-06-28  26   * @reuse_addr:		the virtual address of the @reuse_page page.
998a2997885f73 Muchun Song  2022-06-28  27   * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
998a2997885f73 Muchun Song  2022-06-28  28   *			or is mapped from.
506a27a4627ab7 Joao Martins 2023-08-25  29   * @flags		used to modify behavior in bulk operations
998a2997885f73 Muchun Song  2022-06-28  30   */
998a2997885f73 Muchun Song  2022-06-28  31  struct vmemmap_remap_walk {
998a2997885f73 Muchun Song  2022-06-28  32  	void			(*remap_pte)(pte_t *pte, unsigned long addr,
998a2997885f73 Muchun Song  2022-06-28  33  					     struct vmemmap_remap_walk *walk);
998a2997885f73 Muchun Song  2022-06-28  34  	unsigned long		nr_walked;
998a2997885f73 Muchun Song  2022-06-28  35  	struct page		*reuse_page;
998a2997885f73 Muchun Song  2022-06-28  36  	unsigned long		reuse_addr;
998a2997885f73 Muchun Song  2022-06-28  37  	struct list_head	*vmemmap_pages;
506a27a4627ab7 Joao Martins 2023-08-25  38  #define VMEMMAP_REMAP_ONLY_SPLIT	BIT(0)
506a27a4627ab7 Joao Martins 2023-08-25  39  	unsigned long		flags;
998a2997885f73 Muchun Song  2022-06-28 @40  };
998a2997885f73 Muchun Song  2022-06-28  41
Joao Martins Aug. 28, 2023, 9:42 a.m. UTC | #3
On 26/08/2023 06:56, kernel test robot wrote:
> Hi Mike,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on next-20230825]
> [cannot apply to akpm-mm/mm-everything v6.5-rc7 v6.5-rc6 v6.5-rc5 linus/master v6.5-rc7]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-clear-flags-in-tail-pages-that-will-be-freed-individually/20230826-030805
> base:   next-20230825
> patch link:    https://lore.kernel.org/r/20230825190436.55045-11-mike.kravetz%40oracle.com
> patch subject: [PATCH 10/12] hugetlb: batch PMD split for bulk vmemmap dedup
> config: s390-randconfig-001-20230826 (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/config)
> compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
> reproduce: (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202308261325.ipTttZHZ-lkp@intel.com/
> 
> All error/warnings (new ones prefixed by >>):
> 

[...]

>>> mm/hugetlb_vmemmap.c:698:28: error: use of undeclared identifier 'TLB_FLUSH_ALL'
>      698 |         flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
>          |                                   ^
>    2 warnings and 1 error generated.
> 
> 

TLB_FLUSH_ALL is x86 only so what I wrote above is wrong in what should be
architecture independent. The way I should have written the global TLB flush is
to use flush_tlb_all(), which is what is implemented by the arch.

The alternative is to compose a start/end tuple in the top-level optimize-folios
function as we iterate over folios to remap, and flush via
flush_tlb_kernel_range(). But this would likely only be relevant on x86 only,
that is to optimize the flushing of 3 contiguous 2M hugetlb pages (~24 vmemmap
pages) as that's where the TLB flush ceiling is put (31 pages) for per-page VA
flush, before falling back to a global TLB flush. Weren't sure of the added
complexity for dubious benefit thus kept it in global TLB flush.

> vim +/TLB_FLUSH_ALL +698 mm/hugetlb_vmemmap.c
> 
>    666	
>  > 667	void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>    668	{
>    669		unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>    670		unsigned long vmemmap_reuse;
>    671	
>    672		if (!vmemmap_should_optimize(h, head))
>    673			return;
>    674	
>    675		static_branch_inc(&hugetlb_optimize_vmemmap_key);
>    676	
>    677		vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>    678		vmemmap_reuse   = vmemmap_start;
>    679		vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>    680	
>    681		/*
>    682		 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
>    683		 * to the page which @vmemmap_reuse is mapped to, then free the pages
>    684		 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
>    685		 */
>    686		if (vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse))
>    687			static_branch_dec(&hugetlb_optimize_vmemmap_key);
>    688	}
>    689	
>    690	void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>    691	{
>    692		struct folio *folio;
>    693		LIST_HEAD(vmemmap_pages);
>    694	
>    695		list_for_each_entry(folio, folio_list, lru)
>    696			hugetlb_vmemmap_split(h, &folio->page);
>    697	
>  > 698		flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
>    699	
>    700		list_for_each_entry(folio, folio_list, lru)
>    701			hugetlb_vmemmap_optimize_bulk(h, &folio->page, &vmemmap_pages);
>    702	
>    703		free_vmemmap_page_list(&vmemmap_pages);
>    704	}
>    705	
>
Mike Kravetz Aug. 28, 2023, 4:44 p.m. UTC | #4
On 08/28/23 10:42, Joao Martins wrote:
> On 26/08/2023 06:56, kernel test robot wrote:
> > Hi Mike,
> > 
> > kernel test robot noticed the following build errors:
> > 
> > [auto build test ERROR on next-20230825]
> > [cannot apply to akpm-mm/mm-everything v6.5-rc7 v6.5-rc6 v6.5-rc5 linus/master v6.5-rc7]
> > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > And when submitting patch, we suggest to use '--base' as documented in
> > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-clear-flags-in-tail-pages-that-will-be-freed-individually/20230826-030805
> > base:   next-20230825
> > patch link:    https://lore.kernel.org/r/20230825190436.55045-11-mike.kravetz%40oracle.com
> > patch subject: [PATCH 10/12] hugetlb: batch PMD split for bulk vmemmap dedup
> > config: s390-randconfig-001-20230826 (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/config)
> > compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
> > reproduce: (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/reproduce)
> > 
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202308261325.ipTttZHZ-lkp@intel.com/
> > 
> > All error/warnings (new ones prefixed by >>):
> > 
> 
> [...]
> 
> >>> mm/hugetlb_vmemmap.c:698:28: error: use of undeclared identifier 'TLB_FLUSH_ALL'
> >      698 |         flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
> >          |                                   ^
> >    2 warnings and 1 error generated.
> > 
> > 
> 
> TLB_FLUSH_ALL is x86 only so what I wrote above is wrong in what should be
> architecture independent. The way I should have written the global TLB flush is
> to use flush_tlb_all(), which is what is implemented by the arch.
> 
> The alternative is to compose a start/end tuple in the top-level optimize-folios
> function as we iterate over folios to remap, and flush via
> flush_tlb_kernel_range(). But this would likely only be relevant on x86 only,
> that is to optimize the flushing of 3 contiguous 2M hugetlb pages (~24 vmemmap
> pages) as that's where the TLB flush ceiling is put (31 pages) for per-page VA
> flush, before falling back to a global TLB flush. Weren't sure of the added
> complexity for dubious benefit thus kept it in global TLB flush.

Thanks Joao.

I added my share of build issues to this RFC as can be seen in the bot
responses to other patches.

My assumption is that these build issues will not prevent people from
looking into and commenting on the bigger performance issue that was the
reason for this series.  The build issues would of course be resolved if
there is some concensus that this is the way to move forward to address
this issue.  If the build issues are a stumbling block for anyone to
look at this bigger issue, let me know and I will fix them all ASAP.
Muchun Song Aug. 29, 2023, 3:47 a.m. UTC | #5
> On Aug 29, 2023, at 00:44, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> On 08/28/23 10:42, Joao Martins wrote:
>> On 26/08/2023 06:56, kernel test robot wrote:
>>> Hi Mike,
>>> 
>>> kernel test robot noticed the following build errors:
>>> 
>>> [auto build test ERROR on next-20230825]
>>> [cannot apply to akpm-mm/mm-everything v6.5-rc7 v6.5-rc6 v6.5-rc5 linus/master v6.5-rc7]
>>> [If your patch is applied to the wrong git tree, kindly drop us a note.
>>> And when submitting patch, we suggest to use '--base' as documented in
>>> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>>> 
>>> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-clear-flags-in-tail-pages-that-will-be-freed-individually/20230826-030805
>>> base:   next-20230825
>>> patch link:    https://lore.kernel.org/r/20230825190436.55045-11-mike.kravetz%40oracle.com
>>> patch subject: [PATCH 10/12] hugetlb: batch PMD split for bulk vmemmap dedup
>>> config: s390-randconfig-001-20230826 (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/config)
>>> compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
>>> reproduce: (https://download.01.org/0day-ci/archive/20230826/202308261325.ipTttZHZ-lkp@intel.com/reproduce)
>>> 
>>> If you fix the issue in a separate patch/commit (i.e. not just a new version of
>>> the same patch/commit), kindly add following tags
>>> | Reported-by: kernel test robot <lkp@intel.com>
>>> | Closes: https://lore.kernel.org/oe-kbuild-all/202308261325.ipTttZHZ-lkp@intel.com/
>>> 
>>> All error/warnings (new ones prefixed by >>):
>>> 
>> 
>> [...]
>> 
>>>>> mm/hugetlb_vmemmap.c:698:28: error: use of undeclared identifier 'TLB_FLUSH_ALL'
>>>     698 |         flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
>>>         |                                   ^
>>>   2 warnings and 1 error generated.
>>> 
>>> 
>> 
>> TLB_FLUSH_ALL is x86 only so what I wrote above is wrong in what should be
>> architecture independent. The way I should have written the global TLB flush is
>> to use flush_tlb_all(), which is what is implemented by the arch.
>> 
>> The alternative is to compose a start/end tuple in the top-level optimize-folios
>> function as we iterate over folios to remap, and flush via
>> flush_tlb_kernel_range(). But this would likely only be relevant on x86 only,
>> that is to optimize the flushing of 3 contiguous 2M hugetlb pages (~24 vmemmap
>> pages) as that's where the TLB flush ceiling is put (31 pages) for per-page VA
>> flush, before falling back to a global TLB flush. Weren't sure of the added
>> complexity for dubious benefit thus kept it in global TLB flush.
> 
> Thanks Joao.
> 
> I added my share of build issues to this RFC as can be seen in the bot
> responses to other patches.
> 
> My assumption is that these build issues will not prevent people from
> looking into and commenting on the bigger performance issue that was the
> reason for this series.  The build issues would of course be resolved if
> there is some concensus that this is the way to move forward to address
> this issue.  If the build issues are a stumbling block for anyone to
> look at this bigger issue, let me know and I will fix them all ASAP.

No need to update. But I need some time to look.

Muchun,
Thanks.
Muchun Song Aug. 30, 2023, 8:09 a.m. UTC | #6
On 2023/8/26 03:04, Mike Kravetz wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
>
> In an effort to minimize amount of TLB flushes, batch all PMD splits
> belonging to a range of pages in order to perform only 1 (global) TLB
> flush. This brings down from 14.2secs into 7.9secs a 1T hugetlb
> allocation.
>
> Rebased by Mike Kravetz
>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/hugetlb_vmemmap.c | 94 ++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 90 insertions(+), 4 deletions(-)
>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index 500a118915ff..904a64fe5669 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -26,6 +26,7 @@
>    * @reuse_addr:		the virtual address of the @reuse_page page.
>    * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
>    *			or is mapped from.
> + * @flags		used to modify behavior in bulk operations
>    */
>   struct vmemmap_remap_walk {
>   	void			(*remap_pte)(pte_t *pte, unsigned long addr,
> @@ -34,9 +35,11 @@ struct vmemmap_remap_walk {
>   	struct page		*reuse_page;
>   	unsigned long		reuse_addr;
>   	struct list_head	*vmemmap_pages;
> +#define VMEMMAP_REMAP_ONLY_SPLIT	BIT(0)
> +	unsigned long		flags;
>   };
>   
> -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool bulk)
>   {
>   	pmd_t __pmd;
>   	int i;
> @@ -79,7 +82,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>   		/* Make pte visible before pmd. See comment in pmd_install(). */
>   		smp_wmb();
>   		pmd_populate_kernel(&init_mm, pmd, pgtable);
> -		flush_tlb_kernel_range(start, start + PMD_SIZE);
> +		if (!bulk)
> +			flush_tlb_kernel_range(start, start + PMD_SIZE);

A little weird to me. @bulk is used to indicate whether the TLB
should be flushed, however, the flag name is VMEMMAP_REMAP_ONLY_SPLIT,
it seems to tell me @bulk (calculated from walk->flags & 
VMEMMAP_REMAP_ONLY_SPLIT)
is a indicator to only split the huge pmd entry. For me, I think
it is better to introduce another flag like VMEMMAP_SPLIT_WITHOUT_FLUSH
to indicate whether TLB should be flushed.

>   	} else {
>   		pte_free_kernel(&init_mm, pgtable);
>   	}
> @@ -119,18 +123,28 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
>   			     unsigned long end,
>   			     struct vmemmap_remap_walk *walk)
>   {
> +	bool bulk;
>   	pmd_t *pmd;
>   	unsigned long next;
>   
> +	bulk = walk->flags & VMEMMAP_REMAP_ONLY_SPLIT;
>   	pmd = pmd_offset(pud, addr);
>   	do {
>   		int ret;
>   
> -		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
> +		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, bulk);
>   		if (ret)
>   			return ret;
>   
>   		next = pmd_addr_end(addr, end);
> +
> +		/*
> +		 * We are only splitting, not remapping the hugetlb vmemmap
> +		 * pages.
> +		 */
> +		if (bulk)
> +			continue;

Actually, we don not need a flag to detect this situation, you could
use "!@walk->remap_pte" to determine whether we should go into the
next level traversal of the page table. ->remap_pte is used to traverse
the pte entry, so it make senses to continue to the next pmd entry if
it is NULL.

> +
>   		vmemmap_pte_range(pmd, addr, next, walk);
>   	} while (pmd++, addr = next, addr != end);
>   
> @@ -197,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
>   			return ret;
>   	} while (pgd++, addr = next, addr != end);
>   
> -	flush_tlb_kernel_range(start, end);
> +	if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
> +		flush_tlb_kernel_range(start, end);

This could be:

     if (walk->remap_pte)
         flush_tlb_kernel_range(start, end);

>   
>   	return 0;
>   }
> @@ -296,6 +311,48 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
>   	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>   }
>   
> +/**
> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
> + *                      backing PMDs of the directmap into PTEs
> + * @start:     start address of the vmemmap virtual address range that we want
> + *             to remap.
> + * @end:       end address of the vmemmap virtual address range that we want to
> + *             remap.
> + * @reuse:     reuse address.
> + *
> + * Return: %0 on success, negative error code otherwise.
> + */
> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
> +				unsigned long reuse)
> +{
> +	int ret;
> +	LIST_HEAD(vmemmap_pages);

Unused variable?

> +	struct vmemmap_remap_walk walk = {
> +		.flags = VMEMMAP_REMAP_ONLY_SPLIT,
> +	};
> +
> +	/*
> +	 * In order to make remapping routine most efficient for the huge pages,
> +	 * the routine of vmemmap page table walking has the following rules
> +	 * (see more details from the vmemmap_pte_range()):
> +	 *
> +	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
> +	 *   should be continuous.
> +	 * - The @reuse address is part of the range [@reuse, @end) that we are
> +	 *   walking which is passed to vmemmap_remap_range().
> +	 * - The @reuse address is the first in the complete range.
> +	 *
> +	 * So we need to make sure that @start and @reuse meet the above rules.
> +	 */

The comments are duplicated, something like:

     /* See the comment in the vmemmap_remap_free(). */

is enough.

> +	BUG_ON(start - reuse != PAGE_SIZE);
> +
> +	mmap_read_lock(&init_mm);
> +	ret = vmemmap_remap_range(reuse, end, &walk);
> +	mmap_read_unlock(&init_mm);
> +
> +	return ret;
> +}
> +
>   /**
>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>    *			to the page which @reuse is mapped to, then free vmemmap
> @@ -320,6 +377,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
>   		.remap_pte	= vmemmap_remap_pte,
>   		.reuse_addr	= reuse,
>   		.vmemmap_pages	= &vmemmap_pages,
> +		.flags		= 0,
>   	};
>   	int nid = page_to_nid((struct page *)start);
>   	gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
> @@ -606,11 +664,39 @@ void hugetlb_vmemmap_optimize_bulk(const struct hstate *h, struct page *head,
>   	__hugetlb_vmemmap_optimize(h, head, bulk_pages);
>   }
>   
> +void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
> +{
> +	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
> +	unsigned long vmemmap_reuse;
> +
> +	if (!vmemmap_should_optimize(h, head))
> +		return;
> +
> +	static_branch_inc(&hugetlb_optimize_vmemmap_key);

Why? hugetlb_optimize_vmemmap_key is used as a switch to let
page_fixed_fake_head works properly for the vmemmap-optimized
HugeTLB pages, however, this function only splits the huge pmd
entry without optimizing the vmemmap pages. So it is wrong to
increase the static_key.

Thanks.

> +
> +	vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
> +	vmemmap_reuse   = vmemmap_start;
> +	vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
> +
> +	/*
> +	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
> +	 * to the page which @vmemmap_reuse is mapped to, then free the pages
> +	 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
> +	 */
> +	if (vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse))
> +		static_branch_dec(&hugetlb_optimize_vmemmap_key);
> +}
> +
>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
>   {
>   	struct folio *folio;
>   	LIST_HEAD(vmemmap_pages);
>   
> +	list_for_each_entry(folio, folio_list, lru)
> +		hugetlb_vmemmap_split(h, &folio->page);
> +
> +	flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
> +
>   	list_for_each_entry(folio, folio_list, lru)
>   		hugetlb_vmemmap_optimize_bulk(h, &folio->page, &vmemmap_pages);
>
Joao Martins Aug. 30, 2023, 11:13 a.m. UTC | #7
On 30/08/2023 09:09, Muchun Song wrote:
> 
> 
> On 2023/8/26 03:04, Mike Kravetz wrote:
>> From: Joao Martins <joao.m.martins@oracle.com>
>>
>> In an effort to minimize amount of TLB flushes, batch all PMD splits
>> belonging to a range of pages in order to perform only 1 (global) TLB
>> flush. This brings down from 14.2secs into 7.9secs a 1T hugetlb
>> allocation.
>>
>> Rebased by Mike Kravetz
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>   mm/hugetlb_vmemmap.c | 94 ++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 90 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
>> index 500a118915ff..904a64fe5669 100644
>> --- a/mm/hugetlb_vmemmap.c
>> +++ b/mm/hugetlb_vmemmap.c
>> @@ -26,6 +26,7 @@
>>    * @reuse_addr:        the virtual address of the @reuse_page page.
>>    * @vmemmap_pages:    the list head of the vmemmap pages that can be freed
>>    *            or is mapped from.
>> + * @flags        used to modify behavior in bulk operations
>>    */
>>   struct vmemmap_remap_walk {
>>       void            (*remap_pte)(pte_t *pte, unsigned long addr,
>> @@ -34,9 +35,11 @@ struct vmemmap_remap_walk {
>>       struct page        *reuse_page;
>>       unsigned long        reuse_addr;
>>       struct list_head    *vmemmap_pages;
>> +#define VMEMMAP_REMAP_ONLY_SPLIT    BIT(0)
>> +    unsigned long        flags;
>>   };
>>   -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
>> +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool bulk)
>>   {
>>       pmd_t __pmd;
>>       int i;
>> @@ -79,7 +82,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long
>> start)
>>           /* Make pte visible before pmd. See comment in pmd_install(). */
>>           smp_wmb();
>>           pmd_populate_kernel(&init_mm, pmd, pgtable);
>> -        flush_tlb_kernel_range(start, start + PMD_SIZE);
>> +        if (!bulk)
>> +            flush_tlb_kernel_range(start, start + PMD_SIZE);
> 
> A little weird to me. @bulk is used to indicate whether the TLB
> should be flushed, however, the flag name is VMEMMAP_REMAP_ONLY_SPLIT,
> it seems to tell me @bulk (calculated from walk->flags & VMEMMAP_REMAP_ONLY_SPLIT)

bulk here has a meaning of PMD being split in bulk, not the bulk of the vmemmap
pages. But yeah it's weird, I should clean this up.

> is a indicator to only split the huge pmd entry. For me, I think
> it is better to introduce another flag like VMEMMAP_SPLIT_WITHOUT_FLUSH
> to indicate whether TLB should be flushed.
> 
Based on the feedback of another patch I think it's abetter as you say, to
introduce a VMEMMAP_NO_TLB_FLUSH, and use the the walk::remap_pte to tell
whether it's a PMD split or a PTE remap.

>>       } else {
>>           pte_free_kernel(&init_mm, pgtable);
>>       }
>> @@ -119,18 +123,28 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long
>> addr,
>>                    unsigned long end,
>>                    struct vmemmap_remap_walk *walk)
>>   {
>> +    bool bulk;
>>       pmd_t *pmd;
>>       unsigned long next;
>>   +    bulk = walk->flags & VMEMMAP_REMAP_ONLY_SPLIT;
>>       pmd = pmd_offset(pud, addr);
>>       do {
>>           int ret;
>>   -        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
>> +        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, bulk);
>>           if (ret)
>>               return ret;
>>             next = pmd_addr_end(addr, end);
>> +
>> +        /*
>> +         * We are only splitting, not remapping the hugetlb vmemmap
>> +         * pages.
>> +         */
>> +        if (bulk)
>> +            continue;
> 
> Actually, we don not need a flag to detect this situation, you could
> use "!@walk->remap_pte" to determine whether we should go into the
> next level traversal of the page table. ->remap_pte is used to traverse
> the pte entry, so it make senses to continue to the next pmd entry if
> it is NULL.
> 

Yeap, great suggestion.

>> +
>>           vmemmap_pte_range(pmd, addr, next, walk);
>>       } while (pmd++, addr = next, addr != end);
>>   @@ -197,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start,
>> unsigned long end,
>>               return ret;
>>       } while (pgd++, addr = next, addr != end);
>>   -    flush_tlb_kernel_range(start, end);
>> +    if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
>> +        flush_tlb_kernel_range(start, end);
> 
> This could be:
> 
>     if (walk->remap_pte)
>         flush_tlb_kernel_range(start, end);
> 
Yeap.

>>         return 0;
>>   }
>> @@ -296,6 +311,48 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long
>> addr,
>>       set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
>>   }
>>   +/**
>> + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
>> + *                      backing PMDs of the directmap into PTEs
>> + * @start:     start address of the vmemmap virtual address range that we want
>> + *             to remap.
>> + * @end:       end address of the vmemmap virtual address range that we want to
>> + *             remap.
>> + * @reuse:     reuse address.
>> + *
>> + * Return: %0 on success, negative error code otherwise.
>> + */
>> +static int vmemmap_remap_split(unsigned long start, unsigned long end,
>> +                unsigned long reuse)
>> +{
>> +    int ret;
>> +    LIST_HEAD(vmemmap_pages);
> 
> Unused variable?
> 
Yeap, a leftover from something else.

>> +    struct vmemmap_remap_walk walk = {
>> +        .flags = VMEMMAP_REMAP_ONLY_SPLIT,
>> +    };
>> +
>> +    /*
>> +     * In order to make remapping routine most efficient for the huge pages,
>> +     * the routine of vmemmap page table walking has the following rules
>> +     * (see more details from the vmemmap_pte_range()):
>> +     *
>> +     * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
>> +     *   should be continuous.
>> +     * - The @reuse address is part of the range [@reuse, @end) that we are
>> +     *   walking which is passed to vmemmap_remap_range().
>> +     * - The @reuse address is the first in the complete range.
>> +     *
>> +     * So we need to make sure that @start and @reuse meet the above rules.
>> +     */
> 
> The comments are duplicated, something like:
> 
>     /* See the comment in the vmemmap_remap_free(). */
> 
> is enough.
> 
OK.

>> +    BUG_ON(start - reuse != PAGE_SIZE);
>> +
>> +    mmap_read_lock(&init_mm);
>> +    ret = vmemmap_remap_range(reuse, end, &walk);
>> +    mmap_read_unlock(&init_mm);
>> +
>> +    return ret;
>> +}
>> +
>>   /**
>>    * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
>>    *            to the page which @reuse is mapped to, then free vmemmap
>> @@ -320,6 +377,7 @@ static int vmemmap_remap_free(unsigned long start,
>> unsigned long end,
>>           .remap_pte    = vmemmap_remap_pte,
>>           .reuse_addr    = reuse,
>>           .vmemmap_pages    = &vmemmap_pages,
>> +        .flags        = 0,
>>       };
>>       int nid = page_to_nid((struct page *)start);
>>       gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
>> @@ -606,11 +664,39 @@ void hugetlb_vmemmap_optimize_bulk(const struct hstate
>> *h, struct page *head,
>>       __hugetlb_vmemmap_optimize(h, head, bulk_pages);
>>   }
>>   +void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
>> +{
>> +    unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
>> +    unsigned long vmemmap_reuse;
>> +
>> +    if (!vmemmap_should_optimize(h, head))
>> +        return;
>> +
>> +    static_branch_inc(&hugetlb_optimize_vmemmap_key);
> 
> Why? hugetlb_optimize_vmemmap_key is used as a switch to let
> page_fixed_fake_head works properly for the vmemmap-optimized
> HugeTLB pages, however, this function only splits the huge pmd
> entry without optimizing the vmemmap pages. So it is wrong to
> increase the static_key.
> 
Yes, you're right. It's wrong, it was a copy-and-paste error from
remap_free and I failed to remove the non vmemmap_optimize specific
logic.

> Thanks.
> 
>> +
>> +    vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
>> +    vmemmap_reuse   = vmemmap_start;
>> +    vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
>> +
>> +    /*
>> +     * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
>> +     * to the page which @vmemmap_reuse is mapped to, then free the pages
>> +     * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
>> +     */
>> +    if (vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse))
>> +        static_branch_dec(&hugetlb_optimize_vmemmap_key);
>> +}
>> +
>>   void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head
>> *folio_list)
>>   {
>>       struct folio *folio;
>>       LIST_HEAD(vmemmap_pages);
>>   +    list_for_each_entry(folio, folio_list, lru)
>> +        hugetlb_vmemmap_split(h, &folio->page);
>> +
>> +    flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
>> +
>>       list_for_each_entry(folio, folio_list, lru)
>>           hugetlb_vmemmap_optimize_bulk(h, &folio->page, &vmemmap_pages);
>>   
>
Joao Martins Aug. 30, 2023, 4:03 p.m. UTC | #8
On 30/08/2023 12:13, Joao Martins wrote:
> On 30/08/2023 09:09, Muchun Song wrote:
>> On 2023/8/26 03:04, Mike Kravetz wrote:
>>> +
>>> +        /*
>>> +         * We are only splitting, not remapping the hugetlb vmemmap
>>> +         * pages.
>>> +         */
>>> +        if (bulk)
>>> +            continue;
>>
>> Actually, we don not need a flag to detect this situation, you could
>> use "!@walk->remap_pte" to determine whether we should go into the
>> next level traversal of the page table. ->remap_pte is used to traverse
>> the pte entry, so it make senses to continue to the next pmd entry if
>> it is NULL.
>>
> 
> Yeap, great suggestion.
> 
>>> +
>>>           vmemmap_pte_range(pmd, addr, next, walk);
>>>       } while (pmd++, addr = next, addr != end);
>>>   @@ -197,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start,
>>> unsigned long end,
>>>               return ret;
>>>       } while (pgd++, addr = next, addr != end);
>>>   -    flush_tlb_kernel_range(start, end);
>>> +    if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
>>> +        flush_tlb_kernel_range(start, end);
>>
>> This could be:
>>
>>     if (walk->remap_pte)
>>         flush_tlb_kernel_range(start, end);
>>
> Yeap.
> 

Quite correction: This stays as is, except with a flag rename. That is because
this is actual flush that we intend to batch in the next patch. And while the
PMD split could just use !walk->remap_pte, the next patch would just need to
test NO_TLB_FLUSH flag. Meaning we endup anyways just testing for this
to-be-consolidated flag
Muchun Song Aug. 31, 2023, 3:54 a.m. UTC | #9
> On Aug 31, 2023, at 00:03, Joao Martins <joao.m.martins@oracle.com> wrote:
> 
> On 30/08/2023 12:13, Joao Martins wrote:
>> On 30/08/2023 09:09, Muchun Song wrote:
>>> On 2023/8/26 03:04, Mike Kravetz wrote:
>>>> +
>>>> +        /*
>>>> +         * We are only splitting, not remapping the hugetlb vmemmap
>>>> +         * pages.
>>>> +         */
>>>> +        if (bulk)
>>>> +            continue;
>>> 
>>> Actually, we don not need a flag to detect this situation, you could
>>> use "!@walk->remap_pte" to determine whether we should go into the
>>> next level traversal of the page table. ->remap_pte is used to traverse
>>> the pte entry, so it make senses to continue to the next pmd entry if
>>> it is NULL.
>>> 
>> 
>> Yeap, great suggestion.
>> 
>>>> +
>>>>           vmemmap_pte_range(pmd, addr, next, walk);
>>>>       } while (pmd++, addr = next, addr != end);
>>>>   @@ -197,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start,
>>>> unsigned long end,
>>>>               return ret;
>>>>       } while (pgd++, addr = next, addr != end);
>>>>   -    flush_tlb_kernel_range(start, end);
>>>> +    if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
>>>> +        flush_tlb_kernel_range(start, end);
>>> 
>>> This could be:
>>> 
>>>     if (walk->remap_pte)
>>>         flush_tlb_kernel_range(start, end);
>>> 
>> Yeap.
>> 
> 
> Quite correction: This stays as is, except with a flag rename. That is because
> this is actual flush that we intend to batch in the next patch. And while the
> PMD split could just use !walk->remap_pte, the next patch would just need to
> test NO_TLB_FLUSH flag. Meaning we endup anyways just testing for this
> to-be-consolidated flag

I think this really should be "if (walk->remap_pte && !(flag & VMEMMAP_NO_TLB_FLUSH))"
in your next patch. This TLB flushing only make sense for the case of existing of
@walk->remap_pte. I know "if (!(flag & VMEMMAP_NO_TLB_FLUSH))" check is suitable for your
use case, but what if a user (even if it does not exist now, but it may in the future)
passing a NULL @walk->remap_pte and not specifying VMEMMAP_NO_TLB_FLUSH? Then we will
do a useless TLB flushing. This is why I suggest you change this to "if (walk->remap_pte)"
in this patch and change it to "if (walk->remap_pte && !(flag & VMEMMAP_NO_TLB_FLUSH))"
in the next patch.

Thanks.
Joao Martins Aug. 31, 2023, 9:26 a.m. UTC | #10
On 31/08/2023 04:54, Muchun Song wrote:
> 
> 
>> On Aug 31, 2023, at 00:03, Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> On 30/08/2023 12:13, Joao Martins wrote:
>>> On 30/08/2023 09:09, Muchun Song wrote:
>>>> On 2023/8/26 03:04, Mike Kravetz wrote:
>>>>> +
>>>>> +        /*
>>>>> +         * We are only splitting, not remapping the hugetlb vmemmap
>>>>> +         * pages.
>>>>> +         */
>>>>> +        if (bulk)
>>>>> +            continue;
>>>>
>>>> Actually, we don not need a flag to detect this situation, you could
>>>> use "!@walk->remap_pte" to determine whether we should go into the
>>>> next level traversal of the page table. ->remap_pte is used to traverse
>>>> the pte entry, so it make senses to continue to the next pmd entry if
>>>> it is NULL.
>>>>
>>>
>>> Yeap, great suggestion.
>>>
>>>>> +
>>>>>           vmemmap_pte_range(pmd, addr, next, walk);
>>>>>       } while (pmd++, addr = next, addr != end);
>>>>>   @@ -197,7 +211,8 @@ static int vmemmap_remap_range(unsigned long start,
>>>>> unsigned long end,
>>>>>               return ret;
>>>>>       } while (pgd++, addr = next, addr != end);
>>>>>   -    flush_tlb_kernel_range(start, end);
>>>>> +    if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
>>>>> +        flush_tlb_kernel_range(start, end);
>>>>
>>>> This could be:
>>>>
>>>>     if (walk->remap_pte)
>>>>         flush_tlb_kernel_range(start, end);
>>>>
>>> Yeap.
>>>
>>
>> Quite correction: This stays as is, except with a flag rename. That is because
>> this is actual flush that we intend to batch in the next patch. And while the
>> PMD split could just use !walk->remap_pte, the next patch would just need to
>> test NO_TLB_FLUSH flag. Meaning we endup anyways just testing for this
>> to-be-consolidated flag
> 
> I think this really should be "if (walk->remap_pte && !(flag & VMEMMAP_NO_TLB_FLUSH))"
> in your next patch. This TLB flushing only make sense for the case of existing of
> @walk->remap_pte. I know "if (!(flag & VMEMMAP_NO_TLB_FLUSH))" check is suitable for your
> use case, but what if a user (even if it does not exist now, but it may in the future)
> passing a NULL @walk->remap_pte and not specifying VMEMMAP_NO_TLB_FLUSH? Then we will
> do a useless TLB flushing. This is why I suggest you change this to "if (walk->remap_pte)"
> in this patch and change it to "if (walk->remap_pte && !(flag & VMEMMAP_NO_TLB_FLUSH))"
> in the next patch.

OK, fair enough.
diff mbox series

Patch

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 500a118915ff..904a64fe5669 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -26,6 +26,7 @@ 
  * @reuse_addr:		the virtual address of the @reuse_page page.
  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
  *			or is mapped from.
+ * @flags		used to modify behavior in bulk operations
  */
 struct vmemmap_remap_walk {
 	void			(*remap_pte)(pte_t *pte, unsigned long addr,
@@ -34,9 +35,11 @@  struct vmemmap_remap_walk {
 	struct page		*reuse_page;
 	unsigned long		reuse_addr;
 	struct list_head	*vmemmap_pages;
+#define VMEMMAP_REMAP_ONLY_SPLIT	BIT(0)
+	unsigned long		flags;
 };
 
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool bulk)
 {
 	pmd_t __pmd;
 	int i;
@@ -79,7 +82,8 @@  static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 		/* Make pte visible before pmd. See comment in pmd_install(). */
 		smp_wmb();
 		pmd_populate_kernel(&init_mm, pmd, pgtable);
-		flush_tlb_kernel_range(start, start + PMD_SIZE);
+		if (!bulk)
+			flush_tlb_kernel_range(start, start + PMD_SIZE);
 	} else {
 		pte_free_kernel(&init_mm, pgtable);
 	}
@@ -119,18 +123,28 @@  static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 			     unsigned long end,
 			     struct vmemmap_remap_walk *walk)
 {
+	bool bulk;
 	pmd_t *pmd;
 	unsigned long next;
 
+	bulk = walk->flags & VMEMMAP_REMAP_ONLY_SPLIT;
 	pmd = pmd_offset(pud, addr);
 	do {
 		int ret;
 
-		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, bulk);
 		if (ret)
 			return ret;
 
 		next = pmd_addr_end(addr, end);
+
+		/*
+		 * We are only splitting, not remapping the hugetlb vmemmap
+		 * pages.
+		 */
+		if (bulk)
+			continue;
+
 		vmemmap_pte_range(pmd, addr, next, walk);
 	} while (pmd++, addr = next, addr != end);
 
@@ -197,7 +211,8 @@  static int vmemmap_remap_range(unsigned long start, unsigned long end,
 			return ret;
 	} while (pgd++, addr = next, addr != end);
 
-	flush_tlb_kernel_range(start, end);
+	if (!(walk->flags & VMEMMAP_REMAP_ONLY_SPLIT))
+		flush_tlb_kernel_range(start, end);
 
 	return 0;
 }
@@ -296,6 +311,48 @@  static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 }
 
+/**
+ * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
+ *                      backing PMDs of the directmap into PTEs
+ * @start:     start address of the vmemmap virtual address range that we want
+ *             to remap.
+ * @end:       end address of the vmemmap virtual address range that we want to
+ *             remap.
+ * @reuse:     reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_split(unsigned long start, unsigned long end,
+				unsigned long reuse)
+{
+	int ret;
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.flags = VMEMMAP_REMAP_ONLY_SPLIT,
+	};
+
+	/*
+	 * In order to make remapping routine most efficient for the huge pages,
+	 * the routine of vmemmap page table walking has the following rules
+	 * (see more details from the vmemmap_pte_range()):
+	 *
+	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+	 *   should be continuous.
+	 * - The @reuse address is part of the range [@reuse, @end) that we are
+	 *   walking which is passed to vmemmap_remap_range().
+	 * - The @reuse address is the first in the complete range.
+	 *
+	 * So we need to make sure that @start and @reuse meet the above rules.
+	 */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	mmap_read_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
+
+	return ret;
+}
+
 /**
  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  *			to the page which @reuse is mapped to, then free vmemmap
@@ -320,6 +377,7 @@  static int vmemmap_remap_free(unsigned long start, unsigned long end,
 		.remap_pte	= vmemmap_remap_pte,
 		.reuse_addr	= reuse,
 		.vmemmap_pages	= &vmemmap_pages,
+		.flags		= 0,
 	};
 	int nid = page_to_nid((struct page *)start);
 	gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
@@ -606,11 +664,39 @@  void hugetlb_vmemmap_optimize_bulk(const struct hstate *h, struct page *head,
 	__hugetlb_vmemmap_optimize(h, head, bulk_pages);
 }
 
+void hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
+{
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
+
+	if (!vmemmap_should_optimize(h, head))
+		return;
+
+	static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
+	vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse   = vmemmap_start;
+	vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+	/*
+	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
+	 * to the page which @vmemmap_reuse is mapped to, then free the pages
+	 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
+	 */
+	if (vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse))
+		static_branch_dec(&hugetlb_optimize_vmemmap_key);
+}
+
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 {
 	struct folio *folio;
 	LIST_HEAD(vmemmap_pages);
 
+	list_for_each_entry(folio, folio_list, lru)
+		hugetlb_vmemmap_split(h, &folio->page);
+
+	flush_tlb_kernel_range(0, TLB_FLUSH_ALL);
+
 	list_for_each_entry(folio, folio_list, lru)
 		hugetlb_vmemmap_optimize_bulk(h, &folio->page, &vmemmap_pages);