diff mbox series

[v3,01/12] mm/debug_vm_pgtable: Introduce struct pgtable_debug_args

Message ID 20210719130613.334901-2-gshan@redhat.com (mailing list archive)
State New
Headers show
Series mm/debug_vm_pgtable: Enhancements | expand

Commit Message

Gavin Shan July 19, 2021, 1:06 p.m. UTC
In debug_vm_pgtable(), there are many local variables introduced to
track the needed information and they are passed to the functions for
various test cases. It'd better to introduce a struct as place holder
for these information. With it, what the functions for various test
cases need is the struct, to simplify the code. It also makes code
easier to be maintained.

Besides, set_xxx_at() could access the data on the corresponding pages
in the page table modifying tests. So the accessed pages in the tests
should have been allocated from buddy. Otherwise, we're accessing pages
that aren't owned by us. This causes issues like page flag corruption.

This introduces "struct pgtable_debug_args". The struct is initialized
and destroyed, but the information in the struct isn't used yet. They
will be used in subsequent patches.

Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 196 insertions(+), 1 deletion(-)

Comments

Anshuman Khandual July 21, 2021, 5:44 a.m. UTC | #1
On 7/19/21 6:36 PM, Gavin Shan wrote:
> In debug_vm_pgtable(), there are many local variables introduced to
> track the needed information and they are passed to the functions for
> various test cases. It'd better to introduce a struct as place holder
> for these information. With it, what the functions for various test
> cases need is the struct, to simplify the code. It also makes code
> easier to be maintained.
> 
> Besides, set_xxx_at() could access the data on the corresponding pages
> in the page table modifying tests. So the accessed pages in the tests
> should have been allocated from buddy. Otherwise, we're accessing pages
> that aren't owned by us. This causes issues like page flag corruption.
> 
> This introduces "struct pgtable_debug_args". The struct is initialized
> and destroyed, but the information in the struct isn't used yet. They
> will be used in subsequent patches.
> 
> Signed-off-by: Gavin Shan <gshan@redhat.com>
> ---
>  mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 196 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 1c922691aa61..ea153ff40d23 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -58,6 +58,36 @@
>  #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
>  #define RANDOM_NZVALUE	GENMASK(7, 0)
>  
> +struct pgtable_debug_args {
> +	struct mm_struct	*mm;
> +	struct vm_area_struct	*vma;
> +
> +	pgd_t			*pgdp;
> +	p4d_t			*p4dp;
> +	pud_t			*pudp;
> +	pmd_t			*pmdp;
> +	pte_t			*ptep;
> +
> +	p4d_t			*start_p4dp;
> +	pud_t			*start_pudp;
> +	pmd_t			*start_pmdp;
> +	pgtable_t		start_ptep;
> +
> +	unsigned long		vaddr;
> +	pgprot_t		page_prot;
> +	pgprot_t		page_prot_none;
> +
> +	unsigned long		pud_pfn;
> +	unsigned long		pmd_pfn;
> +	unsigned long		pte_pfn;
> +
> +	unsigned long		fixed_pgd_pfn;
> +	unsigned long		fixed_p4d_pfn;
> +	unsigned long		fixed_pud_pfn;
> +	unsigned long		fixed_pmd_pfn;
> +	unsigned long		fixed_pte_pfn;
> +};
> +
>  static void __init pte_basic_tests(unsigned long pfn, int idx)
>  {
>  	pgprot_t prot = protection_map[idx];
> @@ -955,8 +985,167 @@ static unsigned long __init get_random_vaddr(void)
>  	return random_vaddr;
>  }
>  
> +static void __init destroy_args(struct pgtable_debug_args *args)
> +{
> +	struct page *page = NULL;
> +
> +	/* Free (huge) page */
> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
> +	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
> +	    has_transparent_hugepage() &&
> +	    args->pud_pfn != ULONG_MAX) {
> +		page = pfn_to_page(args->pud_pfn);
> +		__free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
> +	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
> +		   has_transparent_hugepage() &&
> +		   args->pmd_pfn != ULONG_MAX) {
> +		page = pfn_to_page(args->pmd_pfn);
> +		__free_pages(page, HPAGE_PMD_ORDER);
> +	} else if (args->pte_pfn != ULONG_MAX) {
> +		page = pfn_to_page(args->pte_pfn);
> +		__free_pages(page, 0);
> +	}
> +
> +	/* Free page table */
> +	if (args->start_ptep) {
> +		pte_free(args->mm, args->start_ptep);
> +		mm_dec_nr_ptes(args->mm);
> +	}
> +
> +	if (args->start_pmdp) {
> +		pmd_free(args->mm, args->start_pmdp);
> +		mm_dec_nr_pmds(args->mm);
> +	}
> +
> +	if (args->start_pudp) {
> +		pud_free(args->mm, args->start_pudp);
> +		mm_dec_nr_puds(args->mm);
> +	}
> +
> +	if (args->start_p4dp)
> +		p4d_free(args->mm, args->p4dp);
> +
> +	/* Free vma and mm struct */
> +	if (args->vma)
> +		vm_area_free(args->vma);
> +	if (args->mm)
> +		mmdrop(args->mm);
> +}
> +
> +static int __init init_args(struct pgtable_debug_args *args)
> +{
> +	struct page *page = NULL;
> +	phys_addr_t phys;
> +	int ret = 0;
> +
> +	/* Initialize the debugging data */
> +	memset(args, 0, sizeof(*args));
> +	args->page_prot      = vm_get_page_prot(VMFLAGS);
> +	args->page_prot_none = __P000;

Please preserve the existing comments before this assignment.

        /*
         * __P000 (or even __S000) will help create page table entries with
         * PROT_NONE permission as required for pxx_protnone_tests().
         */

> +	args->pud_pfn        = ULONG_MAX;
> +	args->pmd_pfn        = ULONG_MAX;
> +	args->pte_pfn        = ULONG_MAX;
> +	args->fixed_pgd_pfn  = ULONG_MAX;
> +	args->fixed_p4d_pfn  = ULONG_MAX;
> +	args->fixed_pud_pfn  = ULONG_MAX;
> +	args->fixed_pmd_pfn  = ULONG_MAX;
> +	args->fixed_pte_pfn  = ULONG_MAX;
> +
> +	/* Allocate mm and vma */
> +	args->mm = mm_alloc();
> +	if (!args->mm) {
> +		pr_err("Failed to allocate mm struct\n");
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	args->vma = vm_area_alloc(args->mm);
> +	if (!args->vma) {
> +		pr_err("Failed to allocate vma\n");
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	/* Figure out the virtual address and allocate page table entries */
> +	args->vaddr = get_random_vaddr();

Please group args->vaddr's init with page_prot and page_prot_none above.

> +	args->pgdp = pgd_offset(args->mm, args->vaddr);
> +	args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
> +	args->pudp = args->p4dp ?
> +		     pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
> +	args->pmdp = args->pudp ?
> +		     pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
> +	args->ptep = args->pmdp ?
> +		     pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
> +	if (!args->ptep) {
> +		pr_err("Failed to allocate page table\n");
> +		ret = -ENOMEM;
> +		goto error;
> +	}

Why not just assert that all page table level pointers are allocated
successfully, otherwise bail out the test completely. Something like
this at each level.

	if (!args->p4dp) {
		pr_err("Failed to allocate page table\n");
		ret = -ENOMEM;
		goto error;
	}

Is there any value in proceeding with the test when some page table
pointers have not been allocated. Also individual tests do not cross
check these pointers. Also asserting successful allocations will
make the freeing path simpler, as I had mentioned earlier.

> +
> +	/*
> +	 * The above page table entries will be modified. Lets save the
> +	 * page table entries so that they can be released when the tests
> +	 * are completed.
> +	 */
> +	args->start_p4dp = p4d_offset(args->pgdp, 0UL);
> +	args->start_pudp = pud_offset(args->p4dp, 0UL);
> +	args->start_pmdp = pmd_offset(args->pudp, 0UL);
> +	args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));

If the above page table pointers have been validated to be allocated
successfully, we could add these here.

	WARN_ON(!args->start_p4dp)
	WARN_ON(!args->start_pudp)
	WARN_ON(!args->start_pmdp)
	WARN_ON(!args->start_ptep)

Afterwards all those if (args->start_pxdp) checks in the freeing path
will not be required anymore.

> +
> +	/*
> +	 * Figure out the fixed addresses, which are all around the kernel
> +	 * symbol (@start_kernel). The corresponding PFNs might be invalid,
> +	 * but it's fine as the following tests won't access the pages.
> +	 */
> +	phys = __pa_symbol(&start_kernel);
> +	args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
> +	args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
> +	args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
> +	args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
> +	args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
> +
> +	/*
> +	 * Allocate (huge) pages because some of the tests need to access
> +	 * the data in the pages. The corresponding tests will be skipped
> +	 * if we fail to allocate (huge) pages.
> +	 */
> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
> +	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
> +	    has_transparent_hugepage()) {
> +		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
> +				   HPAGE_PUD_SHIFT - PAGE_SHIFT);

Please drop __GFP_NOWARN and instead use something like alloc_contig_pages()
when required allocation order exceed (MAX_ORDER - 1). Else the test might
not be able to execute on platform configurations, where PUD THP is enabled.

> +		if (page) {
> +			args->pud_pfn = page_to_pfn(page);
> +			args->pmd_pfn = args->pud_pfn;
> +			args->pte_pfn = args->pud_pfn;
> +			return 0;
> +		}
> +	}
> +
> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
> +	    has_transparent_hugepage()) {
> +		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN, HPAGE_PMD_ORDER);
> +		if (page) {
> +			args->pmd_pfn = page_to_pfn(page);
> +			args->pte_pfn = args->pmd_pfn;
> +			return 0;
> +		}
> +	}
> +
> +	page = alloc_pages(GFP_KERNEL, 0);
> +	if (page)
> +		args->pte_pfn = page_to_pfn(page);
> +
> +	return 0;
> +
> +error:
> +	destroy_args(args);
> +	return ret;
> +}
> +
>  static int __init debug_vm_pgtable(void)
>  {
> +	struct pgtable_debug_args args;>  	struct vm_area_struct *vma;
>  	struct mm_struct *mm;
>  	pgd_t *pgdp;
> @@ -970,9 +1159,13 @@ static int __init debug_vm_pgtable(void)
>  	unsigned long vaddr, pte_aligned, pmd_aligned;
>  	unsigned long pud_aligned, p4d_aligned, pgd_aligned;
>  	spinlock_t *ptl = NULL;
> -	int idx;
> +	int idx, ret;
>  
>  	pr_info("Validating architecture page table helpers\n");
> +	ret = init_args(&args);
> +	if (ret)
> +		return ret;
> +
>  	prot = vm_get_page_prot(VMFLAGS);
>  	vaddr = get_random_vaddr();
>  	mm = mm_alloc();
> @@ -1127,6 +1320,8 @@ static int __init debug_vm_pgtable(void)
>  	mm_dec_nr_pmds(mm);
>  	mm_dec_nr_ptes(mm);
>  	mmdrop(mm);
> +
> +	destroy_args(&args);
>  	return 0;
>  }
>  late_initcall(debug_vm_pgtable);
>
Gavin Shan July 21, 2021, 10:20 a.m. UTC | #2
Hi Anshuman,

On 7/21/21 3:44 PM, Anshuman Khandual wrote:
> On 7/19/21 6:36 PM, Gavin Shan wrote:
>> In debug_vm_pgtable(), there are many local variables introduced to
>> track the needed information and they are passed to the functions for
>> various test cases. It'd better to introduce a struct as place holder
>> for these information. With it, what the functions for various test
>> cases need is the struct, to simplify the code. It also makes code
>> easier to be maintained.
>>
>> Besides, set_xxx_at() could access the data on the corresponding pages
>> in the page table modifying tests. So the accessed pages in the tests
>> should have been allocated from buddy. Otherwise, we're accessing pages
>> that aren't owned by us. This causes issues like page flag corruption.
>>
>> This introduces "struct pgtable_debug_args". The struct is initialized
>> and destroyed, but the information in the struct isn't used yet. They
>> will be used in subsequent patches.
>>
>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>> ---
>>   mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 196 insertions(+), 1 deletion(-)
>>

I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
I will wait to integrate your comments to v4 until you finish the review
on all patches in v3 series.

>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>> index 1c922691aa61..ea153ff40d23 100644
>> --- a/mm/debug_vm_pgtable.c
>> +++ b/mm/debug_vm_pgtable.c
>> @@ -58,6 +58,36 @@
>>   #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
>>   #define RANDOM_NZVALUE	GENMASK(7, 0)
>>   
>> +struct pgtable_debug_args {
>> +	struct mm_struct	*mm;
>> +	struct vm_area_struct	*vma;
>> +
>> +	pgd_t			*pgdp;
>> +	p4d_t			*p4dp;
>> +	pud_t			*pudp;
>> +	pmd_t			*pmdp;
>> +	pte_t			*ptep;
>> +
>> +	p4d_t			*start_p4dp;
>> +	pud_t			*start_pudp;
>> +	pmd_t			*start_pmdp;
>> +	pgtable_t		start_ptep;
>> +
>> +	unsigned long		vaddr;
>> +	pgprot_t		page_prot;
>> +	pgprot_t		page_prot_none;
>> +
>> +	unsigned long		pud_pfn;
>> +	unsigned long		pmd_pfn;
>> +	unsigned long		pte_pfn;
>> +
>> +	unsigned long		fixed_pgd_pfn;
>> +	unsigned long		fixed_p4d_pfn;
>> +	unsigned long		fixed_pud_pfn;
>> +	unsigned long		fixed_pmd_pfn;
>> +	unsigned long		fixed_pte_pfn;
>> +};
>> +
>>   static void __init pte_basic_tests(unsigned long pfn, int idx)
>>   {
>>   	pgprot_t prot = protection_map[idx];
>> @@ -955,8 +985,167 @@ static unsigned long __init get_random_vaddr(void)
>>   	return random_vaddr;
>>   }
>>   
>> +static void __init destroy_args(struct pgtable_debug_args *args)
>> +{
>> +	struct page *page = NULL;
>> +
>> +	/* Free (huge) page */
>> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>> +	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>> +	    has_transparent_hugepage() &&
>> +	    args->pud_pfn != ULONG_MAX) {
>> +		page = pfn_to_page(args->pud_pfn);
>> +		__free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
>> +	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>> +		   has_transparent_hugepage() &&
>> +		   args->pmd_pfn != ULONG_MAX) {
>> +		page = pfn_to_page(args->pmd_pfn);
>> +		__free_pages(page, HPAGE_PMD_ORDER);
>> +	} else if (args->pte_pfn != ULONG_MAX) {
>> +		page = pfn_to_page(args->pte_pfn);
>> +		__free_pages(page, 0);
>> +	}
>> +
>> +	/* Free page table */
>> +	if (args->start_ptep) {
>> +		pte_free(args->mm, args->start_ptep);
>> +		mm_dec_nr_ptes(args->mm);
>> +	}
>> +
>> +	if (args->start_pmdp) {
>> +		pmd_free(args->mm, args->start_pmdp);
>> +		mm_dec_nr_pmds(args->mm);
>> +	}
>> +
>> +	if (args->start_pudp) {
>> +		pud_free(args->mm, args->start_pudp);
>> +		mm_dec_nr_puds(args->mm);
>> +	}
>> +
>> +	if (args->start_p4dp)
>> +		p4d_free(args->mm, args->p4dp);
>> +
>> +	/* Free vma and mm struct */
>> +	if (args->vma)
>> +		vm_area_free(args->vma);
>> +	if (args->mm)
>> +		mmdrop(args->mm);
>> +}
>> +
>> +static int __init init_args(struct pgtable_debug_args *args)
>> +{
>> +	struct page *page = NULL;
>> +	phys_addr_t phys;
>> +	int ret = 0;
>> +
>> +	/* Initialize the debugging data */
>> +	memset(args, 0, sizeof(*args));
>> +	args->page_prot      = vm_get_page_prot(VMFLAGS);
>> +	args->page_prot_none = __P000;
> 
> Please preserve the existing comments before this assignment.
> 
>          /*
>           * __P000 (or even __S000) will help create page table entries with
>           * PROT_NONE permission as required for pxx_protnone_tests().
>           */
> 

Sure. I will combine the comments in v4 as below:

	/*
	 * Initialize the debugging arguments.
	 *
	 * __P000 (or even __S000) will help create page table entries with
          * PROT_NONE permission as required for pxx_protnone_tests().
          */


>> +	args->pud_pfn        = ULONG_MAX;
>> +	args->pmd_pfn        = ULONG_MAX;
>> +	args->pte_pfn        = ULONG_MAX;
>> +	args->fixed_pgd_pfn  = ULONG_MAX;
>> +	args->fixed_p4d_pfn  = ULONG_MAX;
>> +	args->fixed_pud_pfn  = ULONG_MAX;
>> +	args->fixed_pmd_pfn  = ULONG_MAX;
>> +	args->fixed_pte_pfn  = ULONG_MAX;
>> +
>> +	/* Allocate mm and vma */
>> +	args->mm = mm_alloc();
>> +	if (!args->mm) {
>> +		pr_err("Failed to allocate mm struct\n");
>> +		ret = -ENOMEM;
>> +		goto error;
>> +	}
>> +
>> +	args->vma = vm_area_alloc(args->mm);
>> +	if (!args->vma) {
>> +		pr_err("Failed to allocate vma\n");
>> +		ret = -ENOMEM;
>> +		goto error;
>> +	}
>> +
>> +	/* Figure out the virtual address and allocate page table entries */
>> +	args->vaddr = get_random_vaddr();
> 
> Please group args->vaddr's init with page_prot and page_prot_none above.
> 

Yes, It will make the code tidy. I'll move this line accordingly in v4,
but the related comments will be dropped as the code is self-explanatory.

         /* Allocate page table entries */

>> +	args->pgdp = pgd_offset(args->mm, args->vaddr);
>> +	args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>> +	args->pudp = args->p4dp ?
>> +		     pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
>> +	args->pmdp = args->pudp ?
>> +		     pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
>> +	args->ptep = args->pmdp ?
>> +		     pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
>> +	if (!args->ptep) {
>> +		pr_err("Failed to allocate page table\n");
>> +		ret = -ENOMEM;
>> +		goto error;
>> +	}
> 
> Why not just assert that all page table level pointers are allocated
> successfully, otherwise bail out the test completely. Something like
> this at each level.
> 
> 	if (!args->p4dp) {
> 		pr_err("Failed to allocate page table\n");
> 		ret = -ENOMEM;
> 		goto error;
> 	}
> 
> Is there any value in proceeding with the test when some page table
> pointers have not been allocated. Also individual tests do not cross
> check these pointers. Also asserting successful allocations will
> make the freeing path simpler, as I had mentioned earlier.
> 

There is no tests will be carried out if we fail to allocate any level
of page table entries. For other questions, please refer below response.
In summary, this snippet needs to be combined with next snippet, as below.

>> +
>> +	/*
>> +	 * The above page table entries will be modified. Lets save the
>> +	 * page table entries so that they can be released when the tests
>> +	 * are completed.
>> +	 */
>> +	args->start_p4dp = p4d_offset(args->pgdp, 0UL);
>> +	args->start_pudp = pud_offset(args->p4dp, 0UL);
>> +	args->start_pmdp = pmd_offset(args->pudp, 0UL);
>> +	args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
> 
> If the above page table pointers have been validated to be allocated
> successfully, we could add these here.
> 
> 	WARN_ON(!args->start_p4dp)
> 	WARN_ON(!args->start_pudp)
> 	WARN_ON(!args->start_pmdp)
> 	WARN_ON(!args->start_ptep)
> 
> Afterwards all those if (args->start_pxdp) checks in the freeing path
> will not be required anymore.
> 

The check on @args->start_pxdp is still needed in destroy_args() for
couple of cases: (1) destroy_args() is called on failing to allocate
@args->mm or @args->vma. That time, no page table entries are allocated.
(2) It's possible to fail allocating current level of page table entries
even the previous levels of page table entries are allocated successfully.

So Lets change these (above) two snippets as below in v4:

	/*
	 * Allocate page table entries. The allocated page table entries
	 * will be modified in the tests. Lets save the page table entries
	 * so that they can be released when the tests are completed.
	 */
	args->pgdp = pgd_offset(args->mm, args->vaddr);
	args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
	if (!args->p4dp) {
	    pr_err("Failed to allocate p4d entries\n");
	    ret = -ENOMEM;
	    goto error;
	}

	args->start_p4dp = p4d_offset(args->pgdp, 0UL);
	args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
	if (!args->pudp) {
	    pr_err("Failed to allocate pud entries\n");
	    ret = -ENOMEM;
	    goto error;
	}

	args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
	if (!args->pmdp) {
	    pr_err("Failed to allocate PMD entries\n");
	    ret = -ENOMEM;
	    goto error;
	}

	args->start_pmdp = pmd_offset(args->pudp, 0UL);
	args->ptep = pte_alloc_map(args->mm, args->pmdp, args->vaddr);
	if (!args->ptep) {
	    pr_err("Failed to allocate page table\n");
	    ret = -ENOMEM;
	    goto error;
	}

	args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));

>> +
>> +	/*
>> +	 * Figure out the fixed addresses, which are all around the kernel
>> +	 * symbol (@start_kernel). The corresponding PFNs might be invalid,
>> +	 * but it's fine as the following tests won't access the pages.
>> +	 */
>> +	phys = __pa_symbol(&start_kernel);
>> +	args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
>> +	args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
>> +	args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
>> +	args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
>> +	args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
>> +
>> +	/*
>> +	 * Allocate (huge) pages because some of the tests need to access
>> +	 * the data in the pages. The corresponding tests will be skipped
>> +	 * if we fail to allocate (huge) pages.
>> +	 */
>> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>> +	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>> +	    has_transparent_hugepage()) {
>> +		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>> +				   HPAGE_PUD_SHIFT - PAGE_SHIFT);
> 
> Please drop __GFP_NOWARN and instead use something like alloc_contig_pages()
> when required allocation order exceed (MAX_ORDER - 1). Else the test might
> not be able to execute on platform configurations, where PUD THP is enabled.
> 

Yes, It's correct that alloc_contig_pages() should be used here, depending
on CONFIG_CONTIG_ALLOC. Otherwise, alloc_pages(...__GFP_NOWARN...) is still
used as we're doing. This snippet will be changed like below in v4:

	/*
	 * Allocate (huge) pages because some of the tests need to access
	 * the data in the pages. The corresponding tests will be skipped
	 * if we fail to allocate (huge) pages.
	 */
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
	    IS_ENABLED(CONFIG_CONTIG_ALLOC)) &&
	    has_transparent_hugepage()) {
		page = alloc_contig_pages((1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)),
					  GFP_KERNEL | __GFP_NOWARN,
					  first_online_node, NULL);
		if (page) {
			args->is_contiguous_pud_page = true;
			args->pud_pfn = page_to_pfn(page);
			args->pmd_pfn = args->pud_pfn;
			args->pte_pfn = args->pud_pfn;
			return 0;
		}
	}

	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
	    has_transparent_hugepage()) {
		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
				   HPAGE_PUD_SHIFT - PAGE_SHIFT);
		if (page) {
			args->is_contiguous_pud_page = false;
			args->pud_pfn = page_to_pfn(page);
			args->pmd_pfn = args->pud_pfn;
			args->pte_pfn = args->pud_pfn;
			return 0;
		}
	}

	[... The logic to allocate PMD huge page or page is kept as of being]
         [... The code to release the PUD huge page needs changes based on @args->is_contiguous_pud_page]


>> +		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>> +				   HPAGE_PUD_SHIFT - PAGE_SHIFT);
>> +		if (page) {
>> +			args->pud_pfn = page_to_pfn(page);
>> +			args->pmd_pfn = args->pud_pfn;
>> +			args->pte_pfn = args->pud_pfn;
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>> +	    has_transparent_hugepage()) {
>> +		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN, HPAGE_PMD_ORDER);
>> +		if (page) {
>> +			args->pmd_pfn = page_to_pfn(page);
>> +			args->pte_pfn = args->pmd_pfn;
>> +			return 0;
>> +		}
>> +	}
>> +
>> +	page = alloc_pages(GFP_KERNEL, 0);
>> +	if (page)
>> +		args->pte_pfn = page_to_pfn(page);
>> +
>> +	return 0;
>> +
>> +error:
>> +	destroy_args(args);
>> +	return ret;
>> +}
>> +
>>   static int __init debug_vm_pgtable(void)
>>   {
>> +	struct pgtable_debug_args args;>  	struct vm_area_struct *vma;
>>   	struct mm_struct *mm;
>>   	pgd_t *pgdp;
>> @@ -970,9 +1159,13 @@ static int __init debug_vm_pgtable(void)
>>   	unsigned long vaddr, pte_aligned, pmd_aligned;
>>   	unsigned long pud_aligned, p4d_aligned, pgd_aligned;
>>   	spinlock_t *ptl = NULL;
>> -	int idx;
>> +	int idx, ret;
>>   
>>   	pr_info("Validating architecture page table helpers\n");
>> +	ret = init_args(&args);
>> +	if (ret)
>> +		return ret;
>> +
>>   	prot = vm_get_page_prot(VMFLAGS);
>>   	vaddr = get_random_vaddr();
>>   	mm = mm_alloc();
>> @@ -1127,6 +1320,8 @@ static int __init debug_vm_pgtable(void)
>>   	mm_dec_nr_pmds(mm);
>>   	mm_dec_nr_ptes(mm);
>>   	mmdrop(mm);
>> +
>> +	destroy_args(&args);
>>   	return 0;
>>   }
>>   late_initcall(debug_vm_pgtable);
>>

Thanks,
Gavin
Anshuman Khandual July 21, 2021, 10:39 a.m. UTC | #3
On 7/21/21 3:50 PM, Gavin Shan wrote:
> Hi Anshuman,
> 
> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>> In debug_vm_pgtable(), there are many local variables introduced to
>>> track the needed information and they are passed to the functions for
>>> various test cases. It'd better to introduce a struct as place holder
>>> for these information. With it, what the functions for various test
>>> cases need is the struct, to simplify the code. It also makes code
>>> easier to be maintained.
>>>
>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>> in the page table modifying tests. So the accessed pages in the tests
>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>> that aren't owned by us. This causes issues like page flag corruption.
>>>
>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>> and destroyed, but the information in the struct isn't used yet. They
>>> will be used in subsequent patches.
>>>
>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>> ---
>>>   mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 196 insertions(+), 1 deletion(-)
>>>
> 
> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
> I will wait to integrate your comments to v4 until you finish the review
> on all patches in v3 series

Yes, please do wait for the complete review and test before going for V4.
Also please add the following emails on copy next time, so that we might
have some more reviews here. Thank you.

+ Christophe Leroy <christophe.leroy@csgroup.eu>
+ Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ Qian Cai <cai@lca.pw>
+ Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Anshuman Khandual July 21, 2021, 10:59 a.m. UTC | #4
On 7/21/21 4:09 PM, Anshuman Khandual wrote:
> 
> On 7/21/21 3:50 PM, Gavin Shan wrote:
>> Hi Anshuman,
>>
>> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>>> In debug_vm_pgtable(), there are many local variables introduced to
>>>> track the needed information and they are passed to the functions for
>>>> various test cases. It'd better to introduce a struct as place holder
>>>> for these information. With it, what the functions for various test
>>>> cases need is the struct, to simplify the code. It also makes code
>>>> easier to be maintained.
>>>>
>>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>>> in the page table modifying tests. So the accessed pages in the tests
>>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>>> that aren't owned by us. This causes issues like page flag corruption.
>>>>
>>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>>> and destroyed, but the information in the struct isn't used yet. They
>>>> will be used in subsequent patches.
>>>>
>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>> ---
>>>>   mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 196 insertions(+), 1 deletion(-)
>>>>
>> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
>> I will wait to integrate your comments to v4 until you finish the review
>> on all patches in v3 series
> Yes, please do wait for the complete review and test before going for V4.
> Also please add the following emails on copy next time, so that we might
> have some more reviews here. Thank you.
> 
> + Christophe Leroy <christophe.leroy@csgroup.eu>
> + Gerald Schaefer <gerald.schaefer@de.ibm.com>

This one instead.

Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Gavin Shan July 21, 2021, 11:59 a.m. UTC | #5
On 7/21/21 8:59 PM, Anshuman Khandual wrote:
> On 7/21/21 4:09 PM, Anshuman Khandual wrote:
>> On 7/21/21 3:50 PM, Gavin Shan wrote:
>>> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>>>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>>>> In debug_vm_pgtable(), there are many local variables introduced to
>>>>> track the needed information and they are passed to the functions for
>>>>> various test cases. It'd better to introduce a struct as place holder
>>>>> for these information. With it, what the functions for various test
>>>>> cases need is the struct, to simplify the code. It also makes code
>>>>> easier to be maintained.
>>>>>
>>>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>>>> in the page table modifying tests. So the accessed pages in the tests
>>>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>>>> that aren't owned by us. This causes issues like page flag corruption.
>>>>>
>>>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>>>> and destroyed, but the information in the struct isn't used yet. They
>>>>> will be used in subsequent patches.
>>>>>
>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>> ---
>>>>>    mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>>>    1 file changed, 196 insertions(+), 1 deletion(-)
>>>>>
>>> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
>>> I will wait to integrate your comments to v4 until you finish the review
>>> on all patches in v3 series
>> Yes, please do wait for the complete review and test before going for V4.
>> Also please add the following emails on copy next time, so that we might
>> have some more reviews here. Thank you.
>>
>> + Christophe Leroy <christophe.leroy@csgroup.eu>
>> + Gerald Schaefer <gerald.schaefer@de.ibm.com>
> 
> This one instead.
> 
> Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> 

Sure, It's always nice to have more reviews from the experts. I will
include them when I post v4. Thanks again for your time to review :)

Thanks,
Gavin
Anshuman Khandual July 22, 2021, 4:41 a.m. UTC | #6
On 7/21/21 3:50 PM, Gavin Shan wrote:
> Hi Anshuman,
> 
> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>> In debug_vm_pgtable(), there are many local variables introduced to
>>> track the needed information and they are passed to the functions for
>>> various test cases. It'd better to introduce a struct as place holder
>>> for these information. With it, what the functions for various test
>>> cases need is the struct, to simplify the code. It also makes code
>>> easier to be maintained.
>>>
>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>> in the page table modifying tests. So the accessed pages in the tests
>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>> that aren't owned by us. This causes issues like page flag corruption.
>>>
>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>> and destroyed, but the information in the struct isn't used yet. They
>>> will be used in subsequent patches.
>>>
>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>> ---
>>>   mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 196 insertions(+), 1 deletion(-)
>>>
> 
> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
> I will wait to integrate your comments to v4 until you finish the review
> on all patches in v3 series.
> 
>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>> index 1c922691aa61..ea153ff40d23 100644
>>> --- a/mm/debug_vm_pgtable.c
>>> +++ b/mm/debug_vm_pgtable.c
>>> @@ -58,6 +58,36 @@
>>>   #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
>>>   #define RANDOM_NZVALUE    GENMASK(7, 0)
>>>   +struct pgtable_debug_args {
>>> +    struct mm_struct    *mm;
>>> +    struct vm_area_struct    *vma;
>>> +
>>> +    pgd_t            *pgdp;
>>> +    p4d_t            *p4dp;
>>> +    pud_t            *pudp;
>>> +    pmd_t            *pmdp;
>>> +    pte_t            *ptep;
>>> +
>>> +    p4d_t            *start_p4dp;
>>> +    pud_t            *start_pudp;
>>> +    pmd_t            *start_pmdp;
>>> +    pgtable_t        start_ptep;
>>> +
>>> +    unsigned long        vaddr;
>>> +    pgprot_t        page_prot;
>>> +    pgprot_t        page_prot_none;
>>> +
>>> +    unsigned long        pud_pfn;
>>> +    unsigned long        pmd_pfn;
>>> +    unsigned long        pte_pfn;
>>> +
>>> +    unsigned long        fixed_pgd_pfn;
>>> +    unsigned long        fixed_p4d_pfn;
>>> +    unsigned long        fixed_pud_pfn;
>>> +    unsigned long        fixed_pmd_pfn;
>>> +    unsigned long        fixed_pte_pfn;
>>> +};
>>> +
>>>   static void __init pte_basic_tests(unsigned long pfn, int idx)
>>>   {
>>>       pgprot_t prot = protection_map[idx];
>>> @@ -955,8 +985,167 @@ static unsigned long __init get_random_vaddr(void)
>>>       return random_vaddr;
>>>   }
>>>   +static void __init destroy_args(struct pgtable_debug_args *args)
>>> +{
>>> +    struct page *page = NULL;
>>> +
>>> +    /* Free (huge) page */
>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>> +        has_transparent_hugepage() &&
>>> +        args->pud_pfn != ULONG_MAX) {
>>> +        page = pfn_to_page(args->pud_pfn);
>>> +        __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>> +    } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>> +           has_transparent_hugepage() &&
>>> +           args->pmd_pfn != ULONG_MAX) {
>>> +        page = pfn_to_page(args->pmd_pfn);
>>> +        __free_pages(page, HPAGE_PMD_ORDER);
>>> +    } else if (args->pte_pfn != ULONG_MAX) {
>>> +        page = pfn_to_page(args->pte_pfn);
>>> +        __free_pages(page, 0);
>>> +    }
>>> +
>>> +    /* Free page table */
>>> +    if (args->start_ptep) {
>>> +        pte_free(args->mm, args->start_ptep);
>>> +        mm_dec_nr_ptes(args->mm);
>>> +    }
>>> +
>>> +    if (args->start_pmdp) {
>>> +        pmd_free(args->mm, args->start_pmdp);
>>> +        mm_dec_nr_pmds(args->mm);
>>> +    }
>>> +
>>> +    if (args->start_pudp) {
>>> +        pud_free(args->mm, args->start_pudp);
>>> +        mm_dec_nr_puds(args->mm);
>>> +    }
>>> +
>>> +    if (args->start_p4dp)
>>> +        p4d_free(args->mm, args->p4dp);
>>> +
>>> +    /* Free vma and mm struct */
>>> +    if (args->vma)
>>> +        vm_area_free(args->vma);
>>> +    if (args->mm)
>>> +        mmdrop(args->mm);
>>> +}
>>> +
>>> +static int __init init_args(struct pgtable_debug_args *args)
>>> +{
>>> +    struct page *page = NULL;
>>> +    phys_addr_t phys;
>>> +    int ret = 0;
>>> +
>>> +    /* Initialize the debugging data */
>>> +    memset(args, 0, sizeof(*args));
>>> +    args->page_prot      = vm_get_page_prot(VMFLAGS);
>>> +    args->page_prot_none = __P000;
>>
>> Please preserve the existing comments before this assignment.
>>
>>          /*
>>           * __P000 (or even __S000) will help create page table entries with
>>           * PROT_NONE permission as required for pxx_protnone_tests().
>>           */
>>
> 
> Sure. I will combine the comments in v4 as below:
> 
>     /*
>      * Initialize the debugging arguments.
>      *
>      * __P000 (or even __S000) will help create page table entries with
>          * PROT_NONE permission as required for pxx_protnone_tests().
>          */
> 
> 
>>> +    args->pud_pfn        = ULONG_MAX;
>>> +    args->pmd_pfn        = ULONG_MAX;
>>> +    args->pte_pfn        = ULONG_MAX;
>>> +    args->fixed_pgd_pfn  = ULONG_MAX;
>>> +    args->fixed_p4d_pfn  = ULONG_MAX;
>>> +    args->fixed_pud_pfn  = ULONG_MAX;
>>> +    args->fixed_pmd_pfn  = ULONG_MAX;
>>> +    args->fixed_pte_pfn  = ULONG_MAX;
>>> +
>>> +    /* Allocate mm and vma */
>>> +    args->mm = mm_alloc();
>>> +    if (!args->mm) {
>>> +        pr_err("Failed to allocate mm struct\n");
>>> +        ret = -ENOMEM;
>>> +        goto error;
>>> +    }
>>> +
>>> +    args->vma = vm_area_alloc(args->mm);
>>> +    if (!args->vma) {
>>> +        pr_err("Failed to allocate vma\n");
>>> +        ret = -ENOMEM;
>>> +        goto error;
>>> +    }
>>> +
>>> +    /* Figure out the virtual address and allocate page table entries */
>>> +    args->vaddr = get_random_vaddr();
>>
>> Please group args->vaddr's init with page_prot and page_prot_none above.
>>
> 
> Yes, It will make the code tidy. I'll move this line accordingly in v4,
> but the related comments will be dropped as the code is self-explanatory.
> 
>         /* Allocate page table entries */
> 
>>> +    args->pgdp = pgd_offset(args->mm, args->vaddr);
>>> +    args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>>> +    args->pudp = args->p4dp ?
>>> +             pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
>>> +    args->pmdp = args->pudp ?
>>> +             pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
>>> +    args->ptep = args->pmdp ?
>>> +             pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
>>> +    if (!args->ptep) {
>>> +        pr_err("Failed to allocate page table\n");
>>> +        ret = -ENOMEM;
>>> +        goto error;
>>> +    }
>>
>> Why not just assert that all page table level pointers are allocated
>> successfully, otherwise bail out the test completely. Something like
>> this at each level.
>>
>>     if (!args->p4dp) {
>>         pr_err("Failed to allocate page table\n");
>>         ret = -ENOMEM;
>>         goto error;
>>     }
>>
>> Is there any value in proceeding with the test when some page table
>> pointers have not been allocated. Also individual tests do not cross
>> check these pointers. Also asserting successful allocations will
>> make the freeing path simpler, as I had mentioned earlier.
>>
> 
> There is no tests will be carried out if we fail to allocate any level
> of page table entries. For other questions, please refer below response.
> In summary, this snippet needs to be combined with next snippet, as below.
> 
>>> +
>>> +    /*
>>> +     * The above page table entries will be modified. Lets save the
>>> +     * page table entries so that they can be released when the tests
>>> +     * are completed.
>>> +     */
>>> +    args->start_p4dp = p4d_offset(args->pgdp, 0UL);
>>> +    args->start_pudp = pud_offset(args->p4dp, 0UL);
>>> +    args->start_pmdp = pmd_offset(args->pudp, 0UL);
>>> +    args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
>>
>> If the above page table pointers have been validated to be allocated
>> successfully, we could add these here.
>>
>>     WARN_ON(!args->start_p4dp)
>>     WARN_ON(!args->start_pudp)
>>     WARN_ON(!args->start_pmdp)
>>     WARN_ON(!args->start_ptep)
>>
>> Afterwards all those if (args->start_pxdp) checks in the freeing path
>> will not be required anymore.
>>
> 
> The check on @args->start_pxdp is still needed in destroy_args() for
> couple of cases: (1) destroy_args() is called on failing to allocate
> @args->mm or @args->vma. That time, no page table entries are allocated.
> (2) It's possible to fail allocating current level of page table entries
> even the previous levels of page table entries are allocated successfully.

This makes sense as destroy_args() is getting called if any of these
allocations fails during init_args(). Did not realize that earlier.

> 
> So Lets change these (above) two snippets as below in v4:
> 
>     /*
>      * Allocate page table entries. The allocated page table entries
>      * will be modified in the tests. Lets save the page table entries
>      * so that they can be released when the tests are completed.
>      */
>     args->pgdp = pgd_offset(args->mm, args->vaddr);
>     args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>     if (!args->p4dp) {
>         pr_err("Failed to allocate p4d entries\n");
>         ret = -ENOMEM;
>         goto error;
>     }
> 
>     args->start_p4dp = p4d_offset(args->pgdp, 0UL);

Dont bring the arg->start_pxdp assignments here. If all page table level
pointer allocations succeed, they all get assigned together like we have
right now. Although a sanity check afterwards like the following, might
still be better.

WARN_ON(!args->start_p4dp)
WARN_ON(!args->start_pudp)
WARN_ON(!args->start_pmdp)
WARN_ON(!args->start_ptep)

>     args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
>     if (!args->pudp) {
>         pr_err("Failed to allocate pud entries\n");
>         ret = -ENOMEM;
>         goto error;
>     }
> 
>     args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
>     if (!args->pmdp) {
>         pr_err("Failed to allocate PMD entries\n");
>         ret = -ENOMEM;
>         goto error;
>     }
> 
>     args->start_pmdp = pmd_offset(args->pudp, 0UL);
>     args->ptep = pte_alloc_map(args->mm, args->pmdp, args->vaddr);
>     if (!args->ptep) {
>         pr_err("Failed to allocate page table\n");
>         ret = -ENOMEM;
>         goto error;
>     }
> 
>     args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
> 
>>> +
>>> +    /*
>>> +     * Figure out the fixed addresses, which are all around the kernel
>>> +     * symbol (@start_kernel). The corresponding PFNs might be invalid,
>>> +     * but it's fine as the following tests won't access the pages.
>>> +     */
>>> +    phys = __pa_symbol(&start_kernel);
>>> +    args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
>>> +    args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
>>> +    args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
>>> +    args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
>>> +    args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
>>> +
>>> +    /*
>>> +     * Allocate (huge) pages because some of the tests need to access
>>> +     * the data in the pages. The corresponding tests will be skipped
>>> +     * if we fail to allocate (huge) pages.
>>> +     */
>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>> +        has_transparent_hugepage()) {
>>> +        page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>>> +                   HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>
>> Please drop __GFP_NOWARN and instead use something like alloc_contig_pages()
>> when required allocation order exceed (MAX_ORDER - 1). Else the test might
>> not be able to execute on platform configurations, where PUD THP is enabled.
>>
> 
> Yes, It's correct that alloc_contig_pages() should be used here, depending
> on CONFIG_CONTIG_ALLOC. Otherwise, alloc_pages(...__GFP_NOWARN...) is still
> used as we're doing. This snippet will be changed like below in v4:

First 'order > (MAX_ORDER - 1)' needs to be established before calling into
alloc_contig_pages() without __GFP_NOWARN and set a new flag indicating that
there is contig page allocated. But if 'order <= (MAX_ORDER - 1)', then call
alloc_pages(..) without  __GFP_NOWARN. There is no need to add  __GFP_NOWARN
in any case. In case CONFIG_CONTIG_ALLOC is not available, directly return a
NULL as that would have been the case with alloc_pages(...__GFP_NOWARN...) as
well.

Symbol alloc_contig_pages() is not available outside CONFIG_CONTIG_ALLOC. So
IS_ENABLED() construct will not work, unless there is an empty stub added in
the header. Otherwise #ifdef CONFIG_CONTIG_ALLOC needs to be used instead.

Regardless please do test this on a x86 platform with PUD based THP in order
to make sure every thing works as expected.

> 
>     /*
>      * Allocate (huge) pages because some of the tests need to access
>      * the data in the pages. The corresponding tests will be skipped
>      * if we fail to allocate (huge) pages.
>      */
>     if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>         IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>         IS_ENABLED(CONFIG_CONTIG_ALLOC)) &&
>         has_transparent_hugepage()) {
>         page = alloc_contig_pages((1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)),
>                       GFP_KERNEL | __GFP_NOWARN,
>                       first_online_node, NULL);
>         if (page) {
>             args->is_contiguous_pud_page = true;
>             args->pud_pfn = page_to_pfn(page);
>             args->pmd_pfn = args->pud_pfn;
>             args->pte_pfn = args->pud_pfn;
>             return 0;
>         }
>     }
> 
>     if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>         IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>         has_transparent_hugepage()) {
>         page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>                    HPAGE_PUD_SHIFT - PAGE_SHIFT);
>         if (page) {
>             args->is_contiguous_pud_page = false;
>             args->pud_pfn = page_to_pfn(page);
>             args->pmd_pfn = args->pud_pfn;
>             args->pte_pfn = args->pud_pfn;
>             return 0;
>         }
>     }
> 
>     [... The logic to allocate PMD huge page or page is kept as of being]

IIRC it is also not guaranteed that PMD_SHIFT <= (MAX_ORDER - 1). Hence
this same scheme should be followed for PMD level allocation as well.

>     [... The code to release the PUD huge page needs changes based on @args->is_contiguous_pud_page]

Right, a flag would be needed to call the appropriate free function.
Gavin Shan July 22, 2021, 6:23 a.m. UTC | #7
Hi Anshuman,

On 7/22/21 2:41 PM, Anshuman Khandual wrote:
> On 7/21/21 3:50 PM, Gavin Shan wrote:
>> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>>> In debug_vm_pgtable(), there are many local variables introduced to
>>>> track the needed information and they are passed to the functions for
>>>> various test cases. It'd better to introduce a struct as place holder
>>>> for these information. With it, what the functions for various test
>>>> cases need is the struct, to simplify the code. It also makes code
>>>> easier to be maintained.
>>>>
>>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>>> in the page table modifying tests. So the accessed pages in the tests
>>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>>> that aren't owned by us. This causes issues like page flag corruption.
>>>>
>>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>>> and destroyed, but the information in the struct isn't used yet. They
>>>> will be used in subsequent patches.
>>>>
>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>> ---
>>>>    mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 196 insertions(+), 1 deletion(-)
>>>>
>>
>> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
>> I will wait to integrate your comments to v4 until you finish the review
>> on all patches in v3 series.
>>
>>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>>> index 1c922691aa61..ea153ff40d23 100644
>>>> --- a/mm/debug_vm_pgtable.c
>>>> +++ b/mm/debug_vm_pgtable.c
>>>> @@ -58,6 +58,36 @@
>>>>    #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
>>>>    #define RANDOM_NZVALUE    GENMASK(7, 0)
>>>>    +struct pgtable_debug_args {
>>>> +    struct mm_struct    *mm;
>>>> +    struct vm_area_struct    *vma;
>>>> +
>>>> +    pgd_t            *pgdp;
>>>> +    p4d_t            *p4dp;
>>>> +    pud_t            *pudp;
>>>> +    pmd_t            *pmdp;
>>>> +    pte_t            *ptep;
>>>> +
>>>> +    p4d_t            *start_p4dp;
>>>> +    pud_t            *start_pudp;
>>>> +    pmd_t            *start_pmdp;
>>>> +    pgtable_t        start_ptep;
>>>> +
>>>> +    unsigned long        vaddr;
>>>> +    pgprot_t        page_prot;
>>>> +    pgprot_t        page_prot_none;
>>>> +
>>>> +    unsigned long        pud_pfn;
>>>> +    unsigned long        pmd_pfn;
>>>> +    unsigned long        pte_pfn;
>>>> +
>>>> +    unsigned long        fixed_pgd_pfn;
>>>> +    unsigned long        fixed_p4d_pfn;
>>>> +    unsigned long        fixed_pud_pfn;
>>>> +    unsigned long        fixed_pmd_pfn;
>>>> +    unsigned long        fixed_pte_pfn;
>>>> +};
>>>> +
>>>>    static void __init pte_basic_tests(unsigned long pfn, int idx)
>>>>    {
>>>>        pgprot_t prot = protection_map[idx];
>>>> @@ -955,8 +985,167 @@ static unsigned long __init get_random_vaddr(void)
>>>>        return random_vaddr;
>>>>    }
>>>>    +static void __init destroy_args(struct pgtable_debug_args *args)
>>>> +{
>>>> +    struct page *page = NULL;
>>>> +
>>>> +    /* Free (huge) page */
>>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>> +        has_transparent_hugepage() &&
>>>> +        args->pud_pfn != ULONG_MAX) {
>>>> +        page = pfn_to_page(args->pud_pfn);
>>>> +        __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>>> +    } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>> +           has_transparent_hugepage() &&
>>>> +           args->pmd_pfn != ULONG_MAX) {
>>>> +        page = pfn_to_page(args->pmd_pfn);
>>>> +        __free_pages(page, HPAGE_PMD_ORDER);
>>>> +    } else if (args->pte_pfn != ULONG_MAX) {
>>>> +        page = pfn_to_page(args->pte_pfn);
>>>> +        __free_pages(page, 0);
>>>> +    }
>>>> +
>>>> +    /* Free page table */
>>>> +    if (args->start_ptep) {
>>>> +        pte_free(args->mm, args->start_ptep);
>>>> +        mm_dec_nr_ptes(args->mm);
>>>> +    }
>>>> +
>>>> +    if (args->start_pmdp) {
>>>> +        pmd_free(args->mm, args->start_pmdp);
>>>> +        mm_dec_nr_pmds(args->mm);
>>>> +    }
>>>> +
>>>> +    if (args->start_pudp) {
>>>> +        pud_free(args->mm, args->start_pudp);
>>>> +        mm_dec_nr_puds(args->mm);
>>>> +    }
>>>> +
>>>> +    if (args->start_p4dp)
>>>> +        p4d_free(args->mm, args->p4dp);
>>>> +
>>>> +    /* Free vma and mm struct */
>>>> +    if (args->vma)
>>>> +        vm_area_free(args->vma);
>>>> +    if (args->mm)
>>>> +        mmdrop(args->mm);
>>>> +}
>>>> +
>>>> +static int __init init_args(struct pgtable_debug_args *args)
>>>> +{
>>>> +    struct page *page = NULL;
>>>> +    phys_addr_t phys;
>>>> +    int ret = 0;
>>>> +
>>>> +    /* Initialize the debugging data */
>>>> +    memset(args, 0, sizeof(*args));
>>>> +    args->page_prot      = vm_get_page_prot(VMFLAGS);
>>>> +    args->page_prot_none = __P000;
>>>
>>> Please preserve the existing comments before this assignment.
>>>
>>>           /*
>>>            * __P000 (or even __S000) will help create page table entries with
>>>            * PROT_NONE permission as required for pxx_protnone_tests().
>>>            */
>>>
>>
>> Sure. I will combine the comments in v4 as below:
>>
>>      /*
>>       * Initialize the debugging arguments.
>>       *
>>       * __P000 (or even __S000) will help create page table entries with
>>           * PROT_NONE permission as required for pxx_protnone_tests().
>>           */
>>
>>
>>>> +    args->pud_pfn        = ULONG_MAX;
>>>> +    args->pmd_pfn        = ULONG_MAX;
>>>> +    args->pte_pfn        = ULONG_MAX;
>>>> +    args->fixed_pgd_pfn  = ULONG_MAX;
>>>> +    args->fixed_p4d_pfn  = ULONG_MAX;
>>>> +    args->fixed_pud_pfn  = ULONG_MAX;
>>>> +    args->fixed_pmd_pfn  = ULONG_MAX;
>>>> +    args->fixed_pte_pfn  = ULONG_MAX;
>>>> +
>>>> +    /* Allocate mm and vma */
>>>> +    args->mm = mm_alloc();
>>>> +    if (!args->mm) {
>>>> +        pr_err("Failed to allocate mm struct\n");
>>>> +        ret = -ENOMEM;
>>>> +        goto error;
>>>> +    }
>>>> +
>>>> +    args->vma = vm_area_alloc(args->mm);
>>>> +    if (!args->vma) {
>>>> +        pr_err("Failed to allocate vma\n");
>>>> +        ret = -ENOMEM;
>>>> +        goto error;
>>>> +    }
>>>> +
>>>> +    /* Figure out the virtual address and allocate page table entries */
>>>> +    args->vaddr = get_random_vaddr();
>>>
>>> Please group args->vaddr's init with page_prot and page_prot_none above.
>>>
>>
>> Yes, It will make the code tidy. I'll move this line accordingly in v4,
>> but the related comments will be dropped as the code is self-explanatory.
>>
>>          /* Allocate page table entries */
>>
>>>> +    args->pgdp = pgd_offset(args->mm, args->vaddr);
>>>> +    args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>>>> +    args->pudp = args->p4dp ?
>>>> +             pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
>>>> +    args->pmdp = args->pudp ?
>>>> +             pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
>>>> +    args->ptep = args->pmdp ?
>>>> +             pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
>>>> +    if (!args->ptep) {
>>>> +        pr_err("Failed to allocate page table\n");
>>>> +        ret = -ENOMEM;
>>>> +        goto error;
>>>> +    }
>>>
>>> Why not just assert that all page table level pointers are allocated
>>> successfully, otherwise bail out the test completely. Something like
>>> this at each level.
>>>
>>>      if (!args->p4dp) {
>>>          pr_err("Failed to allocate page table\n");
>>>          ret = -ENOMEM;
>>>          goto error;
>>>      }
>>>
>>> Is there any value in proceeding with the test when some page table
>>> pointers have not been allocated. Also individual tests do not cross
>>> check these pointers. Also asserting successful allocations will
>>> make the freeing path simpler, as I had mentioned earlier.
>>>
>>
>> There is no tests will be carried out if we fail to allocate any level
>> of page table entries. For other questions, please refer below response.
>> In summary, this snippet needs to be combined with next snippet, as below.
>>
>>>> +
>>>> +    /*
>>>> +     * The above page table entries will be modified. Lets save the
>>>> +     * page table entries so that they can be released when the tests
>>>> +     * are completed.
>>>> +     */
>>>> +    args->start_p4dp = p4d_offset(args->pgdp, 0UL);
>>>> +    args->start_pudp = pud_offset(args->p4dp, 0UL);
>>>> +    args->start_pmdp = pmd_offset(args->pudp, 0UL);
>>>> +    args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
>>>
>>> If the above page table pointers have been validated to be allocated
>>> successfully, we could add these here.
>>>
>>>      WARN_ON(!args->start_p4dp)
>>>      WARN_ON(!args->start_pudp)
>>>      WARN_ON(!args->start_pmdp)
>>>      WARN_ON(!args->start_ptep)
>>>
>>> Afterwards all those if (args->start_pxdp) checks in the freeing path
>>> will not be required anymore.
>>>
>>
>> The check on @args->start_pxdp is still needed in destroy_args() for
>> couple of cases: (1) destroy_args() is called on failing to allocate
>> @args->mm or @args->vma. That time, no page table entries are allocated.
>> (2) It's possible to fail allocating current level of page table entries
>> even the previous levels of page table entries are allocated successfully.
> 
> This makes sense as destroy_args() is getting called if any of these
> allocations fails during init_args(). Did not realize that earlier.
> 
>>
>> So Lets change these (above) two snippets as below in v4:
>>
>>      /*
>>       * Allocate page table entries. The allocated page table entries
>>       * will be modified in the tests. Lets save the page table entries
>>       * so that they can be released when the tests are completed.
>>       */
>>      args->pgdp = pgd_offset(args->mm, args->vaddr);
>>      args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>>      if (!args->p4dp) {
>>          pr_err("Failed to allocate p4d entries\n");
>>          ret = -ENOMEM;
>>          goto error;
>>      }
>>
>>      args->start_p4dp = p4d_offset(args->pgdp, 0UL);
> 
> Dont bring the arg->start_pxdp assignments here. If all page table level
> pointer allocations succeed, they all get assigned together like we have
> right now. Although a sanity check afterwards like the following, might
> still be better.
> 
> WARN_ON(!args->start_p4dp)
> WARN_ON(!args->start_pudp)
> WARN_ON(!args->start_pmdp)
> WARN_ON(!args->start_ptep)
> 

We have to assign arg->start_pxdp here because destroy_args() relies
it to release the corresponding page tables in failing path. For example,
the args->start_p4dp is going to be release if we fail to populate
args->start_pudp.

Ok. I will add WARN_ON() for each level of page table entries right after
they are assigned in v4.

>>      args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
>>      if (!args->pudp) {
>>          pr_err("Failed to allocate pud entries\n");
>>          ret = -ENOMEM;
>>          goto error;
>>      }
>>
>>      args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
>>      if (!args->pmdp) {
>>          pr_err("Failed to allocate PMD entries\n");
>>          ret = -ENOMEM;
>>          goto error;
>>      }
>>
>>      args->start_pmdp = pmd_offset(args->pudp, 0UL);
>>      args->ptep = pte_alloc_map(args->mm, args->pmdp, args->vaddr);
>>      if (!args->ptep) {
>>          pr_err("Failed to allocate page table\n");
>>          ret = -ENOMEM;
>>          goto error;
>>      }
>>
>>      args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
>>
>>>> +
>>>> +    /*
>>>> +     * Figure out the fixed addresses, which are all around the kernel
>>>> +     * symbol (@start_kernel). The corresponding PFNs might be invalid,
>>>> +     * but it's fine as the following tests won't access the pages.
>>>> +     */
>>>> +    phys = __pa_symbol(&start_kernel);
>>>> +    args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
>>>> +    args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
>>>> +    args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
>>>> +    args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
>>>> +    args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
>>>> +
>>>> +    /*
>>>> +     * Allocate (huge) pages because some of the tests need to access
>>>> +     * the data in the pages. The corresponding tests will be skipped
>>>> +     * if we fail to allocate (huge) pages.
>>>> +     */
>>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>> +        has_transparent_hugepage()) {
>>>> +        page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>>>> +                   HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>>
>>> Please drop __GFP_NOWARN and instead use something like alloc_contig_pages()
>>> when required allocation order exceed (MAX_ORDER - 1). Else the test might
>>> not be able to execute on platform configurations, where PUD THP is enabled.
>>>
>>
>> Yes, It's correct that alloc_contig_pages() should be used here, depending
>> on CONFIG_CONTIG_ALLOC. Otherwise, alloc_pages(...__GFP_NOWARN...) is still
>> used as we're doing. This snippet will be changed like below in v4:
> 
> First 'order > (MAX_ORDER - 1)' needs to be established before calling into
> alloc_contig_pages() without __GFP_NOWARN and set a new flag indicating that
> there is contig page allocated. But if 'order <= (MAX_ORDER - 1)', then call
> alloc_pages(..) without  __GFP_NOWARN. There is no need to add  __GFP_NOWARN
> in any case. In case CONFIG_CONTIG_ALLOC is not available, directly return a
> NULL as that would have been the case with alloc_pages(...__GFP_NOWARN...) as
> well.
> 
> Symbol alloc_contig_pages() is not available outside CONFIG_CONTIG_ALLOC. So
> IS_ENABLED() construct will not work, unless there is an empty stub added in
> the header. Otherwise #ifdef CONFIG_CONTIG_ALLOC needs to be used instead.
> 
> Regardless please do test this on a x86 platform with PUD based THP in order
> to make sure every thing works as expected.
> 

Thanks, I will change the code accordingly in v4 and test it on x86
before posting it.

>>
>>      /*
>>       * Allocate (huge) pages because some of the tests need to access
>>       * the data in the pages. The corresponding tests will be skipped
>>       * if we fail to allocate (huge) pages.
>>       */
>>      if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>          IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>          IS_ENABLED(CONFIG_CONTIG_ALLOC)) &&
>>          has_transparent_hugepage()) {
>>          page = alloc_contig_pages((1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)),
>>                        GFP_KERNEL | __GFP_NOWARN,
>>                        first_online_node, NULL);
>>          if (page) {
>>              args->is_contiguous_pud_page = true;
>>              args->pud_pfn = page_to_pfn(page);
>>              args->pmd_pfn = args->pud_pfn;
>>              args->pte_pfn = args->pud_pfn;
>>              return 0;
>>          }
>>      }
>>
>>      if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>          IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>          has_transparent_hugepage()) {
>>          page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>>                     HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>          if (page) {
>>              args->is_contiguous_pud_page = false;
>>              args->pud_pfn = page_to_pfn(page);
>>              args->pmd_pfn = args->pud_pfn;
>>              args->pte_pfn = args->pud_pfn;
>>              return 0;
>>          }
>>      }
>>
>>      [... The logic to allocate PMD huge page or page is kept as of being]
> 
> IIRC it is also not guaranteed that PMD_SHIFT <= (MAX_ORDER - 1). Hence
> this same scheme should be followed for PMD level allocation as well.
> 

In theory, it's possible to have PMD_SHIFT <= (MAX_ORDER - 1) with misconfigured
kernel. I will apply the similar logic to PMD huge page in v4.

>>      [... The code to release the PUD huge page needs changes based on @args->is_contiguous_pud_page]
> 
> Right, a flag would be needed to call the appropriate free function.
> 

Yes. We need two falgs for PUD and PMD huge pages separately.

Thanks,
Gavin
Anshuman Khandual July 22, 2021, 7:08 a.m. UTC | #8
On 7/22/21 11:53 AM, Gavin Shan wrote:
> Hi Anshuman,
> 
> On 7/22/21 2:41 PM, Anshuman Khandual wrote:
>> On 7/21/21 3:50 PM, Gavin Shan wrote:
>>> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>>>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>>>> In debug_vm_pgtable(), there are many local variables introduced to
>>>>> track the needed information and they are passed to the functions for
>>>>> various test cases. It'd better to introduce a struct as place holder
>>>>> for these information. With it, what the functions for various test
>>>>> cases need is the struct, to simplify the code. It also makes code
>>>>> easier to be maintained.
>>>>>
>>>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>>>> in the page table modifying tests. So the accessed pages in the tests
>>>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>>>> that aren't owned by us. This causes issues like page flag corruption.
>>>>>
>>>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>>>> and destroyed, but the information in the struct isn't used yet. They
>>>>> will be used in subsequent patches.
>>>>>
>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>> ---
>>>>>    mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>>>    1 file changed, 196 insertions(+), 1 deletion(-)
>>>>>
>>>
>>> I saw you've finished the review on PATCH[v3 01/12] and PATCH[v3 02/12].
>>> I will wait to integrate your comments to v4 until you finish the review
>>> on all patches in v3 series.
>>>
>>>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>>>> index 1c922691aa61..ea153ff40d23 100644
>>>>> --- a/mm/debug_vm_pgtable.c
>>>>> +++ b/mm/debug_vm_pgtable.c
>>>>> @@ -58,6 +58,36 @@
>>>>>    #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
>>>>>    #define RANDOM_NZVALUE    GENMASK(7, 0)
>>>>>    +struct pgtable_debug_args {
>>>>> +    struct mm_struct    *mm;
>>>>> +    struct vm_area_struct    *vma;
>>>>> +
>>>>> +    pgd_t            *pgdp;
>>>>> +    p4d_t            *p4dp;
>>>>> +    pud_t            *pudp;
>>>>> +    pmd_t            *pmdp;
>>>>> +    pte_t            *ptep;
>>>>> +
>>>>> +    p4d_t            *start_p4dp;
>>>>> +    pud_t            *start_pudp;
>>>>> +    pmd_t            *start_pmdp;
>>>>> +    pgtable_t        start_ptep;
>>>>> +
>>>>> +    unsigned long        vaddr;
>>>>> +    pgprot_t        page_prot;
>>>>> +    pgprot_t        page_prot_none;
>>>>> +
>>>>> +    unsigned long        pud_pfn;
>>>>> +    unsigned long        pmd_pfn;
>>>>> +    unsigned long        pte_pfn;
>>>>> +
>>>>> +    unsigned long        fixed_pgd_pfn;
>>>>> +    unsigned long        fixed_p4d_pfn;
>>>>> +    unsigned long        fixed_pud_pfn;
>>>>> +    unsigned long        fixed_pmd_pfn;
>>>>> +    unsigned long        fixed_pte_pfn;
>>>>> +};
>>>>> +
>>>>>    static void __init pte_basic_tests(unsigned long pfn, int idx)
>>>>>    {
>>>>>        pgprot_t prot = protection_map[idx];
>>>>> @@ -955,8 +985,167 @@ static unsigned long __init get_random_vaddr(void)
>>>>>        return random_vaddr;
>>>>>    }
>>>>>    +static void __init destroy_args(struct pgtable_debug_args *args)
>>>>> +{
>>>>> +    struct page *page = NULL;
>>>>> +
>>>>> +    /* Free (huge) page */
>>>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>>> +        has_transparent_hugepage() &&
>>>>> +        args->pud_pfn != ULONG_MAX) {
>>>>> +        page = pfn_to_page(args->pud_pfn);
>>>>> +        __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>>>> +    } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>>> +           has_transparent_hugepage() &&
>>>>> +           args->pmd_pfn != ULONG_MAX) {
>>>>> +        page = pfn_to_page(args->pmd_pfn);
>>>>> +        __free_pages(page, HPAGE_PMD_ORDER);
>>>>> +    } else if (args->pte_pfn != ULONG_MAX) {
>>>>> +        page = pfn_to_page(args->pte_pfn);
>>>>> +        __free_pages(page, 0);
>>>>> +    }
>>>>> +
>>>>> +    /* Free page table */
>>>>> +    if (args->start_ptep) {
>>>>> +        pte_free(args->mm, args->start_ptep);
>>>>> +        mm_dec_nr_ptes(args->mm);
>>>>> +    }
>>>>> +
>>>>> +    if (args->start_pmdp) {
>>>>> +        pmd_free(args->mm, args->start_pmdp);
>>>>> +        mm_dec_nr_pmds(args->mm);
>>>>> +    }
>>>>> +
>>>>> +    if (args->start_pudp) {
>>>>> +        pud_free(args->mm, args->start_pudp);
>>>>> +        mm_dec_nr_puds(args->mm);
>>>>> +    }
>>>>> +
>>>>> +    if (args->start_p4dp)
>>>>> +        p4d_free(args->mm, args->p4dp);
>>>>> +
>>>>> +    /* Free vma and mm struct */
>>>>> +    if (args->vma)
>>>>> +        vm_area_free(args->vma);
>>>>> +    if (args->mm)
>>>>> +        mmdrop(args->mm);
>>>>> +}
>>>>> +
>>>>> +static int __init init_args(struct pgtable_debug_args *args)
>>>>> +{
>>>>> +    struct page *page = NULL;
>>>>> +    phys_addr_t phys;
>>>>> +    int ret = 0;
>>>>> +
>>>>> +    /* Initialize the debugging data */
>>>>> +    memset(args, 0, sizeof(*args));
>>>>> +    args->page_prot      = vm_get_page_prot(VMFLAGS);
>>>>> +    args->page_prot_none = __P000;
>>>>
>>>> Please preserve the existing comments before this assignment.
>>>>
>>>>           /*
>>>>            * __P000 (or even __S000) will help create page table entries with
>>>>            * PROT_NONE permission as required for pxx_protnone_tests().
>>>>            */
>>>>
>>>
>>> Sure. I will combine the comments in v4 as below:
>>>
>>>      /*
>>>       * Initialize the debugging arguments.
>>>       *
>>>       * __P000 (or even __S000) will help create page table entries with
>>>           * PROT_NONE permission as required for pxx_protnone_tests().
>>>           */
>>>
>>>
>>>>> +    args->pud_pfn        = ULONG_MAX;
>>>>> +    args->pmd_pfn        = ULONG_MAX;
>>>>> +    args->pte_pfn        = ULONG_MAX;
>>>>> +    args->fixed_pgd_pfn  = ULONG_MAX;
>>>>> +    args->fixed_p4d_pfn  = ULONG_MAX;
>>>>> +    args->fixed_pud_pfn  = ULONG_MAX;
>>>>> +    args->fixed_pmd_pfn  = ULONG_MAX;
>>>>> +    args->fixed_pte_pfn  = ULONG_MAX;
>>>>> +
>>>>> +    /* Allocate mm and vma */
>>>>> +    args->mm = mm_alloc();
>>>>> +    if (!args->mm) {
>>>>> +        pr_err("Failed to allocate mm struct\n");
>>>>> +        ret = -ENOMEM;
>>>>> +        goto error;
>>>>> +    }
>>>>> +
>>>>> +    args->vma = vm_area_alloc(args->mm);
>>>>> +    if (!args->vma) {
>>>>> +        pr_err("Failed to allocate vma\n");
>>>>> +        ret = -ENOMEM;
>>>>> +        goto error;
>>>>> +    }
>>>>> +
>>>>> +    /* Figure out the virtual address and allocate page table entries */
>>>>> +    args->vaddr = get_random_vaddr();
>>>>
>>>> Please group args->vaddr's init with page_prot and page_prot_none above.
>>>>
>>>
>>> Yes, It will make the code tidy. I'll move this line accordingly in v4,
>>> but the related comments will be dropped as the code is self-explanatory.
>>>
>>>          /* Allocate page table entries */
>>>
>>>>> +    args->pgdp = pgd_offset(args->mm, args->vaddr);
>>>>> +    args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>>>>> +    args->pudp = args->p4dp ?
>>>>> +             pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
>>>>> +    args->pmdp = args->pudp ?
>>>>> +             pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
>>>>> +    args->ptep = args->pmdp ?
>>>>> +             pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
>>>>> +    if (!args->ptep) {
>>>>> +        pr_err("Failed to allocate page table\n");
>>>>> +        ret = -ENOMEM;
>>>>> +        goto error;
>>>>> +    }
>>>>
>>>> Why not just assert that all page table level pointers are allocated
>>>> successfully, otherwise bail out the test completely. Something like
>>>> this at each level.
>>>>
>>>>      if (!args->p4dp) {
>>>>          pr_err("Failed to allocate page table\n");
>>>>          ret = -ENOMEM;
>>>>          goto error;
>>>>      }
>>>>
>>>> Is there any value in proceeding with the test when some page table
>>>> pointers have not been allocated. Also individual tests do not cross
>>>> check these pointers. Also asserting successful allocations will
>>>> make the freeing path simpler, as I had mentioned earlier.
>>>>
>>>
>>> There is no tests will be carried out if we fail to allocate any level
>>> of page table entries. For other questions, please refer below response.
>>> In summary, this snippet needs to be combined with next snippet, as below.
>>>
>>>>> +
>>>>> +    /*
>>>>> +     * The above page table entries will be modified. Lets save the
>>>>> +     * page table entries so that they can be released when the tests
>>>>> +     * are completed.
>>>>> +     */
>>>>> +    args->start_p4dp = p4d_offset(args->pgdp, 0UL);
>>>>> +    args->start_pudp = pud_offset(args->p4dp, 0UL);
>>>>> +    args->start_pmdp = pmd_offset(args->pudp, 0UL);
>>>>> +    args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
>>>>
>>>> If the above page table pointers have been validated to be allocated
>>>> successfully, we could add these here.
>>>>
>>>>      WARN_ON(!args->start_p4dp)
>>>>      WARN_ON(!args->start_pudp)
>>>>      WARN_ON(!args->start_pmdp)
>>>>      WARN_ON(!args->start_ptep)
>>>>
>>>> Afterwards all those if (args->start_pxdp) checks in the freeing path
>>>> will not be required anymore.
>>>>
>>>
>>> The check on @args->start_pxdp is still needed in destroy_args() for
>>> couple of cases: (1) destroy_args() is called on failing to allocate
>>> @args->mm or @args->vma. That time, no page table entries are allocated.
>>> (2) It's possible to fail allocating current level of page table entries
>>> even the previous levels of page table entries are allocated successfully.
>>
>> This makes sense as destroy_args() is getting called if any of these
>> allocations fails during init_args(). Did not realize that earlier.
>>
>>>
>>> So Lets change these (above) two snippets as below in v4:
>>>
>>>      /*
>>>       * Allocate page table entries. The allocated page table entries
>>>       * will be modified in the tests. Lets save the page table entries
>>>       * so that they can be released when the tests are completed.
>>>       */
>>>      args->pgdp = pgd_offset(args->mm, args->vaddr);
>>>      args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
>>>      if (!args->p4dp) {
>>>          pr_err("Failed to allocate p4d entries\n");
>>>          ret = -ENOMEM;
>>>          goto error;
>>>      }
>>>
>>>      args->start_p4dp = p4d_offset(args->pgdp, 0UL);
>>
>> Dont bring the arg->start_pxdp assignments here. If all page table level
>> pointer allocations succeed, they all get assigned together like we have
>> right now. Although a sanity check afterwards like the following, might
>> still be better.
>>
>> WARN_ON(!args->start_p4dp)
>> WARN_ON(!args->start_pudp)
>> WARN_ON(!args->start_pmdp)
>> WARN_ON(!args->start_ptep)
>>
> 
> We have to assign arg->start_pxdp here because destroy_args() relies
> it to release the corresponding page tables in failing path. For example,
> the args->start_p4dp is going to be release if we fail to populate
> args->start_pudp.

Okay.

> 
> Ok. I will add WARN_ON() for each level of page table entries right after
> they are assigned in v4.
> 
>>>      args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
>>>      if (!args->pudp) {
>>>          pr_err("Failed to allocate pud entries\n");
>>>          ret = -ENOMEM;
>>>          goto error;
>>>      }
>>>
>>>      args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
>>>      if (!args->pmdp) {
>>>          pr_err("Failed to allocate PMD entries\n");
>>>          ret = -ENOMEM;
>>>          goto error;
>>>      }
>>>
>>>      args->start_pmdp = pmd_offset(args->pudp, 0UL);
>>>      args->ptep = pte_alloc_map(args->mm, args->pmdp, args->vaddr);
>>>      if (!args->ptep) {
>>>          pr_err("Failed to allocate page table\n");
>>>          ret = -ENOMEM;
>>>          goto error;
>>>      }
>>>
>>>      args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
>>>
>>>>> +
>>>>> +    /*
>>>>> +     * Figure out the fixed addresses, which are all around the kernel
>>>>> +     * symbol (@start_kernel). The corresponding PFNs might be invalid,
>>>>> +     * but it's fine as the following tests won't access the pages.
>>>>> +     */
>>>>> +    phys = __pa_symbol(&start_kernel);
>>>>> +    args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
>>>>> +    args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
>>>>> +    args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
>>>>> +    args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
>>>>> +    args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
>>>>> +
>>>>> +    /*
>>>>> +     * Allocate (huge) pages because some of the tests need to access
>>>>> +     * the data in the pages. The corresponding tests will be skipped
>>>>> +     * if we fail to allocate (huge) pages.
>>>>> +     */
>>>>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>>> +        IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>>> +        has_transparent_hugepage()) {
>>>>> +        page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>>>>> +                   HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>>>
>>>> Please drop __GFP_NOWARN and instead use something like alloc_contig_pages()
>>>> when required allocation order exceed (MAX_ORDER - 1). Else the test might
>>>> not be able to execute on platform configurations, where PUD THP is enabled.
>>>>
>>>
>>> Yes, It's correct that alloc_contig_pages() should be used here, depending
>>> on CONFIG_CONTIG_ALLOC. Otherwise, alloc_pages(...__GFP_NOWARN...) is still
>>> used as we're doing. This snippet will be changed like below in v4:
>>
>> First 'order > (MAX_ORDER - 1)' needs to be established before calling into
>> alloc_contig_pages() without __GFP_NOWARN and set a new flag indicating that
>> there is contig page allocated. But if 'order <= (MAX_ORDER - 1)', then call
>> alloc_pages(..) without  __GFP_NOWARN. There is no need to add  __GFP_NOWARN
>> in any case. In case CONFIG_CONTIG_ALLOC is not available, directly return a
>> NULL as that would have been the case with alloc_pages(...__GFP_NOWARN...) as
>> well.
>>
>> Symbol alloc_contig_pages() is not available outside CONFIG_CONTIG_ALLOC. So
>> IS_ENABLED() construct will not work, unless there is an empty stub added in
>> the header. Otherwise #ifdef CONFIG_CONTIG_ALLOC needs to be used instead.
>>
>> Regardless please do test this on a x86 platform with PUD based THP in order
>> to make sure every thing works as expected.
>>
> 
> Thanks, I will change the code accordingly in v4 and test it on x86
> before posting it.
> 
>>>
>>>      /*
>>>       * Allocate (huge) pages because some of the tests need to access
>>>       * the data in the pages. The corresponding tests will be skipped
>>>       * if we fail to allocate (huge) pages.
>>>       */
>>>      if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>          IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>          IS_ENABLED(CONFIG_CONTIG_ALLOC)) &&
>>>          has_transparent_hugepage()) {
>>>          page = alloc_contig_pages((1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)),
>>>                        GFP_KERNEL | __GFP_NOWARN,
>>>                        first_online_node, NULL);
>>>          if (page) {
>>>              args->is_contiguous_pud_page = true;
>>>              args->pud_pfn = page_to_pfn(page);
>>>              args->pmd_pfn = args->pud_pfn;
>>>              args->pte_pfn = args->pud_pfn;
>>>              return 0;
>>>          }
>>>      }
>>>
>>>      if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
>>>          IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
>>>          has_transparent_hugepage()) {
>>>          page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
>>>                     HPAGE_PUD_SHIFT - PAGE_SHIFT);
>>>          if (page) {
>>>              args->is_contiguous_pud_page = false;
>>>              args->pud_pfn = page_to_pfn(page);
>>>              args->pmd_pfn = args->pud_pfn;
>>>              args->pte_pfn = args->pud_pfn;
>>>              return 0;
>>>          }
>>>      }
>>>
>>>      [... The logic to allocate PMD huge page or page is kept as of being]
>>
>> IIRC it is also not guaranteed that PMD_SHIFT <= (MAX_ORDER - 1). Hence
>> this same scheme should be followed for PMD level allocation as well.
>>
> 
> In theory, it's possible to have PMD_SHIFT <= (MAX_ORDER - 1) with misconfigured
> kernel. I will apply the similar logic to PMD huge page in v4.
> 
>>>      [... The code to release the PUD huge page needs changes based on @args->is_contiguous_pud_page]
>>
>> Right, a flag would be needed to call the appropriate free function.
>>
> 
> Yes. We need two falgs for PUD and PMD huge pages separately.

A single flag should be enough, the order would be dependent on
whether args->pud_pfn or args->pmd_pfn is valid.
Gavin Shan July 23, 2021, 12:43 a.m. UTC | #9
Hi Anshuman,

On 7/22/21 5:08 PM, Anshuman Khandual wrote:
> On 7/22/21 11:53 AM, Gavin Shan wrote:
>> On 7/22/21 2:41 PM, Anshuman Khandual wrote:
>>> On 7/21/21 3:50 PM, Gavin Shan wrote:
>>>> On 7/21/21 3:44 PM, Anshuman Khandual wrote:
>>>>> On 7/19/21 6:36 PM, Gavin Shan wrote:
>>>>>> In debug_vm_pgtable(), there are many local variables introduced to
>>>>>> track the needed information and they are passed to the functions for
>>>>>> various test cases. It'd better to introduce a struct as place holder
>>>>>> for these information. With it, what the functions for various test
>>>>>> cases need is the struct, to simplify the code. It also makes code
>>>>>> easier to be maintained.
>>>>>>
>>>>>> Besides, set_xxx_at() could access the data on the corresponding pages
>>>>>> in the page table modifying tests. So the accessed pages in the tests
>>>>>> should have been allocated from buddy. Otherwise, we're accessing pages
>>>>>> that aren't owned by us. This causes issues like page flag corruption.
>>>>>>
>>>>>> This introduces "struct pgtable_debug_args". The struct is initialized
>>>>>> and destroyed, but the information in the struct isn't used yet. They
>>>>>> will be used in subsequent patches.
>>>>>>
>>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>>> ---
>>>>>>     mm/debug_vm_pgtable.c | 197 +++++++++++++++++++++++++++++++++++++++++-
>>>>>>     1 file changed, 196 insertions(+), 1 deletion(-)
>>>>>>

[...]

>>>
>>> IIRC it is also not guaranteed that PMD_SHIFT <= (MAX_ORDER - 1). Hence
>>> this same scheme should be followed for PMD level allocation as well.
>>>
>>
>> In theory, it's possible to have PMD_SHIFT <= (MAX_ORDER - 1) with misconfigured
>> kernel. I will apply the similar logic to PMD huge page in v4.
>>
>>>>       [... The code to release the PUD huge page needs changes based on @args->is_contiguous_pud_page]
>>>
>>> Right, a flag would be needed to call the appropriate free function.
>>>
>>
>> Yes. We need two falgs for PUD and PMD huge pages separately.
> 
> A single flag should be enough, the order would be dependent on
> whether args->pud_pfn or args->pmd_pfn is valid.
> 

Yes, it's correct that one flag is enough as we're sharing the PUD
or PMD huge page.

Thanks,
Gavin
diff mbox series

Patch

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1c922691aa61..ea153ff40d23 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -58,6 +58,36 @@ 
 #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
 #define RANDOM_NZVALUE	GENMASK(7, 0)
 
+struct pgtable_debug_args {
+	struct mm_struct	*mm;
+	struct vm_area_struct	*vma;
+
+	pgd_t			*pgdp;
+	p4d_t			*p4dp;
+	pud_t			*pudp;
+	pmd_t			*pmdp;
+	pte_t			*ptep;
+
+	p4d_t			*start_p4dp;
+	pud_t			*start_pudp;
+	pmd_t			*start_pmdp;
+	pgtable_t		start_ptep;
+
+	unsigned long		vaddr;
+	pgprot_t		page_prot;
+	pgprot_t		page_prot_none;
+
+	unsigned long		pud_pfn;
+	unsigned long		pmd_pfn;
+	unsigned long		pte_pfn;
+
+	unsigned long		fixed_pgd_pfn;
+	unsigned long		fixed_p4d_pfn;
+	unsigned long		fixed_pud_pfn;
+	unsigned long		fixed_pmd_pfn;
+	unsigned long		fixed_pte_pfn;
+};
+
 static void __init pte_basic_tests(unsigned long pfn, int idx)
 {
 	pgprot_t prot = protection_map[idx];
@@ -955,8 +985,167 @@  static unsigned long __init get_random_vaddr(void)
 	return random_vaddr;
 }
 
+static void __init destroy_args(struct pgtable_debug_args *args)
+{
+	struct page *page = NULL;
+
+	/* Free (huge) page */
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+	    has_transparent_hugepage() &&
+	    args->pud_pfn != ULONG_MAX) {
+		page = pfn_to_page(args->pud_pfn);
+		__free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
+	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+		   has_transparent_hugepage() &&
+		   args->pmd_pfn != ULONG_MAX) {
+		page = pfn_to_page(args->pmd_pfn);
+		__free_pages(page, HPAGE_PMD_ORDER);
+	} else if (args->pte_pfn != ULONG_MAX) {
+		page = pfn_to_page(args->pte_pfn);
+		__free_pages(page, 0);
+	}
+
+	/* Free page table */
+	if (args->start_ptep) {
+		pte_free(args->mm, args->start_ptep);
+		mm_dec_nr_ptes(args->mm);
+	}
+
+	if (args->start_pmdp) {
+		pmd_free(args->mm, args->start_pmdp);
+		mm_dec_nr_pmds(args->mm);
+	}
+
+	if (args->start_pudp) {
+		pud_free(args->mm, args->start_pudp);
+		mm_dec_nr_puds(args->mm);
+	}
+
+	if (args->start_p4dp)
+		p4d_free(args->mm, args->p4dp);
+
+	/* Free vma and mm struct */
+	if (args->vma)
+		vm_area_free(args->vma);
+	if (args->mm)
+		mmdrop(args->mm);
+}
+
+static int __init init_args(struct pgtable_debug_args *args)
+{
+	struct page *page = NULL;
+	phys_addr_t phys;
+	int ret = 0;
+
+	/* Initialize the debugging data */
+	memset(args, 0, sizeof(*args));
+	args->page_prot      = vm_get_page_prot(VMFLAGS);
+	args->page_prot_none = __P000;
+	args->pud_pfn        = ULONG_MAX;
+	args->pmd_pfn        = ULONG_MAX;
+	args->pte_pfn        = ULONG_MAX;
+	args->fixed_pgd_pfn  = ULONG_MAX;
+	args->fixed_p4d_pfn  = ULONG_MAX;
+	args->fixed_pud_pfn  = ULONG_MAX;
+	args->fixed_pmd_pfn  = ULONG_MAX;
+	args->fixed_pte_pfn  = ULONG_MAX;
+
+	/* Allocate mm and vma */
+	args->mm = mm_alloc();
+	if (!args->mm) {
+		pr_err("Failed to allocate mm struct\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	args->vma = vm_area_alloc(args->mm);
+	if (!args->vma) {
+		pr_err("Failed to allocate vma\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/* Figure out the virtual address and allocate page table entries */
+	args->vaddr = get_random_vaddr();
+	args->pgdp = pgd_offset(args->mm, args->vaddr);
+	args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
+	args->pudp = args->p4dp ?
+		     pud_alloc(args->mm, args->p4dp, args->vaddr) : NULL;
+	args->pmdp = args->pudp ?
+		     pmd_alloc(args->mm, args->pudp, args->vaddr) : NULL;
+	args->ptep = args->pmdp ?
+		     pte_alloc_map(args->mm, args->pmdp, args->vaddr) : NULL;
+	if (!args->ptep) {
+		pr_err("Failed to allocate page table\n");
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/*
+	 * The above page table entries will be modified. Lets save the
+	 * page table entries so that they can be released when the tests
+	 * are completed.
+	 */
+	args->start_p4dp = p4d_offset(args->pgdp, 0UL);
+	args->start_pudp = pud_offset(args->p4dp, 0UL);
+	args->start_pmdp = pmd_offset(args->pudp, 0UL);
+	args->start_ptep = pmd_pgtable(READ_ONCE(*(args->pmdp)));
+
+	/*
+	 * Figure out the fixed addresses, which are all around the kernel
+	 * symbol (@start_kernel). The corresponding PFNs might be invalid,
+	 * but it's fine as the following tests won't access the pages.
+	 */
+	phys = __pa_symbol(&start_kernel);
+	args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+	args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+	args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+	args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+	args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+
+	/*
+	 * Allocate (huge) pages because some of the tests need to access
+	 * the data in the pages. The corresponding tests will be skipped
+	 * if we fail to allocate (huge) pages.
+	 */
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+	    IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+	    has_transparent_hugepage()) {
+		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN,
+				   HPAGE_PUD_SHIFT - PAGE_SHIFT);
+		if (page) {
+			args->pud_pfn = page_to_pfn(page);
+			args->pmd_pfn = args->pud_pfn;
+			args->pte_pfn = args->pud_pfn;
+			return 0;
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+	    has_transparent_hugepage()) {
+		page = alloc_pages(GFP_KERNEL | __GFP_NOWARN, HPAGE_PMD_ORDER);
+		if (page) {
+			args->pmd_pfn = page_to_pfn(page);
+			args->pte_pfn = args->pmd_pfn;
+			return 0;
+		}
+	}
+
+	page = alloc_pages(GFP_KERNEL, 0);
+	if (page)
+		args->pte_pfn = page_to_pfn(page);
+
+	return 0;
+
+error:
+	destroy_args(args);
+	return ret;
+}
+
 static int __init debug_vm_pgtable(void)
 {
+	struct pgtable_debug_args args;
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
 	pgd_t *pgdp;
@@ -970,9 +1159,13 @@  static int __init debug_vm_pgtable(void)
 	unsigned long vaddr, pte_aligned, pmd_aligned;
 	unsigned long pud_aligned, p4d_aligned, pgd_aligned;
 	spinlock_t *ptl = NULL;
-	int idx;
+	int idx, ret;
 
 	pr_info("Validating architecture page table helpers\n");
+	ret = init_args(&args);
+	if (ret)
+		return ret;
+
 	prot = vm_get_page_prot(VMFLAGS);
 	vaddr = get_random_vaddr();
 	mm = mm_alloc();
@@ -1127,6 +1320,8 @@  static int __init debug_vm_pgtable(void)
 	mm_dec_nr_pmds(mm);
 	mm_dec_nr_ptes(mm);
 	mmdrop(mm);
+
+	destroy_args(&args);
 	return 0;
 }
 late_initcall(debug_vm_pgtable);