diff mbox series

[v3,3/6] arm64: mm: make __create_pgd_mapping() and helpers non-void

Message ID 20250304222018.615808-4-yang@os.amperecomputing.com (mailing list archive)
State New, archived
Headers show
Series arm64: support FEAT_BBM level 2 and large block mapping when rodata=full | expand

Commit Message

Yang Shi March 4, 2025, 10:19 p.m. UTC
The later patch will enhance __create_pgd_mapping() and related helpers
to split kernel linear mapping, it requires have return value.  So make
__create_pgd_mapping() and helpers non-void functions.

And move the BUG_ON() out of page table alloc helper since failing
splitting kernel linear mapping is not fatal and can be handled by the
callers in the later patch.  Have BUG_ON() after
__create_pgd_mapping_locked() returns to keep the current callers behavior
intact.

Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
---
 arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 41 deletions(-)

Comments

Ryan Roberts March 14, 2025, 11:51 a.m. UTC | #1
On 04/03/2025 22:19, Yang Shi wrote:
> The later patch will enhance __create_pgd_mapping() and related helpers
> to split kernel linear mapping, it requires have return value.  So make
> __create_pgd_mapping() and helpers non-void functions.
> 
> And move the BUG_ON() out of page table alloc helper since failing
> splitting kernel linear mapping is not fatal and can be handled by the
> callers in the later patch.  Have BUG_ON() after
> __create_pgd_mapping_locked() returns to keep the current callers behavior
> intact.
> 
> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
>  arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 86 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index b4df5bc5b1b8..dccf0877285b 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -189,11 +189,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>  	} while (ptep++, addr += PAGE_SIZE, addr != end);
>  }
>  
> -static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> -				unsigned long end, phys_addr_t phys,
> -				pgprot_t prot,
> -				phys_addr_t (*pgtable_alloc)(int),
> -				int flags)
> +static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
> +			       unsigned long end, phys_addr_t phys,
> +			       pgprot_t prot,
> +			       phys_addr_t (*pgtable_alloc)(int),
> +			       int flags)
>  {
>  	unsigned long next;
>  	pmd_t pmd = READ_ONCE(*pmdp);
> @@ -208,6 +208,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>  			pmdval |= PMD_TABLE_PXN;
>  		BUG_ON(!pgtable_alloc);
>  		pte_phys = pgtable_alloc(PAGE_SHIFT);
> +		if (!pte_phys)
> +			return -ENOMEM;

nit: personally I'd prefer to see a "goto out" and funnel all to a single return
statement. You do that in some functions (via loop break), but would be cleaner
if consistent.

If pgtable_alloc() is modified to return int (see my comment at the bottom),
this becomes:

ret = pgtable_alloc(PAGE_SHIFT, &pte_phys);
if (ret)
	goto out;


>  		ptep = pte_set_fixmap(pte_phys);
>  		init_clear_pgtable(ptep);
>  		ptep += pte_index(addr);
> @@ -239,13 +241,16 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>  	 * walker.
>  	 */
>  	pte_clear_fixmap();
> +
> +	return 0;
>  }
>  
> -static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
> -		     phys_addr_t phys, pgprot_t prot,
> -		     phys_addr_t (*pgtable_alloc)(int), int flags)
> +static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
> +		    phys_addr_t phys, pgprot_t prot,
> +		    phys_addr_t (*pgtable_alloc)(int), int flags)
>  {
>  	unsigned long next;
> +	int ret = 0;
>  
>  	do {
>  		pmd_t old_pmd = READ_ONCE(*pmdp);
> @@ -264,22 +269,27 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
>  						      READ_ONCE(pmd_val(*pmdp))));
>  		} else {
> -			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
> +			ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>  					    pgtable_alloc, flags);
> +			if (ret)
> +				break;
>  
>  			BUG_ON(pmd_val(old_pmd) != 0 &&
>  			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
>  		}
>  		phys += next - addr;
>  	} while (pmdp++, addr = next, addr != end);
> +
> +	return ret;
>  }
>  
> -static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
> -				unsigned long end, phys_addr_t phys,
> -				pgprot_t prot,
> -				phys_addr_t (*pgtable_alloc)(int), int flags)
> +static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
> +			       unsigned long end, phys_addr_t phys,
> +			       pgprot_t prot,
> +			       phys_addr_t (*pgtable_alloc)(int), int flags)
>  {
>  	unsigned long next;
> +	int ret = 0;
>  	pud_t pud = READ_ONCE(*pudp);
>  	pmd_t *pmdp;
>  
> @@ -295,6 +305,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>  			pudval |= PUD_TABLE_PXN;
>  		BUG_ON(!pgtable_alloc);
>  		pmd_phys = pgtable_alloc(PMD_SHIFT);
> +		if (!pmd_phys)
> +			return -ENOMEM;
>  		pmdp = pmd_set_fixmap(pmd_phys);
>  		init_clear_pgtable(pmdp);
>  		pmdp += pmd_index(addr);
> @@ -314,21 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>  		    (flags & NO_CONT_MAPPINGS) == 0)
>  			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>  
> -		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
> +		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
> +		if (ret)
> +			break;
>  
>  		pmdp += pmd_index(next) - pmd_index(addr);
>  		phys += next - addr;
>  	} while (addr = next, addr != end);
>  
>  	pmd_clear_fixmap();
> +
> +	return ret;
>  }
>  
> -static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
> -			   phys_addr_t phys, pgprot_t prot,
> -			   phys_addr_t (*pgtable_alloc)(int),
> -			   int flags)
> +static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
> +			  phys_addr_t phys, pgprot_t prot,
> +			  phys_addr_t (*pgtable_alloc)(int),
> +			  int flags)
>  {
>  	unsigned long next;
> +	int ret = 0;
>  	p4d_t p4d = READ_ONCE(*p4dp);
>  	pud_t *pudp;
>  
> @@ -340,6 +357,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  			p4dval |= P4D_TABLE_PXN;
>  		BUG_ON(!pgtable_alloc);
>  		pud_phys = pgtable_alloc(PUD_SHIFT);
> +		if (!pud_phys)
> +			return -ENOMEM;
>  		pudp = pud_set_fixmap(pud_phys);
>  		init_clear_pgtable(pudp);
>  		pudp += pud_index(addr);
> @@ -369,8 +388,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
>  						      READ_ONCE(pud_val(*pudp))));
>  		} else {
> -			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> +			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>  					    pgtable_alloc, flags);
> +			if (ret)
> +				break;
>  
>  			BUG_ON(pud_val(old_pud) != 0 &&
>  			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
> @@ -379,14 +400,17 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  	} while (pudp++, addr = next, addr != end);
>  
>  	pud_clear_fixmap();
> +
> +	return ret;
>  }
>  
> -static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
> -			   phys_addr_t phys, pgprot_t prot,
> -			   phys_addr_t (*pgtable_alloc)(int),
> -			   int flags)
> +static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
> +			  phys_addr_t phys, pgprot_t prot,
> +			  phys_addr_t (*pgtable_alloc)(int),
> +			  int flags)
>  {
>  	unsigned long next;
> +	int ret = 0;
>  	pgd_t pgd = READ_ONCE(*pgdp);
>  	p4d_t *p4dp;
>  
> @@ -398,6 +422,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  			pgdval |= PGD_TABLE_PXN;
>  		BUG_ON(!pgtable_alloc);
>  		p4d_phys = pgtable_alloc(P4D_SHIFT);
> +		if (!p4d_phys)
> +			return -ENOMEM;
>  		p4dp = p4d_set_fixmap(p4d_phys);
>  		init_clear_pgtable(p4dp);
>  		p4dp += p4d_index(addr);
> @@ -412,8 +438,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  
>  		next = p4d_addr_end(addr, end);
>  
> -		alloc_init_pud(p4dp, addr, next, phys, prot,
> +		ret = alloc_init_pud(p4dp, addr, next, phys, prot,
>  			       pgtable_alloc, flags);
> +		if (ret)
> +			break;
>  
>  		BUG_ON(p4d_val(old_p4d) != 0 &&
>  		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
> @@ -422,23 +450,26 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	} while (p4dp++, addr = next, addr != end);
>  
>  	p4d_clear_fixmap();
> +
> +	return ret;
>  }
>  
> -static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
> -					unsigned long virt, phys_addr_t size,
> -					pgprot_t prot,
> -					phys_addr_t (*pgtable_alloc)(int),
> -					int flags)
> +static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
> +				       unsigned long virt, phys_addr_t size,
> +				       pgprot_t prot,
> +				       phys_addr_t (*pgtable_alloc)(int),
> +				       int flags)
>  {
>  	unsigned long addr, end, next;
>  	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> +	int ret = 0;
>  
>  	/*
>  	 * If the virtual and physical address don't have the same offset
>  	 * within a page, we cannot map the region as the caller expects.
>  	 */
>  	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
> -		return;
> +		return -EINVAL;
>  
>  	phys &= PAGE_MASK;
>  	addr = virt & PAGE_MASK;
> @@ -446,29 +477,38 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>  
>  	do {
>  		next = pgd_addr_end(addr, end);
> -		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
> +		ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>  			       flags);
> +		if (ret)
> +			break;
>  		phys += next - addr;
>  	} while (pgdp++, addr = next, addr != end);
> +
> +	return ret;
>  }
>  
> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> -				 unsigned long virt, phys_addr_t size,
> -				 pgprot_t prot,
> -				 phys_addr_t (*pgtable_alloc)(int),
> -				 int flags)
> +static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> +				unsigned long virt, phys_addr_t size,
> +				pgprot_t prot,
> +				phys_addr_t (*pgtable_alloc)(int),
> +				int flags)
>  {
> +	int ret;
> +
>  	mutex_lock(&fixmap_lock);
> -	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
> +	ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>  				    pgtable_alloc, flags);
> +	BUG_ON(ret);

This function now returns an error, but also BUGs on ret!=0. For this patch, I'd
suggest keeping this function as void.

But I believe there is a pre-existing bug in arch_add_memory(). That's called at
runtime so if __create_pgd_mapping() fails and BUGs, it will take down a running
system.

With this foundational patch, we can fix that with an additional patch to pass
along the error code instead of BUGing in that case. arch_add_memory() would
need to unwind whatever __create_pgd_mapping() managed to do before the memory
allocation failure (presumably unmapping and freeing any allocated tables). I'm
happy to do this as a follow up patch.

>  	mutex_unlock(&fixmap_lock);
> +
> +	return ret;
>  }
>  
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>  extern __alias(__create_pgd_mapping_locked)
> -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
> -			     phys_addr_t size, pgprot_t prot,
> -			     phys_addr_t (*pgtable_alloc)(int), int flags);
> +int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
> +			    phys_addr_t size, pgprot_t prot,
> +			    phys_addr_t (*pgtable_alloc)(int), int flags);

create_kpti_ng_temp_pgd() now returns error instead of BUGing on allocation
failure, but I don't see a change to handle that error. You'll want to update
__kpti_install_ng_mappings() to BUG on error.

>  #endif
>  
>  static phys_addr_t __pgd_pgtable_alloc(int shift)
> @@ -476,13 +516,17 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
>  	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
>  	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
>  
> -	BUG_ON(!ptr);
> +	if (!ptr)
> +		return 0;

0 is a valid (though unlikely) physical address. I guess you could technically
encode like ERR_PTR(), but since you are returning phys_addr_t and not a
pointer, then perhaps it will be clearer to make this return int and accept a
pointer to a phys_addr_t, which it will populate on success?

> +
>  	return __pa(ptr);
>  }
>  
>  static phys_addr_t pgd_pgtable_alloc(int shift)
>  {
>  	phys_addr_t pa = __pgd_pgtable_alloc(shift);
> +	if (!pa)
> +		goto out;

This would obviously need to be fixed up as per above.

>  	struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
>  
>  	/*
> @@ -498,6 +542,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>  	else if (shift == PMD_SHIFT)
>  		BUG_ON(!pagetable_pmd_ctor(ptdesc));
>  
> +out:
>  	return pa;
>  }
>  

You have left early_pgtable_alloc() to panic() on allocation failure. Given we
can now unwind the stack with error code, I think it would be more consistent to
also allow early_pgtable_alloc() to return error.

Thanks,
Ryan
Yang Shi March 17, 2025, 5:53 p.m. UTC | #2
On 3/14/25 4:51 AM, Ryan Roberts wrote:
> On 04/03/2025 22:19, Yang Shi wrote:
>> The later patch will enhance __create_pgd_mapping() and related helpers
>> to split kernel linear mapping, it requires have return value.  So make
>> __create_pgd_mapping() and helpers non-void functions.
>>
>> And move the BUG_ON() out of page table alloc helper since failing
>> splitting kernel linear mapping is not fatal and can be handled by the
>> callers in the later patch.  Have BUG_ON() after
>> __create_pgd_mapping_locked() returns to keep the current callers behavior
>> intact.
>>
>> Suggested-by: Ryan Roberts<ryan.roberts@arm.com>
>> Signed-off-by: Yang Shi<yang@os.amperecomputing.com>
>> ---
>>   arch/arm64/mm/mmu.c | 127 ++++++++++++++++++++++++++++++--------------
>>   1 file changed, 86 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index b4df5bc5b1b8..dccf0877285b 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -189,11 +189,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   	} while (ptep++, addr += PAGE_SIZE, addr != end);
>>   }
>>   
>> -static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> -				unsigned long end, phys_addr_t phys,
>> -				pgprot_t prot,
>> -				phys_addr_t (*pgtable_alloc)(int),
>> -				int flags)
>> +static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>> +			       unsigned long end, phys_addr_t phys,
>> +			       pgprot_t prot,
>> +			       phys_addr_t (*pgtable_alloc)(int),
>> +			       int flags)
>>   {
>>   	unsigned long next;
>>   	pmd_t pmd = READ_ONCE(*pmdp);
>> @@ -208,6 +208,8 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   			pmdval |= PMD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +		if (!pte_phys)
>> +			return -ENOMEM;
> nit: personally I'd prefer to see a "goto out" and funnel all to a single return
> statement. You do that in some functions (via loop break), but would be cleaner
> if consistent.
>
> If pgtable_alloc() is modified to return int (see my comment at the bottom),
> this becomes:
>
> ret = pgtable_alloc(PAGE_SHIFT, &pte_phys);
> if (ret)
> 	goto out;

OK

>>   		ptep = pte_set_fixmap(pte_phys);
>>   		init_clear_pgtable(ptep);
>>   		ptep += pte_index(addr);
>> @@ -239,13 +241,16 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   	 * walker.
>>   	 */
>>   	pte_clear_fixmap();
>> +
>> +	return 0;
>>   }
>>   
>> -static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> -		     phys_addr_t phys, pgprot_t prot,
>> -		     phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>> +		    phys_addr_t phys, pgprot_t prot,
>> +		    phys_addr_t (*pgtable_alloc)(int), int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   
>>   	do {
>>   		pmd_t old_pmd = READ_ONCE(*pmdp);
>> @@ -264,22 +269,27 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
>>   						      READ_ONCE(pmd_val(*pmdp))));
>>   		} else {
>> -			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>> +			ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
>>   					    pgtable_alloc, flags);
>> +			if (ret)
>> +				break;
>>   
>>   			BUG_ON(pmd_val(old_pmd) != 0 &&
>>   			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
>>   		}
>>   		phys += next - addr;
>>   	} while (pmdp++, addr = next, addr != end);
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> -				unsigned long end, phys_addr_t phys,
>> -				pgprot_t prot,
>> -				phys_addr_t (*pgtable_alloc)(int), int flags)
>> +static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>> +			       unsigned long end, phys_addr_t phys,
>> +			       pgprot_t prot,
>> +			       phys_addr_t (*pgtable_alloc)(int), int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	pud_t pud = READ_ONCE(*pudp);
>>   	pmd_t *pmdp;
>>   
>> @@ -295,6 +305,8 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   			pudval |= PUD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +		if (!pmd_phys)
>> +			return -ENOMEM;
>>   		pmdp = pmd_set_fixmap(pmd_phys);
>>   		init_clear_pgtable(pmdp);
>>   		pmdp += pmd_index(addr);
>> @@ -314,21 +326,26 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   		    (flags & NO_CONT_MAPPINGS) == 0)
>>   			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> -		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> +		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
>> +		if (ret)
>> +			break;
>>   
>>   		pmdp += pmd_index(next) - pmd_index(addr);
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>>   
>>   	pmd_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> -			   phys_addr_t phys, pgprot_t prot,
>> -			   phys_addr_t (*pgtable_alloc)(int),
>> -			   int flags)
>> +static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>> +			  phys_addr_t phys, pgprot_t prot,
>> +			  phys_addr_t (*pgtable_alloc)(int),
>> +			  int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	p4d_t p4d = READ_ONCE(*p4dp);
>>   	pud_t *pudp;
>>   
>> @@ -340,6 +357,8 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   			p4dval |= P4D_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		pud_phys = pgtable_alloc(PUD_SHIFT);
>> +		if (!pud_phys)
>> +			return -ENOMEM;
>>   		pudp = pud_set_fixmap(pud_phys);
>>   		init_clear_pgtable(pudp);
>>   		pudp += pud_index(addr);
>> @@ -369,8 +388,10 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
>>   						      READ_ONCE(pud_val(*pudp))));
>>   		} else {
>> -			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>> +			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>>   					    pgtable_alloc, flags);
>> +			if (ret)
>> +				break;
>>   
>>   			BUG_ON(pud_val(old_pud) != 0 &&
>>   			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
>> @@ -379,14 +400,17 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   	} while (pudp++, addr = next, addr != end);
>>   
>>   	pud_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> -			   phys_addr_t phys, pgprot_t prot,
>> -			   phys_addr_t (*pgtable_alloc)(int),
>> -			   int flags)
>> +static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>> +			  phys_addr_t phys, pgprot_t prot,
>> +			  phys_addr_t (*pgtable_alloc)(int),
>> +			  int flags)
>>   {
>>   	unsigned long next;
>> +	int ret = 0;
>>   	pgd_t pgd = READ_ONCE(*pgdp);
>>   	p4d_t *p4dp;
>>   
>> @@ -398,6 +422,8 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   			pgdval |= PGD_TABLE_PXN;
>>   		BUG_ON(!pgtable_alloc);
>>   		p4d_phys = pgtable_alloc(P4D_SHIFT);
>> +		if (!p4d_phys)
>> +			return -ENOMEM;
>>   		p4dp = p4d_set_fixmap(p4d_phys);
>>   		init_clear_pgtable(p4dp);
>>   		p4dp += p4d_index(addr);
>> @@ -412,8 +438,10 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   
>>   		next = p4d_addr_end(addr, end);
>>   
>> -		alloc_init_pud(p4dp, addr, next, phys, prot,
>> +		ret = alloc_init_pud(p4dp, addr, next, phys, prot,
>>   			       pgtable_alloc, flags);
>> +		if (ret)
>> +			break;
>>   
>>   		BUG_ON(p4d_val(old_p4d) != 0 &&
>>   		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
>> @@ -422,23 +450,26 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   	} while (p4dp++, addr = next, addr != end);
>>   
>>   	p4d_clear_fixmap();
>> +
>> +	return ret;
>>   }
>>   
>> -static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> -					unsigned long virt, phys_addr_t size,
>> -					pgprot_t prot,
>> -					phys_addr_t (*pgtable_alloc)(int),
>> -					int flags)
>> +static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>> +				       unsigned long virt, phys_addr_t size,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>>   {
>>   	unsigned long addr, end, next;
>>   	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>> +	int ret = 0;
>>   
>>   	/*
>>   	 * If the virtual and physical address don't have the same offset
>>   	 * within a page, we cannot map the region as the caller expects.
>>   	 */
>>   	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>> -		return;
>> +		return -EINVAL;
>>   
>>   	phys &= PAGE_MASK;
>>   	addr = virt & PAGE_MASK;
>> @@ -446,29 +477,38 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
>>   
>>   	do {
>>   		next = pgd_addr_end(addr, end);
>> -		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>> +		ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
>>   			       flags);
>> +		if (ret)
>> +			break;
>>   		phys += next - addr;
>>   	} while (pgdp++, addr = next, addr != end);
>> +
>> +	return ret;
>>   }
>>   
>> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> -				 unsigned long virt, phys_addr_t size,
>> -				 pgprot_t prot,
>> -				 phys_addr_t (*pgtable_alloc)(int),
>> -				 int flags)
>> +static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>> +				unsigned long virt, phys_addr_t size,
>> +				pgprot_t prot,
>> +				phys_addr_t (*pgtable_alloc)(int),
>> +				int flags)
>>   {
>> +	int ret;
>> +
>>   	mutex_lock(&fixmap_lock);
>> -	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>> +	ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
>>   				    pgtable_alloc, flags);
>> +	BUG_ON(ret);
> This function now returns an error, but also BUGs on ret!=0. For this patch, I'd
> suggest keeping this function as void.

You mean __create_pgd_mapping(), right?

> But I believe there is a pre-existing bug in arch_add_memory(). That's called at
> runtime so if __create_pgd_mapping() fails and BUGs, it will take down a running
> system.

Yes, it is the current behavior.

> With this foundational patch, we can fix that with an additional patch to pass
> along the error code instead of BUGing in that case. arch_add_memory() would
> need to unwind whatever __create_pgd_mapping() managed to do before the memory
> allocation failure (presumably unmapping and freeing any allocated tables). I'm
> happy to do this as a follow up patch.

Yes, the allocated page tables need to be freed. Thank you for taking it.

>>   	mutex_unlock(&fixmap_lock);
>> +
>> +	return ret;
>>   }
>>   
>>   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>>   extern __alias(__create_pgd_mapping_locked)
>> -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> -			     phys_addr_t size, pgprot_t prot,
>> -			     phys_addr_t (*pgtable_alloc)(int), int flags);
>> +int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>> +			    phys_addr_t size, pgprot_t prot,
>> +			    phys_addr_t (*pgtable_alloc)(int), int flags);
> create_kpti_ng_temp_pgd() now returns error instead of BUGing on allocation
> failure, but I don't see a change to handle that error. You'll want to update
> __kpti_install_ng_mappings() to BUG on error.

Yes, I missed that. It should BUG on error.

>>   #endif
>>   
>>   static phys_addr_t __pgd_pgtable_alloc(int shift)
>> @@ -476,13 +516,17 @@ static phys_addr_t __pgd_pgtable_alloc(int shift)
>>   	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
>>   	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
>>   
>> -	BUG_ON(!ptr);
>> +	if (!ptr)
>> +		return 0;
> 0 is a valid (though unlikely) physical address. I guess you could technically
> encode like ERR_PTR(), but since you are returning phys_addr_t and not a
> pointer, then perhaps it will be clearer to make this return int and accept a
> pointer to a phys_addr_t, which it will populate on success?

Actually I did something similar in the first place, but just returned 
the virt address. Then did something if it returns NULL. That made the 
code a little more messy since we need convert the virt address to phys 
address because __create_pgd_mapping() and the helpers require phys 
address, and changed the functions definition.

But I noticed 0 should be not a valid phys address if I remember 
correctly. I also noticed early_pgtable_alloc() calls 
memblock_phys_alloc_range(), it returns 0 on failure. If 0 is valid phys 
address, then it should not do that, right? And I also noticed the 
memblock range 0 - memstart_addr is actually removed from memblock (see 
arm64_memblock_init()), so IIUC 0 should be not valid phys address. So 
the patch ended up being as is.

If this assumption doesn't stand, I think your suggestion makes sense.

>> +
>>   	return __pa(ptr);
>>   }
>>   
>>   static phys_addr_t pgd_pgtable_alloc(int shift)
>>   {
>>   	phys_addr_t pa = __pgd_pgtable_alloc(shift);
>> +	if (!pa)
>> +		goto out;
> This would obviously need to be fixed up as per above.
>
>>   	struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
>>   
>>   	/*
>> @@ -498,6 +542,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>>   	else if (shift == PMD_SHIFT)
>>   		BUG_ON(!pagetable_pmd_ctor(ptdesc));
>>   
>> +out:
>>   	return pa;
>>   }
>>   
> You have left early_pgtable_alloc() to panic() on allocation failure. Given we
> can now unwind the stack with error code, I think it would be more consistent to
> also allow early_pgtable_alloc() to return error.

The early_pgtable_alloc() is just used for painting linear mapping at 
early boot stage, if it fails I don't think unwinding the stack is 
feasible and worth it. Did I miss something?

Thanks,
Yang

> Thanks,
> Ryan
>
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b4df5bc5b1b8..dccf0877285b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -189,11 +189,11 @@  static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
 }
 
-static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
-				unsigned long end, phys_addr_t phys,
-				pgprot_t prot,
-				phys_addr_t (*pgtable_alloc)(int),
-				int flags)
+static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
+			       unsigned long end, phys_addr_t phys,
+			       pgprot_t prot,
+			       phys_addr_t (*pgtable_alloc)(int),
+			       int flags)
 {
 	unsigned long next;
 	pmd_t pmd = READ_ONCE(*pmdp);
@@ -208,6 +208,8 @@  static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 			pmdval |= PMD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pte_phys = pgtable_alloc(PAGE_SHIFT);
+		if (!pte_phys)
+			return -ENOMEM;
 		ptep = pte_set_fixmap(pte_phys);
 		init_clear_pgtable(ptep);
 		ptep += pte_index(addr);
@@ -239,13 +241,16 @@  static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 	 * walker.
 	 */
 	pte_clear_fixmap();
+
+	return 0;
 }
 
-static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
-		     phys_addr_t phys, pgprot_t prot,
-		     phys_addr_t (*pgtable_alloc)(int), int flags)
+static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		    phys_addr_t phys, pgprot_t prot,
+		    phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long next;
+	int ret = 0;
 
 	do {
 		pmd_t old_pmd = READ_ONCE(*pmdp);
@@ -264,22 +269,27 @@  static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
 						      READ_ONCE(pmd_val(*pmdp))));
 		} else {
-			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
+			ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot,
 					    pgtable_alloc, flags);
+			if (ret)
+				break;
 
 			BUG_ON(pmd_val(old_pmd) != 0 &&
 			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 		}
 		phys += next - addr;
 	} while (pmdp++, addr = next, addr != end);
+
+	return ret;
 }
 
-static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
-				unsigned long end, phys_addr_t phys,
-				pgprot_t prot,
-				phys_addr_t (*pgtable_alloc)(int), int flags)
+static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
+			       unsigned long end, phys_addr_t phys,
+			       pgprot_t prot,
+			       phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long next;
+	int ret = 0;
 	pud_t pud = READ_ONCE(*pudp);
 	pmd_t *pmdp;
 
@@ -295,6 +305,8 @@  static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 			pudval |= PUD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pmd_phys = pgtable_alloc(PMD_SHIFT);
+		if (!pmd_phys)
+			return -ENOMEM;
 		pmdp = pmd_set_fixmap(pmd_phys);
 		init_clear_pgtable(pmdp);
 		pmdp += pmd_index(addr);
@@ -314,21 +326,26 @@  static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
+		if (ret)
+			break;
 
 		pmdp += pmd_index(next) - pmd_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
 
 	pmd_clear_fixmap();
+
+	return ret;
 }
 
-static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
-			   phys_addr_t phys, pgprot_t prot,
-			   phys_addr_t (*pgtable_alloc)(int),
-			   int flags)
+static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
+			  phys_addr_t phys, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(int),
+			  int flags)
 {
 	unsigned long next;
+	int ret = 0;
 	p4d_t p4d = READ_ONCE(*p4dp);
 	pud_t *pudp;
 
@@ -340,6 +357,8 @@  static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			p4dval |= P4D_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		pud_phys = pgtable_alloc(PUD_SHIFT);
+		if (!pud_phys)
+			return -ENOMEM;
 		pudp = pud_set_fixmap(pud_phys);
 		init_clear_pgtable(pudp);
 		pudp += pud_index(addr);
@@ -369,8 +388,10 @@  static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
 						      READ_ONCE(pud_val(*pudp))));
 		} else {
-			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
 					    pgtable_alloc, flags);
+			if (ret)
+				break;
 
 			BUG_ON(pud_val(old_pud) != 0 &&
 			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
@@ -379,14 +400,17 @@  static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 	} while (pudp++, addr = next, addr != end);
 
 	pud_clear_fixmap();
+
+	return ret;
 }
 
-static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
-			   phys_addr_t phys, pgprot_t prot,
-			   phys_addr_t (*pgtable_alloc)(int),
-			   int flags)
+static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			  phys_addr_t phys, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(int),
+			  int flags)
 {
 	unsigned long next;
+	int ret = 0;
 	pgd_t pgd = READ_ONCE(*pgdp);
 	p4d_t *p4dp;
 
@@ -398,6 +422,8 @@  static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 			pgdval |= PGD_TABLE_PXN;
 		BUG_ON(!pgtable_alloc);
 		p4d_phys = pgtable_alloc(P4D_SHIFT);
+		if (!p4d_phys)
+			return -ENOMEM;
 		p4dp = p4d_set_fixmap(p4d_phys);
 		init_clear_pgtable(p4dp);
 		p4dp += p4d_index(addr);
@@ -412,8 +438,10 @@  static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 
 		next = p4d_addr_end(addr, end);
 
-		alloc_init_pud(p4dp, addr, next, phys, prot,
+		ret = alloc_init_pud(p4dp, addr, next, phys, prot,
 			       pgtable_alloc, flags);
+		if (ret)
+			break;
 
 		BUG_ON(p4d_val(old_p4d) != 0 &&
 		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
@@ -422,23 +450,26 @@  static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	} while (p4dp++, addr = next, addr != end);
 
 	p4d_clear_fixmap();
+
+	return ret;
 }
 
-static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
-					unsigned long virt, phys_addr_t size,
-					pgprot_t prot,
-					phys_addr_t (*pgtable_alloc)(int),
-					int flags)
+static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
+				       unsigned long virt, phys_addr_t size,
+				       pgprot_t prot,
+				       phys_addr_t (*pgtable_alloc)(int),
+				       int flags)
 {
 	unsigned long addr, end, next;
 	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+	int ret = 0;
 
 	/*
 	 * If the virtual and physical address don't have the same offset
 	 * within a page, we cannot map the region as the caller expects.
 	 */
 	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
-		return;
+		return -EINVAL;
 
 	phys &= PAGE_MASK;
 	addr = virt & PAGE_MASK;
@@ -446,29 +477,38 @@  static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
+		ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
 			       flags);
+		if (ret)
+			break;
 		phys += next - addr;
 	} while (pgdp++, addr = next, addr != end);
+
+	return ret;
 }
 
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
-				 unsigned long virt, phys_addr_t size,
-				 pgprot_t prot,
-				 phys_addr_t (*pgtable_alloc)(int),
-				 int flags)
+static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				unsigned long virt, phys_addr_t size,
+				pgprot_t prot,
+				phys_addr_t (*pgtable_alloc)(int),
+				int flags)
 {
+	int ret;
+
 	mutex_lock(&fixmap_lock);
-	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
+	ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
 				    pgtable_alloc, flags);
+	BUG_ON(ret);
 	mutex_unlock(&fixmap_lock);
+
+	return ret;
 }
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 extern __alias(__create_pgd_mapping_locked)
-void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
-			     phys_addr_t size, pgprot_t prot,
-			     phys_addr_t (*pgtable_alloc)(int), int flags);
+int create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+			    phys_addr_t size, pgprot_t prot,
+			    phys_addr_t (*pgtable_alloc)(int), int flags);
 #endif
 
 static phys_addr_t __pgd_pgtable_alloc(int shift)
@@ -476,13 +516,17 @@  static phys_addr_t __pgd_pgtable_alloc(int shift)
 	/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
 	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
 
-	BUG_ON(!ptr);
+	if (!ptr)
+		return 0;
+
 	return __pa(ptr);
 }
 
 static phys_addr_t pgd_pgtable_alloc(int shift)
 {
 	phys_addr_t pa = __pgd_pgtable_alloc(shift);
+	if (!pa)
+		goto out;
 	struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
 
 	/*
@@ -498,6 +542,7 @@  static phys_addr_t pgd_pgtable_alloc(int shift)
 	else if (shift == PMD_SHIFT)
 		BUG_ON(!pagetable_pmd_ctor(ptdesc));
 
+out:
 	return pa;
 }