diff mbox

[v2,2/4] ioremap: Implement TLB_INV before huge mapping

Message ID 1521117906-20107-3-git-send-email-cpandya@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Chintan Pandya March 15, 2018, 12:45 p.m. UTC
Huge mapping changes PMD/PUD which could have
valid previous entries. This requires proper
TLB maintanance on some architectures, like
ARM64.

Implent BBM (break-before-make) safe TLB
invalidation.

Here, I've used flush_tlb_pgtable() instead
of flush_kernel_range() because invalidating
intermediate page_table entries could have
been optimized for specific arch. That's the
case with ARM64 at least.

Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
---
 lib/ioremap.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

Comments

Mark Rutland March 15, 2018, 1:13 p.m. UTC | #1
Hi,

As a general note, pleas wrap commit text to 72 characters.

On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
> Huge mapping changes PMD/PUD which could have
> valid previous entries. This requires proper
> TLB maintanance on some architectures, like
> ARM64.

Just to check, I take it that you mean we could have a valid table
entry, but all the entries in that next level table must be invalid,
right?

> 
> Implent BBM (break-before-make) safe TLB
> invalidation.
> 
> Here, I've used flush_tlb_pgtable() instead
> of flush_kernel_range() because invalidating
> intermediate page_table entries could have
> been optimized for specific arch. That's the
> case with ARM64 at least.

... because if there are valid entries in the next level table,
__flush_tlb_pgtable() is not sufficient to ensure all of these are
removed from the TLB.

Assuming that all entries in the next level table are invalid, this
looks ok to me.

Thanks,
Mark.

> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
> ---
>  lib/ioremap.c | 25 +++++++++++++++++++------
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/ioremap.c b/lib/ioremap.c
> index 54e5bba..55f8648 100644
> --- a/lib/ioremap.c
> +++ b/lib/ioremap.c
> @@ -13,6 +13,7 @@
>  #include <linux/export.h>
>  #include <asm/cacheflush.h>
>  #include <asm/pgtable.h>
> +#include <asm-generic/tlb.h>
>  
>  #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>  static int __read_mostly ioremap_p4d_capable;
> @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>  		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>  {
>  	pmd_t *pmd;
> +	pmd_t old_pmd;
>  	unsigned long next;
>  
>  	phys_addr -= addr;
> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>  
>  		if (ioremap_pmd_enabled() &&
>  		    ((next - addr) == PMD_SIZE) &&
> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> -		    pmd_free_pte_page(pmd)) {
> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> +			old_pmd = *pmd;
> +			pmd_clear(pmd);
> +			flush_tlb_pgtable(&init_mm, addr);
> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> +				pmd_free_pte_page(&old_pmd);
>  				continue;
> +			} else
> +				set_pmd(pmd, old_pmd);
>  		}
>  
>  		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
> @@ -107,6 +114,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
>  		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>  {
>  	pud_t *pud;
> +	pud_t old_pud;
>  	unsigned long next;
>  
>  	phys_addr -= addr;
> @@ -118,10 +126,15 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
>  
>  		if (ioremap_pud_enabled() &&
>  		    ((next - addr) == PUD_SIZE) &&
> -		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
> -		    pud_free_pmd_page(pud)) {
> -			if (pud_set_huge(pud, phys_addr + addr, prot))
> +		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
> +			old_pud = *pud;
> +			pud_clear(pud);
> +			flush_tlb_pgtable(&init_mm, addr);
> +			if (pud_set_huge(pud, phys_addr + addr, prot)) {
> +				pud_free_pmd_page(&old_pud);
>  				continue;
> +			} else
> +				set_pud(pud, old_pud);
>  		}
>  
>  		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
> -- 
> Qualcomm India Private Limited, on behalf of Qualcomm Innovation
> Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
> Collaborative Project
>
Chintan Pandya March 15, 2018, 1:25 p.m. UTC | #2
On 3/15/2018 6:43 PM, Mark Rutland wrote:
> Hi,
> 
> As a general note, pleas wrap commit text to 72 characters.
> 
> On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
>> Huge mapping changes PMD/PUD which could have
>> valid previous entries. This requires proper
>> TLB maintanance on some architectures, like
>> ARM64.
> 
> Just to check, I take it that you mean we could have a valid table
> entry, but all the entries in that next level table must be invalid,
> right?

That was my assumption but my assumption can be wrong if any VA gets
block mapping for 1G directly (instead of the 2M cases we discussed
so far), then this would go for a toss.

> 
>>
>> Implent BBM (break-before-make) safe TLB
>> invalidation.
>>
>> Here, I've used flush_tlb_pgtable() instead
>> of flush_kernel_range() because invalidating
>> intermediate page_table entries could have
>> been optimized for specific arch. That's the
>> case with ARM64 at least.
> 
> ... because if there are valid entries in the next level table,
> __flush_tlb_pgtable() is not sufficient to ensure all of these are
> removed from the TLB.

oh !! In case of huge_pgd, next level pmd may or may not be valid. So, 
better I be using flush_kernel_range()

I will upload v3. But, would wait for other comments...

> 
> Assuming that all entries in the next level table are invalid, this
> looks ok to me.
> 
> Thanks,
> Mark.
> 
>> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
>> ---
>>   lib/ioremap.c | 25 +++++++++++++++++++------
>>   1 file changed, 19 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/ioremap.c b/lib/ioremap.c
>> index 54e5bba..55f8648 100644
>> --- a/lib/ioremap.c
>> +++ b/lib/ioremap.c
>> @@ -13,6 +13,7 @@
>>   #include <linux/export.h>
>>   #include <asm/cacheflush.h>
>>   #include <asm/pgtable.h>
>> +#include <asm-generic/tlb.h>
>>   
>>   #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>   static int __read_mostly ioremap_p4d_capable;
>> @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>   		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>>   {
>>   	pmd_t *pmd;
>> +	pmd_t old_pmd;
>>   	unsigned long next;
>>   
>>   	phys_addr -= addr;
>> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>   
>>   		if (ioremap_pmd_enabled() &&
>>   		    ((next - addr) == PMD_SIZE) &&
>> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
>> -		    pmd_free_pte_page(pmd)) {
>> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
>> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
>> +			old_pmd = *pmd;
>> +			pmd_clear(pmd);
>> +			flush_tlb_pgtable(&init_mm, addr);
>> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
>> +				pmd_free_pte_page(&old_pmd);
>>   				continue;
>> +			} else
>> +				set_pmd(pmd, old_pmd);
>>   		}
>>   
>>   		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
>> @@ -107,6 +114,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
>>   		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>>   {
>>   	pud_t *pud;
>> +	pud_t old_pud;
>>   	unsigned long next;
>>   
>>   	phys_addr -= addr;
>> @@ -118,10 +126,15 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
>>   
>>   		if (ioremap_pud_enabled() &&
>>   		    ((next - addr) == PUD_SIZE) &&
>> -		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
>> -		    pud_free_pmd_page(pud)) {
>> -			if (pud_set_huge(pud, phys_addr + addr, prot))
>> +		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
>> +			old_pud = *pud;
>> +			pud_clear(pud);
>> +			flush_tlb_pgtable(&init_mm, addr);
>> +			if (pud_set_huge(pud, phys_addr + addr, prot)) {
>> +				pud_free_pmd_page(&old_pud);
>>   				continue;
>> +			} else
>> +				set_pud(pud, old_pud);
>>   		}
>>   
>>   		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
>> -- 
>> Qualcomm India Private Limited, on behalf of Qualcomm Innovation
>> Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
>> Collaborative Project
>>

Chintan
Mark Rutland March 15, 2018, 1:31 p.m. UTC | #3
On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>  
>  		if (ioremap_pmd_enabled() &&
>  		    ((next - addr) == PMD_SIZE) &&
> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> -		    pmd_free_pte_page(pmd)) {
> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> +			old_pmd = *pmd;
> +			pmd_clear(pmd);
> +			flush_tlb_pgtable(&init_mm, addr);
> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> +				pmd_free_pte_page(&old_pmd);
>  				continue;
> +			} else
> +				set_pmd(pmd, old_pmd);
>  		}
>  

Can we have something like a pmd_can_set_huge() helper? Then we could
avoid pointless modification and TLB invalidation work when
pmd_set_huge() will fail.

	if (ioremap_pmd_enabled() &&
	    ((next - addr) == PMD_SIZE) &&
	    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
	    pmd_can_set_huge(pmd, phys_addr + addr, prot)) {
	    	// clear entries, invalidate TLBs, and free tables
		...
		continue;

	}

Thanks,
MArk.
Chintan Pandya March 15, 2018, 2:19 p.m. UTC | #4
On 3/15/2018 7:01 PM, Mark Rutland wrote:
> On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
>> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>   
>>   		if (ioremap_pmd_enabled() &&
>>   		    ((next - addr) == PMD_SIZE) &&
>> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
>> -		    pmd_free_pte_page(pmd)) {
>> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
>> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
>> +			old_pmd = *pmd;
>> +			pmd_clear(pmd);
>> +			flush_tlb_pgtable(&init_mm, addr);
>> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
>> +				pmd_free_pte_page(&old_pmd);
>>   				continue;
>> +			} else
>> +				set_pmd(pmd, old_pmd);
>>   		}
>>   
> 
> Can we have something like a pmd_can_set_huge() helper? Then we could
> avoid pointless modification and TLB invalidation work when
> pmd_set_huge() will fail.

Actually, pmd_set_huge() will never fail because, if
CONFIG_HAVE_ARCH_HUGE_VMAP is disabled, ioremap_pmd_enabled()
will fail and if enabled (i.e. ARM64 & x86), they don't fail
in their implementation. So, rather we can do the following.

-                       if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
-                               pmd_free_pte_page(&old_pmd);
-                               continue;
-                       } else
-                               set_pmd(pmd, old_pmd);
+                       pmd_set_huge(pmd, phys_addr + addr, prot)
+                       pmd_free_pte_page(&old_pmd);
+                       continue;

> 
> 	if (ioremap_pmd_enabled() &&
> 	    ((next - addr) == PMD_SIZE) &&
> 	    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> 	    pmd_can_set_huge(pmd, phys_addr + addr, prot)) {
> 	    	// clear entries, invalidate TLBs, and free tables
> 		...
> 		continue;
> 
> 	}
> 
> Thanks,
> MArk.
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> 

Chintan
Mark Rutland March 15, 2018, 3:16 p.m. UTC | #5
On Thu, Mar 15, 2018 at 06:55:32PM +0530, Chintan Pandya wrote:
> On 3/15/2018 6:43 PM, Mark Rutland wrote:
> > On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
> > > Huge mapping changes PMD/PUD which could have
> > > valid previous entries. This requires proper
> > > TLB maintanance on some architectures, like
> > > ARM64.
> > 
> > Just to check, I take it that you mean we could have a valid table
> > entry, but all the entries in that next level table must be invalid,
> > right?
> 
> That was my assumption but my assumption can be wrong if any VA gets
> block mapping for 1G directly (instead of the 2M cases we discussed
> so far), then this would go for a toss.

Ok. Just considering the 4K -> 2M case, is that an assumption, or a
guarantee?

Thanks,
Mark.
Mark Rutland March 15, 2018, 3:20 p.m. UTC | #6
On Thu, Mar 15, 2018 at 07:49:01PM +0530, Chintan Pandya wrote:
> On 3/15/2018 7:01 PM, Mark Rutland wrote:
> > On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
> > > @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> > >   		if (ioremap_pmd_enabled() &&
> > >   		    ((next - addr) == PMD_SIZE) &&
> > > -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> > > -		    pmd_free_pte_page(pmd)) {
> > > -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
> > > +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> > > +			old_pmd = *pmd;
> > > +			pmd_clear(pmd);
> > > +			flush_tlb_pgtable(&init_mm, addr);
> > > +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> > > +				pmd_free_pte_page(&old_pmd);
> > >   				continue;
> > > +			} else
> > > +				set_pmd(pmd, old_pmd);
> > >   		}
> > 
> > Can we have something like a pmd_can_set_huge() helper? Then we could
> > avoid pointless modification and TLB invalidation work when
> > pmd_set_huge() will fail.
> 
> Actually, pmd_set_huge() will never fail because, if
> CONFIG_HAVE_ARCH_HUGE_VMAP is disabled, ioremap_pmd_enabled()
> will fail and if enabled (i.e. ARM64 & x86), they don't fail
> in their implementation. So, rather we can do the following.

AFAICT, that's not true. The x86 pmd_set_huge() can fail under certain
conditions:

int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
	u8 mtrr, uniform;

	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
	    (mtrr != MTRR_TYPE_WRBACK)) {
		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
			     __func__, addr, addr + PMD_SIZE);
		return 0;
	}

	prot = pgprot_4k_2_large(prot);

	set_pte((pte_t *)pmd, pfn_pte(
		(u64)addr >> PAGE_SHIFT,
		__pgprot(pgprot_val(prot) | _PAGE_PSE)));

	return 1;
}

... perhaps that can never happen in this particular case, but that's
not clear to me.

Thanks,
Mark.
Kani, Toshi March 15, 2018, 4:12 p.m. UTC | #7
On Thu, 2018-03-15 at 18:15 +0530, Chintan Pandya wrote:
> Huge mapping changes PMD/PUD which could have
> valid previous entries. This requires proper
> TLB maintanance on some architectures, like
> ARM64.
> 
> Implent BBM (break-before-make) safe TLB
> invalidation.
> 
> Here, I've used flush_tlb_pgtable() instead
> of flush_kernel_range() because invalidating
> intermediate page_table entries could have
> been optimized for specific arch. That's the
> case with ARM64 at least.
> 
> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
> ---
>  lib/ioremap.c | 25 +++++++++++++++++++------
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/ioremap.c b/lib/ioremap.c
> index 54e5bba..55f8648 100644
> --- a/lib/ioremap.c
> +++ b/lib/ioremap.c
> @@ -13,6 +13,7 @@
>  #include <linux/export.h>
>  #include <asm/cacheflush.h>
>  #include <asm/pgtable.h>
> +#include <asm-generic/tlb.h>
>  
>  #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>  static int __read_mostly ioremap_p4d_capable;
> @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>  		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>  {
>  	pmd_t *pmd;
> +	pmd_t old_pmd;
>  	unsigned long next;
>  
>  	phys_addr -= addr;
> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>  
>  		if (ioremap_pmd_enabled() &&
>  		    ((next - addr) == PMD_SIZE) &&
> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> -		    pmd_free_pte_page(pmd)) {
> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> +			old_pmd = *pmd;
> +			pmd_clear(pmd);

pmd_clear() is one of the operations pmd_free_pte_page() needs to do.
See the x86 version.

> +			flush_tlb_pgtable(&init_mm, addr);

You can call it in pmd_free_pte_page() on arm64 as well. 

> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> +				pmd_free_pte_page(&old_pmd);
>  				continue;
> +			} else
> +				set_pmd(pmd, old_pmd);

I do not understand why you needed to make this change. 
pmd_free_pte_page() is defined as an arch-specific function so that you
can additionally perform TLB purges on arm64.  Please try to make proper
arm64 implementation of this interface.  And if you find any issue in
this interface, please let me know.

Same for pud.

Thanks,
-Toshi
Chintan Pandya March 16, 2018, 7:14 a.m. UTC | #8
On 3/15/2018 8:46 PM, Mark Rutland wrote:
> On Thu, Mar 15, 2018 at 06:55:32PM +0530, Chintan Pandya wrote:
>> On 3/15/2018 6:43 PM, Mark Rutland wrote:
>>> On Thu, Mar 15, 2018 at 06:15:04PM +0530, Chintan Pandya wrote:
>>>> Huge mapping changes PMD/PUD which could have
>>>> valid previous entries. This requires proper
>>>> TLB maintanance on some architectures, like
>>>> ARM64.
>>>
>>> Just to check, I take it that you mean we could have a valid table
>>> entry, but all the entries in that next level table must be invalid,
>>> right?
>>
>> That was my assumption but my assumption can be wrong if any VA gets
>> block mapping for 1G directly (instead of the 2M cases we discussed
>> so far), then this would go for a toss.
> 
> Ok. Just considering the 4K -> 2M case, is that an assumption, or a
> guarantee?
For 4K->2M case, that's confirmed. I mean, while mapping 2M, all the
next level entries will be unmapped and cleared. That gets ensured
before we land to page table code. But if someone calls these page table
APIs directly without respecting previous mappings, we will not hit
BUG_ON() anywhere but a crash later in unfamiliar situations. But that's
the wrong thing to do.

> 
> Thanks,
> Mark.
> 

Chintan
Chintan Pandya March 16, 2018, 7:40 a.m. UTC | #9
On 3/15/2018 9:42 PM, Kani, Toshi wrote:
> On Thu, 2018-03-15 at 18:15 +0530, Chintan Pandya wrote:
>> Huge mapping changes PMD/PUD which could have
>> valid previous entries. This requires proper
>> TLB maintanance on some architectures, like
>> ARM64.
>>
>> Implent BBM (break-before-make) safe TLB
>> invalidation.
>>
>> Here, I've used flush_tlb_pgtable() instead
>> of flush_kernel_range() because invalidating
>> intermediate page_table entries could have
>> been optimized for specific arch. That's the
>> case with ARM64 at least.
>>
>> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
>> ---
>>   lib/ioremap.c | 25 +++++++++++++++++++------
>>   1 file changed, 19 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/ioremap.c b/lib/ioremap.c
>> index 54e5bba..55f8648 100644
>> --- a/lib/ioremap.c
>> +++ b/lib/ioremap.c
>> @@ -13,6 +13,7 @@
>>   #include <linux/export.h>
>>   #include <asm/cacheflush.h>
>>   #include <asm/pgtable.h>
>> +#include <asm-generic/tlb.h>
>>   
>>   #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>   static int __read_mostly ioremap_p4d_capable;
>> @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>   		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>>   {
>>   	pmd_t *pmd;
>> +	pmd_t old_pmd;
>>   	unsigned long next;
>>   
>>   	phys_addr -= addr;
>> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>   
>>   		if (ioremap_pmd_enabled() &&
>>   		    ((next - addr) == PMD_SIZE) &&
>> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
>> -		    pmd_free_pte_page(pmd)) {
>> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
>> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
>> +			old_pmd = *pmd;
>> +			pmd_clear(pmd);
> 
> pmd_clear() is one of the operations pmd_free_pte_page() needs to do.
> See the x86 version.
> 
>> +			flush_tlb_pgtable(&init_mm, addr);
> 
> You can call it in pmd_free_pte_page() on arm64 as well.
> 
>> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
>> +				pmd_free_pte_page(&old_pmd);
>>   				continue;
>> +			} else
>> +				set_pmd(pmd, old_pmd);
> 
> I do not understand why you needed to make this change.
> pmd_free_pte_page() is defined as an arch-specific function so that you
> can additionally perform TLB purges on arm64.  Please try to make proper
> arm64 implementation of this interface.  And if you find any issue in
> this interface, please let me know.
TLB ops require VA at least. And this interface passes just the PMD/PUD.

Second is, if we clear the previous table entry inside the arch specific
code and then we fail in pmd/pud_set_huge, we can't fallback (x86 case).

So, we can do something like this (following Mark's suggestion),

	if (ioremap_pmd_enabled() &&
         	((next - addr) == PMD_SIZE) &&
		IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
		pmd_can_set_huge(pmd, phys_addr + addr, prot)) {
			/*
			 * Clear existing table entry,
			 * Invalidate,
			 * Free the page table
			 * inside this code
			 */
			pmd_free_pte_page(pmd, addr, addr + PMD_SIZE);
			pmd_set_huge(...) //without fail
			continue;
	}


> 
> Same for pud.
> 
> Thanks,
> -Toshi
> 

Chintan
Kani, Toshi March 16, 2018, 2:50 p.m. UTC | #10
On Fri, 2018-03-16 at 13:10 +0530, Chintan Pandya wrote:
> 
> On 3/15/2018 9:42 PM, Kani, Toshi wrote:
> > On Thu, 2018-03-15 at 18:15 +0530, Chintan Pandya wrote:
> > > Huge mapping changes PMD/PUD which could have
> > > valid previous entries. This requires proper
> > > TLB maintanance on some architectures, like
> > > ARM64.
> > > 
> > > Implent BBM (break-before-make) safe TLB
> > > invalidation.
> > > 
> > > Here, I've used flush_tlb_pgtable() instead
> > > of flush_kernel_range() because invalidating
> > > intermediate page_table entries could have
> > > been optimized for specific arch. That's the
> > > case with ARM64 at least.
> > > 
> > > Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
> > > ---
> > >   lib/ioremap.c | 25 +++++++++++++++++++------
> > >   1 file changed, 19 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/lib/ioremap.c b/lib/ioremap.c
> > > index 54e5bba..55f8648 100644
> > > --- a/lib/ioremap.c
> > > +++ b/lib/ioremap.c
> > > @@ -13,6 +13,7 @@
> > >   #include <linux/export.h>
> > >   #include <asm/cacheflush.h>
> > >   #include <asm/pgtable.h>
> > > +#include <asm-generic/tlb.h>
> > >   
> > >   #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> > >   static int __read_mostly ioremap_p4d_capable;
> > > @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> > >   		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> > >   {
> > >   	pmd_t *pmd;
> > > +	pmd_t old_pmd;
> > >   	unsigned long next;
> > >   
> > >   	phys_addr -= addr;
> > > @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> > >   
> > >   		if (ioremap_pmd_enabled() &&
> > >   		    ((next - addr) == PMD_SIZE) &&
> > > -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> > > -		    pmd_free_pte_page(pmd)) {
> > > -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
> > > +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
> > > +			old_pmd = *pmd;
> > > +			pmd_clear(pmd);
> > 
> > pmd_clear() is one of the operations pmd_free_pte_page() needs to do.
> > See the x86 version.
> > 
> > > +			flush_tlb_pgtable(&init_mm, addr);
> > 
> > You can call it in pmd_free_pte_page() on arm64 as well.
> > 
> > > +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
> > > +				pmd_free_pte_page(&old_pmd);
> > >   				continue;
> > > +			} else
> > > +				set_pmd(pmd, old_pmd);
> > 
> > I do not understand why you needed to make this change.
> > pmd_free_pte_page() is defined as an arch-specific function so that you
> > can additionally perform TLB purges on arm64.  Please try to make proper
> > arm64 implementation of this interface.  And if you find any issue in
> > this interface, please let me know.
> 
> TLB ops require VA at least. And this interface passes just the PMD/PUD.

You can add 'addr' as the 2nd arg.  Such minor tweak is expected when
implementing on multiple arches.

> Second is, if we clear the previous table entry inside the arch specific
> code and then we fail in pmd/pud_set_huge, we can't fallback (x86 case).
> 
> So, we can do something like this (following Mark's suggestion),
> 
> 	if (ioremap_pmd_enabled() &&
>          	((next - addr) == PMD_SIZE) &&
> 		IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
> 		pmd_can_set_huge(pmd, phys_addr + addr, prot)) {
> 			/*
> 			 * Clear existing table entry,
> 			 * Invalidate,
> 			 * Free the page table
> 			 * inside this code
> 			 */
> 			pmd_free_pte_page(pmd, addr, addr + PMD_SIZE);
> 			pmd_set_huge(...) //without fail
> 			continue;
> 	}

That's not necessary.  pmd being none is a legitimate state.  In fact,
it is the case when pmd_alloc() allocated and populated a new pmd.

Thanks,
-Toshi
Chintan Pandya March 19, 2018, 4:26 a.m. UTC | #11
On 3/16/2018 8:20 PM, Kani, Toshi wrote:
> On Fri, 2018-03-16 at 13:10 +0530, Chintan Pandya wrote:
>>
>> On 3/15/2018 9:42 PM, Kani, Toshi wrote:
>>> On Thu, 2018-03-15 at 18:15 +0530, Chintan Pandya wrote:
>>>> Huge mapping changes PMD/PUD which could have
>>>> valid previous entries. This requires proper
>>>> TLB maintanance on some architectures, like
>>>> ARM64.
>>>>
>>>> Implent BBM (break-before-make) safe TLB
>>>> invalidation.
>>>>
>>>> Here, I've used flush_tlb_pgtable() instead
>>>> of flush_kernel_range() because invalidating
>>>> intermediate page_table entries could have
>>>> been optimized for specific arch. That's the
>>>> case with ARM64 at least.
>>>>
>>>> Signed-off-by: Chintan Pandya <cpandya@codeaurora.org>
>>>> ---
>>>>    lib/ioremap.c | 25 +++++++++++++++++++------
>>>>    1 file changed, 19 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/lib/ioremap.c b/lib/ioremap.c
>>>> index 54e5bba..55f8648 100644
>>>> --- a/lib/ioremap.c
>>>> +++ b/lib/ioremap.c
>>>> @@ -13,6 +13,7 @@
>>>>    #include <linux/export.h>
>>>>    #include <asm/cacheflush.h>
>>>>    #include <asm/pgtable.h>
>>>> +#include <asm-generic/tlb.h>
>>>>    
>>>>    #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>>>    static int __read_mostly ioremap_p4d_capable;
>>>> @@ -80,6 +81,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>>>    		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>>>>    {
>>>>    	pmd_t *pmd;
>>>> +	pmd_t old_pmd;
>>>>    	unsigned long next;
>>>>    
>>>>    	phys_addr -= addr;
>>>> @@ -91,10 +93,15 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
>>>>    
>>>>    		if (ioremap_pmd_enabled() &&
>>>>    		    ((next - addr) == PMD_SIZE) &&
>>>> -		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
>>>> -		    pmd_free_pte_page(pmd)) {
>>>> -			if (pmd_set_huge(pmd, phys_addr + addr, prot))
>>>> +		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
>>>> +			old_pmd = *pmd;
>>>> +			pmd_clear(pmd);
>>>
>>> pmd_clear() is one of the operations pmd_free_pte_page() needs to do.
>>> See the x86 version.
>>>
>>>> +			flush_tlb_pgtable(&init_mm, addr);
>>>
>>> You can call it in pmd_free_pte_page() on arm64 as well.
>>>
>>>> +			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
>>>> +				pmd_free_pte_page(&old_pmd);
>>>>    				continue;
>>>> +			} else
>>>> +				set_pmd(pmd, old_pmd);
>>>
>>> I do not understand why you needed to make this change.
>>> pmd_free_pte_page() is defined as an arch-specific function so that you
>>> can additionally perform TLB purges on arm64.  Please try to make proper
>>> arm64 implementation of this interface.  And if you find any issue in
>>> this interface, please let me know.
>>
>> TLB ops require VA at least. And this interface passes just the PMD/PUD.
> 
> You can add 'addr' as the 2nd arg.  Such minor tweak is expected when
> implementing on multiple arches.
> 
>> Second is, if we clear the previous table entry inside the arch specific
>> code and then we fail in pmd/pud_set_huge, we can't fallback (x86 case).
>>
>> So, we can do something like this (following Mark's suggestion),
>>
>> 	if (ioremap_pmd_enabled() &&
>>           	((next - addr) == PMD_SIZE) &&
>> 		IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
>> 		pmd_can_set_huge(pmd, phys_addr + addr, prot)) {
>> 			/*
>> 			 * Clear existing table entry,
>> 			 * Invalidate,
>> 			 * Free the page table
>> 			 * inside this code
>> 			 */
>> 			pmd_free_pte_page(pmd, addr, addr + PMD_SIZE);
>> 			pmd_set_huge(...) //without fail
>> 			continue;
>> 	}
> 
> That's not necessary.  pmd being none is a legitimate state.  In fact,
> it is the case when pmd_alloc() allocated and populated a new pmd.

Alright. I'll send v3 today.

> 
> Thanks,
> -Toshi
> 

Chintan
diff mbox

Patch

diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bba..55f8648 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -13,6 +13,7 @@ 
 #include <linux/export.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
+#include <asm-generic/tlb.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static int __read_mostly ioremap_p4d_capable;
@@ -80,6 +81,7 @@  static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pmd_t *pmd;
+	pmd_t old_pmd;
 	unsigned long next;
 
 	phys_addr -= addr;
@@ -91,10 +93,15 @@  static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
 
 		if (ioremap_pmd_enabled() &&
 		    ((next - addr) == PMD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
-		    pmd_free_pte_page(pmd)) {
-			if (pmd_set_huge(pmd, phys_addr + addr, prot))
+		    IS_ALIGNED(phys_addr + addr, PMD_SIZE)) {
+			old_pmd = *pmd;
+			pmd_clear(pmd);
+			flush_tlb_pgtable(&init_mm, addr);
+			if (pmd_set_huge(pmd, phys_addr + addr, prot)) {
+				pmd_free_pte_page(&old_pmd);
 				continue;
+			} else
+				set_pmd(pmd, old_pmd);
 		}
 
 		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
@@ -107,6 +114,7 @@  static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
 	pud_t *pud;
+	pud_t old_pud;
 	unsigned long next;
 
 	phys_addr -= addr;
@@ -118,10 +126,15 @@  static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
 
 		if (ioremap_pud_enabled() &&
 		    ((next - addr) == PUD_SIZE) &&
-		    IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
-		    pud_free_pmd_page(pud)) {
-			if (pud_set_huge(pud, phys_addr + addr, prot))
+		    IS_ALIGNED(phys_addr + addr, PUD_SIZE)) {
+			old_pud = *pud;
+			pud_clear(pud);
+			flush_tlb_pgtable(&init_mm, addr);
+			if (pud_set_huge(pud, phys_addr + addr, prot)) {
+				pud_free_pmd_page(&old_pud);
 				continue;
+			} else
+				set_pud(pud, old_pud);
 		}
 
 		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))