[RFC,v4,2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
diff mbox series

Message ID 20200601144713.2222-3-yezhenyu2@huawei.com
State New
Headers show
Series
  • [RFC,v4,1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
Related show

Commit Message

Zhenyu Ye June 1, 2020, 2:47 p.m. UTC
Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range().

In this patch, we only use the TLBI RANGE feature if the stride == PAGE_SIZE,
because when stride > PAGE_SIZE, usually only a small number of pages need
to be flushed and classic tlbi intructions are more effective.

We can also use 'end - start < threshold number' to decide which way
to go, however, different hardware may have different thresholds, so
I'm not sure if this is feasible.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arm64/include/asm/tlbflush.h | 98 +++++++++++++++++++++++++++----
 1 file changed, 86 insertions(+), 12 deletions(-)

Comments

Zhenyu Ye June 2, 2020, 12:06 p.m. UTC | #1
Hi all,

Some optimizations to the codes:

On 2020/6/1 22:47, Zhenyu Ye wrote:
> -	start = __TLBI_VADDR(start, asid);
> -	end = __TLBI_VADDR(end, asid);
> +	/*
> +	 * The minimum size of TLB RANGE is 2 pages;
> +	 * Use normal TLB instruction to handle odd pages.
> +	 * If the stride != PAGE_SIZE, this will never happen.
> +	 */
> +	if (range_pages % 2 == 1) {
> +		addr = __TLBI_VADDR(start, asid);
> +		__tlbi_last_level(vale1is, vae1is, addr, last_level);
> +		start += 1 << PAGE_SHIFT;
> +		range_pages >>= 1;
> +	}
>  

We flush a single page here, and below loop does the same thing
if cpu not support TLB RANGE feature.  So may we use a goto statement
to simplify the code.

> +	while (range_pages > 0) {
> +		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
> +		    stride == PAGE_SIZE) {
> +			num = (range_pages & TLB_RANGE_MASK) - 1;
> +			if (num >= 0) {
> +				addr = __TLBI_VADDR_RANGE(start, asid, scale,
> +							  num, 0);
> +				__tlbi_last_level(rvale1is, rvae1is, addr,
> +						  last_level);
> +				start += __TLBI_RANGE_SIZES(num, scale);
> +			}
> +			scale++;
> +			range_pages >>= TLB_RANGE_MASK_SHIFT;
> +			continue;
>  		}
> +
> +		addr = __TLBI_VADDR(start, asid);
> +		__tlbi_last_level(vale1is, vae1is, addr, last_level);
> +		start += stride;
> +		range_pages -= stride >> 12;
>  	}
>  	dsb(ish);
>  }
> 

Just like:

--8<---
	if (range_pages %2 == 1)
		goto flush_single_tlb;

	while (range_pages > 0) {
		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
		    stride == PAGE_SIZE) {
			num = ((range_pages >> 1) & TLB_RANGE_MASK) - 1;
			if (num >= 0) {
				addr = __TLBI_VADDR_RANGE(start, asid, scale,
							  num, 0);
				__tlbi_last_level(rvale1is, rvae1is, addr,
						  last_level);
				start += __TLBI_RANGE_SIZES(num, scale);
			}
			scale++;
			range_pages >>= TLB_RANGE_MASK_SHIFT;
			continue;
		}

flush_single_tlb:
		addr = __TLBI_VADDR(start, asid);
		__tlbi_last_level(vale1is, vae1is, addr, last_level);
		start += stride;
		range_pages -= stride >> PAGE_SHIFT;
	}
--8<---
Catalin Marinas July 7, 2020, 5:36 p.m. UTC | #2
On Mon, Jun 01, 2020 at 10:47:13PM +0800, Zhenyu Ye wrote:
> @@ -59,6 +69,47 @@
>  		__ta;						\
>  	})
>  
> +/*
> + * __TG defines translation granule of the system, which is decided by
> + * PAGE_SHIFT.  Used by TTL.
> + *  - 4KB	: 1
> + *  - 16KB	: 2
> + *  - 64KB	: 3
> + */
> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)

Nitpick: maybe something like __TLBI_TG to avoid clashes in case someone
else defines a __TG macro.

> @@ -181,32 +232,55 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>  				     unsigned long start, unsigned long end,
>  				     unsigned long stride, bool last_level)
>  {
> +	int num = 0;
> +	int scale = 0;
>  	unsigned long asid = ASID(vma->vm_mm);
>  	unsigned long addr;
> +	unsigned long range_pages;
>  
>  	start = round_down(start, stride);
>  	end = round_up(end, stride);
> +	range_pages = (end - start) >> PAGE_SHIFT;
>  
>  	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
>  		flush_tlb_mm(vma->vm_mm);
>  		return;
>  	}
>  
> -	/* Convert the stride into units of 4k */
> -	stride >>= 12;
> +	dsb(ishst);
>  
> -	start = __TLBI_VADDR(start, asid);
> -	end = __TLBI_VADDR(end, asid);
> +	/*
> +	 * The minimum size of TLB RANGE is 2 pages;
> +	 * Use normal TLB instruction to handle odd pages.
> +	 * If the stride != PAGE_SIZE, this will never happen.
> +	 */
> +	if (range_pages % 2 == 1) {
> +		addr = __TLBI_VADDR(start, asid);
> +		__tlbi_last_level(vale1is, vae1is, addr, last_level);
> +		start += 1 << PAGE_SHIFT;
> +		range_pages >>= 1;
> +	}

Shouldn't this be range_pages-- or -= stride >> 12? Your goto follow-up
fixes this, though I'm not a big fan of gotos jumping in the middle of a
loop.

> -	dsb(ishst);
> -	for (addr = start; addr < end; addr += stride) {
> -		if (last_level) {
> -			__tlbi(vale1is, addr);
> -			__tlbi_user(vale1is, addr);
> -		} else {
> -			__tlbi(vae1is, addr);
> -			__tlbi_user(vae1is, addr);
> +	while (range_pages > 0) {
> +		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
> +		    stride == PAGE_SIZE) {

I think we could have the odd range_pages check here:

		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
		    stride == PAGE_SIZE && range_pages % 2 == 0) {

and avoid the one outside the loop.

> +			num = (range_pages & TLB_RANGE_MASK) - 1;
> +			if (num >= 0) {
> +				addr = __TLBI_VADDR_RANGE(start, asid, scale,
> +							  num, 0);
> +				__tlbi_last_level(rvale1is, rvae1is, addr,
> +						  last_level);
> +				start += __TLBI_RANGE_SIZES(num, scale);
> +			}
> +			scale++;
> +			range_pages >>= TLB_RANGE_MASK_SHIFT;
> +			continue;
>  		}
> +
> +		addr = __TLBI_VADDR(start, asid);
> +		__tlbi_last_level(vale1is, vae1is, addr, last_level);
> +		start += stride;
> +		range_pages -= stride >> 12;
>  	}
>  	dsb(ish);
>  }
> -- 
> 2.19.1
Marc Zyngier July 7, 2020, 5:43 p.m. UTC | #3
On 2020-07-07 18:36, Catalin Marinas wrote:
> On Mon, Jun 01, 2020 at 10:47:13PM +0800, Zhenyu Ye wrote:
>> @@ -59,6 +69,47 @@
>>  		__ta;						\
>>  	})
>> 
>> +/*
>> + * __TG defines translation granule of the system, which is decided 
>> by
>> + * PAGE_SHIFT.  Used by TTL.
>> + *  - 4KB	: 1
>> + *  - 16KB	: 2
>> + *  - 64KB	: 3
>> + */
>> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
> 
> Nitpick: maybe something like __TLBI_TG to avoid clashes in case 
> someone
> else defines a __TG macro.

I have commented on this in the past, and still maintain that this
would be better served by a switch statement similar to what is used
for TTL already (I don't think this magic formula exists in the
ARM ARM).

         M.
Catalin Marinas July 7, 2020, 5:46 p.m. UTC | #4
On Tue, Jul 07, 2020 at 06:43:35PM +0100, Marc Zyngier wrote:
> On 2020-07-07 18:36, Catalin Marinas wrote:
> > On Mon, Jun 01, 2020 at 10:47:13PM +0800, Zhenyu Ye wrote:
> > > @@ -59,6 +69,47 @@
> > >  		__ta;						\
> > >  	})
> > > 
> > > +/*
> > > + * __TG defines translation granule of the system, which is decided
> > > by
> > > + * PAGE_SHIFT.  Used by TTL.
> > > + *  - 4KB	: 1
> > > + *  - 16KB	: 2
> > > + *  - 64KB	: 3
> > > + */
> > > +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
> > 
> > Nitpick: maybe something like __TLBI_TG to avoid clashes in case someone
> > else defines a __TG macro.
> 
> I have commented on this in the past, and still maintain that this
> would be better served by a switch statement similar to what is used
> for TTL already (I don't think this magic formula exists in the
> ARM ARM).

Good point, it would be cleaner indeed.
Robin Murphy July 7, 2020, 6:12 p.m. UTC | #5
On 2020-07-07 18:46, Catalin Marinas wrote:
> On Tue, Jul 07, 2020 at 06:43:35PM +0100, Marc Zyngier wrote:
>> On 2020-07-07 18:36, Catalin Marinas wrote:
>>> On Mon, Jun 01, 2020 at 10:47:13PM +0800, Zhenyu Ye wrote:
>>>> @@ -59,6 +69,47 @@
>>>>   		__ta;						\
>>>>   	})
>>>>
>>>> +/*
>>>> + * __TG defines translation granule of the system, which is decided
>>>> by
>>>> + * PAGE_SHIFT.  Used by TTL.
>>>> + *  - 4KB	: 1
>>>> + *  - 16KB	: 2
>>>> + *  - 64KB	: 3
>>>> + */
>>>> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
>>>
>>> Nitpick: maybe something like __TLBI_TG to avoid clashes in case someone
>>> else defines a __TG macro.
>>
>> I have commented on this in the past, and still maintain that this
>> would be better served by a switch statement similar to what is used
>> for TTL already (I don't think this magic formula exists in the
>> ARM ARM).
> 
> Good point, it would be cleaner indeed.

FWIW, we use the somewhat obtuse "(shift - 10) / 2" in the SMMUv3 
driver, but that's because we support multiple different granules at 
runtime and want to generate efficient code. Anything based on 
PAGE_SHIFT that resolves to a compile-time constant has no excuse for 
not being written in a clear and obvious manner ;)

Robin.
Zhenyu Ye July 8, 2020, 9:01 a.m. UTC | #6
Hi Catalin,

On 2020/7/8 1:36, Catalin Marinas wrote:
> On Mon, Jun 01, 2020 at 10:47:13PM +0800, Zhenyu Ye wrote:
>> @@ -59,6 +69,47 @@
>>  		__ta;						\
>>  	})
>>  
>> +/*
>> + * __TG defines translation granule of the system, which is decided by
>> + * PAGE_SHIFT.  Used by TTL.
>> + *  - 4KB	: 1
>> + *  - 16KB	: 2
>> + *  - 64KB	: 3
>> + */
>> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
> 
> Nitpick: maybe something like __TLBI_TG to avoid clashes in case someone
> else defines a __TG macro.
> 

Thanks for your review. According to Marc and Robin's suggestion, I will remove this
macro.
I'd like implement this in a function beacause it's used in both TTL and TLB RANGE.

>> -	for (addr = start; addr < end; addr += stride) {
>> -		if (last_level) {
>> -			__tlbi(vale1is, addr);
>> -			__tlbi_user(vale1is, addr);
>> -		} else {
>> -			__tlbi(vae1is, addr);
>> -			__tlbi_user(vae1is, addr);
>> +	while (range_pages > 0) {
>> +		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
>> +		    stride == PAGE_SIZE) {
> 
> I think we could have the odd range_pages check here:
> 
> 		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
> 		    stride == PAGE_SIZE && range_pages % 2 == 0) {
> 
> and avoid the one outside the loop.
> 

This may need some other necessary changes to do this. See in next version series.

Thanks,
Zhenyu

Patch
diff mbox series

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bc3949064725..818f27c82024 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -50,6 +50,16 @@ 
 		__tlbi(op, (arg) | USER_ASID_FLAG);				\
 } while (0)
 
+#define __tlbi_last_level(op1, op2, arg, last_level) do {		\
+	if (last_level)	{						\
+		__tlbi(op1, arg);					\
+		__tlbi_user(op1, arg);					\
+	} else {							\
+		__tlbi(op2, arg);					\
+		__tlbi_user(op2, arg);					\
+	}								\
+} while (0)
+
 /* This macro creates a properly formatted VA operand for the TLBI */
 #define __TLBI_VADDR(addr, asid)				\
 	({							\
@@ -59,6 +69,47 @@ 
 		__ta;						\
 	})
 
+/*
+ * __TG defines translation granule of the system, which is decided by
+ * PAGE_SHIFT.  Used by TTL.
+ *  - 4KB	: 1
+ *  - 16KB	: 2
+ *  - 64KB	: 3
+ */
+#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
+
+/*
+ * This macro creates a properly formatted VA operand for the TLBI RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63      48|47  46|45   44|43   39|38   37|36                   0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl)		\
+	({							\
+		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		__ta &= GENMASK_ULL(36, 0);			\
+		__ta |= (unsigned long)(ttl) << 37;		\
+		__ta |= (unsigned long)(num) << 39;		\
+		__ta |= (unsigned long)(scale) << 44;		\
+		__ta |= (unsigned long)(__TG) << 46;		\
+		__ta |= (unsigned long)(asid) << 48;		\
+		__ta;						\
+	})
+
+/* This macro defines the range pages of the TLBI RANGE. */
+#define __TLBI_RANGE_SIZES(num, scale)	((num + 1) << (5 * scale + 1) << PAGE_SHIFT)
+
+#define TLB_RANGE_MASK_SHIFT 5
+#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0)
+
+
 /*
  *	TLB Invalidation
  *	================
@@ -181,32 +232,55 @@  static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
 				     unsigned long stride, bool last_level)
 {
+	int num = 0;
+	int scale = 0;
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
+	unsigned long range_pages;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
+	range_pages = (end - start) >> PAGE_SHIFT;
 
 	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
 
-	/* Convert the stride into units of 4k */
-	stride >>= 12;
+	dsb(ishst);
 
-	start = __TLBI_VADDR(start, asid);
-	end = __TLBI_VADDR(end, asid);
+	/*
+	 * The minimum size of TLB RANGE is 2 pages;
+	 * Use normal TLB instruction to handle odd pages.
+	 * If the stride != PAGE_SIZE, this will never happen.
+	 */
+	if (range_pages % 2 == 1) {
+		addr = __TLBI_VADDR(start, asid);
+		__tlbi_last_level(vale1is, vae1is, addr, last_level);
+		start += 1 << PAGE_SHIFT;
+		range_pages >>= 1;
+	}
 
-	dsb(ishst);
-	for (addr = start; addr < end; addr += stride) {
-		if (last_level) {
-			__tlbi(vale1is, addr);
-			__tlbi_user(vale1is, addr);
-		} else {
-			__tlbi(vae1is, addr);
-			__tlbi_user(vae1is, addr);
+	while (range_pages > 0) {
+		if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
+		    stride == PAGE_SIZE) {
+			num = (range_pages & TLB_RANGE_MASK) - 1;
+			if (num >= 0) {
+				addr = __TLBI_VADDR_RANGE(start, asid, scale,
+							  num, 0);
+				__tlbi_last_level(rvale1is, rvae1is, addr,
+						  last_level);
+				start += __TLBI_RANGE_SIZES(num, scale);
+			}
+			scale++;
+			range_pages >>= TLB_RANGE_MASK_SHIFT;
+			continue;
 		}
+
+		addr = __TLBI_VADDR(start, asid);
+		__tlbi_last_level(vale1is, vae1is, addr, last_level);
+		start += stride;
+		range_pages -= stride >> 12;
 	}
 	dsb(ish);
 }