diff mbox series

[v2,2/2] arm64: tlb: Use the TLBI RANGE feature in arm64

Message ID 20200710094420.517-3-yezhenyu2@huawei.com (mailing list archive)
State New, archived
Headers show
Series arm64: tlb: add support for TLBI RANGE instructions | expand

Commit Message

Zhenyu Ye July 10, 2020, 9:44 a.m. UTC
Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range().

When cpu supports TLBI feature, the minimum range granularity is
decided by 'scale', so we can not flush all pages by one instruction
in some cases.

For example, when the pages = 0xe81a, let's start 'scale' from
maximum, and find right 'num' for each 'scale':

1. scale = 3, we can flush no pages because the minimum range is
   2^(5*3 + 1) = 0x10000.
2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can
   flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c.
   Remaining pages is 0x1a;
3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page
   can be flushed.
4. scale = 0, we flush the remaining 0x1a pages, the num =
   0x1a/0x2 - 1 = 0xd.

However, in most scenarios, the pages = 1 when flush_tlb_range() is
called. Start from scale = 3 or other proper value (such as scale =
ilog2(pages)), will incur extra overhead.
So increase 'scale' from 0 to maximum, the flush order is exactly
opposite to the example.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arm64/include/asm/tlbflush.h | 138 +++++++++++++++++++++++-------
 1 file changed, 109 insertions(+), 29 deletions(-)

Comments

Catalin Marinas July 10, 2020, 6:31 p.m. UTC | #1
On Fri, Jul 10, 2020 at 05:44:20PM +0800, Zhenyu Ye wrote:
> Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range().
> 
> When cpu supports TLBI feature, the minimum range granularity is
> decided by 'scale', so we can not flush all pages by one instruction
> in some cases.
> 
> For example, when the pages = 0xe81a, let's start 'scale' from
> maximum, and find right 'num' for each 'scale':
> 
> 1. scale = 3, we can flush no pages because the minimum range is
>    2^(5*3 + 1) = 0x10000.
> 2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can
>    flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c.
>    Remaining pages is 0x1a;
> 3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page
>    can be flushed.
> 4. scale = 0, we flush the remaining 0x1a pages, the num =
>    0x1a/0x2 - 1 = 0xd.
> 
> However, in most scenarios, the pages = 1 when flush_tlb_range() is
> called. Start from scale = 3 or other proper value (such as scale =
> ilog2(pages)), will incur extra overhead.
> So increase 'scale' from 0 to maximum, the flush order is exactly
> opposite to the example.
> 
> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> ---
>  arch/arm64/include/asm/tlbflush.h | 138 +++++++++++++++++++++++-------
>  1 file changed, 109 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index 39aed2efd21b..edfec8139ef8 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -60,6 +60,31 @@
>  		__ta;						\
>  	})
>  
> +/*
> + * Get translation granule of the system, which is decided by
> + * PAGE_SIZE.  Used by TTL.
> + *  - 4KB	: 1
> + *  - 16KB	: 2
> + *  - 64KB	: 3
> + */
> +#define TLBI_TTL_TG_4K		1
> +#define TLBI_TTL_TG_16K		2
> +#define TLBI_TTL_TG_64K		3
> +
> +static inline unsigned long get_trans_granule(void)
> +{
> +	switch (PAGE_SIZE) {
> +	case SZ_4K:
> +		return TLBI_TTL_TG_4K;
> +	case SZ_16K:
> +		return TLBI_TTL_TG_16K;
> +	case SZ_64K:
> +		return TLBI_TTL_TG_64K;
> +	default:
> +		return 0;
> +	}
> +}
> +
>  /*
>   * Level-based TLBI operations.
>   *
> @@ -73,9 +98,6 @@
>   * in asm/stage2_pgtable.h.
>   */
>  #define TLBI_TTL_MASK		GENMASK_ULL(47, 44)
> -#define TLBI_TTL_TG_4K		1
> -#define TLBI_TTL_TG_16K		2
> -#define TLBI_TTL_TG_64K		3
>  
>  #define __tlbi_level(op, addr, level) do {				\
>  	u64 arg = addr;							\
> @@ -83,19 +105,7 @@
>  	if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&		\
>  	    level) {							\
>  		u64 ttl = level & 3;					\
> -									\
> -		switch (PAGE_SIZE) {					\
> -		case SZ_4K:						\
> -			ttl |= TLBI_TTL_TG_4K << 2;			\
> -			break;						\
> -		case SZ_16K:						\
> -			ttl |= TLBI_TTL_TG_16K << 2;			\
> -			break;						\
> -		case SZ_64K:						\
> -			ttl |= TLBI_TTL_TG_64K << 2;			\
> -			break;						\
> -		}							\
> -									\
> +		ttl |= get_trans_granule() << 2;			\
>  		arg &= ~TLBI_TTL_MASK;					\
>  		arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);			\
>  	}								\
> @@ -108,6 +118,39 @@
>  		__tlbi_level(op, (arg | USER_ASID_FLAG), level);	\
>  } while (0)
>  
> +/*
> + * This macro creates a properly formatted VA operand for the TLBI RANGE.
> + * The value bit assignments are:
> + *
> + * +----------+------+-------+-------+-------+----------------------+
> + * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
> + * +-----------------+-------+-------+-------+----------------------+
> + * |63      48|47  46|45   44|43   39|38   37|36                   0|
> + *
> + * The address range is determined by below formula:
> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
> + *
> + */
> +#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl)		\
> +	({							\
> +		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
> +		__ta &= GENMASK_ULL(36, 0);			\
> +		__ta |= (unsigned long)(ttl & 3) << 37;		\
> +		__ta |= (unsigned long)(num & 31) << 39;	\
> +		__ta |= (unsigned long)(scale & 3) << 44;	\
> +		__ta |= (get_trans_granule() & 3) << 46;	\
> +		__ta |= (unsigned long)(asid) << 48;		\
> +		__ta;						\
> +	})

Nitpick: we don't need the additional masking here (e.g. ttl & 3) since
the values are capped anyway.

> +
> +/* These macros are used by the TLBI RANGE feature. */
> +#define __TLBI_RANGE_PAGES(num, scale)	(((num) + 1) << (5 * (scale) + 1))
> +#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
> +
> +#define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
> +#define __TLBI_RANGE_NUM(range, scale)	\
> +	(((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK)
> +
>  /*
>   *	TLB Invalidation
>   *	================
> @@ -232,32 +275,69 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>  				     unsigned long stride, bool last_level,
>  				     int tlb_level)
>  {
> +	int num = 0;
> +	int scale = 0;
>  	unsigned long asid = ASID(vma->vm_mm);
>  	unsigned long addr;
> +	unsigned long pages;
>  
>  	start = round_down(start, stride);
>  	end = round_up(end, stride);
> +	pages = (end - start) >> PAGE_SHIFT;
>  
> -	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
> +	if ((!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
> +	    (end - start) >= (MAX_TLBI_OPS * stride)) ||
> +	    pages >= MAX_TLBI_RANGE_PAGES) {
>  		flush_tlb_mm(vma->vm_mm);
>  		return;
>  	}

I think we can use strictly greater here rather than greater or equal.
MAX_TLBI_RANGE_PAGES can be encoded as num 31, scale 3.

>  
> -	/* Convert the stride into units of 4k */
> -	stride >>= 12;
> +	dsb(ishst);
>  
> -	start = __TLBI_VADDR(start, asid);
> -	end = __TLBI_VADDR(end, asid);
> +	/*
> +	 * When cpu does not support TLBI RANGE feature, we flush the tlb
> +	 * entries one by one at the granularity of 'stride'.
> +	 * When cpu supports the TLBI RANGE feature, then:
> +	 * 1. If pages is odd, flush the first page through non-RANGE
> +	 *    instruction;
> +	 * 2. For remaining pages: The minimum range granularity is decided
> +	 *    by 'scale', so we can not flush all pages by one instruction
> +	 *    in some cases.
> +	 *    Here, we start from scale = 0, flush corresponding pages
> +	 *    (from 2^(5*scale + 1) to 2^(5*(scale + 1) + 1)), and increase
> +	 *    it until no pages left.
> +	 */
> +	while (pages > 0) {

I did some simple checks on ((end - start) % stride) and never
triggered. I had a slight worry that pages could become negative (and
we'd loop forever since it's unsigned long) for some mismatched stride
and flush size. It doesn't seem like.

> +		if (!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) ||
> +		    pages % 2 == 1) {
> +			addr = __TLBI_VADDR(start, asid);
> +			if (last_level) {
> +				__tlbi_level(vale1is, addr, tlb_level);
> +				__tlbi_user_level(vale1is, addr, tlb_level);
> +			} else {
> +				__tlbi_level(vae1is, addr, tlb_level);
> +				__tlbi_user_level(vae1is, addr, tlb_level);
> +			}
> +			start += stride;
> +			pages -= stride >> PAGE_SHIFT;
> +			continue;
> +		}
>  
> -	dsb(ishst);
> -	for (addr = start; addr < end; addr += stride) {
> -		if (last_level) {
> -			__tlbi_level(vale1is, addr, tlb_level);
> -			__tlbi_user_level(vale1is, addr, tlb_level);
> -		} else {
> -			__tlbi_level(vae1is, addr, tlb_level);
> -			__tlbi_user_level(vae1is, addr, tlb_level);
> +		num = __TLBI_RANGE_NUM(pages, scale) - 1;
> +		if (num >= 0) {
> +			addr = __TLBI_VADDR_RANGE(start, asid, scale,
> +						  num, tlb_level);
> +			if (last_level) {
> +				__tlbi(rvale1is, addr);
> +				__tlbi_user(rvale1is, addr);
> +			} else {
> +				__tlbi(rvae1is, addr);
> +				__tlbi_user(rvae1is, addr);
> +			}
> +			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
> +			pages -= __TLBI_RANGE_PAGES(num, scale);
>  		}
> +		scale++;
>  	}
>  	dsb(ish);

The logic looks fine to me now. I can fix the above nitpicks myself and
maybe adjust the comment a bit. I plan to push them into next to see if
anything explodes.

Thanks.
Zhenyu Ye July 11, 2020, 6:50 a.m. UTC | #2
Hi Catalin,

On 2020/7/11 2:31, Catalin Marinas wrote:
> On Fri, Jul 10, 2020 at 05:44:20PM +0800, Zhenyu Ye wrote:
>> -	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
>> +	if ((!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
>> +	    (end - start) >= (MAX_TLBI_OPS * stride)) ||
>> +	    pages >= MAX_TLBI_RANGE_PAGES) {
>>  		flush_tlb_mm(vma->vm_mm);
>>  		return;
>>  	}
> 
> I think we can use strictly greater here rather than greater or equal.
> MAX_TLBI_RANGE_PAGES can be encoded as num 31, scale 3.

Sorry, we can't.
For a boundary value (such as 2^6), we have two way to express it
in TLBI RANGE operations:
1. scale = 0, num = 31.
2. scale = 1, num = 0.

I used the second way in following implementation.  However, for the
MAX_TLBI_RANGE_PAGES, we can only use scale = 3, num = 31.
So if use strictly greater here, ERROR will happen when range pages
equal to MAX_TLBI_RANGE_PAGES.

There are two ways to avoid this bug:
1. Just keep 'greater or equal' here.  The ARM64 specification does
not specify how we flush tlb entries in this case, flush_tlb_mm()
is also a good choice for such a wide range of pages.
2. Add check in the loop, just like: (this may cause the codes a bit ugly)

	num = __TLBI_RANGE_NUM(pages, scale) - 1;

	/* scale = 4, num = 0 is equal to scale = 3, num = 31. */
	if (scale == 4 && num == 0) {
		scale = 3;
		num = 31;
	}

	if (num >= 0) {
	...

Which one do you prefer and how do you want to fix this error? Just
a fix patch again?

> 
>>  
>> -	/* Convert the stride into units of 4k */
>> -	stride >>= 12;
>> +	dsb(ishst);
>>  
>> -	start = __TLBI_VADDR(start, asid);
>> -	end = __TLBI_VADDR(end, asid);
>> +	/*
>> +	 * When cpu does not support TLBI RANGE feature, we flush the tlb
>> +	 * entries one by one at the granularity of 'stride'.
>> +	 * When cpu supports the TLBI RANGE feature, then:
>> +	 * 1. If pages is odd, flush the first page through non-RANGE
>> +	 *    instruction;
>> +	 * 2. For remaining pages: The minimum range granularity is decided
>> +	 *    by 'scale', so we can not flush all pages by one instruction
>> +	 *    in some cases.
>> +	 *    Here, we start from scale = 0, flush corresponding pages
>> +	 *    (from 2^(5*scale + 1) to 2^(5*(scale + 1) + 1)), and increase
>> +	 *    it until no pages left.
>> +	 */
>> +	while (pages > 0) {
> 
> I did some simple checks on ((end - start) % stride) and never
> triggered. I had a slight worry that pages could become negative (and
> we'd loop forever since it's unsigned long) for some mismatched stride
> and flush size. It doesn't seem like.
> 

The start and end are round_down/up in the function:

	start = round_down(start, stride);
 	end = round_up(end, stride);

So the flush size and stride will never mismatch.

Thanks,
Zhenyu
Catalin Marinas July 12, 2020, 12:03 p.m. UTC | #3
On Sat, Jul 11, 2020 at 02:50:46PM +0800, Zhenyu Ye wrote:
> On 2020/7/11 2:31, Catalin Marinas wrote:
> > On Fri, Jul 10, 2020 at 05:44:20PM +0800, Zhenyu Ye wrote:
> >> -	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
> >> +	if ((!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
> >> +	    (end - start) >= (MAX_TLBI_OPS * stride)) ||
> >> +	    pages >= MAX_TLBI_RANGE_PAGES) {
> >>  		flush_tlb_mm(vma->vm_mm);
> >>  		return;
> >>  	}
> > 
> > I think we can use strictly greater here rather than greater or equal.
> > MAX_TLBI_RANGE_PAGES can be encoded as num 31, scale 3.
> 
> Sorry, we can't.
> For a boundary value (such as 2^6), we have two way to express it
> in TLBI RANGE operations:
> 1. scale = 0, num = 31.
> 2. scale = 1, num = 0.
> 
> I used the second way in following implementation.  However, for the
> MAX_TLBI_RANGE_PAGES, we can only use scale = 3, num = 31.
> So if use strictly greater here, ERROR will happen when range pages
> equal to MAX_TLBI_RANGE_PAGES.

You are right, I got confused by the __TLBI_RANGE_NUM() macro which
doesn't return the actual 'num' for the TLBI argument as it would go
from 0 to 31. After subtracting 1, num end sup from -1 to 30, so we
never get the maximum range. I think for scale 3 and num 31, this would
be 8GB with 4K pages, so the maximum we'd cover 8GB - 64K * 4K.

> There are two ways to avoid this bug:
> 1. Just keep 'greater or equal' here.  The ARM64 specification does
> not specify how we flush tlb entries in this case, flush_tlb_mm()
> is also a good choice for such a wide range of pages.

I'll go for this option, I don't think it would make much difference in
practice if we stop at 8GB - 256M range.

> 2. Add check in the loop, just like: (this may cause the codes a bit ugly)
> 
> 	num = __TLBI_RANGE_NUM(pages, scale) - 1;
> 
> 	/* scale = 4, num = 0 is equal to scale = 3, num = 31. */
> 	if (scale == 4 && num == 0) {
> 		scale = 3;
> 		num = 31;
> 	}
> 
> 	if (num >= 0) {
> 	...
> 
> Which one do you prefer and how do you want to fix this error? Just
> a fix patch again?

I'll fold the diff below and refresh the patch:

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1eb0588718fb..0300e433ffe6 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -147,9 +147,13 @@ static inline unsigned long get_trans_granule(void)
 #define __TLBI_RANGE_PAGES(num, scale)	(((num) + 1) << (5 * (scale) + 1))
 #define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
 
+/*
+ * Generate 'num' values from -1 to 30 with -1 rejected by the
+ * __flush_tlb_range() loop below.
+ */
 #define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
 #define __TLBI_RANGE_NUM(range, scale)	\
-	(((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK)
+	((((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1)
 
 /*
  *	TLB Invalidation
@@ -285,8 +289,8 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	pages = (end - start) >> PAGE_SHIFT;
 
 	if ((!cpus_have_const_cap(ARM64_HAS_TLB_RANGE) &&
-	     (end - start) > (MAX_TLBI_OPS * stride)) ||
-	    pages > MAX_TLBI_RANGE_PAGES) {
+	     (end - start) >= (MAX_TLBI_OPS * stride)) ||
+	    pages >= MAX_TLBI_RANGE_PAGES) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
@@ -306,6 +310,10 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	 *    Start from scale = 0, flush the corresponding number of pages
 	 *    ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
 	 *    until no pages left.
+	 *
+	 * Note that certain ranges can be represented by either num = 31 and
+	 * scale or num = 0 and scale + 1. The loop below favours the latter
+	 * since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
 	 */
 	while (pages > 0) {
 		if (!cpus_have_const_cap(ARM64_HAS_TLB_RANGE) ||
@@ -323,7 +331,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 			continue;
 		}
 
-		num = __TLBI_RANGE_NUM(pages, scale) - 1;
+		num = __TLBI_RANGE_NUM(pages, scale);
 		if (num >= 0) {
 			addr = __TLBI_VADDR_RANGE(start, asid, scale,
 						  num, tlb_level);

> >> -	/* Convert the stride into units of 4k */
> >> -	stride >>= 12;
> >> +	dsb(ishst);
> >>  
> >> -	start = __TLBI_VADDR(start, asid);
> >> -	end = __TLBI_VADDR(end, asid);
> >> +	/*
> >> +	 * When cpu does not support TLBI RANGE feature, we flush the tlb
> >> +	 * entries one by one at the granularity of 'stride'.
> >> +	 * When cpu supports the TLBI RANGE feature, then:
> >> +	 * 1. If pages is odd, flush the first page through non-RANGE
> >> +	 *    instruction;
> >> +	 * 2. For remaining pages: The minimum range granularity is decided
> >> +	 *    by 'scale', so we can not flush all pages by one instruction
> >> +	 *    in some cases.
> >> +	 *    Here, we start from scale = 0, flush corresponding pages
> >> +	 *    (from 2^(5*scale + 1) to 2^(5*(scale + 1) + 1)), and increase
> >> +	 *    it until no pages left.
> >> +	 */
> >> +	while (pages > 0) {
> > 
> > I did some simple checks on ((end - start) % stride) and never
> > triggered. I had a slight worry that pages could become negative (and
> > we'd loop forever since it's unsigned long) for some mismatched stride
> > and flush size. It doesn't seem like.
> 
> The start and end are round_down/up in the function:
> 
> 	start = round_down(start, stride);
>  	end = round_up(end, stride);
> 
> So the flush size and stride will never mismatch.

Right.

To make sure we don't miss any corner cases, I'll try to through the
algorithm above at CBMC (model checker; hopefully next week if I find
some time).
Jon Hunter July 13, 2020, 2:27 p.m. UTC | #4
On 10/07/2020 10:44, Zhenyu Ye wrote:
> Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range().
> 
> When cpu supports TLBI feature, the minimum range granularity is
> decided by 'scale', so we can not flush all pages by one instruction
> in some cases.
> 
> For example, when the pages = 0xe81a, let's start 'scale' from
> maximum, and find right 'num' for each 'scale':
> 
> 1. scale = 3, we can flush no pages because the minimum range is
>    2^(5*3 + 1) = 0x10000.
> 2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can
>    flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c.
>    Remaining pages is 0x1a;
> 3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page
>    can be flushed.
> 4. scale = 0, we flush the remaining 0x1a pages, the num =
>    0x1a/0x2 - 1 = 0xd.
> 
> However, in most scenarios, the pages = 1 when flush_tlb_range() is
> called. Start from scale = 3 or other proper value (such as scale =
> ilog2(pages)), will incur extra overhead.
> So increase 'scale' from 0 to maximum, the flush order is exactly
> opposite to the example.
> 
> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>


After this change I am seeing the following build errors ...

/tmp/cckzq3FT.s: Assembler messages:
/tmp/cckzq3FT.s:854: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:870: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:1095: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:1111: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:1964: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:1980: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:2286: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:2302: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
/tmp/cckzq3FT.s:4833: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
/tmp/cckzq3FT.s:4849: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
/tmp/cckzq3FT.s:5090: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
/tmp/cckzq3FT.s:5106: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
/tmp/cckzq3FT.s:874: Error: attempt to move .org backwards
/tmp/cckzq3FT.s:1115: Error: attempt to move .org backwards
/tmp/cckzq3FT.s:1984: Error: attempt to move .org backwards
/tmp/cckzq3FT.s:2306: Error: attempt to move .org backwards
/tmp/cckzq3FT.s:4853: Error: attempt to move .org backwards
/tmp/cckzq3FT.s:5110: Error: attempt to move .org backwards
scripts/Makefile.build:280: recipe for target 'arch/arm64/mm/hugetlbpage.o' failed
make[3]: *** [arch/arm64/mm/hugetlbpage.o] Error 1
scripts/Makefile.build:497: recipe for target 'arch/arm64/mm' failed
make[2]: *** [arch/arm64/mm] Error 2

Cheers
Jon
Zhenyu Ye July 13, 2020, 2:39 p.m. UTC | #5
Hi Jon,

On 2020/7/13 22:27, Jon Hunter wrote:
> After this change I am seeing the following build errors ...
> 
> /tmp/cckzq3FT.s: Assembler messages:
> /tmp/cckzq3FT.s:854: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:870: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:1095: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:1111: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:1964: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:1980: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:2286: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:2302: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> /tmp/cckzq3FT.s:4833: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> /tmp/cckzq3FT.s:4849: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> /tmp/cckzq3FT.s:5090: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> /tmp/cckzq3FT.s:5106: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> /tmp/cckzq3FT.s:874: Error: attempt to move .org backwards
> /tmp/cckzq3FT.s:1115: Error: attempt to move .org backwards
> /tmp/cckzq3FT.s:1984: Error: attempt to move .org backwards
> /tmp/cckzq3FT.s:2306: Error: attempt to move .org backwards
> /tmp/cckzq3FT.s:4853: Error: attempt to move .org backwards
> /tmp/cckzq3FT.s:5110: Error: attempt to move .org backwards
> scripts/Makefile.build:280: recipe for target 'arch/arm64/mm/hugetlbpage.o' failed
> make[3]: *** [arch/arm64/mm/hugetlbpage.o] Error 1
> scripts/Makefile.build:497: recipe for target 'arch/arm64/mm' failed
> make[2]: *** [arch/arm64/mm] Error 2
> 
> Cheers
> Jon
> 

The code must be built with binutils >= 2.30.
Maybe I should add  a check on whether binutils supports ARMv8.4-a instructions...

Thanks,
Zhenyu
Jon Hunter July 13, 2020, 2:44 p.m. UTC | #6
On 13/07/2020 15:39, Zhenyu Ye wrote:
> Hi Jon,
> 
> On 2020/7/13 22:27, Jon Hunter wrote:
>> After this change I am seeing the following build errors ...
>>
>> /tmp/cckzq3FT.s: Assembler messages:
>> /tmp/cckzq3FT.s:854: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:870: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:1095: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:1111: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:1964: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:1980: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:2286: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:2302: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
>> /tmp/cckzq3FT.s:4833: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
>> /tmp/cckzq3FT.s:4849: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
>> /tmp/cckzq3FT.s:5090: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
>> /tmp/cckzq3FT.s:5106: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
>> /tmp/cckzq3FT.s:874: Error: attempt to move .org backwards
>> /tmp/cckzq3FT.s:1115: Error: attempt to move .org backwards
>> /tmp/cckzq3FT.s:1984: Error: attempt to move .org backwards
>> /tmp/cckzq3FT.s:2306: Error: attempt to move .org backwards
>> /tmp/cckzq3FT.s:4853: Error: attempt to move .org backwards
>> /tmp/cckzq3FT.s:5110: Error: attempt to move .org backwards
>> scripts/Makefile.build:280: recipe for target 'arch/arm64/mm/hugetlbpage.o' failed
>> make[3]: *** [arch/arm64/mm/hugetlbpage.o] Error 1
>> scripts/Makefile.build:497: recipe for target 'arch/arm64/mm' failed
>> make[2]: *** [arch/arm64/mm] Error 2
>>
>> Cheers
>> Jon
>>
> 
> The code must be built with binutils >= 2.30.
> Maybe I should add  a check on whether binutils supports ARMv8.4-a instructions...

Yes I believe so.

Cheers
Jon
Catalin Marinas July 13, 2020, 5:21 p.m. UTC | #7
On Mon, Jul 13, 2020 at 03:44:16PM +0100, Jon Hunter wrote:
> On 13/07/2020 15:39, Zhenyu Ye wrote:
> > On 2020/7/13 22:27, Jon Hunter wrote:
> >> After this change I am seeing the following build errors ...
> >>
> >> /tmp/cckzq3FT.s: Assembler messages:
> >> /tmp/cckzq3FT.s:854: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:870: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:1095: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:1111: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:1964: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:1980: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:2286: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:2302: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x7'
> >> /tmp/cckzq3FT.s:4833: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> >> /tmp/cckzq3FT.s:4849: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> >> /tmp/cckzq3FT.s:5090: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> >> /tmp/cckzq3FT.s:5106: Error: unknown or missing operation name at operand 1 -- `tlbi rvae1is,x6'
> >> /tmp/cckzq3FT.s:874: Error: attempt to move .org backwards
> >> /tmp/cckzq3FT.s:1115: Error: attempt to move .org backwards
> >> /tmp/cckzq3FT.s:1984: Error: attempt to move .org backwards
> >> /tmp/cckzq3FT.s:2306: Error: attempt to move .org backwards
> >> /tmp/cckzq3FT.s:4853: Error: attempt to move .org backwards
> >> /tmp/cckzq3FT.s:5110: Error: attempt to move .org backwards
> >> scripts/Makefile.build:280: recipe for target 'arch/arm64/mm/hugetlbpage.o' failed
> >> make[3]: *** [arch/arm64/mm/hugetlbpage.o] Error 1
> >> scripts/Makefile.build:497: recipe for target 'arch/arm64/mm' failed
> >> make[2]: *** [arch/arm64/mm] Error 2
> > 
> > The code must be built with binutils >= 2.30.
> > Maybe I should add  a check on whether binutils supports ARMv8.4-a instructions...
> 
> Yes I believe so.

The binutils guys in Arm confirmed that assembling "tlbi rvae1is"
without -march=armv8.4-a is a bug. When it gets fixed, checking for the
binutils version is not sufficient without passing -march. I think we
are better off with a manual encoding of the instruction.
Catalin Marinas July 14, 2020, 10:36 a.m. UTC | #8
On Fri, Jul 10, 2020 at 05:44:20PM +0800, Zhenyu Ye wrote:
> +#define __TLBI_RANGE_PAGES(num, scale)	(((num) + 1) << (5 * (scale) + 1))
> +#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
> +
> +#define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
> +#define __TLBI_RANGE_NUM(range, scale)	\
> +	(((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK)
[...]
> +	int num = 0;
> +	int scale = 0;
[...]
> +			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
[...]

Since num is an int, __TLBI_RANGE_PAGES is still an int. Shifting it by
PAGE_SHIFT can overflow as the maximum would be 8GB for 4K pages (or
128GB for 64K pages). I think we probably get away with this because of
some implicit type conversion but I'd rather make __TLBI_RANGE_PAGES an
explicit unsigned long:

#define __TLBI_RANGE_PAGES(num, scale)	((unsigned long)((num) + 1) << (5 * (scale) + 1))

Without this change, the CBMC check fails (see below for the test). In
the kernel, we don't have this problem as we encode the address via
__TLBI_VADDR_RANGE and it doesn't overflow.

The good part is that CBMC reckons the algorithm is correct ;).

---------------8<------tlbinval.c---------------------------
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Check with:
 *   cbmc --unwind 6 tlbinval.c
 */

#define PAGE_SHIFT	(12)
#define PAGE_SIZE	(1 << PAGE_SHIFT)
#define VA_RANGE	(1UL << 48)

#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))

#define __TLBI_RANGE_PAGES(num, scale)	((unsigned long)((num) + 1) << (5 * (scale) + 1))
#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)

#define TLBI_RANGE_MASK			0x1fUL
#define __TLBI_RANGE_NUM(pages, scale)	\
	((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1)

static unsigned long inval_start;
static unsigned long inval_end;

static void tlbi(unsigned long start, unsigned long size)
{
	unsigned long end = start + size;

	if (inval_end == 0) {
		inval_start = start;
		inval_end = end;
		return;
	}

	/* contiguous ranges in ascending order only */
	__CPROVER_assert(start == inval_end, "Contiguous TLBI ranges");

	inval_end = end;
}

static void __flush_tlb_range(unsigned long start, unsigned long end,
			      unsigned long stride)
{
	int num = 0;
	int scale = 0;
	unsigned long pages;

	start = round_down(start, stride);
	end = round_up(end, stride);
	pages = (end - start) >> PAGE_SHIFT;

	if (pages >= MAX_TLBI_RANGE_PAGES) {
		tlbi(0, VA_RANGE);
		return;
	}

	while (pages > 0) {
		__CPROVER_assert(scale <= 3, "Scale in range");
		if (pages % 2 == 1) {
			tlbi(start, stride);
			start += stride;
			pages -= stride >> PAGE_SHIFT;
			continue;
		}

		num = __TLBI_RANGE_NUM(pages, scale);
		__CPROVER_assert(num <= 30, "Num in range");
		if (num >= 0) {
			tlbi(start, __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT);
			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
			pages -= __TLBI_RANGE_PAGES(num, scale);
		}
		scale++;
	}
}

static unsigned long nondet_ulong(void);

int main(void)
{
	unsigned long stride = nondet_ulong();
	unsigned long start = round_down(nondet_ulong(), stride);
	unsigned long end = round_up(nondet_ulong(), stride);

	__CPROVER_assume(stride == PAGE_SIZE ||
			 stride == PAGE_SIZE << (PAGE_SHIFT - 3) ||
			 stride == PAGE_SIZE << (2 * (PAGE_SHIFT - 3)));
	__CPROVER_assume(start < end);
	__CPROVER_assume(end <= VA_RANGE);

	__flush_tlb_range(start, end, stride);

	__CPROVER_assert((inval_start == 0 && inval_end == VA_RANGE) ||
			 (inval_start == start && inval_end == end),
			 "Correct invalidation");

	return 0;
}
Zhenyu Ye July 14, 2020, 1:51 p.m. UTC | #9
On 2020/7/14 18:36, Catalin Marinas wrote:
> On Fri, Jul 10, 2020 at 05:44:20PM +0800, Zhenyu Ye wrote:
>> +#define __TLBI_RANGE_PAGES(num, scale)	(((num) + 1) << (5 * (scale) + 1))
>> +#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
>> +
>> +#define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
>> +#define __TLBI_RANGE_NUM(range, scale)	\
>> +	(((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK)
> [...]
>> +	int num = 0;
>> +	int scale = 0;
> [...]
>> +			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
> [...]
> 
> Since num is an int, __TLBI_RANGE_PAGES is still an int. Shifting it by
> PAGE_SHIFT can overflow as the maximum would be 8GB for 4K pages (or
> 128GB for 64K pages). I think we probably get away with this because of
> some implicit type conversion but I'd rather make __TLBI_RANGE_PAGES an
> explicit unsigned long:
> 
> #define __TLBI_RANGE_PAGES(num, scale)	((unsigned long)((num) + 1) << (5 * (scale) + 1))
> 

This is valuable and I will update this in next series, with the check
for binutils (or encode the instructions by hand), as soon as possible.

> Without this change, the CBMC check fails (see below for the test). In
> the kernel, we don't have this problem as we encode the address via
> __TLBI_VADDR_RANGE and it doesn't overflow.> The good part is that CBMC reckons the algorithm is correct ;).

Thanks for your test!

Zhenyu
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 39aed2efd21b..edfec8139ef8 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -60,6 +60,31 @@ 
 		__ta;						\
 	})
 
+/*
+ * Get translation granule of the system, which is decided by
+ * PAGE_SIZE.  Used by TTL.
+ *  - 4KB	: 1
+ *  - 16KB	: 2
+ *  - 64KB	: 3
+ */
+#define TLBI_TTL_TG_4K		1
+#define TLBI_TTL_TG_16K		2
+#define TLBI_TTL_TG_64K		3
+
+static inline unsigned long get_trans_granule(void)
+{
+	switch (PAGE_SIZE) {
+	case SZ_4K:
+		return TLBI_TTL_TG_4K;
+	case SZ_16K:
+		return TLBI_TTL_TG_16K;
+	case SZ_64K:
+		return TLBI_TTL_TG_64K;
+	default:
+		return 0;
+	}
+}
+
 /*
  * Level-based TLBI operations.
  *
@@ -73,9 +98,6 @@ 
  * in asm/stage2_pgtable.h.
  */
 #define TLBI_TTL_MASK		GENMASK_ULL(47, 44)
-#define TLBI_TTL_TG_4K		1
-#define TLBI_TTL_TG_16K		2
-#define TLBI_TTL_TG_64K		3
 
 #define __tlbi_level(op, addr, level) do {				\
 	u64 arg = addr;							\
@@ -83,19 +105,7 @@ 
 	if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&		\
 	    level) {							\
 		u64 ttl = level & 3;					\
-									\
-		switch (PAGE_SIZE) {					\
-		case SZ_4K:						\
-			ttl |= TLBI_TTL_TG_4K << 2;			\
-			break;						\
-		case SZ_16K:						\
-			ttl |= TLBI_TTL_TG_16K << 2;			\
-			break;						\
-		case SZ_64K:						\
-			ttl |= TLBI_TTL_TG_64K << 2;			\
-			break;						\
-		}							\
-									\
+		ttl |= get_trans_granule() << 2;			\
 		arg &= ~TLBI_TTL_MASK;					\
 		arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);			\
 	}								\
@@ -108,6 +118,39 @@ 
 		__tlbi_level(op, (arg | USER_ASID_FLAG), level);	\
 } while (0)
 
+/*
+ * This macro creates a properly formatted VA operand for the TLBI RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63      48|47  46|45   44|43   39|38   37|36                   0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl)		\
+	({							\
+		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		__ta &= GENMASK_ULL(36, 0);			\
+		__ta |= (unsigned long)(ttl & 3) << 37;		\
+		__ta |= (unsigned long)(num & 31) << 39;	\
+		__ta |= (unsigned long)(scale & 3) << 44;	\
+		__ta |= (get_trans_granule() & 3) << 46;	\
+		__ta |= (unsigned long)(asid) << 48;		\
+		__ta;						\
+	})
+
+/* These macros are used by the TLBI RANGE feature. */
+#define __TLBI_RANGE_PAGES(num, scale)	(((num) + 1) << (5 * (scale) + 1))
+#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
+
+#define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
+#define __TLBI_RANGE_NUM(range, scale)	\
+	(((range) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK)
+
 /*
  *	TLB Invalidation
  *	================
@@ -232,32 +275,69 @@  static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
 {
+	int num = 0;
+	int scale = 0;
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
+	unsigned long pages;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
+	pages = (end - start) >> PAGE_SHIFT;
 
-	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
+	if ((!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
+	    (end - start) >= (MAX_TLBI_OPS * stride)) ||
+	    pages >= MAX_TLBI_RANGE_PAGES) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
 
-	/* Convert the stride into units of 4k */
-	stride >>= 12;
+	dsb(ishst);
 
-	start = __TLBI_VADDR(start, asid);
-	end = __TLBI_VADDR(end, asid);
+	/*
+	 * When cpu does not support TLBI RANGE feature, we flush the tlb
+	 * entries one by one at the granularity of 'stride'.
+	 * When cpu supports the TLBI RANGE feature, then:
+	 * 1. If pages is odd, flush the first page through non-RANGE
+	 *    instruction;
+	 * 2. For remaining pages: The minimum range granularity is decided
+	 *    by 'scale', so we can not flush all pages by one instruction
+	 *    in some cases.
+	 *    Here, we start from scale = 0, flush corresponding pages
+	 *    (from 2^(5*scale + 1) to 2^(5*(scale + 1) + 1)), and increase
+	 *    it until no pages left.
+	 */
+	while (pages > 0) {
+		if (!cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) ||
+		    pages % 2 == 1) {
+			addr = __TLBI_VADDR(start, asid);
+			if (last_level) {
+				__tlbi_level(vale1is, addr, tlb_level);
+				__tlbi_user_level(vale1is, addr, tlb_level);
+			} else {
+				__tlbi_level(vae1is, addr, tlb_level);
+				__tlbi_user_level(vae1is, addr, tlb_level);
+			}
+			start += stride;
+			pages -= stride >> PAGE_SHIFT;
+			continue;
+		}
 
-	dsb(ishst);
-	for (addr = start; addr < end; addr += stride) {
-		if (last_level) {
-			__tlbi_level(vale1is, addr, tlb_level);
-			__tlbi_user_level(vale1is, addr, tlb_level);
-		} else {
-			__tlbi_level(vae1is, addr, tlb_level);
-			__tlbi_user_level(vae1is, addr, tlb_level);
+		num = __TLBI_RANGE_NUM(pages, scale) - 1;
+		if (num >= 0) {
+			addr = __TLBI_VADDR_RANGE(start, asid, scale,
+						  num, tlb_level);
+			if (last_level) {
+				__tlbi(rvale1is, addr);
+				__tlbi_user(rvale1is, addr);
+			} else {
+				__tlbi(rvae1is, addr);
+				__tlbi_user(rvae1is, addr);
+			}
+			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+			pages -= __TLBI_RANGE_PAGES(num, scale);
 		}
+		scale++;
 	}
 	dsb(ish);
 }