diff mbox series

[v3,4/4] arm64: mte: Optimize mte_assign_mem_tag_range()

Message ID 20210115120043.50023-5-vincenzo.frascino@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: ARMv8.5-A: MTE: Add async mode support | expand

Commit Message

Vincenzo Frascino Jan. 15, 2021, noon UTC
mte_assign_mem_tag_range() is called on production KASAN HW hot
paths. It makes sense to optimize it in an attempt to reduce the
overhead.

Optimize mte_assign_mem_tag_range() based on the indications provided at
[1].

[1] https://lore.kernel.org/r/CAAeHK+wCO+J7D1_T89DG+jJrPLk3X9RsGFKxJGd0ZcUFjQT-9Q@mail.gmail.com/

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
---
 arch/arm64/include/asm/mte.h | 26 +++++++++++++++++++++++++-
 arch/arm64/lib/mte.S         | 15 ---------------
 2 files changed, 25 insertions(+), 16 deletions(-)

Comments

Mark Rutland Jan. 15, 2021, 3:45 p.m. UTC | #1
On Fri, Jan 15, 2021 at 12:00:43PM +0000, Vincenzo Frascino wrote:
> mte_assign_mem_tag_range() is called on production KASAN HW hot
> paths. It makes sense to optimize it in an attempt to reduce the
> overhead.
> 
> Optimize mte_assign_mem_tag_range() based on the indications provided at
> [1].

... what exactly is the optimization?

I /think/ you're just trying to have it inlined, but you should mention
that explicitly.

> 
> [1] https://lore.kernel.org/r/CAAeHK+wCO+J7D1_T89DG+jJrPLk3X9RsGFKxJGd0ZcUFjQT-9Q@mail.gmail.com/
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
> ---
>  arch/arm64/include/asm/mte.h | 26 +++++++++++++++++++++++++-
>  arch/arm64/lib/mte.S         | 15 ---------------
>  2 files changed, 25 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> index 1a715963d909..9730f2b07b79 100644
> --- a/arch/arm64/include/asm/mte.h
> +++ b/arch/arm64/include/asm/mte.h
> @@ -49,7 +49,31 @@ long get_mte_ctrl(struct task_struct *task);
>  int mte_ptrace_copy_tags(struct task_struct *child, long request,
>  			 unsigned long addr, unsigned long data);
>  
> -void mte_assign_mem_tag_range(void *addr, size_t size);
> +static inline void mte_assign_mem_tag_range(void *addr, size_t size)
> +{
> +	u64 _addr = (u64)addr;
> +	u64 _end = _addr + size;
> +
> +	/*
> +	 * This function must be invoked from an MTE enabled context.
> +	 *
> +	 * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> +	 * size must be non-zero and MTE_GRANULE_SIZE aligned.
> +	 */
> +	do {
> +		/*
> +		 * 'asm volatile' is required to prevent the compiler to move
> +		 * the statement outside of the loop.
> +		 */
> +		asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
> +			     :
> +			     : "r" (_addr)
> +			     : "memory");
> +
> +		_addr += MTE_GRANULE_SIZE;
> +	} while (_addr < _end);

Is there any chance that this can be used for the last bytes of the
virtual address space? This might need to change to `_addr == _end` if
that is possible, otherwise it'll terminate early in that case.

> +}

What does the code generation look like for this, relative to the
assembly version?

Thanks,
Mark.

> +
>  
>  #else /* CONFIG_ARM64_MTE */
>  
> diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
> index 9e1a12e10053..a0a650451510 100644
> --- a/arch/arm64/lib/mte.S
> +++ b/arch/arm64/lib/mte.S
> @@ -150,18 +150,3 @@ SYM_FUNC_START(mte_restore_page_tags)
>  	ret
>  SYM_FUNC_END(mte_restore_page_tags)
>  
> -/*
> - * Assign allocation tags for a region of memory based on the pointer tag
> - *   x0 - source pointer
> - *   x1 - size
> - *
> - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> - */
> -SYM_FUNC_START(mte_assign_mem_tag_range)
> -1:	stg	x0, [x0]
> -	add	x0, x0, #MTE_GRANULE_SIZE
> -	subs	x1, x1, #MTE_GRANULE_SIZE
> -	b.gt	1b
> -	ret
> -SYM_FUNC_END(mte_assign_mem_tag_range)
> -- 
> 2.30.0
>
Vincenzo Frascino Jan. 16, 2021, 2:22 p.m. UTC | #2
Hi Mark,

On 1/15/21 3:45 PM, Mark Rutland wrote:
> On Fri, Jan 15, 2021 at 12:00:43PM +0000, Vincenzo Frascino wrote:
>> mte_assign_mem_tag_range() is called on production KASAN HW hot
>> paths. It makes sense to optimize it in an attempt to reduce the
>> overhead.
>>
>> Optimize mte_assign_mem_tag_range() based on the indications provided at
>> [1].
> 
> ... what exactly is the optimization?
> 
> I /think/ you're just trying to have it inlined, but you should mention
> that explicitly.
> 

Good point, I will change it in the next version. I used "Optimize" as a
continuation of the topic in the previous thread but you are right it is not
immediately obvious.

>>
>> [1] https://lore.kernel.org/r/CAAeHK+wCO+J7D1_T89DG+jJrPLk3X9RsGFKxJGd0ZcUFjQT-9Q@mail.gmail.com/
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
>> ---
>>  arch/arm64/include/asm/mte.h | 26 +++++++++++++++++++++++++-
>>  arch/arm64/lib/mte.S         | 15 ---------------
>>  2 files changed, 25 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
>> index 1a715963d909..9730f2b07b79 100644
>> --- a/arch/arm64/include/asm/mte.h
>> +++ b/arch/arm64/include/asm/mte.h
>> @@ -49,7 +49,31 @@ long get_mte_ctrl(struct task_struct *task);
>>  int mte_ptrace_copy_tags(struct task_struct *child, long request,
>>  			 unsigned long addr, unsigned long data);
>>  
>> -void mte_assign_mem_tag_range(void *addr, size_t size);
>> +static inline void mte_assign_mem_tag_range(void *addr, size_t size)
>> +{
>> +	u64 _addr = (u64)addr;
>> +	u64 _end = _addr + size;
>> +
>> +	/*
>> +	 * This function must be invoked from an MTE enabled context.
>> +	 *
>> +	 * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
>> +	 * size must be non-zero and MTE_GRANULE_SIZE aligned.
>> +	 */
>> +	do {
>> +		/*
>> +		 * 'asm volatile' is required to prevent the compiler to move
>> +		 * the statement outside of the loop.
>> +		 */
>> +		asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
>> +			     :
>> +			     : "r" (_addr)
>> +			     : "memory");
>> +
>> +		_addr += MTE_GRANULE_SIZE;
>> +	} while (_addr < _end);
> 
> Is there any chance that this can be used for the last bytes of the
> virtual address space? This might need to change to `_addr == _end` if
> that is possible, otherwise it'll terminate early in that case.
> 

Theoretically it is a possibility. I will change the condition and add a note
for that.

>> +}
> 
> What does the code generation look like for this, relative to the
> assembly version?
> 

The assembly looks like this:

 390:   8b000022        add     x2, x1, x0
 394:   aa0003e1        mov     x1, x0
 398:   d9200821        stg     x1, [x1]
 39c:   91004021        add     x1, x1, #0x10
 3a0:   eb01005f        cmp     x2, x1
 3a4:   54ffffa8        b.hi    398 <mte_set_mem_tag_range+0x48>

You can see the handcrafted one below.

> Thanks,
> Mark.
> 
>> +
>>  
>>  #else /* CONFIG_ARM64_MTE */
>>  
>> diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
>> index 9e1a12e10053..a0a650451510 100644
>> --- a/arch/arm64/lib/mte.S
>> +++ b/arch/arm64/lib/mte.S
>> @@ -150,18 +150,3 @@ SYM_FUNC_START(mte_restore_page_tags)
>>  	ret
>>  SYM_FUNC_END(mte_restore_page_tags)
>>  
>> -/*
>> - * Assign allocation tags for a region of memory based on the pointer tag
>> - *   x0 - source pointer
>> - *   x1 - size
>> - *
>> - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
>> - * size must be non-zero and MTE_GRANULE_SIZE aligned.
>> - */
>> -SYM_FUNC_START(mte_assign_mem_tag_range)
>> -1:	stg	x0, [x0]
>> -	add	x0, x0, #MTE_GRANULE_SIZE
>> -	subs	x1, x1, #MTE_GRANULE_SIZE
>> -	b.gt	1b
>> -	ret
>> -SYM_FUNC_END(mte_assign_mem_tag_range)
>> -- 
>> 2.30.0
>>
Vincenzo Frascino Jan. 17, 2021, 12:27 p.m. UTC | #3
Hi Mark,

On 1/16/21 2:22 PM, Vincenzo Frascino wrote:
>> Is there any chance that this can be used for the last bytes of the
>> virtual address space? This might need to change to `_addr == _end` if
>> that is possible, otherwise it'll terminate early in that case.
>>
> Theoretically it is a possibility. I will change the condition and add a note
> for that.
> 

I was thinking to the end of the virtual address space scenario and I forgot
that if I use a condition like `_addr == _end` the tagging operation overflows
to the first granule of the next allocation. This disrupts tagging accesses for
that memory area hence I think that `_addr < _end` is the way to go.
Mark Rutland Jan. 18, 2021, 10:41 a.m. UTC | #4
On Sun, Jan 17, 2021 at 12:27:08PM +0000, Vincenzo Frascino wrote:
> Hi Mark,
> 
> On 1/16/21 2:22 PM, Vincenzo Frascino wrote:
> >> Is there any chance that this can be used for the last bytes of the
> >> virtual address space? This might need to change to `_addr == _end` if
> >> that is possible, otherwise it'll terminate early in that case.
> >>
> > Theoretically it is a possibility. I will change the condition and add a note
> > for that.
> > 
> 
> I was thinking to the end of the virtual address space scenario and I forgot
> that if I use a condition like `_addr == _end` the tagging operation overflows
> to the first granule of the next allocation. This disrupts tagging accesses for
> that memory area hence I think that `_addr < _end` is the way to go.

I think it implies `_addr != _end` is necessary. Otherwise, if `addr` is
PAGE_SIZE from the end of memory, and `size` is PAGE_SIZE, `_end` will
be 0, so using `_addr < _end` will mean the loop will terminate after a
single MTE tag granule rather than the whole page.

Generally, for some addr/increment/size combination (where all are
suitably aligned), you need a pattern like:

| do {
|       thing(addr);
|       addr += increment;
| } while (addr != end);

... or:

| for (addr = start; addr != end; addr += increment) {
|       thing(addr);
| }

... to correctly handle working at the very end of the VA space.

We do similar for page tables, e.g. when we use pmd_addr_end().

Thanks,
Mark.
Vincenzo Frascino Jan. 18, 2021, 11 a.m. UTC | #5
On 1/18/21 10:41 AM, Mark Rutland wrote:
> On Sun, Jan 17, 2021 at 12:27:08PM +0000, Vincenzo Frascino wrote:
>> Hi Mark,
>>
>> On 1/16/21 2:22 PM, Vincenzo Frascino wrote:
>>>> Is there any chance that this can be used for the last bytes of the
>>>> virtual address space? This might need to change to `_addr == _end` if
>>>> that is possible, otherwise it'll terminate early in that case.
>>>>
>>> Theoretically it is a possibility. I will change the condition and add a note
>>> for that.
>>>
>>
>> I was thinking to the end of the virtual address space scenario and I forgot
>> that if I use a condition like `_addr == _end` the tagging operation overflows
>> to the first granule of the next allocation. This disrupts tagging accesses for
>> that memory area hence I think that `_addr < _end` is the way to go.
> 
> I think it implies `_addr != _end` is necessary. Otherwise, if `addr` is
> PAGE_SIZE from the end of memory, and `size` is PAGE_SIZE, `_end` will
> be 0, so using `_addr < _end` will mean the loop will terminate after a
> single MTE tag granule rather than the whole page.
> 
> Generally, for some addr/increment/size combination (where all are
> suitably aligned), you need a pattern like:
> 
> | do {
> |       thing(addr);
> |       addr += increment;
> | } while (addr != end);
> 
> ... or:
> 
> | for (addr = start; addr != end; addr += increment) {
> |       thing(addr);
> | }
> 
> ... to correctly handle working at the very end of the VA space.
> 
> We do similar for page tables, e.g. when we use pmd_addr_end().
>

Good point! I agree it wraps around otherwise. I will change it accordingly.

Thanks!

> Thanks,
> Mark.
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 1a715963d909..9730f2b07b79 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -49,7 +49,31 @@  long get_mte_ctrl(struct task_struct *task);
 int mte_ptrace_copy_tags(struct task_struct *child, long request,
 			 unsigned long addr, unsigned long data);
 
-void mte_assign_mem_tag_range(void *addr, size_t size);
+static inline void mte_assign_mem_tag_range(void *addr, size_t size)
+{
+	u64 _addr = (u64)addr;
+	u64 _end = _addr + size;
+
+	/*
+	 * This function must be invoked from an MTE enabled context.
+	 *
+	 * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
+	 * size must be non-zero and MTE_GRANULE_SIZE aligned.
+	 */
+	do {
+		/*
+		 * 'asm volatile' is required to prevent the compiler to move
+		 * the statement outside of the loop.
+		 */
+		asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
+			     :
+			     : "r" (_addr)
+			     : "memory");
+
+		_addr += MTE_GRANULE_SIZE;
+	} while (_addr < _end);
+}
+
 
 #else /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 9e1a12e10053..a0a650451510 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -150,18 +150,3 @@  SYM_FUNC_START(mte_restore_page_tags)
 	ret
 SYM_FUNC_END(mte_restore_page_tags)
 
-/*
- * Assign allocation tags for a region of memory based on the pointer tag
- *   x0 - source pointer
- *   x1 - size
- *
- * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
- */
-SYM_FUNC_START(mte_assign_mem_tag_range)
-1:	stg	x0, [x0]
-	add	x0, x0, #MTE_GRANULE_SIZE
-	subs	x1, x1, #MTE_GRANULE_SIZE
-	b.gt	1b
-	ret
-SYM_FUNC_END(mte_assign_mem_tag_range)