arm64: optimize flush tlb kernel range

Message ID	20240914141441.2340469-1-wangkefeng.wang@huawei.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org> From: Kefeng Wang <wangkefeng.wang@huawei.com> To: Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org> CC: Ryan Roberts <ryan.roberts@arm.com>, <linux-arm-kernel@lists.infradead.org>, Kefeng Wang <wangkefeng.wang@huawei.com>, Yicong Yang <yangyicong@hisilicon.com> Subject: [PATCH] arm64: optimize flush tlb kernel range Date: Sat, 14 Sep 2024 22:14:41 +0800 Message-ID: <20240914141441.2340469-1-wangkefeng.wang@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain Precedence: list Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org
Series	arm64: optimize flush tlb kernel range \| expand arm64: optimize flush tlb kernel range

Message ID

20240914141441.2340469-1-wangkefeng.wang@huawei.com (mailing list archive)

State

New, archived

Headers

From: Kefeng Wang <wangkefeng.wang@huawei.com>
To: Catalin Marinas <catalin.marinas@arm.com>, Will Deacon <will@kernel.org>
CC: Ryan Roberts <ryan.roberts@arm.com>,
	<linux-arm-kernel@lists.infradead.org>, Kefeng Wang
	<wangkefeng.wang@huawei.com>, Yicong Yang <yangyicong@hisilicon.com>
Subject: [PATCH] arm64: optimize flush tlb kernel range
Date: Sat, 14 Sep 2024 22:14:41 +0800
Message-ID: <20240914141441.2340469-1-wangkefeng.wang@huawei.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Content-Type: text/plain
Precedence: list
Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org>
Errors-To: 
 linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org

Series

arm64: optimize flush tlb kernel range | expand

Commit Message

Kefeng Wang Sept. 14, 2024, 2:14 p.m. UTC

Currently the kernel TLBs is flushed page by page if the target
VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
brutally issue a TLBI ALL.

But we could optimize it when CPU supports TLB range operations,
convert to use __flush_tlb_range_nosync() like other tlb range
flush to improve performance.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
 arch/arm64/mm/contpte.c           |  3 ++-
 2 files changed, 20 insertions(+), 26 deletions(-)

Comments

Catalin Marinas Sept. 16, 2024, 3:08 p.m. UTC | #1

On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
> Currently the kernel TLBs is flushed page by page if the target
> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
> brutally issue a TLBI ALL.
> 
> But we could optimize it when CPU supports TLB range operations,
> convert to use __flush_tlb_range_nosync() like other tlb range
> flush to improve performance.
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>

Nit: you need a co-developed-by here for Yicong.

> ---
>  arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>  arch/arm64/mm/contpte.c           |  3 ++-
>  2 files changed, 20 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index 95fbc8c05607..8537fad83999 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -431,12 +431,12 @@ do {									\
>  #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>  	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>  
> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> -				     unsigned long start, unsigned long end,
> -				     unsigned long stride, bool last_level,
> -				     int tlb_level)
> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
> +		unsigned long asid, unsigned long start, unsigned long end,
> +		unsigned long stride, bool last_level, int tlb_level)
>  {
> -	unsigned long asid, pages;
> +	bool tlbi_user = !!asid;
> +	unsigned long pages;
>  
>  	start = round_down(start, stride);
>  	end = round_up(end, stride);
> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>  	if ((!system_supports_tlb_range() &&
>  	     (end - start) >= (MAX_DVM_OPS * stride)) ||
>  	    pages > MAX_TLBI_RANGE_PAGES) {
> -		flush_tlb_mm(vma->vm_mm);
> +		if (asid)
> +			flush_tlb_mm(mm);
> +		else
> +			flush_tlb_all();
>  		return;
>  	}
>  
>  	dsb(ishst);
> -	asid = ASID(vma->vm_mm);
>  
>  	if (last_level)
>  		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
> -				     tlb_level, true, lpa2_is_enabled());
> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>  	else
> -		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
> -				     tlb_level, true, lpa2_is_enabled());
> +		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>  
> -	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
> +	if (asid)
> +		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>  }

This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
operation while the above would use VALE1IS for the kernel mapping, with
ASID 0.

I also don't like overriding the meaning of asid here to guess whether
it's user or kernel mapping, it just complicates this function
unnecessarily.

>  static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
> -	unsigned long addr;
>  
> -	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
> -		flush_tlb_all();
> -		return;
> -	}
> -
> -	start = __TLBI_VADDR(start, 0);
> -	end = __TLBI_VADDR(end, 0);
> -
> -	dsb(ishst);
> -	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
> -		__tlbi(vaale1is, addr);
> +	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
> +				 TLBI_TTL_UNKNOWN);
>  	dsb(ish);
>  	isb();
>  }

Just call __flush_tlb_range_op(vaale1is, ...) directly here.

Kefeng Wang Sept. 18, 2024, 12:57 a.m. UTC | #2

On 2024/9/16 23:08, Catalin Marinas wrote:
> On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
>> Currently the kernel TLBs is flushed page by page if the target
>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
>> brutally issue a TLBI ALL.
>>
>> But we could optimize it when CPU supports TLB range operations,
>> convert to use __flush_tlb_range_nosync() like other tlb range
>> flush to improve performance.
>>
>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> 
> Nit: you need a co-developed-by here for Yicong.

OK,

> 
>> ---
>>   arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>>   arch/arm64/mm/contpte.c           |  3 ++-
>>   2 files changed, 20 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index 95fbc8c05607..8537fad83999 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -431,12 +431,12 @@ do {									\
>>   #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>   	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>>   
>> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> -				     unsigned long start, unsigned long end,
>> -				     unsigned long stride, bool last_level,
>> -				     int tlb_level)
>> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
>> +		unsigned long asid, unsigned long start, unsigned long end,
>> +		unsigned long stride, bool last_level, int tlb_level)
>>   {
>> -	unsigned long asid, pages;
>> +	bool tlbi_user = !!asid;
>> +	unsigned long pages;
>>   
>>   	start = round_down(start, stride);
>>   	end = round_up(end, stride);
>> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>   	if ((!system_supports_tlb_range() &&
>>   	     (end - start) >= (MAX_DVM_OPS * stride)) ||
>>   	    pages > MAX_TLBI_RANGE_PAGES) {
>> -		flush_tlb_mm(vma->vm_mm);
>> +		if (asid)
>> +			flush_tlb_mm(mm);
>> +		else
>> +			flush_tlb_all();
>>   		return;
>>   	}
>>   
>>   	dsb(ishst);
>> -	asid = ASID(vma->vm_mm);
>>   
>>   	if (last_level)
>>   		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   	else
>> -		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   
>> -	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>> +	if (asid)
>> +		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>>   }
> 
> This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
> operation while the above would use VALE1IS for the kernel mapping, with
> ASID 0.

Right, missing it when code refactoring.
> 
> I also don't like overriding the meaning of asid here to guess whether
> it's user or kernel mapping, it just complicates this function
> unnecessarily.
> 
>>   static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>>   {
>> -	unsigned long addr;
>>   
>> -	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
>> -		flush_tlb_all();
>> -		return;
>> -	}
>> -
>> -	start = __TLBI_VADDR(start, 0);
>> -	end = __TLBI_VADDR(end, 0);
>> -
>> -	dsb(ishst);
>> -	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
>> -		__tlbi(vaale1is, addr);
>> +	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
>> +				 TLBI_TTL_UNKNOWN);
>>   	dsb(ish);
>>   	isb();
>>   }
> 
> Just call __flush_tlb_range_op(vaale1is, ...) directly here.

Our first internal version using __flush_tlb_range_op(), but I want to 
avoid some duplicate code between flush_tlb_kernel_range() and
__flush_tlb_range_nosync(), but will turn back to use 
__flush_tlb_range_op(), thank.

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 95fbc8c05607..8537fad83999 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -431,12 +431,12 @@  do {									\
 #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
 	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
 
-static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
-				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level,
-				     int tlb_level)
+static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
+		unsigned long asid, unsigned long start, unsigned long end,
+		unsigned long stride, bool last_level, int tlb_level)
 {
-	unsigned long asid, pages;
+	bool tlbi_user = !!asid;
+	unsigned long pages;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
@@ -451,21 +451,24 @@  static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 	if ((!system_supports_tlb_range() &&
 	     (end - start) >= (MAX_DVM_OPS * stride)) ||
 	    pages > MAX_TLBI_RANGE_PAGES) {
-		flush_tlb_mm(vma->vm_mm);
+		if (asid)
+			flush_tlb_mm(mm);
+		else
+			flush_tlb_all();
 		return;
 	}
 
 	dsb(ishst);
-	asid = ASID(vma->vm_mm);
 
 	if (last_level)
 		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
+				     tlb_level, tlbi_user, lpa2_is_enabled());
 	else
-		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
-				     tlb_level, true, lpa2_is_enabled());
+		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
+				     tlb_level, tlbi_user, lpa2_is_enabled());
 
-	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
+	if (asid)
+		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
 }
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
@@ -473,8 +476,8 @@  static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
 {
-	__flush_tlb_range_nosync(vma, start, end, stride,
-				 last_level, tlb_level);
+	__flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm), start, end,
+				 stride, last_level, tlb_level);
 	dsb(ish);
 }
 
@@ -492,19 +495,9 @@  static inline void flush_tlb_range(struct vm_area_struct *vma,
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long addr;
 
-	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
-		flush_tlb_all();
-		return;
-	}
-
-	start = __TLBI_VADDR(start, 0);
-	end = __TLBI_VADDR(end, 0);
-
-	dsb(ishst);
-	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
-		__tlbi(vaale1is, addr);
+	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
+				 TLBI_TTL_UNKNOWN);
 	dsb(ish);
 	isb();
 }
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 55107d27d3f8..7f93f19dc50b 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -335,7 +335,8 @@  int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 		 * eliding the trailing DSB applies here.
 		 */
 		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
-		__flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+		__flush_tlb_range_nosync(vma->vm_mm, ASID(vma->vm_mm),
+					 addr, addr + CONT_PTE_SIZE,
 					 PAGE_SIZE, true, 3);
 	}

arm64: optimize flush tlb kernel range

Commit Message

Comments

Patch