diff mbox series

[v1,6/6] arm64: tlb: Set the TTL field in flush_tlb_range

Message ID 20200403090048.938-7-yezhenyu2@huawei.com (mailing list archive)
State New, archived
Headers show
Series arm64: tlb: add support for TTL feature | expand

Commit Message

Zhenyu Ye April 3, 2020, 9 a.m. UTC
This patch uses the cleared_* in struct mmu_gather to set the
TTL field in flush_tlb_range().

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arm64/include/asm/tlb.h      | 26 +++++++++++++++++++++++++-
 arch/arm64/include/asm/tlbflush.h | 14 ++++++++------
 2 files changed, 33 insertions(+), 7 deletions(-)

Comments

Peter Zijlstra April 20, 2020, 12:10 p.m. UTC | #1
On Fri, Apr 03, 2020 at 05:00:48PM +0800, Zhenyu Ye wrote:
> This patch uses the cleared_* in struct mmu_gather to set the
> TTL field in flush_tlb_range().
> 
> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> ---
>  arch/arm64/include/asm/tlb.h      | 26 +++++++++++++++++++++++++-
>  arch/arm64/include/asm/tlbflush.h | 14 ++++++++------
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
> index b76df828e6b7..d5ab72eccff4 100644
> --- a/arch/arm64/include/asm/tlb.h
> +++ b/arch/arm64/include/asm/tlb.h
> @@ -21,11 +21,34 @@ static void tlb_flush(struct mmu_gather *tlb);
>  
>  #include <asm-generic/tlb.h>
>  
> +/*
> + * get the tlbi levels in arm64.  Default value is 0 if more than one
> + * of cleared_* is set or neither is set.
> + * Arm64 doesn't support p4ds now.
> + */
> +static inline int tlb_get_level(struct mmu_gather *tlb)
> +{
> +	int sum = tlb->cleared_ptes + tlb->cleared_pmds +
> +		  tlb->cleared_puds + tlb->cleared_p4ds;
> +
> +	if (sum != 1)
> +		return 0;
> +	else if (tlb->cleared_ptes)
> +		return 3;
> +	else if (tlb->cleared_pmds)
> +		return 2;
> +	else if (tlb->cleared_puds)
> +		return 1;
> +
> +	return 0;
> +}

That's some mighty wonky code. Please look at the generated asm.
Steven Rostedt April 21, 2020, 12:06 a.m. UTC | #2
On Mon, 20 Apr 2020 14:10:55 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Apr 03, 2020 at 05:00:48PM +0800, Zhenyu Ye wrote:
> > This patch uses the cleared_* in struct mmu_gather to set the
> > TTL field in flush_tlb_range().
> > 
> > Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> > ---
> >  arch/arm64/include/asm/tlb.h      | 26 +++++++++++++++++++++++++-
> >  arch/arm64/include/asm/tlbflush.h | 14 ++++++++------
> >  2 files changed, 33 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
> > index b76df828e6b7..d5ab72eccff4 100644
> > --- a/arch/arm64/include/asm/tlb.h
> > +++ b/arch/arm64/include/asm/tlb.h
> > @@ -21,11 +21,34 @@ static void tlb_flush(struct mmu_gather *tlb);
> >  
> >  #include <asm-generic/tlb.h>
> >  
> > +/*
> > + * get the tlbi levels in arm64.  Default value is 0 if more than one
> > + * of cleared_* is set or neither is set.
> > + * Arm64 doesn't support p4ds now.
> > + */
> > +static inline int tlb_get_level(struct mmu_gather *tlb)
> > +{
> > +	int sum = tlb->cleared_ptes + tlb->cleared_pmds +
> > +		  tlb->cleared_puds + tlb->cleared_p4ds;
> > +
> > +	if (sum != 1)
> > +		return 0;
> > +	else if (tlb->cleared_ptes)
> > +		return 3;
> > +	else if (tlb->cleared_pmds)
> > +		return 2;
> > +	else if (tlb->cleared_puds)
> > +		return 1;
> > +
> > +	return 0;
> > +}  
> 
> That's some mighty wonky code. Please look at the generated asm.

Without even looking at the generated asm, if a condition returns,
there's no reason to add an else for that condition.

	if (x)
		return 1;
	else if (y)
		return 2;
	else
		return 3;

Is exactly the same as:

	if (x)
		return 1;
	if (y)
		return 2;
	return 3;

-- Steve
Peter Zijlstra April 21, 2020, 8:30 a.m. UTC | #3
On Mon, Apr 20, 2020 at 08:06:16PM -0400, Steven Rostedt wrote:
> Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Apr 03, 2020 at 05:00:48PM +0800, Zhenyu Ye wrote:

> > > +static inline int tlb_get_level(struct mmu_gather *tlb)
> > > +{
> > > +	int sum = tlb->cleared_ptes + tlb->cleared_pmds +
> > > +		  tlb->cleared_puds + tlb->cleared_p4ds;
> > > +
> > > +	if (sum != 1)
> > > +		return 0;
> > > +	else if (tlb->cleared_ptes)
> > > +		return 3;
> > > +	else if (tlb->cleared_pmds)
> > > +		return 2;
> > > +	else if (tlb->cleared_puds)
> > > +		return 1;
> > > +
> > > +	return 0;
> > > +}  
> > 
> > That's some mighty wonky code. Please look at the generated asm.
> 
> Without even looking at the generated asm, if a condition returns,
> there's no reason to add an else for that condition.

Not really the point; he wants to guarantee he only returns >0 when
there's a single bit set. But the thing is, cleared_* is a bitfield, and
I'm afraid that the above will result in some terrible code-gen.

Maybe something like:

	if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
				   tlb->cleared_puds ||
				   tlb->cleared_p4ds))
		return 3;

	if (tlb->cleared_pmds && !(tlb->cleared_ptes ||
				   tlb->cleared_puds ||
				   tlb->cleared_p4ds))
		return 2;

	if (tlb->cleared_puds && !(tlb->cleared_ptes ||
				   tlb->cleared_pmds ||
				   tlb->cleared_p4ds))
		return 1;

	return 0;

Which I admit is far too much typing, but I suspect it generates far
saner code (just a few masks and branches).

But maybe the compiler surprises us, what do I konw.
Zhenyu Ye April 21, 2020, 12:22 p.m. UTC | #4
On 2020/4/21 16:30, Peter Zijlstra wrote:
> On Mon, Apr 20, 2020 at 08:06:16PM -0400, Steven Rostedt wrote:
>> Peter Zijlstra <peterz@infradead.org> wrote:
>>> On Fri, Apr 03, 2020 at 05:00:48PM +0800, Zhenyu Ye wrote:
> 
>>>> +static inline int tlb_get_level(struct mmu_gather *tlb)
>>>> +{
>>>> +	int sum = tlb->cleared_ptes + tlb->cleared_pmds +
>>>> +		  tlb->cleared_puds + tlb->cleared_p4ds;
>>>> +
>>>> +	if (sum != 1)
>>>> +		return 0;
>>>> +	else if (tlb->cleared_ptes)
>>>> +		return 3;
>>>> +	else if (tlb->cleared_pmds)
>>>> +		return 2;
>>>> +	else if (tlb->cleared_puds)
>>>> +		return 1;
>>>> +
>>>> +	return 0;
>>>> +}  
>>>
>>> That's some mighty wonky code. Please look at the generated asm.
>>
>> Without even looking at the generated asm, if a condition returns,
>> there's no reason to add an else for that condition.
> 
> Not really the point; he wants to guarantee he only returns >0 when
> there's a single bit set. But the thing is, cleared_* is a bitfield, and
> I'm afraid that the above will result in some terrible code-gen.
> 
> Maybe something like:
> 
> 	if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
> 				   tlb->cleared_puds ||
> 				   tlb->cleared_p4ds))
> 		return 3;
> 
> 	if (tlb->cleared_pmds && !(tlb->cleared_ptes ||
> 				   tlb->cleared_puds ||
> 				   tlb->cleared_p4ds))
> 		return 2;
> 
> 	if (tlb->cleared_puds && !(tlb->cleared_ptes ||
> 				   tlb->cleared_pmds ||
> 				   tlb->cleared_p4ds))
> 		return 1;
> 
> 	return 0;
> 
> Which I admit is far too much typing, but I suspect it generates far
> saner code (just a few masks and branches).
> 
> But maybe the compiler surprises us, what do I konw.

Thanks for your review.  In my view, the asm-code should behave the same
as the C code, even if cleared_* are bitfields (below 02 optimization).

Below is the generated asm of my code (gcc version is 7.3.0):

<tlb_flush_mmu_tlbonly.part.110>:
	...
	ubfx	x5, x2, #3, #1		// x2 stores the values of cleared_*
	ubfx	x1, x2, #4, #1
	add	w1, w1, w5
	ubfx	x5, x2, #5, #1
	add	w1, w1, w5
	ubfx	x2, x2, #6, #1
	add	w1, w1, w2		// then the w1 = sum of cleared_*
	tbnz	w3, #3, 001030f8b8
	tbz	w3, #4, 001030fac0
	cmp	w1, #0x1		// cmp the w1 to select branch
	mov	w5, #0x2
	...				// do the if-else below...


Then with your code above, the generated asm is:

<tlb_flush_mmu_tlbonly.part.110>:
	...
	tbnz    w1, #3, 001030f8a0	// w1 stores the values of cleared_*
	tbz     w1, #4, 001030fac0
	and     w2, w1, #0x78		// mask the cleared_* to w2
	mov     x4, #0x200000
	mov     w7, #0x15
	mov     w6, #0x3
	cmp     w2, #0x8		// cmp the w2 to 0x8, 0x10, 0x20 to
					// select branch
	b.ne    ffff80001030f8b8
	...				// do the if-else below...

So at the gen-asm level, both of our codes are OK.  But your code is really
more saner than mine at the gen-asm level.

Thanks for your suggestion of this, I will send a new patch series soon.

Zhenyu

.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index b76df828e6b7..d5ab72eccff4 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -21,11 +21,34 @@  static void tlb_flush(struct mmu_gather *tlb);
 
 #include <asm-generic/tlb.h>
 
+/*
+ * get the tlbi levels in arm64.  Default value is 0 if more than one
+ * of cleared_* is set or neither is set.
+ * Arm64 doesn't support p4ds now.
+ */
+static inline int tlb_get_level(struct mmu_gather *tlb)
+{
+	int sum = tlb->cleared_ptes + tlb->cleared_pmds +
+		  tlb->cleared_puds + tlb->cleared_p4ds;
+
+	if (sum != 1)
+		return 0;
+	else if (tlb->cleared_ptes)
+		return 3;
+	else if (tlb->cleared_pmds)
+		return 2;
+	else if (tlb->cleared_puds)
+		return 1;
+
+	return 0;
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
 	bool last_level = !tlb->freed_tables;
 	unsigned long stride = tlb_get_unmap_size(tlb);
+	int tlb_level = tlb_get_level(tlb);
 
 	/*
 	 * If we're tearing down the address space then we only care about
@@ -38,7 +61,8 @@  static inline void tlb_flush(struct mmu_gather *tlb)
 		return;
 	}
 
-	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
+	__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
+			  last_level, tlb_level);
 }
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 892f33235dc7..3cc705755a2d 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -215,7 +215,8 @@  static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level)
+				     unsigned long stride, bool last_level,
+				     int tlb_level)
 {
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
@@ -237,11 +238,11 @@  static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ishst);
 	for (addr = start; addr < end; addr += stride) {
 		if (last_level) {
-			__tlbi_level(vale1is, addr, 0);
-			__tlbi_user_level(vale1is, addr, 0);
+			__tlbi_level(vale1is, addr, tlb_level);
+			__tlbi_user_level(vale1is, addr, tlb_level);
 		} else {
-			__tlbi_level(vae1is, addr, 0);
-			__tlbi_user_level(vae1is, addr, 0);
+			__tlbi_level(vae1is, addr, tlb_level);
+			__tlbi_user_level(vae1is, addr, tlb_level);
 		}
 	}
 	dsb(ish);
@@ -253,8 +254,9 @@  static inline void flush_tlb_range(struct vm_area_struct *vma,
 	/*
 	 * We cannot use leaf-only invalidation here, since we may be invalidating
 	 * table entries as part of collapsing hugepages or moving page tables.
+	 * Set the tlb_level to 0 because we can not get enough information here.
 	 */
-	__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
+	__flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)