diff mbox series

[RFC,v5,4/8] mm: tlb: Pass struct mmu_gather to flush_pmd_tlb_range

Message ID 20200331142927.1237-5-yezhenyu2@huawei.com (mailing list archive)
State New, archived
Headers show
Series arm64: tlb: add support for TTL feature | expand

Commit Message

Zhenyu Ye March 31, 2020, 2:29 p.m. UTC
Preparations to support for passing struct mmu_gather to
flush_tlb_range.  See in future patches.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arc/include/asm/hugepage.h               |  4 +--
 arch/arc/include/asm/tlbflush.h               |  5 +--
 arch/arc/mm/tlb.c                             |  4 +--
 arch/powerpc/include/asm/book3s/64/tlbflush.h |  3 +-
 arch/powerpc/mm/book3s64/pgtable.c            |  8 ++++-
 include/asm-generic/pgtable.h                 |  4 +--
 mm/pgtable-generic.c                          | 35 ++++++++++++++++---
 7 files changed, 48 insertions(+), 15 deletions(-)

Comments

Peter Zijlstra March 31, 2020, 3:13 p.m. UTC | #1
On Tue, Mar 31, 2020 at 10:29:23PM +0800, Zhenyu Ye wrote:
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index e2e2bef07dd2..32d4661e5a56 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -1160,10 +1160,10 @@ static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
>   * invalidate the entire TLB which is not desitable.
>   * e.g. see arch/arc: flush_pmd_tlb_range
>   */
> -#define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
> +#define flush_pmd_tlb_range(tlb, vma, addr, end)	flush_tlb_range(vma, addr, end)
>  #define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
>  #else
> -#define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
> +#define flush_pmd_tlb_range(tlb, vma, addr, end)	BUILD_BUG()
>  #define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
>  #endif
>  #endif
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 3d7c01e76efc..96c9cf77bfb5 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -109,8 +109,14 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
>  	int changed = !pmd_same(*pmdp, entry);
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>  	if (changed) {
> +		struct mmu_gather tlb;
> +		unsigned long tlb_start = address;
> +		unsigned long tlb_end = address + HPAGE_PMD_SIZE;
>  		set_pmd_at(vma->vm_mm, address, pmdp, entry);
> -		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> +		tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
> +		tlb.cleared_pmds = 1;
> +		flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
> +		tlb_finish_mmu(&tlb, tlb_start, tlb_end);
>  	}
>  	return changed;
>  }

This is madness. Please, carefully consider what you just wrote and what
it will do in the generic case.

Instead of trying to retro-fit flush_*tlb_range() to take an mmu_gather
parameter, please replace them out-right.
Zhenyu Ye April 1, 2020, 8:51 a.m. UTC | #2
Hi Peter,

On 2020/3/31 23:13, Peter Zijlstra wrote:
> On Tue, Mar 31, 2020 at 10:29:23PM +0800, Zhenyu Ye wrote:
>> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
>> index e2e2bef07dd2..32d4661e5a56 100644
>> --- a/include/asm-generic/pgtable.h
>> +++ b/include/asm-generic/pgtable.h
>> @@ -1160,10 +1160,10 @@ static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
>>   * invalidate the entire TLB which is not desitable.
>>   * e.g. see arch/arc: flush_pmd_tlb_range
>>   */
>> -#define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
>> +#define flush_pmd_tlb_range(tlb, vma, addr, end)	flush_tlb_range(vma, addr, end)
>>  #define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
>>  #else
>> -#define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
>> +#define flush_pmd_tlb_range(tlb, vma, addr, end)	BUILD_BUG()
>>  #define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
>>  #endif
>>  #endif
>> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
>> index 3d7c01e76efc..96c9cf77bfb5 100644
>> --- a/mm/pgtable-generic.c
>> +++ b/mm/pgtable-generic.c
>> @@ -109,8 +109,14 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
>>  	int changed = !pmd_same(*pmdp, entry);
>>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>>  	if (changed) {
>> +		struct mmu_gather tlb;
>> +		unsigned long tlb_start = address;
>> +		unsigned long tlb_end = address + HPAGE_PMD_SIZE;
>>  		set_pmd_at(vma->vm_mm, address, pmdp, entry);
>> -		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>> +		tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
>> +		tlb.cleared_pmds = 1;
>> +		flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
>> +		tlb_finish_mmu(&tlb, tlb_start, tlb_end);
>>  	}
>>  	return changed;
>>  }
> 
> This is madness. Please, carefully consider what you just wrote and what
> it will do in the generic case.
> 
> Instead of trying to retro-fit flush_*tlb_range() to take an mmu_gather
> parameter, please replace them out-right.
> 

I'm sorry that I'm not sure what "replace them out-right" means.  Do you
mean that I should define flush_*_tlb_range like this?

#define flush_pmd_tlb_range(vma, addr, end)				\
	do {								\
		struct mmu_gather tlb;					\
		tlb_gather_mmu(&tlb, (vma)->vm_mm, addr, end);		\
		tlba.cleared_pmds = 1;					\
		flush_tlb_range(&tlb, vma, addr, end);			\
		tlb_finish_mmu(&tlb, addr, end);			\
	} while (0)


Thanks,
Zhenyu
Peter Zijlstra April 1, 2020, 12:20 p.m. UTC | #3
On Wed, Apr 01, 2020 at 04:51:15PM +0800, Zhenyu Ye wrote:
> On 2020/3/31 23:13, Peter Zijlstra wrote:

> > Instead of trying to retro-fit flush_*tlb_range() to take an mmu_gather
> > parameter, please replace them out-right.
> > 
> 
> I'm sorry that I'm not sure what "replace them out-right" means.  Do you
> mean that I should define flush_*_tlb_range like this?
> 
> #define flush_pmd_tlb_range(vma, addr, end)				\
> 	do {								\
> 		struct mmu_gather tlb;					\
> 		tlb_gather_mmu(&tlb, (vma)->vm_mm, addr, end);		\
> 		tlba.cleared_pmds = 1;					\
> 		flush_tlb_range(&tlb, vma, addr, end);			\
> 		tlb_finish_mmu(&tlb, addr, end);			\
> 	} while (0)
> 

I was thinking to remove flush_*tlb_range() entirely (from generic
code).

And specifically to not use them like the above; instead extend the
mmu_gather API.

Specifically, if you wanted to express flush_pmd_tlb_range() in mmu
gather, you'd write it like:

static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end)
{
	struct mmu_gather tlb;

	tlb_gather_mmu(&tlb, vma->vm_mm, addr, end);
	tlb_start_vma(&tlb, vma);
	tlb.cleared_pmds = 1;
	__tlb_adjust_range(addr, end - addr);
	tlb_end_vma(&tlb, vma);
	tlb_finish_mmu(&tlb, addr, end);
}

Except of course, that the code between start_vma and end_vma is not a
proper mmu_gather API.

So maybe add:

  tlb_flush_{pte,pmd,pud,p4d}_range()

Then we can write:

static inline void flush_XXX_tlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end)
{
	struct mmu_gather tlb;

	tlb_gather_mmu(&tlb, vma->vm_mm, addr, end);
	tlb_start_vma(&tlb, vma);
	tlb_flush_XXX_range(&tlb, addr, end - addr);
	tlb_end_vma(&tlb, vma);
	tlb_finish_mmu(&tlb, addr, end);
}

But when I look at the output of:

  git grep flush_.*tlb_range -- :^arch/

I doubt it makes sense to provide wrappers like the above.

( Also, we should probably remove the (addr, end) arguments from
tlb_finish_mmu(), Will? )

---
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f391f6b500b4..be5452a8efaa 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -511,6 +511,34 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 }
 #endif
 
+static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
+				       unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_ptes = 1;
+}
+
+static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
+				       unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_pmds = 1;
+}
+
+static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
+				       unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_puds = 1;
+}
+
+static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
+				       unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_p4ds = 1;
+}
+
 #ifndef __tlb_remove_tlb_entry
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #endif
@@ -524,8 +552,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
  */
 #define tlb_remove_tlb_entry(tlb, ptep, address)		\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
-		tlb->cleared_ptes = 1;				\
+		tlb_flush_pte_range(tlb, address, PAGE_SIZE);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
@@ -550,8 +577,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)			\
 	do {								\
-		__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);	\
-		tlb->cleared_pmds = 1;					\
+		tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);	\
 		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
 	} while (0)
 
@@ -565,8 +591,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 
 #define tlb_remove_pud_tlb_entry(tlb, pudp, address)			\
 	do {								\
-		__tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE);	\
-		tlb->cleared_puds = 1;					\
+		tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);	\
 		__tlb_remove_pud_tlb_entry(tlb, pudp, address);		\
 	} while (0)
Zhenyu Ye April 2, 2020, 11:24 a.m. UTC | #4
Hi Peter,

On 2020/4/1 20:20, Peter Zijlstra wrote:
> On Wed, Apr 01, 2020 at 04:51:15PM +0800, Zhenyu Ye wrote:
>> On 2020/3/31 23:13, Peter Zijlstra wrote:
> 
>>> Instead of trying to retro-fit flush_*tlb_range() to take an mmu_gather
>>> parameter, please replace them out-right.
>>>
>>
>> I'm sorry that I'm not sure what "replace them out-right" means.  Do you
>> mean that I should define flush_*_tlb_range like this?
>>
>> #define flush_pmd_tlb_range(vma, addr, end)				\
>> 	do {								\
>> 		struct mmu_gather tlb;					\
>> 		tlb_gather_mmu(&tlb, (vma)->vm_mm, addr, end);		\
>> 		tlba.cleared_pmds = 1;					\
>> 		flush_tlb_range(&tlb, vma, addr, end);			\
>> 		tlb_finish_mmu(&tlb, addr, end);			\
>> 	} while (0)
>>
> 
> I was thinking to remove flush_*tlb_range() entirely (from generic
> code).
> 
> And specifically to not use them like the above; instead extend the
> mmu_gather API.
> 
> Specifically, if you wanted to express flush_pmd_tlb_range() in mmu
> gather, you'd write it like:
> 
> static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end)
> {
> 	struct mmu_gather tlb;
> 
> 	tlb_gather_mmu(&tlb, vma->vm_mm, addr, end);
> 	tlb_start_vma(&tlb, vma);
> 	tlb.cleared_pmds = 1;
> 	__tlb_adjust_range(addr, end - addr);
> 	tlb_end_vma(&tlb, vma);
> 	tlb_finish_mmu(&tlb, addr, end);
> }
> 
> Except of course, that the code between start_vma and end_vma is not a
> proper mmu_gather API.
> 
> So maybe add:
> 
>   tlb_flush_{pte,pmd,pud,p4d}_range()
> 
> Then we can write:
> 
> static inline void flush_XXX_tlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end)
> {
> 	struct mmu_gather tlb;
> 
> 	tlb_gather_mmu(&tlb, vma->vm_mm, addr, end);
> 	tlb_start_vma(&tlb, vma);
> 	tlb_flush_XXX_range(&tlb, addr, end - addr);
> 	tlb_end_vma(&tlb, vma);
> 	tlb_finish_mmu(&tlb, addr, end);
> }
> 
> But when I look at the output of:
> 
>   git grep flush_.*tlb_range -- :^arch/
> 
> I doubt it makes sense to provide wrappers like the above.
> 

Thanks for your detailed explanation.  I notice that you used
`tlb_end_vma` replace `flush_tlb_range`, which will call `tlb_flush`,
then finally call `flush_tlb_range` in generic code.  However, some
architectures define tlb_end_vma|tlb_flush|flush_tlb_range themselves,
so this may cause problems.

For example, in s390, it defines:

#define tlb_end_vma(tlb, vma)			do { } while (0)

And it doesn't define it's own flush_pmd_tlb_range().  So there will be
a mistake if we changed flush_pmd_tlb_range() using tlb_end_vma().

Is this really a problem or something I understand wrong ?



If true, I think there are three ways to solve this problem:

1. use `flush_tlb_range` rather than `tlb_end_vma` in flush_XXX_tlb_range;
   In this way, we still need retro-fit `flush_tlb_range` to take an mmu_gather
parameter.

2. use `tlb_flush` rather than `tlb_end_vma`.
   There is a constraint such like:

	#ifndef tlb_flush
	#if defined(tlb_start_vma) || defined(tlb_end_vma)
	#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
	#endif

   So all architectures that define tlb_{start|end}_vma have defined tlb_flush.
Also, we can add a constraint to flush_XXX_tlb_range such like:

	#ifndef flush_XXX_tlb_range
	#if defined(tlb_start_vma) || defined(tlb_end_vma)
	#error Default flush_XXX_tlb_range() relies on default tlb_start/end_vma()
	#endif

3. Define flush_XXX_tlb_range() architecture-self, and keep original define in
generic code, such as:

In arm64:
	#define flush_XXX_tlb_range flush_XXX_tlb_range

In generic:
	#ifndef flush_XXX_tlb_range
	#define flush_XXX_tlb_range flush_tlb_range


Which do you think is more appropriate?


> ( Also, we should probably remove the (addr, end) arguments from
> tlb_finish_mmu(), Will? )
> 

This can be changed quickly. If you want I can do this with a
separate patch.

> ---
> diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
> index f391f6b500b4..be5452a8efaa 100644
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -511,6 +511,34 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
>  }
>  #endif
>  
> +static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
> +				       unsigned long address, unsigned long size)
> +{
> +	__tlb_adjust_range(tlb, address, size);
> +	tlb->cleared_ptes = 1;
> +}
> +
> +static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
> +				       unsigned long address, unsigned long size)
> +{
> +	__tlb_adjust_range(tlb, address, size);
> +	tlb->cleared_pmds = 1;
> +}
> +
> +static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
> +				       unsigned long address, unsigned long size)
> +{
> +	__tlb_adjust_range(tlb, address, size);
> +	tlb->cleared_puds = 1;
> +}
> +
> +static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
> +				       unsigned long address, unsigned long size)
> +{
> +	__tlb_adjust_range(tlb, address, size);
> +	tlb->cleared_p4ds = 1;
> +}
> +

By the way, I think the name of tlb_set_XXX_range() is more suitable, because
we don't do actual flush there.

>  #ifndef __tlb_remove_tlb_entry
>  #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
>  #endif
> @@ -524,8 +552,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
>   */
>  #define tlb_remove_tlb_entry(tlb, ptep, address)		\
>  	do {							\
> -		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
> -		tlb->cleared_ptes = 1;				\
> +		tlb_flush_pte_range(tlb, address, PAGE_SIZE);	\
>  		__tlb_remove_tlb_entry(tlb, ptep, address);	\
>  	} while (0)
>  
> @@ -550,8 +577,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
>  
>  #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)			\
>  	do {								\
> -		__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);	\
> -		tlb->cleared_pmds = 1;					\
> +		tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);	\
>  		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
>  	} while (0)
>  
> @@ -565,8 +591,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
>  
>  #define tlb_remove_pud_tlb_entry(tlb, pudp, address)			\
>  	do {								\
> -		__tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE);	\
> -		tlb->cleared_puds = 1;					\
> +		tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);	\
>  		__tlb_remove_pud_tlb_entry(tlb, pudp, address);		\
>  	} while (0)
>  
> 
> .
>
Peter Zijlstra April 2, 2020, 4:38 p.m. UTC | #5
On Thu, Apr 02, 2020 at 07:24:04PM +0800, Zhenyu Ye wrote:
> Thanks for your detailed explanation.  I notice that you used
> `tlb_end_vma` replace `flush_tlb_range`, which will call `tlb_flush`,
> then finally call `flush_tlb_range` in generic code.  However, some
> architectures define tlb_end_vma|tlb_flush|flush_tlb_range themselves,
> so this may cause problems.
> 
> For example, in s390, it defines:
> 
> #define tlb_end_vma(tlb, vma)			do { } while (0)
> 
> And it doesn't define it's own flush_pmd_tlb_range().  So there will be
> a mistake if we changed flush_pmd_tlb_range() using tlb_end_vma().
> 
> Is this really a problem or something I understand wrong ?

If tlb_end_vma() is a no-op, then tlb_finish_mmu() will do:
tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush()

And s390 has tlb_flush().

If tlb_end_vma() is not a no-op and it calls tlb_flush_mmu_tlbonly(),
then tlb_finish_mmu()'s invocation of tlb_flush_mmu_tlbonly() will
terniate early due o no flags set.

IOW, it should all just work.


FYI the whole tlb_{start,end}_vma() thing is a only needed when the
architecture doesn't implement tlb_flush() and instead default to using
flush_tlb_range(), at which point we need to provide a 'fake' vma.

At the time I audited all architectures and they only look at VM_EXEC
(to do $I invalidation) and VM_HUGETLB (for pmd level invalidations),
but I forgot which architectures that were.

But that is all legacy code; eventually we'll get all archs a native
tlb_flush() and this can go away.
Zhenyu Ye April 3, 2020, 5:14 a.m. UTC | #6
Hi Peter,

On 2020/4/3 0:38, Peter Zijlstra wrote:
> On Thu, Apr 02, 2020 at 07:24:04PM +0800, Zhenyu Ye wrote:
>> Thanks for your detailed explanation.  I notice that you used
>> `tlb_end_vma` replace `flush_tlb_range`, which will call `tlb_flush`,
>> then finally call `flush_tlb_range` in generic code.  However, some
>> architectures define tlb_end_vma|tlb_flush|flush_tlb_range themselves,
>> so this may cause problems.
>>
>> For example, in s390, it defines:
>>
>> #define tlb_end_vma(tlb, vma)			do { } while (0)
>>
>> And it doesn't define it's own flush_pmd_tlb_range().  So there will be
>> a mistake if we changed flush_pmd_tlb_range() using tlb_end_vma().
>>
>> Is this really a problem or something I understand wrong ?
> 
> If tlb_end_vma() is a no-op, then tlb_finish_mmu() will do:
> tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush()
> 
> And s390 has tlb_flush().
> 
> If tlb_end_vma() is not a no-op and it calls tlb_flush_mmu_tlbonly(),
> then tlb_finish_mmu()'s invocation of tlb_flush_mmu_tlbonly() will
> terniate early due o no flags set.
> 
> IOW, it should all just work.
> 
> 
> FYI the whole tlb_{start,end}_vma() thing is a only needed when the
> architecture doesn't implement tlb_flush() and instead default to using
> flush_tlb_range(), at which point we need to provide a 'fake' vma.
> 
> At the time I audited all architectures and they only look at VM_EXEC
> (to do $I invalidation) and VM_HUGETLB (for pmd level invalidations),
> but I forgot which architectures that were.

Many architectures, such as alpha, arc, arm and so on.
I really understand why you hate making vma->vm_flags more important for
tlbi :).

> But that is all legacy code; eventually we'll get all archs a native
> tlb_flush() and this can go away.
> 

Thanks for your reply.  Currently, to enable the TTL feature, extending
the flush_*tlb_range() may be more convenient.
I will send a formal PATCH soon.

Thanks,
Zhenyu
Zhenyu Ye April 8, 2020, 9 a.m. UTC | #7
Hi Peter,

On 2020/4/3 13:14, Zhenyu Ye wrote:
> Hi Peter,
> 
> On 2020/4/3 0:38, Peter Zijlstra wrote:
>> On Thu, Apr 02, 2020 at 07:24:04PM +0800, Zhenyu Ye wrote:
>>> Thanks for your detailed explanation.  I notice that you used
>>> `tlb_end_vma` replace `flush_tlb_range`, which will call `tlb_flush`,
>>> then finally call `flush_tlb_range` in generic code.  However, some
>>> architectures define tlb_end_vma|tlb_flush|flush_tlb_range themselves,
>>> so this may cause problems.
>>>
>>> For example, in s390, it defines:
>>>
>>> #define tlb_end_vma(tlb, vma)			do { } while (0)
>>>
>>> And it doesn't define it's own flush_pmd_tlb_range().  So there will be
>>> a mistake if we changed flush_pmd_tlb_range() using tlb_end_vma().
>>>
>>> Is this really a problem or something I understand wrong ?
>>
>> If tlb_end_vma() is a no-op, then tlb_finish_mmu() will do:
>> tlb_flush_mmu() -> tlb_flush_mmu_tlbonly() -> tlb_flush()
>>
>> And s390 has tlb_flush().
>>
>> If tlb_end_vma() is not a no-op and it calls tlb_flush_mmu_tlbonly(),
>> then tlb_finish_mmu()'s invocation of tlb_flush_mmu_tlbonly() will
>> terniate early due o no flags set.
>>
>> IOW, it should all just work.
>>
>>
>> FYI the whole tlb_{start,end}_vma() thing is a only needed when the
>> architecture doesn't implement tlb_flush() and instead default to using
>> flush_tlb_range(), at which point we need to provide a 'fake' vma.
>>
>> At the time I audited all architectures and they only look at VM_EXEC
>> (to do $I invalidation) and VM_HUGETLB (for pmd level invalidations),
>> but I forgot which architectures that were.
> 
> Many architectures, such as alpha, arc, arm and so on.
> I really understand why you hate making vma->vm_flags more important for
> tlbi :).
> 
>> But that is all legacy code; eventually we'll get all archs a native
>> tlb_flush() and this can go away.
>>
> 
> Thanks for your reply.  Currently, to enable the TTL feature, extending
> the flush_*tlb_range() may be more convenient.
> I will send a formal PATCH soon.
> 
> Thanks,
> Zhenyu
> 

I had sent [PATCH v1] a few days ago[1].  Do you have time to review
my changes?  Are those changes appropriate?

Waiting for your suggestion.

[1] https://lore.kernel.org/linux-arm-kernel/20200403090048.938-1-yezhenyu2@huawei.com/

Thanks,
Zhenyu
diff mbox series

Patch

diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 30ac40fed2c5..c2b325dd47f2 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -67,8 +67,8 @@  extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
-				unsigned long end);
+extern void flush_pmd_tlb_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+				unsigned long start, unsigned long end);
 
 /* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/
 #define pmdp_establish generic_pmdp_establish
diff --git a/arch/arc/include/asm/tlbflush.h b/arch/arc/include/asm/tlbflush.h
index 992a2837a53f..49e4e5b59bb2 100644
--- a/arch/arc/include/asm/tlbflush.h
+++ b/arch/arc/include/asm/tlbflush.h
@@ -26,7 +26,7 @@  void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 #define flush_tlb_all()			local_flush_tlb_all()
 #define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define flush_pmd_tlb_range(vma, s, e)	local_flush_pmd_tlb_range(vma, s, e)
+#define flush_pmd_tlb_range(tlb, vma, s, e)	local_flush_pmd_tlb_range(vma, s, e)
 #endif
 #else
 extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -36,7 +36,8 @@  extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tlb_all(void);
 extern void flush_tlb_mm(struct mm_struct *mm);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
+extern void flush_pmd_tlb_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+				unsigned long start, unsigned long end);
 #endif
 #endif /* CONFIG_SMP */
 #endif
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index c340acd989a0..10b2a2373dc0 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -464,8 +464,8 @@  void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
-			 unsigned long end)
+void flush_pmd_tlb_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+			 unsigned long start, unsigned long end)
 {
 	struct tlb_args ta = {
 		.ta_vma = vma,
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index dcb5c3839d2f..6445d179ac15 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -47,7 +47,8 @@  static inline void tlbiel_all_lpid(bool radix)
 
 
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
+static inline void flush_pmd_tlb_range(struct mmu_gather *tlb,
+				       struct vm_area_struct *vma,
 				       unsigned long start, unsigned long end)
 {
 	if (radix_enabled())
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 2bf7e1b4fd82..0a9c7ad7ee81 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -106,9 +106,15 @@  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	unsigned long old_pmd;
+	struct mmu_gather tlb;
+	unsigned long tlb_start = address;
+	unsigned long tlb_end = address + HPAGE_PMD_SIZE;
 
 	old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
-	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
+	tlb.cleared_pmds = 1;
+	flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
+	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
 	/*
 	 * This ensures that generic code that rely on IRQ disabling
 	 * to prevent a parallel THP split work as expected.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2e2bef07dd2..32d4661e5a56 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1160,10 +1160,10 @@  static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
  * invalidate the entire TLB which is not desitable.
  * e.g. see arch/arc: flush_pmd_tlb_range
  */
-#define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
+#define flush_pmd_tlb_range(tlb, vma, addr, end)	flush_tlb_range(vma, addr, end)
 #define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
 #else
-#define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
+#define flush_pmd_tlb_range(tlb, vma, addr, end)	BUILD_BUG()
 #define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
 #endif
 #endif
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3d7c01e76efc..96c9cf77bfb5 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -109,8 +109,14 @@  int pmdp_set_access_flags(struct vm_area_struct *vma,
 	int changed = !pmd_same(*pmdp, entry);
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	if (changed) {
+		struct mmu_gather tlb;
+		unsigned long tlb_start = address;
+		unsigned long tlb_end = address + HPAGE_PMD_SIZE;
 		set_pmd_at(vma->vm_mm, address, pmdp, entry);
-		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+		tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
+		tlb.cleared_pmds = 1;
+		flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
+		tlb_finish_mmu(&tlb, tlb_start, tlb_end);
 	}
 	return changed;
 }
@@ -123,8 +129,15 @@  int pmdp_clear_flush_young(struct vm_area_struct *vma,
 	int young;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	young = pmdp_test_and_clear_young(vma, address, pmdp);
-	if (young)
-		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	if (young) {
+		struct mmu_gather tlb;
+		unsigned long tlb_start = address;
+		unsigned long tlb_end = address + HPAGE_PMD_SIZE;
+		tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
+		tlb.cleared_pmds = 1;
+		flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
+		tlb_finish_mmu(&tlb, tlb_start, tlb_end);
+	}
 	return young;
 }
 #endif
@@ -134,11 +147,17 @@  pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 			    pmd_t *pmdp)
 {
 	pmd_t pmd;
+	struct mmu_gather tlb;
+	unsigned long tlb_start = address;
+	unsigned long tlb_end = address + HPAGE_PMD_SIZE;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
 			   !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
-	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
+	tlb.cleared_pmds = 1;
+	flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
+	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
 	return pmd;
 }
 
@@ -195,7 +214,13 @@  pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
-	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	struct mmu_gather tlb;
+	unsigned long tlb_start = address;
+	unsigned long tlb_end = address + HPAGE_PMD_SIZE;
+	tlb_gather_mmu(&tlb, vma->vm_mm, tlb_start, tlb_end);
+	tlb.cleared_pmds = 1;
+	flush_pmd_tlb_range(&tlb, vma, tlb_start, tlb_end);
+	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
 	return old;
 }
 #endif