diff mbox

[v2] x86: enable RCU based table free

Message ID 20170824092258.12375-1-vkuznets@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vitaly Kuznetsov Aug. 24, 2017, 9:22 a.m. UTC
On x86 software page-table walkers depend on the fact that remote TLB flush
does an IPI: walk is performed lockless but with interrupts disabled and in
case the pagetable is freed the freeing CPU will get blocked as remote TLB
flush is required. On other architecture which don't require an IPI to do
remote TLB flush we have an RCU-based mechanism (see
include/asm-generic/tlb.h for more details).

In virtualized environments we may want to override .flush_tlb_others hook
in pv_mmu_ops and use a hypercall asking the hypervisor to do remote TLB
flush for us. This breaks the assumption about IPI. Xen PV does this for
years and the upcoming remote TLB flush for Hyper-V will do it too. This
is not safe, software pagetable walkers may step on an already freed page.

Solve the issue by enabling RCU-based table free mechanism. Testing with
kernbench and mmap/munmap microbenchmars didn't show any notable
performance impact.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Juergen Gross <jgross@suse.com>
---
Changes since v1:
- Enable HAVE_RCU_TABLE_FREE unconditionally to avoid different code pathes
  for no reason [Linus Torvalds]
---
 arch/x86/Kconfig           | 1 +
 arch/x86/include/asm/tlb.h | 5 +++++
 arch/x86/mm/pgtable.c      | 8 ++++----
 3 files changed, 10 insertions(+), 4 deletions(-)

Comments

Kirill A. Shutemov Aug. 24, 2017, 9:49 a.m. UTC | #1
On Thu, Aug 24, 2017 at 11:22:58AM +0200, Vitaly Kuznetsov wrote:
> On x86 software page-table walkers depend on the fact that remote TLB flush
> does an IPI: walk is performed lockless but with interrupts disabled and in
> case the pagetable is freed the freeing CPU will get blocked as remote TLB
> flush is required. On other architecture which don't require an IPI to do
> remote TLB flush we have an RCU-based mechanism (see
> include/asm-generic/tlb.h for more details).
> 
> In virtualized environments we may want to override .flush_tlb_others hook
> in pv_mmu_ops and use a hypercall asking the hypervisor to do remote TLB
> flush for us. This breaks the assumption about IPI. Xen PV does this for
> years and the upcoming remote TLB flush for Hyper-V will do it too. This
> is not safe, software pagetable walkers may step on an already freed page.
> 
> Solve the issue by enabling RCU-based table free mechanism. Testing with
> kernbench and mmap/munmap microbenchmars didn't show any notable
> performance impact.
> 
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Acked-by: Juergen Gross <jgross@suse.com>
> ---
> Changes since v1:
> - Enable HAVE_RCU_TABLE_FREE unconditionally to avoid different code pathes
>   for no reason [Linus Torvalds]

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Peter Zijlstra Aug. 24, 2017, 2:34 p.m. UTC | #2
On Thu, Aug 24, 2017 at 11:22:58AM +0200, Vitaly Kuznetsov wrote:

> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
> index c7797307fc2b..d43a7fcafee9 100644
> --- a/arch/x86/include/asm/tlb.h
> +++ b/arch/x86/include/asm/tlb.h
> @@ -15,4 +15,9 @@
>  
>  #include <asm-generic/tlb.h>
>  
> +static inline void __tlb_remove_table(void *table)
> +{
> +	free_page_and_swap_cache(table);
> +}

Most other archs have this in pgtable.h, only ARM* has it in tlb.h.

And should we put a comment on explaining _why_ we have RCU_TABLE_FREE
enabled?

>  #endif /* _ASM_X86_TLB_H */
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index 508a708eb9a6..218834a3e9ad 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -56,7 +56,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
>  {
>  	pgtable_page_dtor(pte);
>  	paravirt_release_pte(page_to_pfn(pte));
> -	tlb_remove_page(tlb, pte);
> +	tlb_remove_table(tlb, pte);
>  }
>  
>  #if CONFIG_PGTABLE_LEVELS > 2
> @@ -72,21 +72,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
>  	tlb->need_flush_all = 1;
>  #endif
>  	pgtable_pmd_page_dtor(page);
> -	tlb_remove_page(tlb, page);
> +	tlb_remove_table(tlb, page);
>  }
>  
>  #if CONFIG_PGTABLE_LEVELS > 3
>  void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
>  {
>  	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
> -	tlb_remove_page(tlb, virt_to_page(pud));
> +	tlb_remove_table(tlb, virt_to_page(pud));
>  }
>  
>  #if CONFIG_PGTABLE_LEVELS > 4
>  void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
>  {
>  	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
> -	tlb_remove_page(tlb, virt_to_page(p4d));
> +	tlb_remove_table(tlb, virt_to_page(p4d));
>  }
>  #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
>  #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
> -- 
> 2.13.5
>
Vitaly Kuznetsov Aug. 24, 2017, 3:27 p.m. UTC | #3
Peter Zijlstra <peterz@infradead.org> writes:

> On Thu, Aug 24, 2017 at 11:22:58AM +0200, Vitaly Kuznetsov wrote:
>
>> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
>> index c7797307fc2b..d43a7fcafee9 100644
>> --- a/arch/x86/include/asm/tlb.h
>> +++ b/arch/x86/include/asm/tlb.h
>> @@ -15,4 +15,9 @@
>>  
>>  #include <asm-generic/tlb.h>
>>  
>> +static inline void __tlb_remove_table(void *table)
>> +{
>> +	free_page_and_swap_cache(table);
>> +}
>
> Most other archs have this in pgtable.h, only ARM* has it in tlb.h.
>

Sure, I can move it in v3 if nobody objects.

> And should we put a comment on explaining _why_ we have RCU_TABLE_FREE
> enabled?

Do you think adding something like

/*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
 * a hypercall. To keep software pagetable walkers safe in this case we 
 * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
 * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
 * for more details.
 */

before __tlb_remove_table would suffice? Or do you see a better place
for such comment?

Actually, after enabling HAVE_RCU_TABLE_FREE on x86 we may consider
switching to this mechanism globally: it seems to have negligible effect
on performace (and all major arches will already have it). One step at a
time, though.
Peter Zijlstra Aug. 24, 2017, 4:06 p.m. UTC | #4
On Thu, Aug 24, 2017 at 05:27:21PM +0200, Vitaly Kuznetsov wrote:
> Do you think adding something like
> 
> /*
>  * While x86 architecture in general requires an IPI to perform TLB
>  * shootdown, enablement code for several hypervisors overrides
>  * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
>  * a hypercall. To keep software pagetable walkers safe in this case we 
>  * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
>  * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
>  * for more details.
>  */
> 
> before __tlb_remove_table would suffice? Or do you see a better place
> for such comment?

Yes, that seems fine. Thanks!
Vitaly Kuznetsov Aug. 25, 2017, 7:10 a.m. UTC | #5
Vitaly Kuznetsov <vkuznets@redhat.com> writes:

> Peter Zijlstra <peterz@infradead.org> writes:
>
>> On Thu, Aug 24, 2017 at 11:22:58AM +0200, Vitaly Kuznetsov wrote:
>>
>>> diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
>>> index c7797307fc2b..d43a7fcafee9 100644
>>> --- a/arch/x86/include/asm/tlb.h
>>> +++ b/arch/x86/include/asm/tlb.h
>>> @@ -15,4 +15,9 @@
>>>  
>>>  #include <asm-generic/tlb.h>
>>>  
>>> +static inline void __tlb_remove_table(void *table)
>>> +{
>>> +	free_page_and_swap_cache(table);
>>> +}
>>
>> Most other archs have this in pgtable.h, only ARM* has it in tlb.h.
>>
>
> Sure, I can move it in v3 if nobody objects.
>

Well, turns out it is going to be a bit tricky. 

free_page_and_swap_cache() is defined in linux/swap.h but we can't just
include it from arch/x86/include/asm/pgtable.h as pgtable.h itself is
included from swap.h:

...
In file included from ./include/linux/mm.h:70:0,
                 from ./include/linux/memcontrol.h:29,
                 from ./include/linux/swap.h:8,
                 from ./include/linux/suspend.h:4,
                 from arch/x86/kernel/asm-offsets.c:12:
./arch/x86/include/asm/pgtable.h: In function ‘__tlb_remove_table’:
./arch/x86/include/asm/pgtable.h:1252:2: error: implicit declaration of function ‘free_page_and_swap_cache’; did you mean ‘file_write_and_wait_range’? [-Werror=implicit-function-declaration]
  free_page_and_swap_cache(table);
  ^~~~~~~~~~~~~~~~~~~~~~~~
...

An easy solution would be to make __tlb_remove_table() a define instead
of inline but personally I'd rather prefer to follow ARM and leave it in
tlb.h.
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 323cb065be5e..b0bfc27d05a2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -167,6 +167,7 @@  config X86
 	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
+	select HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index c7797307fc2b..d43a7fcafee9 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -15,4 +15,9 @@ 
 
 #include <asm-generic/tlb.h>
 
+static inline void __tlb_remove_table(void *table)
+{
+	free_page_and_swap_cache(table);
+}
+
 #endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..218834a3e9ad 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -56,7 +56,7 @@  void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
 	pgtable_page_dtor(pte);
 	paravirt_release_pte(page_to_pfn(pte));
-	tlb_remove_page(tlb, pte);
+	tlb_remove_table(tlb, pte);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +72,21 @@  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 	tlb->need_flush_all = 1;
 #endif
 	pgtable_pmd_page_dtor(page);
-	tlb_remove_page(tlb, page);
+	tlb_remove_table(tlb, page);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pud));
+	tlb_remove_table(tlb, virt_to_page(pud));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 {
 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(p4d));
+	tlb_remove_table(tlb, virt_to_page(p4d));
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */