Message ID | 20240108193640.344929-1-alexghiti@rivosinc.com (mailing list archive) |
---|---|
State | Accepted |
Commit | 54d7431af73e2fa53b73cfeb2bec559c6664a4e4 |
Headers | show |
Series | [v2] riscv: Add support for BATCHED_UNMAP_TLB_FLUSH | expand |
On Mon, 8 Jan 2024 20:36:40 +0100 Alexandre Ghiti <alexghiti@rivosinc.com> wrote: > Allow to defer the flushing of the TLB when unmapping pages, which allows > to reduce the numbers of IPI and the number of sfence.vma. > > The ubenchmarch used in commit 43b3dfdd0455 ("arm64: support > batched/deferred tlb shootdown during page reclamation/migration") that > was multithreaded to force the usage of IPI shows good performance > improvement on all platforms: > > * Unmatched: ~34% > * TH1520 : ~78% > * Qemu : ~81% > > In addition, perf on qemu reports an important decrease in time spent > dealing with IPIs: > > Before: 68.17% main [kernel.kallsyms] [k] __sbi_rfence_v02_call > After : 8.64% main [kernel.kallsyms] [k] __sbi_rfence_v02_call > > * Benchmark: > > int stick_this_thread_to_core(int core_id) { > int num_cores = sysconf(_SC_NPROCESSORS_ONLN); > if (core_id < 0 || core_id >= num_cores) > return EINVAL; > > cpu_set_t cpuset; > CPU_ZERO(&cpuset); > CPU_SET(core_id, &cpuset); > > pthread_t current_thread = pthread_self(); > return pthread_setaffinity_np(current_thread, > sizeof(cpu_set_t), &cpuset); > } > > static void *fn_thread (void *p_data) > { > int ret; > pthread_t thread; > > stick_this_thread_to_core((int)p_data); > > while (1) { > sleep(1); > } > > return NULL; > } > > int main() > { > volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, > MAP_SHARED | MAP_ANONYMOUS, -1, 0); > pthread_t threads[4]; > int ret; > > for (int i = 0; i < 4; ++i) { > ret = pthread_create(&threads[i], NULL, fn_thread, (void *)i); > if (ret) > { > printf("%s", strerror (ret)); > } > } > > memset(p, 0x88, SIZE); > > for (int k = 0; k < 10000; k++) { > /* swap in */ > for (int i = 0; i < SIZE; i += 4096) { > (void)p[i]; > } > > /* swap out */ > madvise(p, SIZE, MADV_PAGEOUT); > } > > for (int i = 0; i < 4; i++) > { > pthread_cancel(threads[i]); > } > > for (int i = 0; i < 4; i++) > { > pthread_join(threads[i], NULL); > } > > return 0; > } > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com> > Reviewed-by: Jisheng Zhang <jszhang@kernel.org> > Tested-by: Jisheng Zhang <jszhang@kernel.org> # Tested on TH1520 Before: real 0m36.674s user 0m0.173s sys 0m36.493s After: real 0m18.016s user 0m0.125s sys 0m17.885s Tested-by: Nam Cao <namcao@linutronix.de> Best regards, Nam
On Mon, 8 Jan 2024 21:43:46 +0100 Nam Cao <namcao@linutronix.de> wrote: > On Mon, 8 Jan 2024 20:36:40 +0100 Alexandre Ghiti <alexghiti@rivosinc.com> wrote: > > Allow to defer the flushing of the TLB when unmapping pages, which allows > > to reduce the numbers of IPI and the number of sfence.vma. > > > > The ubenchmarch used in commit 43b3dfdd0455 ("arm64: support > > batched/deferred tlb shootdown during page reclamation/migration") that > > was multithreaded to force the usage of IPI shows good performance > > improvement on all platforms: > > > > * Unmatched: ~34% > > * TH1520 : ~78% > > * Qemu : ~81% > > > > In addition, perf on qemu reports an important decrease in time spent > > dealing with IPIs: > > > > Before: 68.17% main [kernel.kallsyms] [k] __sbi_rfence_v02_call > > After : 8.64% main [kernel.kallsyms] [k] __sbi_rfence_v02_call > > > > * Benchmark: > > > > int stick_this_thread_to_core(int core_id) { > > int num_cores = sysconf(_SC_NPROCESSORS_ONLN); > > if (core_id < 0 || core_id >= num_cores) > > return EINVAL; > > > > cpu_set_t cpuset; > > CPU_ZERO(&cpuset); > > CPU_SET(core_id, &cpuset); > > > > pthread_t current_thread = pthread_self(); > > return pthread_setaffinity_np(current_thread, > > sizeof(cpu_set_t), &cpuset); > > } > > > > static void *fn_thread (void *p_data) > > { > > int ret; > > pthread_t thread; > > > > stick_this_thread_to_core((int)p_data); > > > > while (1) { > > sleep(1); > > } > > > > return NULL; > > } > > > > int main() > > { > > volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, > > MAP_SHARED | MAP_ANONYMOUS, -1, 0); > > pthread_t threads[4]; > > int ret; > > > > for (int i = 0; i < 4; ++i) { > > ret = pthread_create(&threads[i], NULL, fn_thread, (void *)i); > > if (ret) > > { > > printf("%s", strerror (ret)); > > } > > } > > > > memset(p, 0x88, SIZE); > > > > for (int k = 0; k < 10000; k++) { > > /* swap in */ > > for (int i = 0; i < SIZE; i += 4096) { > > (void)p[i]; > > } > > > > /* swap out */ > > madvise(p, SIZE, MADV_PAGEOUT); > > } > > > > for (int i = 0; i < 4; i++) > > { > > pthread_cancel(threads[i]); > > } > > > > for (int i = 0; i < 4; i++) > > { > > pthread_join(threads[i], NULL); > > } > > > > return 0; > > } > > > > Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com> > > Reviewed-by: Jisheng Zhang <jszhang@kernel.org> > > Tested-by: Jisheng Zhang <jszhang@kernel.org> # Tested on TH1520 > > Before: > real 0m36.674s > user 0m0.173s > sys 0m36.493s > After: > real 0m18.016s > user 0m0.125s > sys 0m17.885s > > Tested-by: Nam Cao <namcao@linutronix.de> I forgot to mention: this is for Starfive's Visionfive 2 board. Best regards, Nam
Hello: This patch was applied to riscv/linux.git (for-next) by Palmer Dabbelt <palmer@rivosinc.com>: On Mon, 8 Jan 2024 20:36:40 +0100 you wrote: > Allow to defer the flushing of the TLB when unmapping pages, which allows > to reduce the numbers of IPI and the number of sfence.vma. > > The ubenchmarch used in commit 43b3dfdd0455 ("arm64: support > batched/deferred tlb shootdown during page reclamation/migration") that > was multithreaded to force the usage of IPI shows good performance > improvement on all platforms: > > [...] Here is the summary with links: - [v2] riscv: Add support for BATCHED_UNMAP_TLB_FLUSH https://git.kernel.org/riscv/c/54d7431af73e You are awesome, thank you!
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 8fd22073a847..d222bd3ee749 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -20,7 +20,7 @@ | openrisc: | .. | | parisc: | TODO | | powerpc: | TODO | - | riscv: | TODO | + | riscv: | ok | | s390: | TODO | | sh: | TODO | | sparc: | TODO | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 060c2a4fa639..49a94a4f2f58 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,6 +53,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP && MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT diff --git a/arch/riscv/include/asm/tlbbatch.h b/arch/riscv/include/asm/tlbbatch.h new file mode 100644 index 000000000000..46014f70b9da --- /dev/null +++ b/arch/riscv/include/asm/tlbbatch.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 Rivos Inc. + */ + +#ifndef _ASM_RISCV_TLBBATCH_H +#define _ASM_RISCV_TLBBATCH_H + +#include <linux/cpumask.h> + +struct arch_tlbflush_unmap_batch { + struct cpumask cpumask; +}; + +#endif /* _ASM_RISCV_TLBBATCH_H */ diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 8f3418c5f172..9c8a67b1285e 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -46,6 +46,14 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end); void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif + +bool arch_tlbbatch_should_defer(struct mm_struct *mm); +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr); +void arch_flush_tlb_batched_pending(struct mm_struct *mm); +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); + #else /* CONFIG_SMP && CONFIG_MMU */ #define flush_tlb_all() local_flush_tlb_all() diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index e6659d7368b3..f0190f5fdd05 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -93,29 +93,23 @@ static void __ipi_flush_tlb_range_asid(void *info) local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); } -static void __flush_tlb_range(struct mm_struct *mm, unsigned long start, - unsigned long size, unsigned long stride) +static void __flush_tlb_range(struct cpumask *cmask, unsigned long asid, + unsigned long start, unsigned long size, + unsigned long stride) { struct flush_tlb_range_data ftd; - const struct cpumask *cmask; - unsigned long asid = FLUSH_TLB_NO_ASID; bool broadcast; - if (mm) { - unsigned int cpuid; + if (cpumask_empty(cmask)) + return; - cmask = mm_cpumask(mm); - if (cpumask_empty(cmask)) - return; + if (cmask != cpu_online_mask) { + unsigned int cpuid; cpuid = get_cpu(); /* check if the tlbflush needs to be sent to other CPUs */ broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids; - - if (static_branch_unlikely(&use_asid_allocator)) - asid = atomic_long_read(&mm->context.id) & asid_mask; } else { - cmask = cpu_online_mask; broadcast = true; } @@ -135,25 +129,34 @@ static void __flush_tlb_range(struct mm_struct *mm, unsigned long start, local_flush_tlb_range_asid(start, size, stride, asid); } - if (mm) + if (cmask != cpu_online_mask) put_cpu(); } +static inline unsigned long get_mm_asid(struct mm_struct *mm) +{ + return static_branch_unlikely(&use_asid_allocator) ? + atomic_long_read(&mm->context.id) & asid_mask : FLUSH_TLB_NO_ASID; +} + void flush_tlb_mm(struct mm_struct *mm) { - __flush_tlb_range(mm, 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); + __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm), + 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE); } void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int page_size) { - __flush_tlb_range(mm, start, end - start, page_size); + __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm), + start, end - start, page_size); } void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - __flush_tlb_range(vma->vm_mm, addr, PAGE_SIZE, PAGE_SIZE); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + addr, PAGE_SIZE, PAGE_SIZE); } void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -185,18 +188,44 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, } } - __flush_tlb_range(vma->vm_mm, start, end - start, stride_size); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + start, end - start, stride_size); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - __flush_tlb_range(NULL, start, end - start, PAGE_SIZE); + __flush_tlb_range((struct cpumask *)cpu_online_mask, FLUSH_TLB_NO_ASID, + start, end - start, PAGE_SIZE); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - __flush_tlb_range(vma->vm_mm, start, end - start, PMD_SIZE); + __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm), + start, end - start, PMD_SIZE); } #endif + +bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + return true; +} + +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); +} + +void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + __flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0, + FLUSH_TLB_MAX_SIZE, PAGE_SIZE); +}