Message ID | 20200414112835.1121-3-yezhenyu2@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: tlb: add support for TLBI RANGE instructions | expand |
Hi Zhenyu, On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote: > diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h > index b76df828e6b7..3a1816770bd1 100644 > --- a/arch/arm64/include/asm/tlb.h > +++ b/arch/arm64/include/asm/tlb.h > @@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb) > return; > } > > - __flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level); > + if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE)) > + __flush_tlb_range_directly(&vma, tlb->start, tlb->end, > + stride, last_level); > + else > + __flush_tlb_range(&vma, tlb->start, tlb->end, > + stride, last_level); I think you could move such check in __flush_tlb_range() and avoid cpus_have_const_cap() in two places. More on this below. > diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h > index bc3949064725..a482188ea563 100644 > --- a/arch/arm64/include/asm/tlbflush.h > +++ b/arch/arm64/include/asm/tlbflush.h > @@ -59,6 +59,44 @@ > __ta; \ > }) > > +/* > + * This macro creates a properly formatted VA operand for the TLBI RANGE. > + * The value bit assignments are: > + * > + * +----------+------+-------+-------+-------+----------------------+ > + * | ASID | TG | SCALE | NUM | TTL | BADDR | > + * +-----------------+-------+-------+-------+----------------------+ > + * |63 48|47 46|45 44|43 39|38 37|36 0| > + * > + * The address range is determined by below formula: > + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) > + * > + */ > +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl) \ > + ({ \ > + unsigned long __ta = (addr) >> PAGE_SHIFT; \ > + __ta &= GENMASK_ULL(36, 0); \ > + __ta |= (unsigned long)(ttl) << 37; \ > + __ta |= (unsigned long)(num) << 39; \ > + __ta |= (unsigned long)(scale) << 44; \ > + __ta |= (unsigned long)(tg) << 46; \ > + __ta |= (unsigned long)(asid) << 48; \ > + __ta; \ > + }) > + > +#define TLB_RANGE_MASK_SHIFT 5 > +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0) > + > +/* > + * __TG defines translation granule of the system, which is defined by > + * PAGE_SHIFT. Used by TTL. > + * - 4KB : 1 > + * - 16KB : 2 > + * - 64KB : 3 > + */ > +#define __TG ((PAGE_SHIFT - 12) / 2 + 1) I don't think we need __TLBI_VADDR_RANGE to take a tg argument since it's always the same. > + > + > /* > * TLB Invalidation > * ================ > @@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, > dsb(ish); > } > > +/* The maximum range size of one TLBI-RANGE instruction */ > +#define MAX_TLBI_RANGE_SIZE (1UL << 21) Nitpick: call this MAX_TLBI_RANGE_PAGES as that's not an address range. It may be useful to have a macro for the range here, something like: #define __TLBI_PAGES(num, scale) ((num + 1) << (5 * scale + 1)) and define MAX_TLBI_RANGE_PAGES in terms of this macro as __TLBI_PAGES(31, 3). > + > +/* > + * This interface uses the *rvale1is* instruction to flush TLBs > + * in [start, end) directly. > + * This instruction is supported from ARM v8.4. > + */ > +static inline void __flush_tlb_range_directly(struct vm_area_struct *vma, > + unsigned long start, unsigned long end, > + unsigned long stride, bool last_level) > +{ > + int num = 0; > + int scale = 0; > + unsigned long asid = ASID(vma->vm_mm); > + unsigned long addr = 0; > + unsigned long range_size; > + > + start = round_down(start, stride); > + end = round_up(end, stride); > + range_size = (end - start) >> PAGE_SHIFT; > + > + if (range_size > MAX_TLBI_RANGE_SIZE) { > + flush_tlb_mm(vma->vm_mm); > + return; > + } > + > + dsb(ishst); > + > + /* > + * The minimum size of TLB RANGE is 2 PAGE; > + * Use normal TLB instruction to handle odd PAGEs Nitpick: no need to capitalise PAGE. > + */ > + if (range_size % 2 == 1) { > + addr = __TLBI_VADDR(start, asid); > + if (last_level) { > + __tlbi(vale1is, addr); > + __tlbi_user(vale1is, addr); > + } else { > + __tlbi(vae1is, addr); > + __tlbi_user(vae1is, addr); > + } > + start += 1 << PAGE_SHIFT; > + range_size -= 1; > + } > + > + range_size >>= 1; > + while (range_size > 0) { > + num = (range_size & TLB_RANGE_MASK) - 1; > + if (num >= 0) { > + addr = __TLBI_VADDR_RANGE(start, asid, __TG, > + scale, num, 0); > + if (last_level) { > + __tlbi(rvale1is, addr); > + __tlbi_user(rvale1is, addr); > + } else { > + __tlbi(rvae1is, addr); > + __tlbi_user(rvae1is, addr); > + } > + start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT; You could use the __TLBI_PAGES macro I proposed above. > + } > + scale++; > + range_size >>= TLB_RANGE_MASK_SHIFT; > + } So, you start from scale 0 and increment it until you reach the maximum. I think (haven't done the maths on paper) you could also start from the top with something like scale = ilog2(range_size) / 5. Not sure it's significantly better though, maybe avoiding the loop 3 times if your range is 2MB (which happens with huge pages). Anyway, I think it would be more efficient if we combine the __flush_tlb_range() and the _directly one into the same function with a single loop for both. For example, if the stride is 2MB already, we can handle this with a single classic TLBI without all the calculations for the range operation. The hardware may also handle this better since the software already told it there can be only one entry in that 2MB range. So each loop iteration could figure which operation to use based on cpucaps, TLBI range ops, stride and reduce range_size accordingly.
Hi Catalin, Thanks for your review. On 2020/5/14 23:28, Catalin Marinas wrote: > Hi Zhenyu, > > On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote: >> diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h >> index b76df828e6b7..3a1816770bd1 100644 >> --- a/arch/arm64/include/asm/tlb.h >> +++ b/arch/arm64/include/asm/tlb.h >> @@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb) >> return; >> } >> >> - __flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level); >> + if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE)) >> + __flush_tlb_range_directly(&vma, tlb->start, tlb->end, >> + stride, last_level); >> + else >> + __flush_tlb_range(&vma, tlb->start, tlb->end, >> + stride, last_level); > > I think you could move such check in __flush_tlb_range() and avoid > cpus_have_const_cap() in two places. More on this below. > Then we must mix the __flush_tlb_range() and the _directly one together. I'm worried this will make the code very complicated. See the end for details. >> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h >> index bc3949064725..a482188ea563 100644 >> --- a/arch/arm64/include/asm/tlbflush.h >> +++ b/arch/arm64/include/asm/tlbflush.h >> @@ -59,6 +59,44 @@ >> __ta; \ >> }) >> >> +/* >> + * This macro creates a properly formatted VA operand for the TLBI RANGE. >> + * The value bit assignments are: >> + * >> + * +----------+------+-------+-------+-------+----------------------+ >> + * | ASID | TG | SCALE | NUM | TTL | BADDR | >> + * +-----------------+-------+-------+-------+----------------------+ >> + * |63 48|47 46|45 44|43 39|38 37|36 0| >> + * >> + * The address range is determined by below formula: >> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) >> + * >> + */ >> +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl) \ >> + ({ \ >> + unsigned long __ta = (addr) >> PAGE_SHIFT; \ >> + __ta &= GENMASK_ULL(36, 0); \ >> + __ta |= (unsigned long)(ttl) << 37; \ >> + __ta |= (unsigned long)(num) << 39; \ >> + __ta |= (unsigned long)(scale) << 44; \ >> + __ta |= (unsigned long)(tg) << 46; \ >> + __ta |= (unsigned long)(asid) << 48; \ >> + __ta; \ >> + }) >> + >> +#define TLB_RANGE_MASK_SHIFT 5 >> +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0) >> + >> +/* >> + * __TG defines translation granule of the system, which is defined by >> + * PAGE_SHIFT. Used by TTL. >> + * - 4KB : 1 >> + * - 16KB : 2 >> + * - 64KB : 3 >> + */ >> +#define __TG ((PAGE_SHIFT - 12) / 2 + 1) > > I don't think we need __TLBI_VADDR_RANGE to take a tg argument since > it's always the same. > OK. >> + >> + >> /* >> * TLB Invalidation >> * ================ >> @@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, >> dsb(ish); >> } >> >> +/* The maximum range size of one TLBI-RANGE instruction */ >> +#define MAX_TLBI_RANGE_SIZE (1UL << 21) > > Nitpick: call this MAX_TLBI_RANGE_PAGES as that's not an address range. > > It may be useful to have a macro for the range here, something like: > > #define __TLBI_PAGES(num, scale) ((num + 1) << (5 * scale + 1)) > > and define MAX_TLBI_RANGE_PAGES in terms of this macro as > __TLBI_PAGES(31, 3). > OK, thanks for your great suggestion. >> + >> +/* >> + * This interface uses the *rvale1is* instruction to flush TLBs >> + * in [start, end) directly. >> + * This instruction is supported from ARM v8.4. >> + */ >> +static inline void __flush_tlb_range_directly(struct vm_area_struct *vma, >> + unsigned long start, unsigned long end, >> + unsigned long stride, bool last_level) >> +{ >> + int num = 0; >> + int scale = 0; >> + unsigned long asid = ASID(vma->vm_mm); >> + unsigned long addr = 0; >> + unsigned long range_size; >> + >> + start = round_down(start, stride); >> + end = round_up(end, stride); >> + range_size = (end - start) >> PAGE_SHIFT; >> + >> + if (range_size > MAX_TLBI_RANGE_SIZE) { >> + flush_tlb_mm(vma->vm_mm); >> + return; >> + } >> + >> + dsb(ishst); >> + >> + /* >> + * The minimum size of TLB RANGE is 2 PAGE; >> + * Use normal TLB instruction to handle odd PAGEs > > Nitpick: no need to capitalise PAGE. > OK. >> + */ >> + if (range_size % 2 == 1) { >> + addr = __TLBI_VADDR(start, asid); >> + if (last_level) { >> + __tlbi(vale1is, addr); >> + __tlbi_user(vale1is, addr); >> + } else { >> + __tlbi(vae1is, addr); >> + __tlbi_user(vae1is, addr); >> + } >> + start += 1 << PAGE_SHIFT; >> + range_size -= 1; >> + } >> + >> + range_size >>= 1; >> + while (range_size > 0) { >> + num = (range_size & TLB_RANGE_MASK) - 1; >> + if (num >= 0) { >> + addr = __TLBI_VADDR_RANGE(start, asid, __TG, >> + scale, num, 0); >> + if (last_level) { >> + __tlbi(rvale1is, addr); >> + __tlbi_user(rvale1is, addr); >> + } else { >> + __tlbi(rvae1is, addr); >> + __tlbi_user(rvae1is, addr); >> + } >> + start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT; > > You could use the __TLBI_PAGES macro I proposed above. > OK. >> + } >> + scale++; >> + range_size >>= TLB_RANGE_MASK_SHIFT; >> + } > > So, you start from scale 0 and increment it until you reach the maximum. > I think (haven't done the maths on paper) you could also start from the > top with something like scale = ilog2(range_size) / 5. Not sure it's > significantly better though, maybe avoiding the loop 3 times if your > range is 2MB (which happens with huge pages). > This optimization is only effective when the range is a multiple of 256KB (when the page size is 4KB), and I'm worried about the performance of ilog2(). I traced the __flush_tlb_range() last year and found that in most cases the range is less than 256K (see details in [1]). I will test the performance of your suggestion and then reply you again here. > Anyway, I think it would be more efficient if we combine the > __flush_tlb_range() and the _directly one into the same function with a > single loop for both. For example, if the stride is 2MB already, we can > handle this with a single classic TLBI without all the calculations for > the range operation. The hardware may also handle this better since the > software already told it there can be only one entry in that 2MB range. > So each loop iteration could figure which operation to use based on > cpucaps, TLBI range ops, stride and reduce range_size accordingly. > Summarize your suggestion in one sentence: use 'stride' to optimize the preformance of TLBI. This can also be done by dividing into two functions, and this should indeed be taken into account in the TLBI RANGE feature. But if we figure which operation to use based on cpucaps in each loop iteration, then cpus_have_const_cap() will be called frequently, which may affect performance of TLBI. In my opinion, we should do as few judgments as possible in the loop, so judge the cpucaps outside the loop maybe a good choice. [1] https://lkml.org/lkml/2019/11/11/593 Thanks, Zhenyu
On Mon, May 18, 2020 at 08:21:02PM +0800, Zhenyu Ye wrote: > On 2020/5/14 23:28, Catalin Marinas wrote: > > On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote: > >> + } > >> + scale++; > >> + range_size >>= TLB_RANGE_MASK_SHIFT; > >> + } > > > > So, you start from scale 0 and increment it until you reach the maximum. > > I think (haven't done the maths on paper) you could also start from the > > top with something like scale = ilog2(range_size) / 5. Not sure it's > > significantly better though, maybe avoiding the loop 3 times if your > > range is 2MB (which happens with huge pages). > > This optimization is only effective when the range is a multiple of 256KB > (when the page size is 4KB), and I'm worried about the performance > of ilog2(). I traced the __flush_tlb_range() last year and found that in > most cases the range is less than 256K (see details in [1]). THP or hugetlbfs would exercise bigger strides but I guess it depends on the use-case. ilog2() should be reduced to a few instructions on arm64 AFAICT (haven't tried but it should use the CLZ instruction). > > Anyway, I think it would be more efficient if we combine the > > __flush_tlb_range() and the _directly one into the same function with a > > single loop for both. For example, if the stride is 2MB already, we can > > handle this with a single classic TLBI without all the calculations for > > the range operation. The hardware may also handle this better since the > > software already told it there can be only one entry in that 2MB range. > > So each loop iteration could figure which operation to use based on > > cpucaps, TLBI range ops, stride and reduce range_size accordingly. > > Summarize your suggestion in one sentence: use 'stride' to optimize the > preformance of TLBI. This can also be done by dividing into two functions, > and this should indeed be taken into account in the TLBI RANGE feature. > > But if we figure which operation to use based on cpucaps in each loop > iteration, then cpus_have_const_cap() will be called frequently, which > may affect performance of TLBI. In my opinion, we should do as few > judgments as possible in the loop, so judge the cpucaps outside the > loop maybe a good choice. cpus_have_const_cap() is a static label, so should be patched with a branch or nop. My point was that in the classic __flush_tlb_range() loop, instead of an addr += stride we could have something more dynamic depending on whether the CPU supports range TLBI ops or not. But we would indeed have more (static) branches in the loop, so possibly some performance degradation. If the code looks ok, I'd favour this and we can look at the optimisation later. But I can't really tell how the code would look without attempting to merge the two. Anyway, a first step would be to to add the the range and stride to the decision (i.e. (end-start)/stride > 1) before jumping to the range operations. You can avoid the additional checks in the new TLBI functions since we know we have at least two (huge)pages.
Hi Catalin, I have sent the v4 of this series [1] and combine the two function with a single loop. See codes for details. [1] https://lore.kernel.org/linux-arm-kernel/20200601144713.2222-1-yezhenyu2@huawei.com/ On 2020/5/21 1:08, Catalin Marinas wrote: >> This optimization is only effective when the range is a multiple of 256KB >> (when the page size is 4KB), and I'm worried about the performance >> of ilog2(). I traced the __flush_tlb_range() last year and found that in >> most cases the range is less than 256K (see details in [1]). > > THP or hugetlbfs would exercise bigger strides but I guess it depends on > the use-case. ilog2() should be reduced to a few instructions on arm64 > AFAICT (haven't tried but it should use the CLZ instruction). > Not bigger than 256K, but the range must be a integer multiple of 256KB, so I still start from scale 0. Thanks, Zhenyu
Hi Catalin, On 2020/5/18 20:21, Zhenyu Ye wrote: > I will test the performance of your suggestion and then reply you again > here. > I have sent the v4 of this series [1], and compared the performance of these two different implement. The test code is in the attachment (directly call the __flush_tlb_range()). First, I tested the v4 on a machine whose cpus do not support tlb range. Fortunately, the newly added judgment in loop has very little effect on performance. When page nums are 256 (loop 256 times), the impact is less than 0.5%: [page num] [before change] [v4 change] 1 1457 1491 2 1911 1957 3 2382 2377 4 2827 2852 5 3282 3349 6 3763 3781 7 4295 4252 8 4716 4716 9 5186 5218 10 5618 5648 16 8427 8454 32 15938 15951 64 30890 30977 128 60802 60863 256 120826 121395 512 1508 1555 Then I tested them on a FPGA machine whose cpus support the tlb range feature (this machine is not the same as above). Below is the test data when the stride = PTE: [page num] [before change] [v3 change] [v4 change] 1 16051 15094 13524 2 11366 11270 11146 3 11582 11536 12171 4 11694 11199 11101 5 12138 11506 12267 6 12290 11214 11105 7 12400 11448 12002 8 12837 11225 11097 9 14791 11529 12140 10 15461 11218 11087 16 18233 11192 11094 32 26983 11224 11079 64 43840 11237 11092 128 77754 11247 11098 256 145514 11223 11089 512 280932 11197 11111 We can see the v3 and v4 are very similar in this scene, and both of them performance improved very much compared to current implementation. When the page nums are 256, the performance is improved by more than 10 times. And the TLBI RANGE instruction cost less time than classic TLBI in all secenes on this machine, even if the page num is small. (but this may be different on different machines) Everything performs will util now, but I added a new judgment of stride in the v4: if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) && stride == PAGE_SIZE) use tlbi range here... So when the stride != PTE, then there will use the classic tlbi instruction and flush the tlbs one by one, where the performance becomes worse than v3: [page num] [before change] [v3 change] [v4 change] 1 14047 11332 11611 2 11568 11255 11701 3 11664 11231 11759 4 12097 11204 12173 5 12229 11236 12374 6 12399 11203 12497 7 12802 11266 12914 8 14764 17098 14907 9 15370 17106 15551 10 16130 17103 16137 16 19029 17175 19194 32 27300 17097 27604 64 44172 17075 44609 128 77878 17176 78548 256 145185 12022 146063 512 279822 12029 279922 And as we can see, "handle the 2MB with a single classic TLBI" costs the same time as a single TLBI RANGE instruction. So should I remove the judgment of stride and only figure which to use based on cpucaps in the loop? But if removes the judgment, the logic will be the same as v3.(both of them only judge cpucaps) Waiting for your suggestions... Thanks, Zhenyu #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/delay.h> #include <asm/tlb.h> #include <linux/time.h> #include <asm/current.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/mm.h> #define TESTTIMES 10000 void testRangePerf(void); static int __init test_init(void) { printk("BEGIN TEST\n"); testRangePerf(); printk("END TEST\n"); return 0; } static void __exit test_exit(void) { return; } void testRangePerf(void) { int i, j; struct timespec64 start, end; struct task_struct *ts; struct vm_area_struct *vma; printk("BEGIN testRangePerf\n"); ts = current; vma = ts->mm->mmap; printk("vma->start: %lx, vma->end: %lx, ttl = 0, PAGE_SIZE = 0x%lx\n", vma->vm_start, vma->vm_end, PAGE_SIZE); for (i = 1; i <= 10; i++) { ktime_get_ts64(&start); for (j = 0; j < TESTTIMES; j++) { __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PAGE_SIZE * i, PAGE_SIZE, false); } ktime_get_ts64(&end); printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i, ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES); msleep(100); } for (i = 16; i <= 512; i+=i) { ktime_get_ts64(&start); for (j = 0; j < TESTTIMES; j++) { __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PAGE_SIZE * i, PAGE_SIZE, false); } ktime_get_ts64(&end); printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i, ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES); msleep(100); } printk("vma->start: %lx, vma->end: %lx, ttl = 0, PAGE_SIZE = 0x%lx\n", vma->vm_start, vma->vm_end, PMD_SIZE); for (i = 1; i <= 10; i++) { ktime_get_ts64(&start); for (j = 0; j < TESTTIMES; j++) { __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PMD_SIZE * i, PMD_SIZE, false); } ktime_get_ts64(&end); printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i, ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES); msleep(100); } for (i = 16; i <= 512; i+=i) { ktime_get_ts64(&start); for (j = 0; j < TESTTIMES; j++) { __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PMD_SIZE * i, PMD_SIZE, false); } ktime_get_ts64(&end); printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i, ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES); msleep(100); } } module_init(test_init) module_exit(test_exit) MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Eillon"); MODULE_DESCRIPTION("do TTL test"); MODULE_VERSION("1.0");
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index b76df828e6b7..3a1816770bd1 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb) return; } - __flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level); + if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE)) + __flush_tlb_range_directly(&vma, tlb->start, tlb->end, + stride, last_level); + else + __flush_tlb_range(&vma, tlb->start, tlb->end, + stride, last_level); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bc3949064725..a482188ea563 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -59,6 +59,44 @@ __ta; \ }) +/* + * This macro creates a properly formatted VA operand for the TLBI RANGE. + * The value bit assignments are: + * + * +----------+------+-------+-------+-------+----------------------+ + * | ASID | TG | SCALE | NUM | TTL | BADDR | + * +-----------------+-------+-------+-------+----------------------+ + * |63 48|47 46|45 44|43 39|38 37|36 0| + * + * The address range is determined by below formula: + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) + * + */ +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl) \ + ({ \ + unsigned long __ta = (addr) >> PAGE_SHIFT; \ + __ta &= GENMASK_ULL(36, 0); \ + __ta |= (unsigned long)(ttl) << 37; \ + __ta |= (unsigned long)(num) << 39; \ + __ta |= (unsigned long)(scale) << 44; \ + __ta |= (unsigned long)(tg) << 46; \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) + +#define TLB_RANGE_MASK_SHIFT 5 +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0) + +/* + * __TG defines translation granule of the system, which is defined by + * PAGE_SHIFT. Used by TTL. + * - 4KB : 1 + * - 16KB : 2 + * - 64KB : 3 + */ +#define __TG ((PAGE_SHIFT - 12) / 2 + 1) + + /* * TLB Invalidation * ================ @@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ish); } +/* The maximum range size of one TLBI-RANGE instruction */ +#define MAX_TLBI_RANGE_SIZE (1UL << 21) + +/* + * This interface uses the *rvale1is* instruction to flush TLBs + * in [start, end) directly. + * This instruction is supported from ARM v8.4. + */ +static inline void __flush_tlb_range_directly(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level) +{ + int num = 0; + int scale = 0; + unsigned long asid = ASID(vma->vm_mm); + unsigned long addr = 0; + unsigned long range_size; + + start = round_down(start, stride); + end = round_up(end, stride); + range_size = (end - start) >> PAGE_SHIFT; + + if (range_size > MAX_TLBI_RANGE_SIZE) { + flush_tlb_mm(vma->vm_mm); + return; + } + + dsb(ishst); + + /* + * The minimum size of TLB RANGE is 2 PAGE; + * Use normal TLB instruction to handle odd PAGEs + */ + if (range_size % 2 == 1) { + addr = __TLBI_VADDR(start, asid); + if (last_level) { + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); + } else { + __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } + start += 1 << PAGE_SHIFT; + range_size -= 1; + } + + range_size >>= 1; + while (range_size > 0) { + num = (range_size & TLB_RANGE_MASK) - 1; + if (num >= 0) { + addr = __TLBI_VADDR_RANGE(start, asid, __TG, + scale, num, 0); + if (last_level) { + __tlbi(rvale1is, addr); + __tlbi_user(rvale1is, addr); + } else { + __tlbi(rvae1is, addr); + __tlbi_user(rvae1is, addr); + } + start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT; + } + scale++; + range_size >>= TLB_RANGE_MASK_SHIFT; + } + dsb(ish); +} + /* * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. */ #define MAX_TLBI_OPS PTRS_PER_PTE +/* + * This interface uses the *vae1is* instruction to flush TLBs + * in [start, end) one by one. + */ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level) @@ -218,7 +327,10 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false); + if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE)) + __flush_tlb_range_directly(vma, start, end, PAGE_SIZE, false); + else + __flush_tlb_range(vma, start, end, PAGE_SIZE, false); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
Add __TLBI_VADDR_RANGE macro and __flush_tlb_range_directly() interface. Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com> --- arch/arm64/include/asm/tlb.h | 7 +- arch/arm64/include/asm/tlbflush.h | 114 +++++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 2 deletions(-)